def test_capacity(self): """ Tests if insert correctly blocks when capacity is reached. """ fifo_queue = FIFOQueue(capacity=self.capacity, record_space=self.record_space) test = ComponentTest(component=fifo_queue, input_spaces=self.input_spaces) def run(expected_): # Wait n seconds. time.sleep(2) # Pull something out of the queue again to continue. test.test(("get_records", 2), expected_outputs=expected_) # Insert one more element than capacity records = self.record_space.sample(size=self.capacity + 1) expected = dict() for key, value in flatten_op(records).items(): expected[key] = value[:2] expected = unflatten_op(expected) # Start thread to save this one from getting stuck due to capacity overflow. thread = threading.Thread(target=run, args=(expected, )) thread.start() print("Going over capacity: blocking ...") test.test(("insert_records", records), expected_outputs=None) print("Dequeued some items in another thread. Unblocked.") thread.join()
def test_enqueue_dequeue(self): """ Simply tests insert op without checking internal logic. """ fifo_queue = FIFOQueue(capacity=self.capacity, record_space=self.record_space) test = ComponentTest(component=fifo_queue, input_spaces=self.input_spaces) first_record = self.record_space.sample(size=1) test.test(("insert_records", first_record), expected_outputs=None) test.test("get_size", expected_outputs=1) further_records = self.record_space.sample(size=5) test.test(("insert_records", further_records), expected_outputs=None) test.test("get_size", expected_outputs=6) expected = dict() for (k1, v1), (k2, v2) in zip( flatten_op(first_record).items(), flatten_op(further_records).items()): expected[k1] = np.concatenate((v1, v2[:4])) expected = unflatten_op(expected) test.test(("get_records", 5), expected_outputs=expected) test.test("get_size", expected_outputs=1)
def run2(): fifo_queue_2 = FIFOQueue(capacity=self.capacity, device="/job:source/task:0/cpu") test_2 = ComponentTest(component=fifo_queue_2, input_spaces=self.input_spaces, execution_spec=dict( mode="distributed", distributed_spec=dict(job="target", task_index=0, cluster_spec=cluster_spec) )) # Dequeue elements in target. print("size of target-side queue:") print(test_2.test("get_size", expected_outputs=None)) print("pulling from target-side queue:") print(test_2.test(("get_records", 5), expected_outputs=None))
def run1(): fifo_queue_1 = FIFOQueue(capacity=self.capacity, device="/job:source/task:0/cpu") test_1 = ComponentTest(component=fifo_queue_1, input_spaces=self.input_spaces, execution_spec=dict( mode="distributed", distributed_spec=dict(job="source", task_index=0, cluster_spec=cluster_spec) )) # Insert elements from source. records = self.record_space.sample(size=self.capacity) print("inserting into source-side queue ...") test_1.test(("insert_records", records), expected_outputs=None) print("size of source-side queue:") print(test_1.test("get_size", expected_outputs=None)) # Pull one sample out. print("pulling from source-side queue:") print(test_1.test(("get_records", 2), expected_outputs=None))
def __init__(self, discount=0.99, fifo_queue_spec=None, architecture="large", environment_spec=None, feed_previous_action_through_nn=True, feed_previous_reward_through_nn=True, weight_pg=None, weight_baseline=None, weight_entropy=None, worker_sample_size=100, **kwargs): """ Args: discount (float): The discount factor gamma. architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if `network_spec` is given explicitly in kwargs. Default: "large". fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm. environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent. feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_action". Default: True. feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: True. weight_pg (float): See IMPALALossFunction Component. weight_baseline (float): See IMPALALossFunction Component. weight_entropy (float): See IMPALALossFunction Component. worker_sample_size (int): How many steps the actor will perform in the environment each sample-run. Keyword Args: type (str): One of "single", "actor" or "learner". Default: "single". """ type_ = kwargs.pop("type", "single") assert type_ in ["single", "actor", "learner"] self.type = type_ self.worker_sample_size = worker_sample_size # Network-spec by default is a "large architecture" IMPALA network. self.network_spec = kwargs.pop( "network_spec", dict( type= "rlgraph.components.neural_networks.impala.impala_networks.{}IMPALANetwork" .format("Large" if architecture == "large" else "Small"))) if isinstance(self.network_spec, dict) and "type" in self.network_spec and \ "IMPALANetwork" in self.network_spec["type"]: self.network_spec = default_dict( self.network_spec, dict(worker_sample_size=1 if self.type == "actor" else self.worker_sample_size + 1)) # Depending on the job-type, remove the pieces from the Agent-spec/graph we won't need. self.exploration_spec = kwargs.pop("exploration_spec", None) optimizer_spec = kwargs.pop("optimizer_spec", None) observe_spec = kwargs.pop("observe_spec", None) self.feed_previous_action_through_nn = feed_previous_action_through_nn self.feed_previous_reward_through_nn = feed_previous_reward_through_nn # Run everything in a single process. if self.type == "single": environment_spec = environment_spec or self.default_environment_spec update_spec = kwargs.pop("update_spec", None) # Actors won't need to learn (no optimizer needed in graph). elif self.type == "actor": optimizer_spec = None update_spec = kwargs.pop("update_spec", dict(do_updates=False)) environment_spec = environment_spec or self.default_environment_spec # Learners won't need to explore (act) or observe (insert into Queue). else: observe_spec = None update_spec = kwargs.pop("update_spec", None) environment_spec = None # Add previous-action/reward preprocessors to env-specific preprocessor spec. # TODO: remove this empty hard-coded preprocessor. self.preprocessing_spec = kwargs.pop( "preprocessing_spec", dict( type="dict-preprocessor-stack", preprocessors=dict( # Flatten actions. previous_action=[ dict(type="reshape", flatten=True, flatten_categories=kwargs.get( "action_space").num_categories) ], # Bump reward and convert to float32, so that it can be concatenated by the Concat layer. previous_reward=[dict(type="reshape", new_shape=(1, ))]))) # Limit communication in distributed mode between each actor and the learner (never between actors). execution_spec = kwargs.pop("execution_spec", None) if execution_spec is not None and execution_spec.get( "mode") == "distributed": default_dict( execution_spec["session_config"], dict(type="monitored-training-session", allow_soft_placement=True, device_filters=["/job:learner/task:0"] + ([ "/job:actor/task:{}".format( execution_spec["distributed_spec"]["task_index"]) ] if self.type == "actor" else ["/job:learner/task:0"]))) # If Actor, make non-chief in either case (even if task idx == 0). if self.type == "actor": execution_spec["distributed_spec"]["is_chief"] = False # Hard-set device to the CPU for actors. execution_spec["device_strategy"] = "custom" execution_spec[ "default_device"] = "/job:{}/task:{}/cpu".format( self.type, execution_spec["distributed_spec"]["task_index"]) self.policy_spec = kwargs.pop("policy_spec", dict()) # TODO: Create some auto-setting based on LSTM inside the NN. default_dict( self.policy_spec, dict(type="shared-value-function-policy", deterministic=False, reuse_variable_scope="shared-policy", action_space=kwargs.get("action_space"))) # Now that we fixed the Agent's spec, call the super constructor. super(IMPALAAgent, self).__init__(discount=discount, preprocessing_spec=self.preprocessing_spec, network_spec=self.network_spec, policy_spec=self.policy_spec, exploration_spec=self.exploration_spec, optimizer_spec=optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, execution_spec=execution_spec, name=kwargs.pop( "name", "impala-{}-agent".format(self.type)), **kwargs) # Always use 1st learner as the parameter server for all policy variables. if self.execution_spec["mode"] == "distributed" and self.execution_spec[ "distributed_spec"]["cluster_spec"]: self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu"))) # Check whether we have an RNN. self.has_rnn = self.policy.neural_network.has_rnn() # Check, whether we are running with GPU. self.has_gpu = self.execution_spec["gpu_spec"]["gpus_enabled"] is True and \ self.execution_spec["gpu_spec"]["num_gpus"] > 0 # Some FIFO-queue specs. self.fifo_queue_keys = ["terminals", "states"] + \ (["actions"] if not self.feed_previous_action_through_nn else []) + \ (["rewards"] if not self.feed_previous_reward_through_nn else []) + \ ["action_probs"] + \ (["initial_internal_states"] if self.has_rnn else []) # Define FIFO record space. # Note that only states and internal_states (RNN) contain num-steps+1 items, all other sub-records only contain # num-steps items. self.fifo_record_space = Dict( { "terminals": bool, "action_probs": FloatBox(shape=(self.action_space.num_categories, )), }, add_batch_rank=False, add_time_rank=self.worker_sample_size) self.fifo_record_space["states"] = self.state_space.with_time_rank( self.worker_sample_size + 1) # Add action and rewards to state or do they have an extra channel? if self.feed_previous_action_through_nn: self.fifo_record_space["states"]["previous_action"] = \ self.action_space.with_time_rank(self.worker_sample_size + 1) else: self.fifo_record_space[ "actions"] = self.action_space.with_time_rank( self.worker_sample_size) if self.feed_previous_action_through_nn: self.fifo_record_space["states"]["previous_reward"] = FloatBox( add_time_rank=self.worker_sample_size + 1) else: self.fifo_record_space["rewards"] = FloatBox( add_time_rank=self.worker_sample_size) if self.has_rnn: self.fifo_record_space[ "initial_internal_states"] = self.internal_states_space.with_time_rank( False) # Create our FIFOQueue (actors will enqueue, learner(s) will dequeue). self.fifo_queue = FIFOQueue.from_spec( fifo_queue_spec or dict(capacity=1), reuse_variable_scope="shared-fifo-queue", only_insert_single_records=True, record_space=self.fifo_record_space, device="/job:learner/task:0/cpu" if self.execution_spec["mode"] == "distributed" and self.execution_spec["distributed_spec"]["cluster_spec"] else None) # Remove `states` key from input_spaces: not needed. del self.input_spaces["states"] # Add all our sub-components to the core. if self.type == "single": pass elif self.type == "actor": # No learning, no loss function. self.loss_function = None # A Dict Splitter to split things from the EnvStepper. self.env_output_splitter = ContainerSplitter( tuple_length=4, scope="env-output-splitter") self.states_dict_splitter = None # Slice some data from the EnvStepper (e.g only first internal states are needed). self.internal_states_slicer = Slice(scope="internal-states-slicer", squeeze=True) # Merge back to insert into FIFO. self.fifo_input_merger = DictMerger(*self.fifo_queue_keys) # Dummy Flattener to calculate action-probs space. dummy_flattener = ReShape( flatten=True, flatten_categories=self.action_space.num_categories) self.environment_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=ActorComponent(self.preprocessor, self.policy, self.exploration), state_space=self.state_space.with_batch_rank(), reward_space= float, # TODO <- float64 for deepmind? may not work for other envs internal_states_space=self.internal_states_space, num_steps=self.worker_sample_size, add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=dummy_flattener.get_preprocessed_space( self.action_space)) sub_components = [ self.environment_stepper, self.env_output_splitter, self.internal_states_slicer, self.fifo_input_merger, self.fifo_queue ] # Learner. else: self.environment_stepper = None # A Dict splitter to split up items from the queue. self.fifo_input_merger = None self.fifo_output_splitter = ContainerSplitter( *self.fifo_queue_keys, scope="fifo-output-splitter") self.states_dict_splitter = ContainerSplitter( *list(self.fifo_record_space["states"].keys()), scope="states-dict-splitter") self.internal_states_slicer = None self.transposer = Transpose( scope="transposer", device=dict(ops="/job:learner/task:0/cpu")) self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys)) # Create an IMPALALossFunction with some parameters. self.loss_function = IMPALALossFunction( discount=self.discount, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, slice_actions=self.feed_previous_action_through_nn, slice_rewards=self.feed_previous_reward_through_nn, device="/job:learner/task:0/gpu") self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu", ops="/job:learner/task:0/gpu"))) for component in [ self.staging_area, self.preprocessor, self.optimizer ]: component.propagate_sub_component_properties( dict(device="/job:learner/task:0/gpu")) sub_components = [ self.fifo_output_splitter, self.fifo_queue, self.states_dict_splitter, self.transposer, self.staging_area, self.preprocessor, self.policy, self.loss_function, self.optimizer ] if self.type != "single": # Add all the agent's sub-components to the root. self.root_component.add_components(*sub_components) # Define the Agent's (root Component's) API. self.define_graph_api(*sub_components) if self.type != "single" and self.auto_build: if self.type == "learner": build_options = dict( build_device_context="/job:learner/task:0/cpu", pin_global_variable_device="/job:learner/task:0/cpu") self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=build_options) else: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=None) self.graph_built = True if self.has_gpu: # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op). self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \ out_op_columns[0].op_records[0].op # Initialize the stage. self.graph_executor.monitored_session.run_step_fn( lambda step_context: step_context.session.run(self.stage_op )) # TODO remove after full refactor. self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \ out_op_columns[0].op_records[0].op if self.type == "actor": self.enqueue_op = self.root_component.sub_components["fifo-queue"].api_methods["insert_records"]. \ out_op_columns[0].op_records[0].op