def build_text_processing_stack(): """ Helper function to build the text processing pipeline for both the large and small architectures, consisting of: - ReShape preprocessor to fold the incoming time rank into the batch rank. - StringToHashBucket Layer taking a batch of sentences and converting them to an indices-table of dimensions: cols=length of longest sentences in input rows=number of items in the batch The cols dimension could be interpreted as the time rank into a consecutive LSTM. The StringToHashBucket Component returns the sequence length of each batch item for exactly that purpose. - Embedding Lookup Layer of embedding size 20 and number of rows == num_hash_buckets (see previous layer). - LSTM processing the batched sequences of words coming from the embedding layer as batches of rows. """ num_hash_buckets = 1000 # Create a hash bucket from the sentences and use that bucket to do an embedding lookup (instead of # a vocabulary). string_to_hash_bucket = StringToHashBucket( num_hash_buckets=num_hash_buckets) embedding = EmbeddingLookup(embed_dim=20, vocab_size=num_hash_buckets, pad_empty=True) # The time rank for the LSTM is now the sequence of words in a sentence, NOT the original env time rank. # We will only use the last output of the LSTM-64 for further processing as that is the output after having # seen all words in the sentence. # The original env stepping time rank is currently folded into the batch rank and must be unfolded again before # passing it into the main LSTM. lstm64 = LSTMLayer(units=64, scope="lstm-64", time_major=False) tuple_splitter = ContainerSplitter(tuple_length=2, scope="tuple-splitter") def custom_apply(self, inputs): hash_bucket, lengths = self.sub_components[ "string-to-hash-bucket"].apply(inputs) embedding_output = self.sub_components["embedding-lookup"].apply( hash_bucket) # Return only the last output (sentence of words, where we are not interested in intermediate results # where the LSTM has not seen the entire sentence yet). # Last output is the final internal h-state (slot 1 in the returned LSTM tuple; slot 0 is final c-state). lstm_output = self.sub_components["lstm-64"].apply( embedding_output, sequence_length=lengths) lstm_final_internals = lstm_output["last_internal_states"] # Need to split once more because the LSTM state is always a tuple of final c- and h-states. _, lstm_final_h_state = self.sub_components[ "tuple-splitter"].split(lstm_final_internals) return lstm_final_h_state text_processing_stack = Stack(string_to_hash_bucket, embedding, lstm64, tuple_splitter, api_methods={("apply", custom_apply)}, scope="text-stack") return text_processing_stack
def __init__(self, num_units_a=3, num_units_b=2, scope="dummy-nn-with-dict-input", **kwargs): super(DummyNNWithDictInput, self).__init__(scope=scope, **kwargs) self.num_units_a = num_units_a self.num_units_b = num_units_b # Splits the input into two streams. self.splitter = ContainerSplitter("a", "b") self.stack_a = DenseLayer(units=self.num_units_a, scope="dense-a") self.stack_b = DenseLayer(units=self.num_units_b, scope="dense-b") self.concat_layer = ConcatLayer() # Add all sub-components to this one. self.add_components(self.splitter, self.stack_a, self.stack_b, self.concat_layer)
class DummyNNWithDictInput(NeuralNetwork): """ Dummy NN with dict input taking a dict with keys "a" and "b" passes them both through two different (parallel, not connected in any way) dense layers and then concatenating the outputs to yield the final output. """ def __init__(self, num_units_a=3, num_units_b=2, scope="dummy-nn-with-dict-input", **kwargs): super(DummyNNWithDictInput, self).__init__(scope=scope, **kwargs) self.num_units_a = num_units_a self.num_units_b = num_units_b # Splits the input into two streams. self.splitter = ContainerSplitter("a", "b") self.stack_a = DenseLayer(units=self.num_units_a, scope="dense-a") self.stack_b = DenseLayer(units=self.num_units_b, scope="dense-b") self.concat_layer = ConcatLayer() # Add all sub-components to this one. self.add_components(self.splitter, self.stack_a, self.stack_b, self.concat_layer) @rlgraph_api def apply(self, input_dict): # Split the input dict into two streams. input_a, input_b = self.splitter.split(input_dict) # Get the two stack outputs. output_a = self.stack_a.apply(input_a) output_b = self.stack_b.apply(input_b) # Concat everything together, that's the output. concatenated_data = self.concat_layer.apply(output_a, output_b) return dict(output=concatenated_data)
def __init__(self, discount=0.99, fifo_queue_spec=None, architecture="large", environment_spec=None, feed_previous_action_through_nn=True, feed_previous_reward_through_nn=True, weight_pg=None, weight_baseline=None, weight_entropy=None, num_workers=1, worker_sample_size=100, dynamic_batching=False, visualize=False, **kwargs): """ Args: discount (float): The discount factor gamma. architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if `network_spec` is given explicitly in kwargs. Default: "large". fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm. environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent. feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_action". Default: True. feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: True. weight_pg (float): See IMPALALossFunction Component. weight_baseline (float): See IMPALALossFunction Component. weight_entropy (float): See IMPALALossFunction Component. num_workers (int): How many actors (workers) should be run in separate threads. worker_sample_size (int): How many steps the actor will perform in the environment each sample-run. dynamic_batching (bool): Whether to use the deepmind's custom dynamic batching op for wrapping the optimizer's step call. The batcher.so file must be compiled for this to work (see Docker file). Default: False. visualize (Union[int,bool]): Whether and how many workers to visualize. Default: False (no visualization). """ # Now that we fixed the Agent's spec, call the super constructor. super(SingleIMPALAAgent, self).__init__( type="single", discount=discount, architecture=architecture, fifo_queue_spec=fifo_queue_spec, environment_spec=environment_spec, feed_previous_action_through_nn=feed_previous_action_through_nn, feed_previous_reward_through_nn=feed_previous_reward_through_nn, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, worker_sample_size=worker_sample_size, name=kwargs.pop("name", "impala-single-agent"), **kwargs) self.dynamic_batching = dynamic_batching self.num_workers = num_workers self.visualize = visualize # If we use dynamic batching, wrap the dynamic batcher around the policy's graph_fn that we # actually call below during our build. if self.dynamic_batching: self.policy = DynamicBatchingPolicy(policy_spec=self.policy, scope="") self.env_output_splitter = ContainerSplitter( tuple_length=3 if self.has_rnn is False else 4, scope="env-output-splitter") self.fifo_output_splitter = ContainerSplitter( *self.fifo_queue_keys, scope="fifo-output-splitter") self.states_dict_splitter = ContainerSplitter( *list(self.fifo_record_space["states"].keys( ) if isinstance(self.state_space, Dict) else "dummy"), scope="states-dict-splitter") self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys)) # Slice some data from the EnvStepper (e.g only first internal states are needed). if self.has_rnn: internal_states_slicer = Slice(scope="internal-states-slicer", squeeze=True) else: internal_states_slicer = None self.transposer = Transpose(scope="transposer") # Create an IMPALALossFunction with some parameters. self.loss_function = IMPALALossFunction( discount=self.discount, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, slice_actions=self.feed_previous_action_through_nn, slice_rewards=self.feed_previous_reward_through_nn) # Merge back to insert into FIFO. self.fifo_input_merger = DictMerger(*self.fifo_queue_keys) # Dummy Flattener to calculate action-probs space. dummy_flattener = ReShape( flatten=True, flatten_categories=self.action_space.num_categories) self.environment_steppers = list() for i in range(self.num_workers): environment_spec_ = copy.deepcopy(environment_spec) if self.visualize is True or (isinstance(self.visualize, int) and i + 1 <= self.visualize): environment_spec_["visualize"] = True # Force worker_sample_size for IMPALA NNs (LSTM) in env-stepper to be 1. policy_spec = copy.deepcopy(self.policy_spec) if isinstance(policy_spec, dict) and isinstance(policy_spec["network_spec"], dict) and \ "type" in policy_spec["network_spec"] and "IMPALANetwork" in policy_spec["network_spec"]["type"]: policy_spec["network_spec"]["worker_sample_size"] = 1 env_stepper = EnvironmentStepper( environment_spec=environment_spec_, actor_component_spec=ActorComponent( preprocessor_spec=self.preprocessing_spec, policy_spec=policy_spec, exploration_spec=self.exploration_spec), state_space=self.state_space.with_batch_rank(), action_space=self.action_space.with_batch_rank(), reward_space=float, internal_states_space=self.internal_states_space, num_steps=self.worker_sample_size, add_action=not self.feed_previous_action_through_nn, add_reward=not self.feed_previous_reward_through_nn, add_previous_action_to_state=self. feed_previous_action_through_nn, add_previous_reward_to_state=self. feed_previous_reward_through_nn, add_action_probs=True, action_probs_space=dummy_flattener.get_preprocessed_space( self.action_space), scope="env-stepper-{}".format(i)) if self.dynamic_batching: env_stepper.actor_component.policy.parent_component = None env_stepper.actor_component.policy = DynamicBatchingPolicy( policy_spec=env_stepper.actor_component.policy, scope="") env_stepper.actor_component.add_components( env_stepper.actor_component.policy) self.environment_steppers.append(env_stepper) # Create the QueueRunners (one for each env-stepper). self.queue_runner = QueueRunner( self.fifo_queue, "step", -1, # -1: Take entire return value of API-method `step` as record to insert. self.env_output_splitter, self.fifo_input_merger, internal_states_slicer, *self.environment_steppers) sub_components = [ self.fifo_output_splitter, self.fifo_queue, self.queue_runner, self.transposer, self.staging_area, self.preprocessor, self.states_dict_splitter, self.policy, self.loss_function, self.optimizer ] # Add all the agent's sub-components to the root. self.root_component.add_components(*sub_components) # Define the Agent's (root Component's) API. self.define_graph_api() if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=None) self.graph_built = True if self.has_gpu: # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op). self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \ out_op_columns[0].op_records[0].op # Initialize the stage. self.graph_executor.monitored_session.run_step_fn( lambda step_context: step_context.session.run(self.stage_op )) # TODO remove after full refactor. self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \ out_op_columns[0].op_records[0].op
def __init__(self, discount=0.99, fifo_queue_spec=None, architecture="large", environment_spec=None, feed_previous_action_through_nn=True, feed_previous_reward_through_nn=True, weight_pg=None, weight_baseline=None, weight_entropy=None, worker_sample_size=100, **kwargs): """ Args: discount (float): The discount factor gamma. architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if `network_spec` is given explicitly in kwargs. Default: "large". fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm. environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent. feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_action". Default: True. feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: True. weight_pg (float): See IMPALALossFunction Component. weight_baseline (float): See IMPALALossFunction Component. weight_entropy (float): See IMPALALossFunction Component. worker_sample_size (int): How many steps the actor will perform in the environment each sample-run. Keyword Args: type (str): One of "single", "actor" or "learner". Default: "single". """ type_ = kwargs.pop("type", "single") assert type_ in ["single", "actor", "learner"] self.type = type_ self.worker_sample_size = worker_sample_size # Network-spec by default is a "large architecture" IMPALA network. self.network_spec = kwargs.pop( "network_spec", dict( type= "rlgraph.components.neural_networks.impala.impala_networks.{}IMPALANetwork" .format("Large" if architecture == "large" else "Small"))) if isinstance(self.network_spec, dict) and "type" in self.network_spec and \ "IMPALANetwork" in self.network_spec["type"]: self.network_spec = default_dict( self.network_spec, dict(worker_sample_size=1 if self.type == "actor" else self.worker_sample_size + 1)) # Depending on the job-type, remove the pieces from the Agent-spec/graph we won't need. self.exploration_spec = kwargs.pop("exploration_spec", None) optimizer_spec = kwargs.pop("optimizer_spec", None) observe_spec = kwargs.pop("observe_spec", None) self.feed_previous_action_through_nn = feed_previous_action_through_nn self.feed_previous_reward_through_nn = feed_previous_reward_through_nn # Run everything in a single process. if self.type == "single": environment_spec = environment_spec or self.default_environment_spec update_spec = kwargs.pop("update_spec", None) # Actors won't need to learn (no optimizer needed in graph). elif self.type == "actor": optimizer_spec = None update_spec = kwargs.pop("update_spec", dict(do_updates=False)) environment_spec = environment_spec or self.default_environment_spec # Learners won't need to explore (act) or observe (insert into Queue). else: observe_spec = None update_spec = kwargs.pop("update_spec", None) environment_spec = None # Add previous-action/reward preprocessors to env-specific preprocessor spec. # TODO: remove this empty hard-coded preprocessor. self.preprocessing_spec = kwargs.pop( "preprocessing_spec", dict( type="dict-preprocessor-stack", preprocessors=dict( # Flatten actions. previous_action=[ dict(type="reshape", flatten=True, flatten_categories=kwargs.get( "action_space").num_categories) ], # Bump reward and convert to float32, so that it can be concatenated by the Concat layer. previous_reward=[dict(type="reshape", new_shape=(1, ))]))) # Limit communication in distributed mode between each actor and the learner (never between actors). execution_spec = kwargs.pop("execution_spec", None) if execution_spec is not None and execution_spec.get( "mode") == "distributed": default_dict( execution_spec["session_config"], dict(type="monitored-training-session", allow_soft_placement=True, device_filters=["/job:learner/task:0"] + ([ "/job:actor/task:{}".format( execution_spec["distributed_spec"]["task_index"]) ] if self.type == "actor" else ["/job:learner/task:0"]))) # If Actor, make non-chief in either case (even if task idx == 0). if self.type == "actor": execution_spec["distributed_spec"]["is_chief"] = False # Hard-set device to the CPU for actors. execution_spec["device_strategy"] = "custom" execution_spec[ "default_device"] = "/job:{}/task:{}/cpu".format( self.type, execution_spec["distributed_spec"]["task_index"]) self.policy_spec = kwargs.pop("policy_spec", dict()) # TODO: Create some auto-setting based on LSTM inside the NN. default_dict( self.policy_spec, dict(type="shared-value-function-policy", deterministic=False, reuse_variable_scope="shared-policy", action_space=kwargs.get("action_space"))) # Now that we fixed the Agent's spec, call the super constructor. super(IMPALAAgent, self).__init__(discount=discount, preprocessing_spec=self.preprocessing_spec, network_spec=self.network_spec, policy_spec=self.policy_spec, exploration_spec=self.exploration_spec, optimizer_spec=optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, execution_spec=execution_spec, name=kwargs.pop( "name", "impala-{}-agent".format(self.type)), **kwargs) # Always use 1st learner as the parameter server for all policy variables. if self.execution_spec["mode"] == "distributed" and self.execution_spec[ "distributed_spec"]["cluster_spec"]: self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu"))) # Check whether we have an RNN. self.has_rnn = self.policy.neural_network.has_rnn() # Check, whether we are running with GPU. self.has_gpu = self.execution_spec["gpu_spec"]["gpus_enabled"] is True and \ self.execution_spec["gpu_spec"]["num_gpus"] > 0 # Some FIFO-queue specs. self.fifo_queue_keys = ["terminals", "states"] + \ (["actions"] if not self.feed_previous_action_through_nn else []) + \ (["rewards"] if not self.feed_previous_reward_through_nn else []) + \ ["action_probs"] + \ (["initial_internal_states"] if self.has_rnn else []) # Define FIFO record space. # Note that only states and internal_states (RNN) contain num-steps+1 items, all other sub-records only contain # num-steps items. self.fifo_record_space = Dict( { "terminals": bool, "action_probs": FloatBox(shape=(self.action_space.num_categories, )), }, add_batch_rank=False, add_time_rank=self.worker_sample_size) self.fifo_record_space["states"] = self.state_space.with_time_rank( self.worker_sample_size + 1) # Add action and rewards to state or do they have an extra channel? if self.feed_previous_action_through_nn: self.fifo_record_space["states"]["previous_action"] = \ self.action_space.with_time_rank(self.worker_sample_size + 1) else: self.fifo_record_space[ "actions"] = self.action_space.with_time_rank( self.worker_sample_size) if self.feed_previous_action_through_nn: self.fifo_record_space["states"]["previous_reward"] = FloatBox( add_time_rank=self.worker_sample_size + 1) else: self.fifo_record_space["rewards"] = FloatBox( add_time_rank=self.worker_sample_size) if self.has_rnn: self.fifo_record_space[ "initial_internal_states"] = self.internal_states_space.with_time_rank( False) # Create our FIFOQueue (actors will enqueue, learner(s) will dequeue). self.fifo_queue = FIFOQueue.from_spec( fifo_queue_spec or dict(capacity=1), reuse_variable_scope="shared-fifo-queue", only_insert_single_records=True, record_space=self.fifo_record_space, device="/job:learner/task:0/cpu" if self.execution_spec["mode"] == "distributed" and self.execution_spec["distributed_spec"]["cluster_spec"] else None) # Remove `states` key from input_spaces: not needed. del self.input_spaces["states"] # Add all our sub-components to the core. if self.type == "single": pass elif self.type == "actor": # No learning, no loss function. self.loss_function = None # A Dict Splitter to split things from the EnvStepper. self.env_output_splitter = ContainerSplitter( tuple_length=4, scope="env-output-splitter") self.states_dict_splitter = None # Slice some data from the EnvStepper (e.g only first internal states are needed). self.internal_states_slicer = Slice(scope="internal-states-slicer", squeeze=True) # Merge back to insert into FIFO. self.fifo_input_merger = DictMerger(*self.fifo_queue_keys) # Dummy Flattener to calculate action-probs space. dummy_flattener = ReShape( flatten=True, flatten_categories=self.action_space.num_categories) self.environment_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=ActorComponent(self.preprocessor, self.policy, self.exploration), state_space=self.state_space.with_batch_rank(), reward_space= float, # TODO <- float64 for deepmind? may not work for other envs internal_states_space=self.internal_states_space, num_steps=self.worker_sample_size, add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=dummy_flattener.get_preprocessed_space( self.action_space)) sub_components = [ self.environment_stepper, self.env_output_splitter, self.internal_states_slicer, self.fifo_input_merger, self.fifo_queue ] # Learner. else: self.environment_stepper = None # A Dict splitter to split up items from the queue. self.fifo_input_merger = None self.fifo_output_splitter = ContainerSplitter( *self.fifo_queue_keys, scope="fifo-output-splitter") self.states_dict_splitter = ContainerSplitter( *list(self.fifo_record_space["states"].keys()), scope="states-dict-splitter") self.internal_states_slicer = None self.transposer = Transpose( scope="transposer", device=dict(ops="/job:learner/task:0/cpu")) self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys)) # Create an IMPALALossFunction with some parameters. self.loss_function = IMPALALossFunction( discount=self.discount, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, slice_actions=self.feed_previous_action_through_nn, slice_rewards=self.feed_previous_reward_through_nn, device="/job:learner/task:0/gpu") self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu", ops="/job:learner/task:0/gpu"))) for component in [ self.staging_area, self.preprocessor, self.optimizer ]: component.propagate_sub_component_properties( dict(device="/job:learner/task:0/gpu")) sub_components = [ self.fifo_output_splitter, self.fifo_queue, self.states_dict_splitter, self.transposer, self.staging_area, self.preprocessor, self.policy, self.loss_function, self.optimizer ] if self.type != "single": # Add all the agent's sub-components to the root. self.root_component.add_components(*sub_components) # Define the Agent's (root Component's) API. self.define_graph_api(*sub_components) if self.type != "single" and self.auto_build: if self.type == "learner": build_options = dict( build_device_context="/job:learner/task:0/cpu", pin_global_variable_device="/job:learner/task:0/cpu") self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=build_options) else: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=None) self.graph_built = True if self.has_gpu: # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op). self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \ out_op_columns[0].op_records[0].op # Initialize the stage. self.graph_executor.monitored_session.run_step_fn( lambda step_context: step_context.session.run(self.stage_op )) # TODO remove after full refactor. self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \ out_op_columns[0].op_records[0].op if self.type == "actor": self.enqueue_op = self.root_component.sub_components["fifo-queue"].api_methods["insert_records"]. \ out_op_columns[0].op_records[0].op
def __init__(self, worker_sample_size=100, scope="impala-network", **kwargs): """ Args: worker_sample_size (int): How many time-steps an IMPALA actor will have performed in one rollout. """ super(IMPALANetwork, self).__init__(scope=scope, **kwargs) self.worker_sample_size = worker_sample_size # Create all needed sub-components. # ContainerSplitter for the Env signal (dict of 4 keys: for env image, env text, previous action and reward). self.splitter = ContainerSplitter("RGB_INTERLEAVED", "INSTR", "previous_action", "previous_reward", scope="input-splitter") # Fold the time rank into the batch rank. self.time_rank_fold_before_lstm = ReShape( fold_time_rank=True, scope="time-rank-fold-before-lstm") self.time_rank_unfold_before_lstm = ReShape( unfold_time_rank=True, time_major=True, scope="time-rank-unfold-before-lstm") # The Image Processing Stack (left side of "Large Architecture" Figure 3 in [1]). # Conv2D column + ReLU + fc(256) + ReLU. self.image_processing_stack = self.build_image_processing_stack() # The text processing pipeline: Takes a batch of string tensors as input, creates a hash-bucket thereof, # and passes the output of the hash bucket through an embedding-lookup(20) layer. The output of the embedding # lookup is then passed through an LSTM(64). self.text_processing_stack = self.build_text_processing_stack() #self.debug_slicer = Slice(scope="internal-states-slicer", squeeze=True) # The concatenation layer (concatenates outputs from image/text processing stacks, previous action/reward). self.concat_layer = ConcatLayer() # The main LSTM (going into the ActionAdapter (next in the Policy Component that uses this NN Component)). # Use time-major as it's faster (say tf docs). self.main_lstm = LSTMLayer(units=256, scope="lstm-256", time_major=True, static_loop=self.worker_sample_size) # Add all sub-components to this one. self.add_components( self.splitter, self.image_processing_stack, self.text_processing_stack, self.concat_layer, self.main_lstm, self.time_rank_fold_before_lstm, self.time_rank_unfold_before_lstm, #self.debug_slicer )
class IMPALANetwork(NeuralNetwork): """ The base class for both "large and small architecture" versions of the networks used in [1]. [1] IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures - Espeholt, Soyer, Munos et al. - 2018 (https://arxiv.org/abs/1802.01561) """ def __init__(self, worker_sample_size=100, scope="impala-network", **kwargs): """ Args: worker_sample_size (int): How many time-steps an IMPALA actor will have performed in one rollout. """ super(IMPALANetwork, self).__init__(scope=scope, **kwargs) self.worker_sample_size = worker_sample_size # Create all needed sub-components. # ContainerSplitter for the Env signal (dict of 4 keys: for env image, env text, previous action and reward). self.splitter = ContainerSplitter("RGB_INTERLEAVED", "INSTR", "previous_action", "previous_reward", scope="input-splitter") # Fold the time rank into the batch rank. self.time_rank_fold_before_lstm = ReShape( fold_time_rank=True, scope="time-rank-fold-before-lstm") self.time_rank_unfold_before_lstm = ReShape( unfold_time_rank=True, time_major=True, scope="time-rank-unfold-before-lstm") # The Image Processing Stack (left side of "Large Architecture" Figure 3 in [1]). # Conv2D column + ReLU + fc(256) + ReLU. self.image_processing_stack = self.build_image_processing_stack() # The text processing pipeline: Takes a batch of string tensors as input, creates a hash-bucket thereof, # and passes the output of the hash bucket through an embedding-lookup(20) layer. The output of the embedding # lookup is then passed through an LSTM(64). self.text_processing_stack = self.build_text_processing_stack() #self.debug_slicer = Slice(scope="internal-states-slicer", squeeze=True) # The concatenation layer (concatenates outputs from image/text processing stacks, previous action/reward). self.concat_layer = ConcatLayer() # The main LSTM (going into the ActionAdapter (next in the Policy Component that uses this NN Component)). # Use time-major as it's faster (say tf docs). self.main_lstm = LSTMLayer(units=256, scope="lstm-256", time_major=True, static_loop=self.worker_sample_size) # Add all sub-components to this one. self.add_components( self.splitter, self.image_processing_stack, self.text_processing_stack, self.concat_layer, self.main_lstm, self.time_rank_fold_before_lstm, self.time_rank_unfold_before_lstm, #self.debug_slicer ) @staticmethod def build_image_processing_stack(): """ Builds the image processing pipeline for IMPALA and returns it. """ raise NotImplementedError @staticmethod def build_text_processing_stack(): """ Helper function to build the text processing pipeline for both the large and small architectures, consisting of: - ReShape preprocessor to fold the incoming time rank into the batch rank. - StringToHashBucket Layer taking a batch of sentences and converting them to an indices-table of dimensions: cols=length of longest sentences in input rows=number of items in the batch The cols dimension could be interpreted as the time rank into a consecutive LSTM. The StringToHashBucket Component returns the sequence length of each batch item for exactly that purpose. - Embedding Lookup Layer of embedding size 20 and number of rows == num_hash_buckets (see previous layer). - LSTM processing the batched sequences of words coming from the embedding layer as batches of rows. """ num_hash_buckets = 1000 # Create a hash bucket from the sentences and use that bucket to do an embedding lookup (instead of # a vocabulary). string_to_hash_bucket = StringToHashBucket( num_hash_buckets=num_hash_buckets) embedding = EmbeddingLookup(embed_dim=20, vocab_size=num_hash_buckets, pad_empty=True) # The time rank for the LSTM is now the sequence of words in a sentence, NOT the original env time rank. # We will only use the last output of the LSTM-64 for further processing as that is the output after having # seen all words in the sentence. # The original env stepping time rank is currently folded into the batch rank and must be unfolded again before # passing it into the main LSTM. lstm64 = LSTMLayer(units=64, scope="lstm-64", time_major=False) tuple_splitter = ContainerSplitter(tuple_length=2, scope="tuple-splitter") def custom_apply(self, inputs): hash_bucket, lengths = self.sub_components[ "string-to-hash-bucket"].apply(inputs) embedding_output = self.sub_components["embedding-lookup"].apply( hash_bucket) # Return only the last output (sentence of words, where we are not interested in intermediate results # where the LSTM has not seen the entire sentence yet). # Last output is the final internal h-state (slot 1 in the returned LSTM tuple; slot 0 is final c-state). lstm_output = self.sub_components["lstm-64"].apply( embedding_output, sequence_length=lengths) lstm_final_internals = lstm_output["last_internal_states"] # Need to split once more because the LSTM state is always a tuple of final c- and h-states. _, lstm_final_h_state = self.sub_components[ "tuple-splitter"].split(lstm_final_internals) return lstm_final_h_state text_processing_stack = Stack(string_to_hash_bucket, embedding, lstm64, tuple_splitter, api_methods={("apply", custom_apply)}, scope="text-stack") return text_processing_stack @rlgraph_api def apply(self, input_dict, internal_states=None): # Split the input dict coming directly from the Env. _, _, _, orig_previous_reward = self.splitter.split(input_dict) folded_input = self.time_rank_fold_before_lstm.apply(input_dict) image, text, previous_action, previous_reward = self.splitter.split( folded_input) # Get the left-stack (image) and right-stack (text) output (see [1] for details). text_processing_output = self.text_processing_stack.apply(text) image_processing_output = self.image_processing_stack.apply(image) # Concat everything together. concatenated_data = self.concat_layer.apply(image_processing_output, text_processing_output, previous_action, previous_reward) unfolded_concatenated_data = self.time_rank_unfold_before_lstm.apply( concatenated_data, orig_previous_reward) # Feed concat'd input into main LSTM(256). lstm_output = self.main_lstm.apply(unfolded_concatenated_data, internal_states) return lstm_output