class BatchApply(Component): """ Takes an input with batch and time ranks, then folds the time rank into the batch rank, calls a certain API of some arbitrary child component, and unfolds the time rank again. """ def __init__(self, sub_component, api_method_name, scope="batch-apply", **kwargs): """ Args: sub_component (Component): The sub-Component to apply the batch to. api_method_name (str): The name of the API-method to call on the sub-component. """ super(BatchApply, self).__init__(scope=scope, **kwargs) self.sub_component = sub_component self.api_method_name = api_method_name # Create the necessary reshape components. self.folder = ReShape(fold_time_rank=True, scope="folder") self.unfolder = ReShape(unfold_time_rank=True, scope="unfolder") self.add_components(self.sub_component, self.folder, self.unfolder) @rlgraph_api def call(self, input_): folded = self.folder.call(input_) applied = getattr(self.sub_component, self.api_method_name)(folded) unfolded = self.unfolder.call(applied, input_before_time_rank_folding=input_) return unfolded
class BatchApply(Component): """ Takes an input with batch and time ranks, then folds the time rank into the batch rank, calls a certain API of some arbitrary child component, and unfolds the time rank again. """ def __init__(self, sub_component, api_method_name, scope="batch-apply", **kwargs): """ Args: sub_component (Component): The sub-Component to apply the batch to. api_method_name (str): The name of the API-method to call on the sub-component. """ super(BatchApply, self).__init__(scope=scope, **kwargs) self.sub_component = sub_component self.api_method_name = api_method_name # Create the necessary reshape components. self.folder = ReShape(fold_time_rank=True, scope="folder") self.unfolder = ReShape(unfold_time_rank=True, scope="unfolder") self.add_components(self.sub_component, self.folder, self.unfolder) @rlgraph_api def apply(self, input_): folded = self._graph_fn_fold(input_) applied = self._graph_fn_apply(folded) unfolded = self._graph_fn_unfold(applied, input_) return unfolded @graph_fn(flatten_ops=True, split_ops=True) def _graph_fn_fold(self, input_): if get_backend() == "tf": # Fold the time rank. input_folded = self.folder.apply(input_) return input_folded @graph_fn def _graph_fn_apply(self, input_folded): if get_backend() == "tf": # Send the folded input through the sub-component. sub_component_out = getattr(self.sub_component, self.api_method_name)(input_folded) return sub_component_out @graph_fn(flatten_ops=True, split_ops=True) def _graph_fn_unfold(self, sub_component_out, orig_input): if get_backend() == "tf": # Un-fold the time rank again. output = self.unfolder.apply( sub_component_out, input_before_time_rank_folding=orig_input) return output
def test_keras_style_one_container_input_space(self): # Define one container input Space. input_space = Tuple(IntBox(3), FloatBox(shape=(4,)), add_batch_rank=True) # One-hot flatten the int tensor. flatten_layer_out = ReShape(flatten=True, flatten_categories=True)(input_space[0]) # Run the float tensor through two dense layers. dense_1_out = DenseLayer(units=3, scope="d1")(input_space[1]) dense_2_out = DenseLayer(units=5, scope="d2")(dense_1_out) # Concat everything. cat_out = ConcatLayer()(flatten_layer_out, dense_2_out) # Use the `outputs` arg to allow your network to trace back the data flow until the input space. # `inputs` is not needed here as we only have one single input (the Tuple). neural_net = NeuralNetwork(outputs=cat_out) test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_space)) var_dict = neural_net.variable_registry w1_value = test.read_variable_values(var_dict["neural-network/d1/dense/kernel"]) b1_value = test.read_variable_values(var_dict["neural-network/d1/dense/bias"]) w2_value = test.read_variable_values(var_dict["neural-network/d2/dense/kernel"]) b2_value = test.read_variable_values(var_dict["neural-network/d2/dense/bias"]) # Batch of size=n. input_ = input_space.sample(4) expected = np.concatenate([ # concat everything one_hot(input_[0]), # int flattening dense_layer(dense_layer(input_[1], w1_value, b1_value), w2_value, b2_value) # float -> 2 x dense ], axis=-1) out = test.test(("call", tuple([input_])), expected_outputs=expected) test.terminate()
def build_image_processing_stack(): """ Constructs a ReShape preprocessor to fold the time rank into the batch rank. Then builds the 2 Conv2D Layers followed by ReLUs. Then adds: fc(256) + ReLU. """ # Collect components for image stack before unfolding time-rank going into main LSTM. sub_components = list() # Divide by 255 sub_components.append(Divide(divisor=255, scope="divide-255")) for i, (num_filters, kernel_size, stride) in enumerate(zip([16, 32], [8, 4], [4, 2])): # Conv2D plus ReLU activation function. conv2d = Conv2DLayer( filters=num_filters, kernel_size=kernel_size, strides=stride, padding="same", activation="relu", scope="conv2d-{}".format(i) ) sub_components.append(conv2d) # A Flatten preprocessor and then an fc block (surrounded by ReLUs) and a time-rank-unfolding. sub_components.extend([ ReShape(flatten=True, scope="flatten"), # Flattener (to flatten Conv2D output for the fc layer). DenseLayer(units=256), # Dense layer. NNLayer(activation="relu", scope="relu-before-lstm"), ]) #stack_before_unfold = <- formerly known as image_stack = Stack(sub_components, scope="image-stack") return image_stack
def test_keras_style_two_separate_input_spaces(self): # Define two input Spaces first. Independently (no container). input_space_1 = IntBox(3, add_batch_rank=True) input_space_2 = FloatBox(shape=(4,), add_batch_rank=True) # One-hot flatten the int tensor. flatten_layer_out = ReShape(flatten=True, flatten_categories=True)(input_space_1) # Run the float tensor through two dense layers. dense_1_out = DenseLayer(units=3, scope="d1")(input_space_2) dense_2_out = DenseLayer(units=5, scope="d2")(dense_1_out) # Concat everything. cat_out = ConcatLayer()(flatten_layer_out, dense_2_out) # Use the `outputs` arg to allow your network to trace back the data flow until the input space. neural_net = NeuralNetwork(inputs=[input_space_1, input_space_2], outputs=cat_out) test = ComponentTest(component=neural_net, input_spaces=dict(inputs=[input_space_1, input_space_2])) var_dict = neural_net.variable_registry w1_value = test.read_variable_values(var_dict["neural-network/d1/dense/kernel"]) b1_value = test.read_variable_values(var_dict["neural-network/d1/dense/bias"]) w2_value = test.read_variable_values(var_dict["neural-network/d2/dense/kernel"]) b2_value = test.read_variable_values(var_dict["neural-network/d2/dense/bias"]) # Batch of size=n. input_ = [input_space_1.sample(4), input_space_2.sample(4)] expected = np.concatenate([ # concat everything one_hot(input_[0]), # int flattening dense_layer(dense_layer(input_[1], w1_value, b1_value), w2_value, b2_value) # float -> 2 x dense ], axis=-1) out = test.test(("call", input_), expected_outputs=expected) test.terminate()
def __init__(self, worker_sample_size=100, scope="impala-network", **kwargs): """ Args: worker_sample_size (int): How many time-steps an IMPALA actor will have performed in one rollout. """ super(IMPALANetwork, self).__init__(scope=scope, **kwargs) self.worker_sample_size = worker_sample_size # Create all needed sub-components. # ContainerSplitter for the Env signal (dict of 4 keys: for env image, env text, previous action and reward). self.splitter = ContainerSplitter("RGB_INTERLEAVED", "INSTR", "previous_action", "previous_reward", scope="input-splitter") # Fold the time rank into the batch rank. self.time_rank_fold_before_lstm = ReShape(fold_time_rank=True, scope="time-rank-fold-before-lstm") self.time_rank_unfold_before_lstm = ReShape(unfold_time_rank=True, time_major=True, scope="time-rank-unfold-before-lstm") # The Image Processing Stack (left side of "Large Architecture" Figure 3 in [1]). # Conv2D column + ReLU + fc(256) + ReLU. self.image_processing_stack = self.build_image_processing_stack() # The text processing pipeline: Takes a batch of string tensors as input, creates a hash-bucket thereof, # and passes the output of the hash bucket through an embedding-lookup(20) layer. The output of the embedding # lookup is then passed through an LSTM(64). self.text_processing_stack = self.build_text_processing_stack() #self.debug_slicer = Slice(scope="internal-states-slicer", squeeze=True) # The concatenation layer (concatenates outputs from image/text processing stacks, previous action/reward). self.concat_layer = ConcatLayer() # The main LSTM (going into the ActionAdapter (next in the Policy Component that uses this NN Component)). # Use time-major as it's faster (say tf docs). self.main_lstm = LSTMLayer(units=256, scope="lstm-256", time_major=True, static_loop=self.worker_sample_size) # Add all sub-components to this one. self.add_components( self.splitter, self.image_processing_stack, self.text_processing_stack, self.concat_layer, self.main_lstm, self.time_rank_fold_before_lstm, self.time_rank_unfold_before_lstm, #self.debug_slicer )
def __init__(self, action_space=None, final_shape=None, weights_spec=None, biases_spec=None, activation=None, pre_network_spec=None, scope="action-adapter", **kwargs): """ Args: action_space (Optional[Space]): The action Space within which this Component will create actions. NOTE: Exactly one of `action_space` of `final_shape` must be provided. final_shape (Optional[Tuple[int]): An optional final output shape (in case action_space is not provided). If None, will calculate the shape automatically from the given `action_space`. NOTE: Exactly one of `action_space` of `final_shape` must be provided. weights_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the weights of `self.action layer`. Default: None (use default initializer). biases_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the biases of `self.action layer`. Default: None (use default initializer, which is usually 0.0). activation (Optional[str]): The activation function to use for `self.action_layer`. Default: None (=linear). pre_network_spec (Optional[dict,NeuralNetwork]): A spec dict for a neural network coming before the last action layer. If None, only the action layer itself is applied. """ # Build the action layer for this adapter based on the given action-space. self.action_space = None if action_space is not None: self.action_space = action_space.with_batch_rank() assert not isinstance(self.action_space, ContainerSpace),\ "ERROR: ActionAdapter cannot handle ContainerSpaces!" units, self.final_shape = self.get_units_and_shape() action_layer = DenseLayer(units=units, activation=activation, weights_spec=weights_spec, biases_spec=biases_spec, scope="action-layer") # Do we have a pre-NN? self.network = NeuralNetwork.from_spec( pre_network_spec, scope="action-network") # type: NeuralNetwork self.network.add_layer(action_layer) # Add the reshape layer to match the action space's shape. self.network.add_layer(ReShape(new_shape=self.final_shape)) super(ActionAdapter, self).__init__(self.network, scope=scope, **kwargs)
def __init__(self, sub_component, api_method_name, scope="batch-apply", **kwargs): """ Args: sub_component (Component): The sub-Component to apply the batch to. api_method_name (str): The name of the API-method to call on the sub-component. """ super(BatchApply, self).__init__(scope=scope, **kwargs) self.sub_component = sub_component self.api_method_name = api_method_name # Create the necessary reshape components. self.folder = ReShape(fold_time_rank=True, scope="folder") self.unfolder = ReShape(unfold_time_rank=True, scope="unfolder") self.add_components(self.sub_component, self.folder, self.unfolder)
def build_image_processing_stack(): """ Constructs a ReShape preprocessor to fold the time rank into the batch rank. Then builds the 3 sequential Conv2D blocks that process the image information. Each of these 3 blocks consists of: - 1 Conv2D layer followed by a MaxPool2D - 2 residual blocks, each of which looks like: - ReLU + Conv2D + ReLU + Conv2D + element-wise add with original input Then adds: ReLU + fc(256) + ReLU. """ # Collect components for image stack before unfolding time-rank going into main LSTM. sub_components = list() # Divide by 255 sub_components.append(Divide(divisor=255, scope="divide-255")) for i, num_filters in enumerate([16, 32, 32]): # Conv2D plus MaxPool2D. conv2d_plus_maxpool = Stack( Conv2DLayer(filters=num_filters, kernel_size=3, strides=1, padding="same"), MaxPool2DLayer(pool_size=3, strides=2, padding="same"), scope="conv-max" ) # Single unit for the residual layers (ReLU + Conv2D 3x3 stride=1). residual_unit = Stack( NNLayer(activation="relu"), # single ReLU Conv2DLayer(filters=num_filters, kernel_size=3, strides=1, padding="same"), scope="relu-conv" ) # Residual Layer. residual_layer = ResidualLayer(residual_unit=residual_unit, repeats=2) # Repeat same residual layer 2x. residual_repeater = RepeaterStack(sub_component=residual_layer, repeats=2) sub_components.append(Stack(conv2d_plus_maxpool, residual_repeater, scope="conv-unit-{}".format(i))) # A Flatten preprocessor and then an fc block (surrounded by ReLUs) and a time-rank-unfolding. sub_components.extend([ ReShape(flatten=True, scope="flatten"), # Flattener (to flatten Conv2D output for the fc layer). NNLayer(activation="relu", scope="relu-1"), # ReLU 1 DenseLayer(units=256), # Dense layer. NNLayer(activation="relu", scope="relu-2"), # ReLU 2 ]) image_stack = Stack(sub_components, scope="image-stack") return image_stack
def __init__(self, distribution_specs, scope="joint-cumulative-distribution", **kwargs): """ Args: distribution_specs (dict): Dict with flat-keys containing the specifications of the single sub-distributions. """ super(JointCumulativeDistribution, self).__init__(scope=scope, **kwargs) # Create the flattened sub-distributions and add them. self.flattened_sub_distributions = \ {flat_key: Distribution.from_spec(spec, scope="sub-distribution-{}".format(i)) for i, (flat_key, spec) in enumerate(distribution_specs.items()) } self.flattener = ReShape(flatten=True) self.add_components(self.flattener, *list(self.flattened_sub_distributions.values()))
def __init__(self, discount=0.99, fifo_queue_spec=None, architecture="large", environment_spec=None, feed_previous_action_through_nn=True, feed_previous_reward_through_nn=True, weight_pg=None, weight_baseline=None, weight_entropy=None, worker_sample_size=100, **kwargs): """ Args: discount (float): The discount factor gamma. architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if `network_spec` is given explicitly in kwargs. Default: "large". fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm. environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent. feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_action". Default: True. feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: True. weight_pg (float): See IMPALALossFunction Component. weight_baseline (float): See IMPALALossFunction Component. weight_entropy (float): See IMPALALossFunction Component. worker_sample_size (int): How many steps the actor will perform in the environment each sample-run. Keyword Args: type (str): One of "single", "actor" or "learner". Default: "single". """ type_ = kwargs.pop("type", "single") assert type_ in ["single", "actor", "learner"] self.type = type_ self.worker_sample_size = worker_sample_size # Network-spec by default is a "large architecture" IMPALA network. self.network_spec = kwargs.pop( "network_spec", dict( type= "rlgraph.components.neural_networks.impala.impala_networks.{}IMPALANetwork" .format("Large" if architecture == "large" else "Small"))) if isinstance(self.network_spec, dict) and "type" in self.network_spec and \ "IMPALANetwork" in self.network_spec["type"]: self.network_spec = default_dict( self.network_spec, dict(worker_sample_size=1 if self.type == "actor" else self.worker_sample_size + 1)) # Depending on the job-type, remove the pieces from the Agent-spec/graph we won't need. self.exploration_spec = kwargs.pop("exploration_spec", None) optimizer_spec = kwargs.pop("optimizer_spec", None) observe_spec = kwargs.pop("observe_spec", None) self.feed_previous_action_through_nn = feed_previous_action_through_nn self.feed_previous_reward_through_nn = feed_previous_reward_through_nn # Run everything in a single process. if self.type == "single": environment_spec = environment_spec or self.default_environment_spec update_spec = kwargs.pop("update_spec", None) # Actors won't need to learn (no optimizer needed in graph). elif self.type == "actor": optimizer_spec = None update_spec = kwargs.pop("update_spec", dict(do_updates=False)) environment_spec = environment_spec or self.default_environment_spec # Learners won't need to explore (act) or observe (insert into Queue). else: observe_spec = None update_spec = kwargs.pop("update_spec", None) environment_spec = None # Add previous-action/reward preprocessors to env-specific preprocessor spec. # TODO: remove this empty hard-coded preprocessor. self.preprocessing_spec = kwargs.pop( "preprocessing_spec", dict( type="dict-preprocessor-stack", preprocessors=dict( # Flatten actions. previous_action=[ dict(type="reshape", flatten=True, flatten_categories=kwargs.get( "action_space").num_categories) ], # Bump reward and convert to float32, so that it can be concatenated by the Concat layer. previous_reward=[dict(type="reshape", new_shape=(1, ))]))) # Limit communication in distributed mode between each actor and the learner (never between actors). execution_spec = kwargs.pop("execution_spec", None) if execution_spec is not None and execution_spec.get( "mode") == "distributed": default_dict( execution_spec["session_config"], dict(type="monitored-training-session", allow_soft_placement=True, device_filters=["/job:learner/task:0"] + ([ "/job:actor/task:{}".format( execution_spec["distributed_spec"]["task_index"]) ] if self.type == "actor" else ["/job:learner/task:0"]))) # If Actor, make non-chief in either case (even if task idx == 0). if self.type == "actor": execution_spec["distributed_spec"]["is_chief"] = False # Hard-set device to the CPU for actors. execution_spec["device_strategy"] = "custom" execution_spec[ "default_device"] = "/job:{}/task:{}/cpu".format( self.type, execution_spec["distributed_spec"]["task_index"]) self.policy_spec = kwargs.pop("policy_spec", dict()) # TODO: Create some auto-setting based on LSTM inside the NN. default_dict( self.policy_spec, dict(type="shared-value-function-policy", deterministic=False, reuse_variable_scope="shared-policy", action_space=kwargs.get("action_space"))) # Now that we fixed the Agent's spec, call the super constructor. super(IMPALAAgent, self).__init__(discount=discount, preprocessing_spec=self.preprocessing_spec, network_spec=self.network_spec, policy_spec=self.policy_spec, exploration_spec=self.exploration_spec, optimizer_spec=optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, execution_spec=execution_spec, name=kwargs.pop( "name", "impala-{}-agent".format(self.type)), **kwargs) # Always use 1st learner as the parameter server for all policy variables. if self.execution_spec["mode"] == "distributed" and self.execution_spec[ "distributed_spec"]["cluster_spec"]: self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu"))) # Check whether we have an RNN. self.has_rnn = self.policy.neural_network.has_rnn() # Check, whether we are running with GPU. self.has_gpu = self.execution_spec["gpu_spec"]["gpus_enabled"] is True and \ self.execution_spec["gpu_spec"]["num_gpus"] > 0 # Some FIFO-queue specs. self.fifo_queue_keys = ["terminals", "states"] + \ (["actions"] if not self.feed_previous_action_through_nn else []) + \ (["rewards"] if not self.feed_previous_reward_through_nn else []) + \ ["action_probs"] + \ (["initial_internal_states"] if self.has_rnn else []) # Define FIFO record space. # Note that only states and internal_states (RNN) contain num-steps+1 items, all other sub-records only contain # num-steps items. self.fifo_record_space = Dict( { "terminals": bool, "action_probs": FloatBox(shape=(self.action_space.num_categories, )), }, add_batch_rank=False, add_time_rank=self.worker_sample_size) self.fifo_record_space["states"] = self.state_space.with_time_rank( self.worker_sample_size + 1) # Add action and rewards to state or do they have an extra channel? if self.feed_previous_action_through_nn: self.fifo_record_space["states"]["previous_action"] = \ self.action_space.with_time_rank(self.worker_sample_size + 1) else: self.fifo_record_space[ "actions"] = self.action_space.with_time_rank( self.worker_sample_size) if self.feed_previous_action_through_nn: self.fifo_record_space["states"]["previous_reward"] = FloatBox( add_time_rank=self.worker_sample_size + 1) else: self.fifo_record_space["rewards"] = FloatBox( add_time_rank=self.worker_sample_size) if self.has_rnn: self.fifo_record_space[ "initial_internal_states"] = self.internal_states_space.with_time_rank( False) # Create our FIFOQueue (actors will enqueue, learner(s) will dequeue). self.fifo_queue = FIFOQueue.from_spec( fifo_queue_spec or dict(capacity=1), reuse_variable_scope="shared-fifo-queue", only_insert_single_records=True, record_space=self.fifo_record_space, device="/job:learner/task:0/cpu" if self.execution_spec["mode"] == "distributed" and self.execution_spec["distributed_spec"]["cluster_spec"] else None) # Remove `states` key from input_spaces: not needed. del self.input_spaces["states"] # Add all our sub-components to the core. if self.type == "single": pass elif self.type == "actor": # No learning, no loss function. self.loss_function = None # A Dict Splitter to split things from the EnvStepper. self.env_output_splitter = ContainerSplitter( tuple_length=4, scope="env-output-splitter") self.states_dict_splitter = None # Slice some data from the EnvStepper (e.g only first internal states are needed). self.internal_states_slicer = Slice(scope="internal-states-slicer", squeeze=True) # Merge back to insert into FIFO. self.fifo_input_merger = DictMerger(*self.fifo_queue_keys) # Dummy Flattener to calculate action-probs space. dummy_flattener = ReShape( flatten=True, flatten_categories=self.action_space.num_categories) self.environment_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=ActorComponent(self.preprocessor, self.policy, self.exploration), state_space=self.state_space.with_batch_rank(), reward_space= float, # TODO <- float64 for deepmind? may not work for other envs internal_states_space=self.internal_states_space, num_steps=self.worker_sample_size, add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=dummy_flattener.get_preprocessed_space( self.action_space)) sub_components = [ self.environment_stepper, self.env_output_splitter, self.internal_states_slicer, self.fifo_input_merger, self.fifo_queue ] # Learner. else: self.environment_stepper = None # A Dict splitter to split up items from the queue. self.fifo_input_merger = None self.fifo_output_splitter = ContainerSplitter( *self.fifo_queue_keys, scope="fifo-output-splitter") self.states_dict_splitter = ContainerSplitter( *list(self.fifo_record_space["states"].keys()), scope="states-dict-splitter") self.internal_states_slicer = None self.transposer = Transpose( scope="transposer", device=dict(ops="/job:learner/task:0/cpu")) self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys)) # Create an IMPALALossFunction with some parameters. self.loss_function = IMPALALossFunction( discount=self.discount, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, slice_actions=self.feed_previous_action_through_nn, slice_rewards=self.feed_previous_reward_through_nn, device="/job:learner/task:0/gpu") self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu", ops="/job:learner/task:0/gpu"))) for component in [ self.staging_area, self.preprocessor, self.optimizer ]: component.propagate_sub_component_properties( dict(device="/job:learner/task:0/gpu")) sub_components = [ self.fifo_output_splitter, self.fifo_queue, self.states_dict_splitter, self.transposer, self.staging_area, self.preprocessor, self.policy, self.loss_function, self.optimizer ] if self.type != "single": # Add all the agent's sub-components to the root. self.root_component.add_components(*sub_components) # Define the Agent's (root Component's) API. self.define_graph_api(*sub_components) if self.type != "single" and self.auto_build: if self.type == "learner": build_options = dict( build_device_context="/job:learner/task:0/cpu", pin_global_variable_device="/job:learner/task:0/cpu") self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=build_options) else: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=None) self.graph_built = True if self.has_gpu: # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op). self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \ out_op_columns[0].op_records[0].op # Initialize the stage. self.graph_executor.monitored_session.run_step_fn( lambda step_context: step_context.session.run(self.stage_op )) # TODO remove after full refactor. self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \ out_op_columns[0].op_records[0].op if self.type == "actor": self.enqueue_op = self.root_component.sub_components["fifo-queue"].api_methods["insert_records"]. \ out_op_columns[0].op_records[0].op
class IMPALANetwork(NeuralNetwork): """ The base class for both "large and small architecture" versions of the networks used in [1]. [1] IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures - Espeholt, Soyer, Munos et al. - 2018 (https://arxiv.org/abs/1802.01561) """ def __init__(self, worker_sample_size=100, scope="impala-network", **kwargs): """ Args: worker_sample_size (int): How many time-steps an IMPALA actor will have performed in one rollout. """ super(IMPALANetwork, self).__init__(scope=scope, **kwargs) self.worker_sample_size = worker_sample_size # Create all needed sub-components. # ContainerSplitter for the Env signal (dict of 4 keys: for env image, env text, previous action and reward). self.splitter = ContainerSplitter("RGB_INTERLEAVED", "INSTR", "previous_action", "previous_reward", scope="input-splitter") # Fold the time rank into the batch rank. self.time_rank_fold_before_lstm = ReShape( fold_time_rank=True, scope="time-rank-fold-before-lstm") self.time_rank_unfold_before_lstm = ReShape( unfold_time_rank=True, time_major=True, scope="time-rank-unfold-before-lstm") # The Image Processing Stack (left side of "Large Architecture" Figure 3 in [1]). # Conv2D column + ReLU + fc(256) + ReLU. self.image_processing_stack = self.build_image_processing_stack() # The text processing pipeline: Takes a batch of string tensors as input, creates a hash-bucket thereof, # and passes the output of the hash bucket through an embedding-lookup(20) layer. The output of the embedding # lookup is then passed through an LSTM(64). self.text_processing_stack = self.build_text_processing_stack() #self.debug_slicer = Slice(scope="internal-states-slicer", squeeze=True) # The concatenation layer (concatenates outputs from image/text processing stacks, previous action/reward). self.concat_layer = ConcatLayer() # The main LSTM (going into the ActionAdapter (next in the Policy Component that uses this NN Component)). # Use time-major as it's faster (say tf docs). self.main_lstm = LSTMLayer(units=256, scope="lstm-256", time_major=True, static_loop=self.worker_sample_size) # Add all sub-components to this one. self.add_components( self.splitter, self.image_processing_stack, self.text_processing_stack, self.concat_layer, self.main_lstm, self.time_rank_fold_before_lstm, self.time_rank_unfold_before_lstm, #self.debug_slicer ) @staticmethod def build_image_processing_stack(): """ Builds the image processing pipeline for IMPALA and returns it. """ raise NotImplementedError @staticmethod def build_text_processing_stack(): """ Helper function to build the text processing pipeline for both the large and small architectures, consisting of: - ReShape preprocessor to fold the incoming time rank into the batch rank. - StringToHashBucket Layer taking a batch of sentences and converting them to an indices-table of dimensions: cols=length of longest sentences in input rows=number of items in the batch The cols dimension could be interpreted as the time rank into a consecutive LSTM. The StringToHashBucket Component returns the sequence length of each batch item for exactly that purpose. - Embedding Lookup Layer of embedding size 20 and number of rows == num_hash_buckets (see previous layer). - LSTM processing the batched sequences of words coming from the embedding layer as batches of rows. """ num_hash_buckets = 1000 # Create a hash bucket from the sentences and use that bucket to do an embedding lookup (instead of # a vocabulary). string_to_hash_bucket = StringToHashBucket( num_hash_buckets=num_hash_buckets) embedding = EmbeddingLookup(embed_dim=20, vocab_size=num_hash_buckets, pad_empty=True) # The time rank for the LSTM is now the sequence of words in a sentence, NOT the original env time rank. # We will only use the last output of the LSTM-64 for further processing as that is the output after having # seen all words in the sentence. # The original env stepping time rank is currently folded into the batch rank and must be unfolded again before # passing it into the main LSTM. lstm64 = LSTMLayer(units=64, scope="lstm-64", time_major=False) tuple_splitter = ContainerSplitter(tuple_length=2, scope="tuple-splitter") def custom_apply(self, inputs): hash_bucket, lengths = self.sub_components[ "string-to-hash-bucket"].apply(inputs) embedding_output = self.sub_components["embedding-lookup"].apply( hash_bucket) # Return only the last output (sentence of words, where we are not interested in intermediate results # where the LSTM has not seen the entire sentence yet). # Last output is the final internal h-state (slot 1 in the returned LSTM tuple; slot 0 is final c-state). lstm_output = self.sub_components["lstm-64"].apply( embedding_output, sequence_length=lengths) lstm_final_internals = lstm_output["last_internal_states"] # Need to split once more because the LSTM state is always a tuple of final c- and h-states. _, lstm_final_h_state = self.sub_components[ "tuple-splitter"].split(lstm_final_internals) return lstm_final_h_state text_processing_stack = Stack(string_to_hash_bucket, embedding, lstm64, tuple_splitter, api_methods={("apply", custom_apply)}, scope="text-stack") return text_processing_stack @rlgraph_api def apply(self, input_dict, internal_states=None): # Split the input dict coming directly from the Env. _, _, _, orig_previous_reward = self.splitter.split(input_dict) folded_input = self.time_rank_fold_before_lstm.apply(input_dict) image, text, previous_action, previous_reward = self.splitter.split( folded_input) # Get the left-stack (image) and right-stack (text) output (see [1] for details). text_processing_output = self.text_processing_stack.apply(text) image_processing_output = self.image_processing_stack.apply(image) # Concat everything together. concatenated_data = self.concat_layer.apply(image_processing_output, text_processing_output, previous_action, previous_reward) unfolded_concatenated_data = self.time_rank_unfold_before_lstm.apply( concatenated_data, orig_previous_reward) # Feed concat'd input into main LSTM(256). lstm_output = self.main_lstm.apply(unfolded_concatenated_data, internal_states) return lstm_output
def __init__(self, network_spec, action_space=None, action_adapter_spec=None, max_likelihood=True, scope="policy", **kwargs): """ Args: network_spec (Union[NeuralNetwork,dict]): The NeuralNetwork Component or a specification dict to build one. action_space (Space): The action Space within which this Component will create actions. action_adapter_spec (Optional[dict]): A spec-dict to create an ActionAdapter. Use None for the default ActionAdapter object. max_likelihood (bool): Whether to pick actions according to the max-likelihood value or via sampling. Default: True. """ super(Policy, self).__init__(scope=scope, **kwargs) self.neural_network = NeuralNetwork.from_spec(network_spec) if action_space is None: self.action_adapter = ActionAdapter.from_spec(action_adapter_spec) action_space = self.action_adapter.action_space else: self.action_adapter = ActionAdapter.from_spec( action_adapter_spec, action_space=action_space) self.action_space = action_space self.max_likelihood = max_likelihood # TODO: Hacky trick to implement IMPALA post-LSTM256 time-rank folding and unfolding. # TODO: Replace entirely via sonnet-like BatchApply Component. is_impala = "IMPALANetwork" in type(self.neural_network).__name__ # Add API-method to get baseline output (if we use an extra value function baseline node). if isinstance(self.action_adapter, BaselineActionAdapter): # TODO: IMPALA attempt to speed up final pass after LSTM. if is_impala: self.time_rank_folder = ReShape(fold_time_rank=True, scope="time-rank-fold") self.time_rank_unfolder_v = ReShape(unfold_time_rank=True, time_major=True, scope="time-rank-unfold-v") self.time_rank_unfolder_a_probs = ReShape( unfold_time_rank=True, time_major=True, scope="time-rank-unfold-a-probs") self.time_rank_unfolder_logits = ReShape( unfold_time_rank=True, time_major=True, scope="time-rank-unfold-logits") self.time_rank_unfolder_log_probs = ReShape( unfold_time_rank=True, time_major=True, scope="time-rank-unfold-log-probs") self.add_components(self.time_rank_folder, self.time_rank_unfolder_v, self.time_rank_unfolder_a_probs, self.time_rank_unfolder_log_probs, self.time_rank_unfolder_logits) @rlgraph_api(component=self) def get_state_values_logits_probabilities_log_probs( self, nn_input, internal_states=None): nn_output = self.neural_network.apply(nn_input, internal_states) last_internal_states = nn_output.get("last_internal_states") nn_output = nn_output["output"] # TODO: IMPALA attempt to speed up final pass after LSTM. if is_impala: nn_output = self.time_rank_folder.apply(nn_output) out = self.action_adapter.get_logits_probabilities_log_probs( nn_output) # TODO: IMPALA attempt to speed up final pass after LSTM. if is_impala: state_values = self.time_rank_unfolder_v.apply( out["state_values"], nn_output) logits = self.time_rank_unfolder_logits.apply( out["logits"], nn_output) probs = self.time_rank_unfolder_a_probs.apply( out["probabilities"], nn_output) log_probs = self.time_rank_unfolder_log_probs.apply( out["log_probs"], nn_output) else: state_values = out["state_values"] logits = out["logits"] probs = out["probabilities"] log_probs = out["log_probs"] return dict(state_values=state_values, logits=logits, probabilities=probs, log_probs=log_probs, last_internal_states=last_internal_states) # Figure out our Distribution. if isinstance(action_space, IntBox): self.distribution = Categorical() # Continuous action space -> Normal distribution (each action needs mean and variance from network). elif isinstance(action_space, FloatBox): self.distribution = Normal() else: raise RLGraphError( "ERROR: `action_space` is of type {} and not allowed in {} Component!" .format(type(action_space).__name__, self.name)) self.add_components(self.neural_network, self.action_adapter, self.distribution) if is_impala: self.add_components(self.time_rank_folder, self.time_rank_unfolder_v, self.time_rank_unfolder_a_probs, self.time_rank_unfolder_log_probs, self.time_rank_unfolder_logits)
def test_keras_style_complex_multi_stream_nn(self): # 3 inputs. input_spaces = [ Dict({ "img": FloatBox(shape=(6, 6, 3)), "int": IntBox(3) }, add_batch_rank=True, add_time_rank=True), FloatBox(shape=(2,), add_batch_rank=True), Tuple(IntBox(2), TextBox(), add_batch_rank=True, add_time_rank=True) ] # Same NN as in test above, only using some of the sub-Spaces from the input spaces. # Tests whether this NN can add automatically the correct splitters. folded_text = ReShape(fold_time_rank=True)(input_spaces[2][1]) # String layer will create batched AND time-ranked (individual words) hash outputs (int64). string_bucket_out, lengths = StringToHashBucket(num_hash_buckets=5)(folded_text) # Batched and time-ranked embedding output (floats) with embed dim=n. embedding_out = EmbeddingLookup(embed_dim=10, vocab_size=5)(string_bucket_out) # Pass embeddings through a text LSTM and use last output (reduce time-rank). string_lstm_out, _ = LSTMLayer(units=2, return_sequences=False, scope="lstm-layer-txt")( embedding_out, sequence_length=lengths ) # Unfold to get original time-rank back. string_lstm_out_unfolded = ReShape(unfold_time_rank=True)(string_lstm_out, input_spaces[2][1]) # Parallel image stream via 1 CNN layer plus dense. folded_img = ReShape(fold_time_rank=True, scope="img-fold")(input_spaces[0]["img"]) cnn_out = Conv2DLayer(filters=1, kernel_size=2, strides=2)(folded_img) unfolded_cnn_out = ReShape(unfold_time_rank=True, scope="img-unfold")(cnn_out, input_spaces[0]["img"]) unfolded_cnn_out_flattened = ReShape(flatten=True, scope="img-flat")(unfolded_cnn_out) dense_out = DenseLayer(units=2, scope="dense-0")(unfolded_cnn_out_flattened) # Concat everything. concat_out = ConcatLayer()(string_lstm_out_unfolded, dense_out) # LSTM output has batch+time. main_lstm_out, internal_states = LSTMLayer(units=2, scope="lstm-layer-main")(concat_out) dense1_after_lstm_out = DenseLayer(units=3, scope="dense-1")(main_lstm_out) dense2_after_lstm_out = DenseLayer(units=2, scope="dense-2")(dense1_after_lstm_out) dense3_after_lstm_out = DenseLayer(units=1, scope="dense-3")(dense2_after_lstm_out) # A NN with 3 outputs. neural_net = NeuralNetwork(inputs=input_spaces, outputs=[dense3_after_lstm_out, main_lstm_out, internal_states]) test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_spaces)) # Batch of size=n. sample_shape = (4, 2) input_ = [input_spaces[0].sample(sample_shape), input_spaces[1].sample(sample_shape[0]), input_spaces[2].sample(sample_shape)] out = test.test(("call", tuple(input_)), expected_outputs=None) # Main output (Dense out after LSTM). self.assertTrue(out[0].shape == sample_shape + (1,)) # 1=1 unit in dense layer self.assertTrue(out[0].dtype == np.float32) # main-LSTM out. self.assertTrue(out[1].shape == sample_shape + (2,)) # 2=2 LSTM units self.assertTrue(out[1].dtype == np.float32) # main-LSTM internal-states. self.assertTrue(out[2][0].shape == sample_shape[:1] + (2,)) # 2=2 LSTM units self.assertTrue(out[2][0].dtype == np.float32) self.assertTrue(out[2][1].shape == sample_shape[:1] + (2,)) # 2=2 LSTM units self.assertTrue(out[2][1].dtype == np.float32) test.terminate()
def __init__( self, action_space, world_option_model_network, encoder_network, num_features, num_mixtures, beta=0.2, post_phi_concat_network=None, reward_clipping=1.0, intrinsic_rewards_weight=0.1, concat_with_command_vector=False, optimizer=None, deterministic=False, scope="intrinsic-curiosity-world-option-model", **kwargs ): """ Args: action_space (Space): The action Space to be fed into the model together with the latent feature vector for the states. Will be flattened automatically and then concatenated by this component. world_option_model_network (Union[NeuralNetwork,dict]): A specification dict (or NN object directly) to construct the world-option-model's neural network. encoder_network (Union[NeuralNetwork,dict]): A specification dict (or NN object directly) to construct the inverse dynamics model's encoder network leading from s to phi (feature vector). num_features (int): The size of the feature vectors phi. num_mixtures (int): The number of mixture Normals to use for the next-state distribution output. beta (float): The weight for the phi' loss (action loss is then 1.0 - beta). post_phi_concat_network reward_clipping (float): 0.0 for no clipping, some other value for +/- reward value clipping. Default: 1.0. concat_with_command_vector (bool): If True, this model needs an additional command vector (coming from the policy above) to concat it together with the latent state vector. optimizer (Optional[Optimizer]): The optimizer to use for supervised learning of the two networks (ICM and WOM). """ self.num_features = num_features self.num_mixtures = num_mixtures self.deterministic = deterministic self.beta = beta assert 0.0 < self.beta < 1.0, "ERROR: `beta` must be between 0 and 1!" self.reward_clipping = reward_clipping self.intrinsic_rewards_weight = intrinsic_rewards_weight # Create the encoder network inside a SupervisedPredictor (so we get the adapter + distribution with it). self.state_encoder = SupervisedPredictor( network_spec=encoder_network, output_space=FloatBox(shape=(num_features,), add_batch_rank=True), scope="state-encoder" ) # Create the container loss function for the two prediction tasks: # a) Action prediction and b) next-state prediction, each of them using a simple neg log likelihood loss # comparing the actual action and s' with their log-likelihood value vs the respective distributions. self.loss_functions = dict( # Action prediction loss (neg log likelihood of observed action vs the parameterized distribution). predicted_actions=NegativeLogLikelihoodLoss( distribution_spec=get_default_distribution_from_space(action_space), scope="action-loss" ), # s' prediction loss (neg log likelihood of observed s' vs the parameterized mixed normal distribution). predicted_phi_=NegativeLogLikelihoodLoss(distribution_spec=dict(type="mixture", _args=[ "multi-variate-normal" for _ in range(num_mixtures) ]), scope="phi-loss") ) # TODO: Support for command vector concatenation. #self.concat_with_command_vector = concat_with_command_vector # Define the Model's network's custom call method. def custom_call(self, inputs): phi = inputs["phi"] actions = inputs["actions"] phi_ = inputs["phi_"] actions_flat = self.get_sub_component_by_name("action-flattener").call(actions) concat_phis = self.get_sub_component_by_name("concat-phis").call(phi, phi_) # Predict the action that lead from s to s'. predicted_actions = self.get_sub_component_by_name("post-phi-concat-nn").call(concat_phis) # Concat phi with flattened actions. phi_and_actions = self.get_sub_component_by_name("concat-states-and-actions").call( phi, actions_flat ) # Add stop-gradient to phi here before predicting phi' # (the phis should only be trained by the inverse dynamics model, not by the world option model). # NOT DONE IN ORIGINAL PAPER's CODE AND ALSO NOT IN MLAGENTS EQUIVALENT. # phi_and_actions = self.get_sub_component_by_name("stop-gradient").stop(phi_and_actions) # Predict phi' (through a mixture gaussian distribution). predicted_phi_ = self.get_sub_component_by_name("wom-nn").call(phi_and_actions) return dict( # Predictions (actions and next-state-features (mixture distribution)). predicted_actions=predicted_actions, predicted_phi_=predicted_phi_ ## Also return the two feature vectors for s and s'. #phi=phi, phi_=phi_ ) # Create the SupervisedPredictor's neural network. predictor_network = NeuralNetwork( # The world option model network taking action-cat-phi and mapping them to the predicted phi'. NeuralNetwork.from_spec(world_option_model_network, scope="wom-nn"), # The concat component concatenating both latent state vectors (phi and phi'). ConcatLayer(scope="concat-phis"), # The NN mapping from phi-cat-phi' to the action prediction. NeuralNetwork.from_spec(post_phi_concat_network, scope="post-phi-concat-nn"), # The ReShape component for flattening all actions in arbitrary action spaces. ReShape(flatten=True, flatten_categories=True, flatten_containers=True, scope="action-flattener"), # The concat component concatenating latent state feature vector and incoming (flattened) actions. ConcatLayer(scope="concat-states-and-actions"), # Set the `call` method. api_methods={("call", custom_call)} ) if optimizer is None: optimizer = dict(type="adam", learning_rate=3e-4) super(IntrinsicCuriosityWorldOptionModel, self).__init__( predictor=dict( network_spec=predictor_network, output_space=Dict({ "predicted_actions": action_space, "predicted_phi_": FloatBox(shape=(self.num_features,)) }, add_batch_rank=action_space.has_batch_rank, add_time_rank=action_space.has_time_rank), distribution_adapter_spec=dict( # for `predicted_actions`: use default adapter # for predicted_phi': use normal-mixture adapter & distribution. predicted_phi_={"type": "normal-mixture-adapter", "num_mixtures": num_mixtures} ), deterministic=deterministic ), loss_function=self.loss_functions["predicted_actions"], optimizer=optimizer, scope=scope, **kwargs ) self.add_components(self.state_encoder, self.loss_functions["predicted_phi_"])
def __init__(self, action_space, add_units=0, units=None, weights_spec=None, biases_spec=None, activation=None, scope="action-adapter", **kwargs): """ Args: action_space (Space): The action Space within which this Component will create actions. add_units (Optional[int]): An optional number of units to add to the auto-calculated number of action- layer nodes. Can be negative to subtract units from the auto-calculated value. NOTE: Only one of either `add_units` or `units` must be provided. units (Optional[int]): An optional number of units to use for the action-layer. If None, will calculate the number of units automatically from the given action_space. NOTE: Only one of either `add_units` or `units` must be provided. weights_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the weights of `self.action layer`. Default: None (use default initializer). biases_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the biases of `self.action layer`. Default: None (use default initializer, which is usually 0.0). activation (Optional[str]): The activation function to use for `self.action_layer`. Default: None (=linear). """ super(ActionAdapter, self).__init__(scope=scope, **kwargs) self.action_space = action_space.with_batch_rank() self.weights_spec = weights_spec self.biases_spec = biases_spec self.activation = activation # Our (dense) action layer representing the flattened action space. self.action_layer = None # Calculate the number of nodes in the action layer (DenseLayer object) depending on our action Space # or using a given fixed number (`units`). # Also generate the ReShape sub-Component and give it the new_shape. if isinstance(self.action_space, IntBox): if units is None: units = add_units + self.action_space.flat_dim_with_categories self.reshape = ReShape( new_shape=self.action_space.get_shape(with_category_rank=True), flatten_categories=False) else: if units is None: units = add_units + 2 * self.action_space.flat_dim # Those two dimensions are the mean and log sd # Manually add moments after batch/time ranks. new_shape = tuple([2] + list(self.action_space.shape)) self.reshape = ReShape(new_shape=new_shape) assert units > 0, "ERROR: Number of nodes for action-layer calculated as {}! Must be larger 0.".format( units) # Create the action-layer and add it to this component. self.action_layer = DenseLayer(units=units, activation=self.activation, weights_spec=self.weights_spec, biases_spec=self.biases_spec, scope="action-layer") self.add_components(self.action_layer, self.reshape)
class ActionAdapter(Component): """ A Component that cleans up a neural network's flat output and gets it ready for parameterizing a Distribution Component. Processing steps include: - Sending the raw, flattened NN output through a Dense layer whose number of units matches the flattened action space. - Reshaping (according to the action Space). - Translating the reshaped outputs (logits) into probabilities (by softmaxing) and log-probabilities (log). """ def __init__(self, action_space, add_units=0, units=None, weights_spec=None, biases_spec=None, activation=None, scope="action-adapter", **kwargs): """ Args: action_space (Space): The action Space within which this Component will create actions. add_units (Optional[int]): An optional number of units to add to the auto-calculated number of action- layer nodes. Can be negative to subtract units from the auto-calculated value. NOTE: Only one of either `add_units` or `units` must be provided. units (Optional[int]): An optional number of units to use for the action-layer. If None, will calculate the number of units automatically from the given action_space. NOTE: Only one of either `add_units` or `units` must be provided. weights_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the weights of `self.action layer`. Default: None (use default initializer). biases_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the biases of `self.action layer`. Default: None (use default initializer, which is usually 0.0). activation (Optional[str]): The activation function to use for `self.action_layer`. Default: None (=linear). """ super(ActionAdapter, self).__init__(scope=scope, **kwargs) self.action_space = action_space.with_batch_rank() self.weights_spec = weights_spec self.biases_spec = biases_spec self.activation = activation # Our (dense) action layer representing the flattened action space. self.action_layer = None # Calculate the number of nodes in the action layer (DenseLayer object) depending on our action Space # or using a given fixed number (`units`). # Also generate the ReShape sub-Component and give it the new_shape. if isinstance(self.action_space, IntBox): if units is None: units = add_units + self.action_space.flat_dim_with_categories self.reshape = ReShape( new_shape=self.action_space.get_shape(with_category_rank=True), flatten_categories=False) else: if units is None: units = add_units + 2 * self.action_space.flat_dim # Those two dimensions are the mean and log sd # Manually add moments after batch/time ranks. new_shape = tuple([2] + list(self.action_space.shape)) self.reshape = ReShape(new_shape=new_shape) assert units > 0, "ERROR: Number of nodes for action-layer calculated as {}! Must be larger 0.".format( units) # Create the action-layer and add it to this component. self.action_layer = DenseLayer(units=units, activation=self.activation, weights_spec=self.weights_spec, biases_spec=self.biases_spec, scope="action-layer") self.add_components(self.action_layer, self.reshape) def check_input_spaces(self, input_spaces, action_space=None): # Check the input Space. last_nn_layer_space = input_spaces["nn_output"] # type: Space sanity_check_space(last_nn_layer_space, non_allowed_types=[ContainerSpace]) # Check the action Space. sanity_check_space(self.action_space, must_have_batch_rank=True) if isinstance(self.action_space, IntBox): sanity_check_space(self.action_space, must_have_categories=True) else: # Fixme: Are there other restraints on continuous action spaces? E.g. no dueling layers? pass @rlgraph_api def get_action_layer_output(self, nn_output): """ Returns the raw, non-reshaped output of the action-layer (DenseLayer) after passing through it the raw nn_output (coming from the previous Component). Args: nn_output (DataOpRecord): The NN output of the preceding neural network. Returns: DataOpRecord: The output of the action layer (a DenseLayer) after passing `nn_output` through it. """ out = self.action_layer.apply(nn_output) return dict(output=out) @rlgraph_api def get_logits(self, nn_output): """ Args: nn_output (DataOpRecord): The NN output of the preceding neural network. Returns: SingleDataOp: The logits (raw nn_output, BUT reshaped). """ aa_output = self.get_action_layer_output(nn_output) logits = self.reshape.apply(aa_output["output"]) return logits @rlgraph_api def get_logits_probabilities_log_probs(self, nn_output): """ Args: nn_output (DataOpRecord): The NN output of the preceding neural network. Returns: Tuple[SingleDataOp]: - logits (raw nn_output, BUT reshaped) - probabilities (softmaxed(logits)) - log(probabilities) """ logits = self.get_logits(nn_output) probabilities, log_probs = self._graph_fn_get_probabilities_log_probs( logits) return dict(logits=logits, probabilities=probabilities, log_probs=log_probs) # TODO: Use a SoftMax Component instead (uses the same code as the one below). @graph_fn def _graph_fn_get_probabilities_log_probs(self, logits): """ Creates properties/parameters and log-probs from some reshaped output. Args: logits (SingleDataOp): The output of some layer that is already reshaped according to our action Space. Returns: tuple (2x SingleDataOp): parameters (DataOp): The parameters, ready to be passed to a Distribution object's get_distribution API-method (usually some probabilities or loc/scale pairs). log_probs (DataOp): Simply the log(parameters). """ if get_backend() == "tf": if isinstance(self.action_space, IntBox): # Discrete actions. parameters = tf.maximum(x=tf.nn.softmax(logits=logits, axis=-1), y=SMALL_NUMBER) # Log probs. log_probs = tf.log(x=parameters) elif isinstance(self.action_space, FloatBox): # Continuous actions. mean, log_sd = tf.split(value=logits, num_or_size_splits=2, axis=1) # Remove moments rank. mean = tf.squeeze(input=mean, axis=1) log_sd = tf.squeeze(input=log_sd, axis=1) # Clip log_sd. log(SMALL_NUMBER) is negative. log_sd = tf.clip_by_value(t=log_sd, clip_value_min=log(SMALL_NUMBER), clip_value_max=-log(SMALL_NUMBER)) # Turn log sd into sd. sd = tf.exp(x=log_sd) parameters = DataOpTuple(mean, sd) log_probs = DataOpTuple(tf.log(x=mean), log_sd) else: raise NotImplementedError return parameters, log_probs elif get_backend() == "pytorch": if isinstance(self.action_space, IntBox): # Discrete actions. softmax_logits = torch.softmax(logits, dim=-1) parameters = torch.max(softmax_logits, SMALL_NUMBER_TORCH) # Log probs. log_probs = torch.log(parameters) elif isinstance(self.action_space, FloatBox): # Continuous actions. mean, log_sd = torch.split(logits, split_size_or_sections=2, dim=1) # Remove moments rank. mean = torch.squeeze(mean, dim=1) log_sd = torch.squeeze(log_sd, dim=1) # Clip log_sd. log(SMALL_NUMBER) is negative. log_sd = torch.clamp(log_sd, min=LOG_SMALL_NUMBER, max=-LOG_SMALL_NUMBER) # Turn log sd into sd. sd = torch.exp(log_sd) parameters = DataOpTuple(mean, sd) log_probs = DataOpTuple(torch.log(mean), log_sd) else: raise NotImplementedError return parameters, log_probs
def __init__(self, action_space, add_units=0, units=None, weights_spec=None, biases_spec=None, activation=None, pre_network_spec=None, scope="action-adapter", **kwargs): """ Args: action_space (Space): The action Space within which this Component will create actions. add_units (Optional[int]): An optional number of units to add to the auto-calculated number of action- layer nodes. Can be negative to subtract units from the auto-calculated value. NOTE: Only one of either `add_units` or `units` must be provided. units (Optional[int]): An optional number of units to use for the action-layer. If None, will calculate the number of units automatically from the given action_space. NOTE: Only one of either `add_units` or `units` must be provided. weights_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the weights of `self.action layer`. Default: None (use default initializer). biases_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the biases of `self.action layer`. Default: None (use default initializer, which is usually 0.0). activation (Optional[str]): The activation function to use for `self.action_layer`. Default: None (=linear). pre_network_spec (Optional[dict,NeuralNetwork]): A spec dict for a neural network coming before the last action layer. If None, only the action layer itself is applied. """ # Build the action layer for this adapter based on the given action-space. self.action_space = action_space.with_batch_rank() assert not isinstance( self.action_space, ContainerSpace ), "ERROR: ActionAdapter cannot handle ContainerSpaces!" # Calculate the number of nodes in the action layer (DenseLayer object) depending on our action Space # or using a given fixed number (`units`). # Also generate the ReShape sub-Component and give it the new_shape. if isinstance(self.action_space, IntBox): if units is None: units = add_units + self.action_space.flat_dim_with_categories new_shape = self.action_space.get_shape(with_category_rank=True) else: if units is None: units = add_units + 2 * self.action_space.flat_dim # Those two dimensions are the mean and log sd # Manually add moments after batch/time ranks. new_shape = tuple([2] + list(self.action_space.shape)) assert units > 0, "ERROR: Number of nodes for action-layer calculated as {}! Must be larger 0.".format( units) action_layer = DenseLayer(units=units, activation=activation, weights_spec=weights_spec, biases_spec=biases_spec, scope="action-layer") # Do we have a pre-NN? self.network = NeuralNetwork.from_spec( pre_network_spec, scope="action-network") # type: NeuralNetwork self.network.add_layer(action_layer) # Add the reshape layer to match the action space's shape. self.network.add_layer(ReShape(new_shape=new_shape)) super(ActionAdapter, self).__init__(self.network, scope=scope, **kwargs)
def __init__(self, discount=0.99, fifo_queue_spec=None, architecture="large", environment_spec=None, feed_previous_action_through_nn=True, feed_previous_reward_through_nn=True, weight_pg=None, weight_baseline=None, weight_entropy=None, num_workers=1, worker_sample_size=100, dynamic_batching=False, visualize=False, **kwargs): """ Args: discount (float): The discount factor gamma. architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if `network_spec` is given explicitly in kwargs. Default: "large". fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm. environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent. feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_action". Default: True. feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: True. weight_pg (float): See IMPALALossFunction Component. weight_baseline (float): See IMPALALossFunction Component. weight_entropy (float): See IMPALALossFunction Component. num_workers (int): How many actors (workers) should be run in separate threads. worker_sample_size (int): How many steps the actor will perform in the environment each sample-run. dynamic_batching (bool): Whether to use the deepmind's custom dynamic batching op for wrapping the optimizer's step call. The batcher.so file must be compiled for this to work (see Docker file). Default: False. visualize (Union[int,bool]): Whether and how many workers to visualize. Default: False (no visualization). """ # Now that we fixed the Agent's spec, call the super constructor. super(SingleIMPALAAgent, self).__init__( type="single", discount=discount, architecture=architecture, fifo_queue_spec=fifo_queue_spec, environment_spec=environment_spec, feed_previous_action_through_nn=feed_previous_action_through_nn, feed_previous_reward_through_nn=feed_previous_reward_through_nn, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, worker_sample_size=worker_sample_size, name=kwargs.pop("name", "impala-single-agent"), **kwargs) self.dynamic_batching = dynamic_batching self.num_workers = num_workers self.visualize = visualize # If we use dynamic batching, wrap the dynamic batcher around the policy's graph_fn that we # actually call below during our build. if self.dynamic_batching: self.policy = DynamicBatchingPolicy(policy_spec=self.policy, scope="") self.env_output_splitter = ContainerSplitter( tuple_length=3 if self.has_rnn is False else 4, scope="env-output-splitter") self.fifo_output_splitter = ContainerSplitter( *self.fifo_queue_keys, scope="fifo-output-splitter") self.states_dict_splitter = ContainerSplitter( *list(self.fifo_record_space["states"].keys( ) if isinstance(self.state_space, Dict) else "dummy"), scope="states-dict-splitter") self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys)) # Slice some data from the EnvStepper (e.g only first internal states are needed). if self.has_rnn: internal_states_slicer = Slice(scope="internal-states-slicer", squeeze=True) else: internal_states_slicer = None self.transposer = Transpose(scope="transposer") # Create an IMPALALossFunction with some parameters. self.loss_function = IMPALALossFunction( discount=self.discount, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, slice_actions=self.feed_previous_action_through_nn, slice_rewards=self.feed_previous_reward_through_nn) # Merge back to insert into FIFO. self.fifo_input_merger = DictMerger(*self.fifo_queue_keys) # Dummy Flattener to calculate action-probs space. dummy_flattener = ReShape( flatten=True, flatten_categories=self.action_space.num_categories) self.environment_steppers = list() for i in range(self.num_workers): environment_spec_ = copy.deepcopy(environment_spec) if self.visualize is True or (isinstance(self.visualize, int) and i + 1 <= self.visualize): environment_spec_["visualize"] = True # Force worker_sample_size for IMPALA NNs (LSTM) in env-stepper to be 1. policy_spec = copy.deepcopy(self.policy_spec) if isinstance(policy_spec, dict) and isinstance(policy_spec["network_spec"], dict) and \ "type" in policy_spec["network_spec"] and "IMPALANetwork" in policy_spec["network_spec"]["type"]: policy_spec["network_spec"]["worker_sample_size"] = 1 env_stepper = EnvironmentStepper( environment_spec=environment_spec_, actor_component_spec=ActorComponent( preprocessor_spec=self.preprocessing_spec, policy_spec=policy_spec, exploration_spec=self.exploration_spec), state_space=self.state_space.with_batch_rank(), action_space=self.action_space.with_batch_rank(), reward_space=float, internal_states_space=self.internal_states_space, num_steps=self.worker_sample_size, add_action=not self.feed_previous_action_through_nn, add_reward=not self.feed_previous_reward_through_nn, add_previous_action_to_state=self. feed_previous_action_through_nn, add_previous_reward_to_state=self. feed_previous_reward_through_nn, add_action_probs=True, action_probs_space=dummy_flattener.get_preprocessed_space( self.action_space), scope="env-stepper-{}".format(i)) if self.dynamic_batching: env_stepper.actor_component.policy.parent_component = None env_stepper.actor_component.policy = DynamicBatchingPolicy( policy_spec=env_stepper.actor_component.policy, scope="") env_stepper.actor_component.add_components( env_stepper.actor_component.policy) self.environment_steppers.append(env_stepper) # Create the QueueRunners (one for each env-stepper). self.queue_runner = QueueRunner( self.fifo_queue, "step", -1, # -1: Take entire return value of API-method `step` as record to insert. self.env_output_splitter, self.fifo_input_merger, internal_states_slicer, *self.environment_steppers) sub_components = [ self.fifo_output_splitter, self.fifo_queue, self.queue_runner, self.transposer, self.staging_area, self.preprocessor, self.states_dict_splitter, self.policy, self.loss_function, self.optimizer ] # Add all the agent's sub-components to the root. self.root_component.add_components(*sub_components) # Define the Agent's (root Component's) API. self.define_graph_api() if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=None) self.graph_built = True if self.has_gpu: # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op). self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \ out_op_columns[0].op_records[0].op # Initialize the stage. self.graph_executor.monitored_session.run_step_fn( lambda step_context: step_context.session.run(self.stage_op )) # TODO remove after full refactor. self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \ out_op_columns[0].op_records[0].op
def test_functional_api_multi_stream_nn(self): # Input Space of the network. input_space = Dict( { "img": FloatBox(shape=(6, 6, 3)), # some RGB img "txt": TextBox() # some text }, add_batch_rank=True, add_time_rank=True) img, txt = ContainerSplitter("img", "txt")(input_space) # Complex NN assembly via our Keras-style functional API. # Fold text input into single batch rank. folded_text = ReShape(fold_time_rank=True)(txt) # String layer will create batched AND time-ranked (individual words) hash outputs (int64). string_bucket_out, lengths = StringToHashBucket( num_hash_buckets=5)(folded_text) # Batched and time-ranked embedding output (floats) with embed dim=n. embedding_out = EmbeddingLookup(embed_dim=10, vocab_size=5)(string_bucket_out) # Pass embeddings through a text LSTM and use last output (reduce time-rank). string_lstm_out, _ = LSTMLayer(units=2, return_sequences=False, scope="lstm-layer-txt")( embedding_out, sequence_length=lengths) # Unfold to get original time-rank back. string_lstm_out_unfolded = ReShape(unfold_time_rank=True)( string_lstm_out, txt) # Parallel image stream via 1 CNN layer plus dense. folded_img = ReShape(fold_time_rank=True, scope="img-fold")(img) cnn_out = Conv2DLayer(filters=1, kernel_size=2, strides=2)(folded_img) unfolded_cnn_out = ReShape(unfold_time_rank=True, scope="img-unfold")(cnn_out, img) unfolded_cnn_out_flattened = ReShape( flatten=True, scope="img-flat")(unfolded_cnn_out) dense_out = DenseLayer(units=2, scope="dense-0")(unfolded_cnn_out_flattened) # Concat everything. concat_out = ConcatLayer()(string_lstm_out_unfolded, dense_out) # LSTM output has batch+time. main_lstm_out, internal_states = LSTMLayer( units=2, scope="lstm-layer-main")(concat_out) dense1_after_lstm_out = DenseLayer(units=3, scope="dense-1")(main_lstm_out) dense2_after_lstm_out = DenseLayer( units=2, scope="dense-2")(dense1_after_lstm_out) dense3_after_lstm_out = DenseLayer( units=1, scope="dense-3")(dense2_after_lstm_out) # A NN with 2 outputs. neural_net = NeuralNetwork( outputs=[dense3_after_lstm_out, main_lstm_out, internal_states]) test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_space)) # Batch of size=n. sample_shape = (4, 2) input_ = input_space.sample(sample_shape) out = test.test(("call", input_), expected_outputs=None) # Main output (Dense out after LSTM). self.assertTrue(out[0].shape == sample_shape + (1, )) # 1=1 unit in dense layer self.assertTrue(out[0].dtype == np.float32) # main-LSTM out. self.assertTrue(out[1].shape == sample_shape + (2, )) # 2=2 LSTM units self.assertTrue(out[1].dtype == np.float32) # main-LSTM internal-states. self.assertTrue(out[2][0].shape == sample_shape[:1] + (2, )) # 2=2 LSTM units self.assertTrue(out[2][0].dtype == np.float32) self.assertTrue(out[2][1].shape == sample_shape[:1] + (2, )) # 2=2 LSTM units self.assertTrue(out[2][1].dtype == np.float32) test.terminate()
class Policy(Component): """ A Policy is a wrapper Component that contains a NeuralNetwork, an ActionAdapter and a Distribution Component. """ def __init__(self, network_spec, action_space=None, action_adapter_spec=None, max_likelihood=True, scope="policy", **kwargs): """ Args: network_spec (Union[NeuralNetwork,dict]): The NeuralNetwork Component or a specification dict to build one. action_space (Space): The action Space within which this Component will create actions. action_adapter_spec (Optional[dict]): A spec-dict to create an ActionAdapter. Use None for the default ActionAdapter object. max_likelihood (bool): Whether to pick actions according to the max-likelihood value or via sampling. Default: True. """ super(Policy, self).__init__(scope=scope, **kwargs) self.neural_network = NeuralNetwork.from_spec(network_spec) if action_space is None: self.action_adapter = ActionAdapter.from_spec(action_adapter_spec) action_space = self.action_adapter.action_space else: self.action_adapter = ActionAdapter.from_spec( action_adapter_spec, action_space=action_space) self.action_space = action_space self.max_likelihood = max_likelihood # TODO: Hacky trick to implement IMPALA post-LSTM256 time-rank folding and unfolding. # TODO: Replace entirely via sonnet-like BatchApply Component. is_impala = "IMPALANetwork" in type(self.neural_network).__name__ # Add API-method to get baseline output (if we use an extra value function baseline node). if isinstance(self.action_adapter, BaselineActionAdapter): # TODO: IMPALA attempt to speed up final pass after LSTM. if is_impala: self.time_rank_folder = ReShape(fold_time_rank=True, scope="time-rank-fold") self.time_rank_unfolder_v = ReShape(unfold_time_rank=True, time_major=True, scope="time-rank-unfold-v") self.time_rank_unfolder_a_probs = ReShape( unfold_time_rank=True, time_major=True, scope="time-rank-unfold-a-probs") self.time_rank_unfolder_logits = ReShape( unfold_time_rank=True, time_major=True, scope="time-rank-unfold-logits") self.time_rank_unfolder_log_probs = ReShape( unfold_time_rank=True, time_major=True, scope="time-rank-unfold-log-probs") self.add_components(self.time_rank_folder, self.time_rank_unfolder_v, self.time_rank_unfolder_a_probs, self.time_rank_unfolder_log_probs, self.time_rank_unfolder_logits) @rlgraph_api(component=self) def get_state_values_logits_probabilities_log_probs( self, nn_input, internal_states=None): nn_output = self.neural_network.apply(nn_input, internal_states) last_internal_states = nn_output.get("last_internal_states") nn_output = nn_output["output"] # TODO: IMPALA attempt to speed up final pass after LSTM. if is_impala: nn_output = self.time_rank_folder.apply(nn_output) out = self.action_adapter.get_logits_probabilities_log_probs( nn_output) # TODO: IMPALA attempt to speed up final pass after LSTM. if is_impala: state_values = self.time_rank_unfolder_v.apply( out["state_values"], nn_output) logits = self.time_rank_unfolder_logits.apply( out["logits"], nn_output) probs = self.time_rank_unfolder_a_probs.apply( out["probabilities"], nn_output) log_probs = self.time_rank_unfolder_log_probs.apply( out["log_probs"], nn_output) else: state_values = out["state_values"] logits = out["logits"] probs = out["probabilities"] log_probs = out["log_probs"] return dict(state_values=state_values, logits=logits, probabilities=probs, log_probs=log_probs, last_internal_states=last_internal_states) # Figure out our Distribution. if isinstance(action_space, IntBox): self.distribution = Categorical() # Continuous action space -> Normal distribution (each action needs mean and variance from network). elif isinstance(action_space, FloatBox): self.distribution = Normal() else: raise RLGraphError( "ERROR: `action_space` is of type {} and not allowed in {} Component!" .format(type(action_space).__name__, self.name)) self.add_components(self.neural_network, self.action_adapter, self.distribution) if is_impala: self.add_components(self.time_rank_folder, self.time_rank_unfolder_v, self.time_rank_unfolder_a_probs, self.time_rank_unfolder_log_probs, self.time_rank_unfolder_logits) # Define our interface. @rlgraph_api def get_nn_output(self, nn_input, internal_states=None): """ Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: any: The raw output of the neural network (before it's cleaned-up and passed through the ActionAdapter). """ out = self.neural_network.apply(nn_input, internal_states) return dict(output=out["output"], last_internal_states=out.get("last_internal_states")) @rlgraph_api def get_action(self, nn_input, internal_states=None, max_likelihood=None): """ Returns an action based on NN output, action adapter output and distribution sampling. Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. max_likelihood (Optional[bool]): If not None, use this to determine whether actions should be drawn from the distribution in max-likelihood or stochastic fashion. Returns: any: The drawn action. """ max_likelihood = self.max_likelihood if max_likelihood is None else max_likelihood nn_output = self.get_nn_output(nn_input, internal_states) # Skip our distribution, iff discrete action-space and max-likelihood acting (greedy). # In that case, one does not need to create a distribution in the graph each act (only to get the argmax # over the logits, which is the same as the argmax over the probabilities (or log-probabilities)). if max_likelihood is True and isinstance(self.action_space, IntBox): out = self.action_adapter.get_logits_probabilities_log_probs( nn_output["output"]) action = self._graph_fn_get_max_likelihood_action_wo_distribution( out["logits"]) else: out = self.action_adapter.get_logits_probabilities_log_probs( nn_output["output"]) action = self.distribution.draw(out["probabilities"], max_likelihood) return dict(action=action, last_internal_states=nn_output["last_internal_states"]) @rlgraph_api def get_max_likelihood_action(self, nn_input, internal_states=None): """ Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: any: See `get_action`, but with max_likelihood force set to True. """ out = self.get_logits_probabilities_log_probs(nn_input, internal_states) if isinstance(self.action_space, IntBox): action = self._graph_fn_get_max_likelihood_action_wo_distribution( out["logits"]) else: action = self.distribution.sample_deterministic( out["probabilities"]) return dict(action=action, last_internal_states=out["last_internal_states"]) @rlgraph_api def get_stochastic_action(self, nn_input, internal_states=None): """ Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: any: See `get_action`, but with max_likelihood force set to False. """ out = self.get_logits_probabilities_log_probs(nn_input, internal_states) action = self.distribution.sample_stochastic(out["probabilities"]) return dict(action=action, last_internal_states=out["last_internal_states"]) @rlgraph_api def get_action_layer_output(self, nn_input, internal_states=None): """ Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: any: The raw output of the action layer of the ActionAdapter (including possibly the last internal states of a RNN-based NN). """ nn_output = self.get_nn_output(nn_input, internal_states) action_layer_output = self.action_adapter.get_action_layer_output( nn_output["output"]) # Add last internal states to return value. return dict(output=action_layer_output["output"], last_internal_states=nn_output["last_internal_states"]) @rlgraph_api def get_logits_probabilities_log_probs(self, nn_input, internal_states=None): """ Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: Dict: logits: The (reshaped) logits from the ActionAdapter. probabilities: The probabilities gained from the softmaxed logits. log_probs: The log(probabilities) values. """ nn_output = self.get_nn_output(nn_input, internal_states) aa_output = self.action_adapter.get_logits_probabilities_log_probs( nn_output["output"]) return dict(logits=aa_output["logits"], probabilities=aa_output["probabilities"], log_probs=aa_output["log_probs"], last_internal_states=nn_output["last_internal_states"]) @rlgraph_api def get_entropy(self, nn_input, internal_states=None): """ Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: any: See Distribution component. """ out = self.get_logits_probabilities_log_probs(nn_input, internal_states) entropy = self.distribution.entropy(out["probabilities"]) return dict(entropy=entropy, last_internal_states=out["last_internal_states"]) @graph_fn def _graph_fn_get_max_likelihood_action_wo_distribution(self, logits): """ Use this function only for discrete action spaces to circumvent using a full-blown backend-specific distribution object (e.g. tf.distribution.Multinomial). Args: logits (SingleDataOp): Logits over which to pick the argmax (greedy action). Returns: SingleDataOp: The argmax over the last rank of the input logits. """ if get_backend() == "tf": return tf.argmax(logits, axis=-1, output_type=tf.int32) elif get_backend() == "pytorch": return torch.argmax(logits, dim=-1).int()