def testIsDiscrete(self, dtype): spec = array_spec.ArraySpec((2, 3), dtype=dtype) self.assertIs(tensor_spec.is_discrete(spec), issubclass(np.dtype(dtype).type, np.integer))
def testIsDiscrete(self, dtype): spec = tensor_spec.TensorSpec((2, 3), dtype=dtype) self.assertIs(tensor_spec.is_discrete(spec), dtype.is_integer)
def _get_clip(spec): dims = np.product(spec.shape.as_list()) if tensor_spec.is_discrete(spec): dims *= spec.maximum - spec.minimum + 1 return np.sqrt(action_dist_clip_per_dim * dims)
def __init__(self, time_step_spec=None, action_spec=None, reward_network=None, observation_and_action_constraint_splitter=None, emit_policy_info=(), name=None): """Builds a GreedyRewardPredictionPolicy given a reward tf_agents.Network. This policy takes a tf_agents.Network predicting rewards and generates the action corresponding to the largest predicted reward. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. reward_network: An instance of a `tf_agents.network.Network`, callable via `network(observation, step_type) -> (output, final_state)`. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the mask. The mask should be a 0-1 `Tensor` of shape `[batch_size, num_actions]`. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. Raises: NotImplementedError: If `action_spec` contains more than one `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid. """ flat_action_spec = tf.nest.flatten(action_spec) if len(flat_action_spec) > 1: raise NotImplementedError( 'action_spec can only contain a single BoundedTensorSpec.') action_spec = flat_action_spec[0] if (not tensor_spec.is_bounded(action_spec) or not tensor_spec.is_discrete(action_spec) or action_spec.shape.rank > 1 or action_spec.shape.num_elements() != 1): raise NotImplementedError( 'action_spec must be a BoundedTensorSpec of type int32 and shape (). ' 'Found {}.'.format(action_spec)) self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1 self._action_offset = action_spec.minimum reward_network.create_variables() self._reward_network = reward_network self._emit_policy_info = emit_policy_info predicted_rewards_mean = () if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info: predicted_rewards_mean = tensor_spec.TensorSpec( [self._expected_num_actions]) bandit_policy_type = () if policy_utilities.InfoFields.BANDIT_POLICY_TYPE in emit_policy_info: bandit_policy_type = ( policy_utilities.create_bandit_policy_type_tensor_spec( shape=[1])) info_spec = policy_utilities.PolicyInfo( predicted_rewards_mean=predicted_rewards_mean, bandit_policy_type=bandit_policy_type) super(GreedyRewardPredictionPolicy, self).__init__(time_step_spec, action_spec, policy_state_spec=reward_network.state_spec, clip=False, info_spec=info_spec, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), name=name)
def __init__(self, time_step_spec: types.TimeStep, action_spec: types.NestedTensorSpec, reward_network: types.Network, temperature: types.FloatOrReturningFloat = 1.0, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, accepts_per_arm_features: bool = False, constraints: Tuple[constr.NeuralConstraint, ...] = (), emit_policy_info: Tuple[Text, ...] = (), name: Optional[Text] = None): """Builds a BoltzmannRewardPredictionPolicy given a reward network. This policy takes a tf_agents.Network predicting rewards and chooses an action with weighted probabilities (i.e., using a softmax over the network estimates of value for each action). Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. reward_network: An instance of a `tf_agents.network.Network`, callable via `network(observation, step_type) -> (output, final_state)`. temperature: float or callable that returns a float. The temperature used in the Boltzmann exploration. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the mask. The mask should be a 0-1 `Tensor` of shape `[batch_size, num_actions]`. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. accepts_per_arm_features: (bool) Whether the policy accepts per-arm features. constraints: iterable of constraints objects that are instances of `tf_agents.bandits.agents.NeuralConstraint`. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. Raises: NotImplementedError: If `action_spec` contains more than one `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid. """ policy_utilities.check_no_mask_with_arm_features( accepts_per_arm_features, observation_and_action_constraint_splitter) flat_action_spec = tf.nest.flatten(action_spec) if len(flat_action_spec) > 1: raise NotImplementedError( 'action_spec can only contain a single BoundedTensorSpec.') self._temperature = temperature action_spec = flat_action_spec[0] if (not tensor_spec.is_bounded(action_spec) or not tensor_spec.is_discrete(action_spec) or action_spec.shape.rank > 1 or action_spec.shape.num_elements() != 1): raise NotImplementedError( 'action_spec must be a BoundedTensorSpec of type int32 and shape (). ' 'Found {}.'.format(action_spec)) self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1 self._action_offset = action_spec.minimum reward_network.create_variables() self._reward_network = reward_network self._constraints = constraints self._emit_policy_info = emit_policy_info predicted_rewards_mean = () if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info: predicted_rewards_mean = tensor_spec.TensorSpec( [self._expected_num_actions]) bandit_policy_type = () if policy_utilities.InfoFields.BANDIT_POLICY_TYPE in emit_policy_info: bandit_policy_type = ( policy_utilities.create_bandit_policy_type_tensor_spec( shape=[1])) if accepts_per_arm_features: # The features for the chosen arm is saved to policy_info. chosen_arm_features_info = ( policy_utilities.create_chosen_arm_features_info_spec( time_step_spec.observation)) info_spec = policy_utilities.PerArmPolicyInfo( predicted_rewards_mean=predicted_rewards_mean, bandit_policy_type=bandit_policy_type, chosen_arm_features=chosen_arm_features_info) else: info_spec = policy_utilities.PolicyInfo( predicted_rewards_mean=predicted_rewards_mean, bandit_policy_type=bandit_policy_type) self._accepts_per_arm_features = accepts_per_arm_features super(BoltzmannRewardPredictionPolicy, self).__init__(time_step_spec, action_spec, policy_state_spec=reward_network.state_spec, clip=False, info_spec=info_spec, emit_log_probability='log_probability' in emit_policy_info, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), name=name)
def _is_categorical_spec(spec): return (tensor_spec.is_discrete(spec) and tensor_spec.is_bounded(spec) and spec.shape == [] and spec.minimum == 0)
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, actor_network: network.Network, policy_state_spec: types.NestedTensorSpec = (), info_spec: types.NestedTensorSpec = (), observation_normalizer: Optional[ tensor_normalizer.TensorNormalizer] = None, clip: bool = True, training: bool = False, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, name: Optional[Text] = None): """Builds an Actor Policy given an actor network. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of `BoundedTensorSpec` representing the actions. actor_network: An instance of a `tf_agents.networks.network.Network` to be used by the policy. The network will be called with `call(observation, step_type, policy_state)` and should return `(actions_or_distributions, new_state)`. policy_state_spec: A nest of TensorSpec representing the policy_state. If not set, defaults to actor_network.state_spec. info_spec: A nest of `TensorSpec` representing the policy info. observation_normalizer: An object to use for observation normalization. clip: Whether to clip actions to spec before returning them. Default True. Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped continuous actions for training. training: Whether the network should be called in training mode. observation_and_action_constraint_splitter: A function used to process observations with action constraints. These constraints can indicate, for example, a mask of valid/invalid actions for a given state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the constraint. An example `observation_and_action_constraint_splitter` could be as simple as: ``` def observation_and_action_constraint_splitter(observation): return observation['network_input'], observation['constraint'] ``` *Note*: when using `observation_and_action_constraint_splitter`, make sure the provided `actor_network` is compatible with the network-specific half of the output of the `observation_and_action_constraint_splitter`. In particular, `observation_and_action_constraint_splitter` will be called on the observation before passing to the network. If `observation_and_action_constraint_splitter` is None, action constraints are not applied. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: if `actor_network` is not of type `network.Network`. NotImplementedError: if `observation_and_action_constraint_splitter` is not None but `action_spec` is not discrete. """ if not isinstance(actor_network, network.Network): raise ValueError('actor_network must be a network.Network. Found ' '{}.'.format(type(actor_network))) actor_network.create_variables() self._actor_network = actor_network self._observation_normalizer = observation_normalizer self._training = training if observation_and_action_constraint_splitter is not None: if len(tf.nest.flatten(action_spec)) > 1 or ( not tensor_spec.is_discrete(action_spec)): raise NotImplementedError( 'Action constraints for ActorPolicy are currently only supported ' 'for a single spec of discrete actions. Got action_spec {}' .format(action_spec)) if not policy_state_spec: policy_state_spec = actor_network.state_spec super(ActorPolicy, self).__init__(time_step_spec=time_step_spec, action_spec=action_spec, policy_state_spec=policy_state_spec, info_spec=info_spec, clip=clip, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), name=name)
def __init__(self, input_tensor_spec, output_tensor_spec, fc_layer_params=(200, 100), conv_layer_params=None, activation_fn=tf.keras.activations.relu, discrete_projection_net=_categorical_projection_net, continuous_projection_net=_normal_projection_net, name='ActorDistributionNetwork'): """Creates an instance of `ActorDistributionNetwork`. Args: input_tensor_spec: A nest of `tensor_spec.TensorSpec` representing the input. output_tensor_spec: A nest of `tensor_spec.BoundedTensorSpec` representing the output. fc_layer_params: Optional list of fully_connected parameters, where each item is the number of units in the layer. conv_layer_params: Optional list of convolution layers parameters, where each item is a length-three tuple indicating (filters, kernel_size, stride). activation_fn: Activation function, e.g. tf.nn.relu, slim.leaky_relu, ... discrete_projection_net: Callable that generates a discrete projection network to be called with some hidden state and the outer_rank of the state. continuous_projection_net: Callable that generates a continuous projection network to be called with some hidden state and the outer_rank of the state. name: A string representing name of the network. Raises: ValueError: If `input_tensor_spec` contains more than one observation. """ if len(tf.nest.flatten(input_tensor_spec)) > 1: raise ValueError( 'Only a single observation is supported by this network') mlp_layers = utils.mlp_layers(conv_layer_params, fc_layer_params, activation_fn=activation_fn, kernel_initializer=tf.compat.v1.keras. initializers.glorot_uniform(), name='input_mlp') projection_networks = [] for single_output_spec in tf.nest.flatten(output_tensor_spec): if tensor_spec.is_discrete(single_output_spec): projection_networks.append( discrete_projection_net(single_output_spec)) else: projection_networks.append( continuous_projection_net(single_output_spec)) projection_distribution_specs = [ proj_net.output_spec for proj_net in projection_networks ] output_spec = tf.nest.pack_sequence_as(output_tensor_spec, projection_distribution_specs) super(ActorDistributionNetwork, self).__init__(input_tensor_spec=input_tensor_spec, state_spec=(), output_spec=output_spec, name=name) self._mlp_layers = mlp_layers self._projection_networks = projection_networks self._output_tensor_spec = output_tensor_spec
def __init__(self, action_spec, feature_spec, hidden_size=256, reward_adapt_speed=8.0, encoding_net: Network = None, forward_net: Network = None, inverse_net: Network = None, name="ICMAlgorithm"): """Create an ICMAlgorithm. Args: hidden_size (int|tuple): size of hidden layer(s) reward_adapt_speed (float): how fast to adapt the reward normalizer. rouphly speaking, the statistics for the normalization is calculated mostly based on the most recent T/speed samples, where T is the total number of samples. encoding_net (Network): network for encoding observation into a latent feature specified by feature_spec. Its input is same as the input of this algorithm. forward_net (Network): network for predicting next feature based on previous feature and action. It should accept input with spec [feature_spec, encoded_action_spec] and output a tensor of shape feature_spec. For discrete action, encoded_action is an one-hot representation of the action. For continuous action, encoded action is same as the original action. inverse_net (Network): network for predicting previous action given the previous feature and current feature. It should accept input with spec [feature_spec, feature_spec] and output tensor of shape (num_actions,). """ super(ICMAlgorithm, self).__init__(train_state_spec=feature_spec, name=name) flat_action_spec = tf.nest.flatten(action_spec) assert len( flat_action_spec) == 1, "ICM doesn't suport nested action_spec" flat_feature_spec = tf.nest.flatten(feature_spec) assert len( flat_feature_spec) == 1, "ICM doesn't support nested feature_spec" action_spec = flat_action_spec[0] if tensor_spec.is_discrete(action_spec): self._num_actions = action_spec.maximum - action_spec.minimum + 1 else: self._num_actions = action_spec.shape[-1] self._action_spec = action_spec feature_dim = flat_feature_spec[0].shape[-1] self._encoding_net = encoding_net if isinstance(hidden_size, int): hidden_size = (hidden_size, ) if forward_net is None: encoded_action_spec = tensor_spec.TensorSpec((self._num_actions, ), dtype=tf.float32) forward_net = EncodingNetwork( name="forward_net", input_tensor_spec=[feature_spec, encoded_action_spec], fc_layer_params=hidden_size, last_layer_size=feature_dim) self._forward_net = forward_net if inverse_net is None: inverse_net = EncodingNetwork( name="inverse_net", input_tensor_spec=[feature_spec, feature_spec], fc_layer_params=hidden_size, last_layer_size=self._num_actions, last_kernel_initializer=tf.initializers.Zeros()) self._inverse_net = inverse_net self._reward_normalizer = ScalarAdaptiveNormalizer( speed=reward_adapt_speed)
def testExclusive(self, dtype): if dtype == tf.string: self.skipTest("Not compatible with string type.") spec = tensor_spec.TensorSpec((2, 3), dtype=dtype) self.assertIs( tensor_spec.is_discrete(spec) ^ tensor_spec.is_continuous(spec), True)
def __init__(self, input_tensor_spec, output_tensor_spec, fc_layer_params=(200, 100), activation_fn=tf.nn.relu, output_activation_fn=None, kernel_initializer=None, last_kernel_initializer=None, discrete_projection_net=_categorical_projection_net, continuous_projection_net=_normal_projection_net, name='PolicyNetwork'): """Creates an instance of `ValueNetwork`. Args: input_tensor_spec: A possibly nested container of `tensor_spec.TensorSpec` representing the inputs. output_tensor_spec: A possibly nested container of `tensor_spec.TensorSpec` representing the outputs. fc_layer_params: Optional list of fully connected parameters after merging all inputs, where each item is the number of units in the layer. activation_fn: Activation function, e.g. tf.nn.relu, slim.leaky_relu, ... output_activation_fn: Activation function for the last layer. This can be used to restrict the range of the output. For example, one can pass tf.keras.activations.sigmoid here to restrict the output to be bounded between 0 and 1. kernel_initializer: kernel initializer for all layers except for the value regression layer. If None, a VarianceScaling initializer will be used. last_kernel_initializer: kernel initializer for the value regression layer. If None, a RandomUniform initializer will be used. discrete_projection_net: projection layer for discrete actions. continuous_projection_net: projection layer for continuous actions. name: A string representing name of the network. """ def map_proj(spec): if tensor_spec.is_discrete(spec): return discrete_projection_net(spec) else: return continuous_projection_net(spec) projection_networks = tf.nest.map_structure(map_proj, output_tensor_spec) output_spec = tf.nest.map_structure( lambda proj_net: proj_net.output_spec, projection_networks) if tensor_spec.is_discrete(output_tensor_spec): action_dim = np.unique(output_tensor_spec.maximum - output_tensor_spec.minimum + 1) else: action_dim = output_tensor_spec.shape.num_elements() super(PolicyNetwork, self).__init__(input_tensor_spec=input_tensor_spec, state_spec=(), output_spec=output_spec, name=name) self._flat_specs = tf.nest.flatten(input_tensor_spec) if kernel_initializer is None: kernel_initializer = tf.compat.v1.keras.initializers.VarianceScaling( scale=1. / 3., mode='fan_in', distribution='uniform') if last_kernel_initializer is None: last_kernel_initializer = tf.keras.initializers.RandomUniform( minval=-0.003, maxval=0.003) self._fc_layers = utils.mlp_layers( None, fc_layer_params, activation_fn=activation_fn, kernel_initializer=kernel_initializer, name='mlp') self._fc_layers.append( tf.keras.layers.Dense(action_dim, activation=output_activation_fn, kernel_initializer=last_kernel_initializer, name='value')) self._projection_networks = projection_networks self._output_tensor_spec = output_tensor_spec
def __init__(self, observation_spec, feature_spec, action_spec, dynamics_module: DynamicsLearningAlgorithm, reward_module: RewardEstimationAlgorithm, planner_module: PlanAlgorithm, gradient_clipping=None, debug_summaries=False, name="MbrlAlgorithm"): """Create an MbrlAlgorithm. The MbrlAlgorithm takes as input the following set of modules for making decisions on actions based on the current observation: 1) learnable/fixed dynamics module 2) learnable/fixed reward module 3) learnable/fixed planner module Args: action_spec (nested BoundedTensorSpec): representing the actions. dynamics_module (DDLAlgorithm): module for learning to predict the next feature based on the previous feature and action. It should accept input with spec [feature_spec, encoded_action_spec] and output a tensor of shape feature_spec. For discrete action, encoded_action is an one-hot representation of the action. For continuous action, encoded action is same as the original action. reward_module (REAlgorithm): module for calculating the reward, i.e., evaluating the reward for a (s, a) pair planner_module (PLANAlgorithm): module for generating planned action based on specified reward function and dynamics function gradient_clipping (float): Norm length to clip gradients. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ train_state_spec = MbrlState( dynamics=dynamics_module.train_state_spec, reward=(), planner=()) super().__init__( feature_spec, action_spec, train_state_spec=train_state_spec, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, name=name) flat_action_spec = tf.nest.flatten(action_spec) action_spec = flat_action_spec[0] assert not tensor_spec.is_discrete(action_spec), "only support \ continious control" num_actions = action_spec.shape[-1] flat_feature_spec = tf.nest.flatten(feature_spec) assert len(flat_feature_spec) == 1, "Mbrl doesn't support nested \ feature_spec" feature_dim = flat_feature_spec[0].shape[-1] self._action_spec = action_spec self._num_actions = num_actions self._dynamics_module = dynamics_module self._reward_module = reward_module self._planner_module = planner_module self._planner_module.set_reward_func(self._calc_step_reward) self._planner_module.set_dynamics_func(self._predict_next_step)
def check_supported_spec(spec): if tensor_spec.is_discrete(spec): assert len(spec.shape) == 0 or \ (len(spec.shape) == 1 and spec.shape[0] == 1) else: assert len(spec.shape) == 1
def testExclusive(self, dtype): spec = tensor_spec.TensorSpec((2, 3), dtype=dtype) self.assertIs( tensor_spec.is_discrete(spec) ^ tensor_spec.is_continuous(spec), True)
def __init__(self, time_step_spec=None, action_spec=None, reward_network=None, observation_and_action_constraint_splitter=None, expose_predicted_rewards=False, name=None): """Builds a GreedyRewardPredictionPolicy given a reward tf_agents.Network. This policy takes a tf_agents.Network predicting rewards and generates the action corresponding to the largest predicted reward. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. reward_network: An instance of a `tf_agents.network.Network`, callable via `network(observation, step_type) -> (output, final_state)`. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the mask. The mask should be a 0-1 `Tensor` of shape `[batch_size, num_actions]`. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. expose_predicted_rewards: (bool) Whether to expose the predicted rewards in the policy info field under the name 'predicted_rewards'. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. Raises: NotImplementedError: If `action_spec` contains more than one `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid. """ self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) flat_action_spec = tf.nest.flatten(action_spec) if len(flat_action_spec) > 1: raise NotImplementedError( 'action_spec can only contain a single BoundedTensorSpec.') action_spec = flat_action_spec[0] if (not tensor_spec.is_bounded(action_spec) or not tensor_spec.is_discrete(action_spec) or action_spec.shape.rank > 1 or action_spec.shape.num_elements() != 1): raise NotImplementedError( 'action_spec must be a BoundedTensorSpec of type int32 and shape (). ' 'Found {}.'.format(action_spec)) self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1 self._action_offset = action_spec.minimum self._reward_network = reward_network self._expose_predicted_rewards = expose_predicted_rewards if expose_predicted_rewards: info_spec = PolicyInfo( predicted_rewards=tensor_spec.TensorSpec( [self._expected_num_actions], dtype=tf.float32)) else: info_spec = () super(GreedyRewardPredictionPolicy, self).__init__( time_step_spec, action_spec, policy_state_spec=reward_network.state_spec, clip=False, info_spec=info_spec, name=name)
def _encode_action(self, action): if tensor_spec.is_discrete(self._action_spec): return tf.one_hot(indices=action, depth=self._num_actions) else: return action
def __init__(self, time_step_spec: types.TimeStep, action_spec: types.NestedTensorSpec, alpha: Sequence[tf.Variable], beta: Sequence[tf.Variable], observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, emit_policy_info: Sequence[Text] = (), name: Optional[Text] = None): """Builds a BernoulliThompsonSamplingPolicy. For a reference, see e.g., Chapter 3 in "A Tutorial on Thompson Sampling" by Russo et al. (https://web.stanford.edu/~bvr/pubs/TS_Tutorial.pdf). Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. alpha: list or tuple of tf.Variable's. It holds the `alpha` parameter of the beta distribution of each arm. beta: list or tuple of tf.Variable's. It holds the `beta` parameter of the beta distribution of each arm. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the mask. The mask should be a 0-1 `Tensor` of shape `[batch_size, num_actions]`. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. Raises: NotImplementedError: If `action_spec` contains more than one `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid. """ flat_action_spec = tf.nest.flatten(action_spec) if len(flat_action_spec) > 1: raise NotImplementedError( 'action_spec can only contain a single BoundedTensorSpec.') action_spec = flat_action_spec[0] if (not tensor_spec.is_bounded(action_spec) or not tensor_spec.is_discrete(action_spec) or action_spec.shape.rank > 1 or action_spec.shape.num_elements() != 1): raise NotImplementedError( 'action_spec must be a BoundedTensorSpec of integer type and ' 'shape (). Found {}.'.format(action_spec)) self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1 if len(alpha) != self._expected_num_actions: raise ValueError( 'The size of alpha parameters is expected to be equal ' 'to the number of actions, but found to be {}'.format( len(alpha))) self._alpha = alpha if len(alpha) != len(beta): raise ValueError( 'The size of alpha parameters is expected to be equal ' 'to the size of beta parameters') self._beta = beta self._emit_policy_info = emit_policy_info predicted_rewards_mean = () if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info: predicted_rewards_mean = tensor_spec.TensorSpec( [self._expected_num_actions]) predicted_rewards_sampled = () if policy_utilities.InfoFields.PREDICTED_REWARDS_SAMPLED in ( emit_policy_info): predicted_rewards_sampled = tensor_spec.TensorSpec( [self._expected_num_actions]) info_spec = policy_utilities.PolicyInfo( predicted_rewards_mean=predicted_rewards_mean, predicted_rewards_sampled=predicted_rewards_sampled) super(BernoulliThompsonSamplingPolicy, self).__init__(time_step_spec, action_spec, info_spec=info_spec, emit_log_probability='log_probability' in emit_policy_info, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), name=name)
def policy_gradient_loss(self, time_steps, actions, sample_action_log_probs, advantages, current_policy_distribution, weights, debug_summaries=False): """Create tensor for policy gradient loss. All tensors should have a single batch dimension. Args: time_steps: TimeSteps with observations for each timestep. actions: Tensor of actions for timesteps, aligned on index. sample_action_log_probs: Tensor of sample probability of each action. advantages: Tensor of advantage estimate for each timestep, aligned on index. Works better when advantage estimates are normalized. current_policy_distribution: The policy distribution, evaluated on all time_steps. weights: Optional scalar or element-wise (per-batch-entry) importance weights. Includes a mask for invalid timesteps. debug_summaries: True if debug summaries should be created. Returns: policy_gradient_loss: A tensor that will contain policy gradient loss for the on-policy experience. """ tf.nest.assert_same_structure(time_steps, self.time_step_spec) action_log_prob = common.log_probability(current_policy_distribution, actions, self._action_spec) action_log_prob = tf.cast(action_log_prob, tf.float32) if self._log_prob_clipping > 0.0: action_log_prob = tf.clip_by_value(action_log_prob, -self._log_prob_clipping, self._log_prob_clipping) if self._check_numerics: action_log_prob = tf.debugging.check_numerics( action_log_prob, 'action_log_prob') # Prepare both clipped and unclipped importance ratios. importance_ratio = tf.exp(action_log_prob - sample_action_log_probs) importance_ratio_clipped = tf.clip_by_value( importance_ratio, 1 - self._importance_ratio_clipping, 1 + self._importance_ratio_clipping) if self._check_numerics: importance_ratio = tf.debugging.check_numerics( importance_ratio, 'importance_ratio') if self._importance_ratio_clipping > 0.0: importance_ratio_clipped = tf.debugging.check_numerics( importance_ratio_clipped, 'importance_ratio_clipped') # Pessimistically choose the minimum objective value for clipped and # unclipped importance ratios. per_timestep_objective = importance_ratio * advantages per_timestep_objective_clipped = importance_ratio_clipped * advantages per_timestep_objective_min = tf.minimum( per_timestep_objective, per_timestep_objective_clipped) if self._importance_ratio_clipping > 0.0: policy_gradient_loss = -per_timestep_objective_min else: policy_gradient_loss = -per_timestep_objective policy_gradient_loss = tf.reduce_mean( input_tensor=policy_gradient_loss * weights) if debug_summaries: if self._importance_ratio_clipping > 0.0: clip_fraction = tf.reduce_mean(input_tensor=tf.cast( tf.greater(tf.abs(importance_ratio - 1.0), self._importance_ratio_clipping), tf.float32)) tf.compat.v2.summary.scalar(name='clip_fraction', data=clip_fraction, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='action_log_prob', data=action_log_prob, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='action_log_prob_sample', data=sample_action_log_probs, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='importance_ratio', data=importance_ratio, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='importance_ratio_mean', data=tf.reduce_mean(input_tensor=importance_ratio), step=self.train_step_counter) tf.compat.v2.summary.histogram(name='importance_ratio_clipped', data=importance_ratio_clipped, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='per_timestep_objective', data=per_timestep_objective, step=self.train_step_counter) tf.compat.v2.summary.histogram( name='per_timestep_objective_clipped', data=per_timestep_objective_clipped, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='per_timestep_objective_min', data=per_timestep_objective_min, step=self.train_step_counter) entropy = common.entropy(current_policy_distribution, self.action_spec) tf.compat.v2.summary.histogram(name='policy_entropy', data=entropy, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='policy_entropy_mean', data=tf.reduce_mean(input_tensor=entropy), step=self.train_step_counter) for i, (single_action, single_distribution) in enumerate( zip(tf.nest.flatten(self.action_spec), tf.nest.flatten(current_policy_distribution))): # Categorical distribution (used for discrete actions) doesn't have a # mean. distribution_index = '_{}'.format(i) if i > 0 else '' if not tensor_spec.is_discrete(single_action): tf.compat.v2.summary.histogram( name='actions_distribution_mean' + distribution_index, data=single_distribution.mean(), step=self.train_step_counter) tf.compat.v2.summary.histogram( name='actions_distribution_stddev' + distribution_index, data=single_distribution.stddev(), step=self.train_step_counter) tf.compat.v2.summary.histogram(name='policy_gradient_loss', data=policy_gradient_loss, step=self.train_step_counter) if self._check_numerics: policy_gradient_loss = tf.debugging.check_numerics( policy_gradient_loss, 'policy_gradient_loss') return policy_gradient_loss
def evaluate_policy(): env_load_fn = suite_mujoco.load categorical = True FLAGS = flags.FLAGS dim_z = FLAGS.dim_z mask_xy = FLAGS.mask_xy eval_env_name = FLAGS.env_name skill_epsilon = FLAGS.skill_epsilon epsilon = 0.75 epsilon_greedy = False state_noise = False action_noise = False skill_randomization = True plot_actions = False def _env_load_fn(env_name): diayn_wrapper = (lambda x: diayn_gym_env_fixed.DiaynGymEnvFixed( x, dim_z, categorical)) return env_load_fn( env_name, gym_env_wrappers=[diayn_wrapper], ) root_dir = FLAGS.root_dir policy_fc_layers = (256, 256) env_steps = tf_metrics.EnvironmentSteps(prefix='Eval') _preprocessing_combiner = DictConcatenateLayer() global_step = tf.compat.v1.train.get_or_create_global_step() if eval_env_name == "Plane-v1": make_video = False else: make_video = True tf_env = tf_py_environment.TFPyEnvironment(_env_load_fn(eval_env_name)) eval_py_env = _env_load_fn(eval_env_name) eval_tf_env = tf_py_environment.TFPyEnvironment(eval_py_env) time_step_spec = tf_env.time_step_spec() observation_spec = time_step_spec.observation action_spec = tf_env.action_spec() augmented_time_step_spec = tf_env.time_step_spec() augmented_observation_spec = augmented_time_step_spec.observation z_spec = augmented_observation_spec["z"] if tensor_spec.is_discrete(z_spec): _preprocessing_combiner = OneHotConcatenateLayer(dim_z) else: _preprocessing_combiner = DictConcatenateLayer() actor_net = actor_distribution_network.ActorDistributionNetwork( augmented_observation_spec, action_spec, fc_layer_params=policy_fc_layers, continuous_projection_net=normal_projection_net, preprocessing_combiner=_preprocessing_combiner, mask_xy=mask_xy, name='EvalNetwork') generator_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( root_dir, 'diayn_actor'), actor_net=actor_net, global_step=global_step) if generator_checkpointer.checkpoint_exists: generator_checkpointer.initialize_or_restore() else: generator_checkpointer.initialize_or_restore() print("No low-level actor checkpoint exists...training from scratch") # Re-purpose the restored actor network generator_net = actor_net eval_policy = actor_policy.ActorPolicy( time_step_spec=augmented_time_step_spec, action_spec=action_spec, actor_network=generator_net, training=False) print("Loaded evaluation policy") if make_video: print("Creating video") skill_path = root_dir + "/skills" action_path = root_dir + "/actions" if not path.exists(skill_path): os.mkdir(skill_path) if not path.exists(action_path): os.mkdir(action_path) color_wheel = ['b', 'r', 'g', 'c', 'm'] for i in range(5): for runs in range(dim_z): xs_list = [] ys_list = [] actions_list = {new_list: [] for new_list in range(8)} video_filename = root_dir + '/skills/' + str( i + 1) + eval_env_name[:-3] + '.mp4' skill_plot_filename = root_dir + '/skills/' + str(i+1) + eval_env_name[:-3] + 'eps' \ + str(skill_epsilon) + '.png' action_plot_filename = root_dir + '/actions/' + str( i + 1) + eval_env_name[:-3] + '.png' path_len = 200 num_eps = 1 print_interval = 20 action_interval = 10 skill_sample_interval = 20 print("skill {}".format(i)) with imageio.get_writer(video_filename, fps=60) as video: for _ in range(num_eps): if categorical: eval_py_env.set_z(i) else: skill = [0] * dim_z skill[i] = 1 eval_py_env.set_z(skill) _time_step = eval_py_env.reset() print("{} {}".format( _time_step.observation["observation"][:2][0], _time_step.observation["observation"][:2][1])) video.append_data(eval_py_env.render()) steps = 0 while steps < path_len: if skill_randomization: if steps % skill_sample_interval == 0: if np.random.random() < skill_epsilon: sampled_skill = np.random.choice(dim_z) eval_py_env.set_z(sampled_skill) print("randomly sampled skill: {}". format(sampled_skill)) else: eval_py_env.set_z(i) print("stuck with skill {}".format(i)) if state_noise: _time_step.observation[ "observation"] = np.random.normal( _time_step.observation["observation"], scale=0.25) if epsilon_greedy: sample = np.random.random_sample() if sample < epsilon: action = tensor_spec.sample_spec_nest( action_spec).numpy() else: action = eval_policy.action( _time_step).action.numpy() else: action = eval_policy.action( _time_step).action.numpy() if steps % action_interval == 0: for index in range(action.shape[0]): actions_list[index].append(action[index]) if action_noise: noisy_action = np.random.normal(action, scale=1.0) _time_step = eval_py_env.step(noisy_action) else: _time_step = eval_py_env.step(action) if steps % print_interval == 0: print("{} {}".format( _time_step.observation["observation"][:2] [0], _time_step.observation["observation"] [:2][1])) xs_list.append( _time_step.observation["observation"][:2] [0]) ys_list.append( _time_step.observation["observation"][:2] [1]) video.append_data(eval_py_env.render()) steps += 1 embed_mp4(video_filename) if plot_actions: plt.ylim(-1, 1) for i in range(8): plt.plot(range(int(200 / action_interval)), actions_list[i]) plt.xlim(-25, 25) plt.ylim(-25, 25) plt.plot(xs_list, ys_list, color_wheel[i]) plt.savefig(skill_plot_filename) if plot_actions: plt.savefig(action_plot_filename) else: print("Rendering plane skills") skill_path = root_dir + "/skills" if not path.exists(skill_path): os.mkdir(skill_path) for i in range(0, dim_z, 1): path_len = 10 num_eps = 1 xs_list = [] ys_list = [] skill_plot_filename = root_dir + '/skills/' + str( i + 1) + eval_env_name[:-3] + '.png' for _ in range(num_eps): if categorical: eval_py_env.set_z(i) print("skill {}".format(i + 1)) else: skill = [0] * dim_z skill[i] = 1 eval_py_env.set_z(skill) _time_step = eval_py_env.reset() eval_py_env.render() steps = 0 while steps < path_len: xs_list.append(_time_step.observation["observation"][0]) ys_list.append(_time_step.observation["observation"][1]) action_step = eval_policy.action(_time_step) _time_step = eval_py_env.step(action_step.action.numpy()) eval_py_env.render() steps += 1 plt.xlim(-100, 100) plt.ylim(-100, 100) plt.plot(xs_list, ys_list) plt.savefig(skill_plot_filename)
def __init__(self, input_tensor_spec, output_tensor_spec, input_fc_layer_params=(200, 100), output_fc_layer_params=(200, 100), conv_layer_params=None, lstm_size=(40,), activation_fn=tf.keras.activations.relu, categorical_projection_net=_categorical_projection_net, normal_projection_net=_normal_projection_net, name='ActorDistributionRnnNetwork'): """Creates an instance of `ActorDistributionRnnNetwork`. Args: input_tensor_spec: A nest of `tensor_spec.TensorSpec` representing the input. output_tensor_spec: A nest of `tensor_spec.BoundedTensorSpec` representing the output. input_fc_layer_params: Optional list of fully_connected parameters, where each item is the number of units in the layer. This is applied before the LSTM cell. output_fc_layer_params: Optional list of fully_connected parameters, where each item is the number of units in the layer. This is applied after the LSTM cell. conv_layer_params: Optional list of convolution layers parameters, where each item is a length-three tuple indicating (filters, kernel_size, stride). lstm_size: An iterable of ints specifying the LSTM cell sizes to use. activation_fn: Activation function, e.g. tf.nn.relu, slim.leaky_relu, ... categorical_projection_net: Callable that generates a categorical projection network to be called with some hidden state and the outer_rank of the state. normal_projection_net: Callable that generates a normal projection network to be called with some hidden state and the outer_rank of the state. name: A string representing name of the network. Raises: ValueError: If `input_tensor_spec` contains more than one observation. """ if len(tf.nest.flatten(input_tensor_spec)) > 1: raise ValueError('Only a single observation is supported by this network') input_layers = utils.mlp_layers( conv_layer_params, input_fc_layer_params, activation_fn=activation_fn, kernel_initializer=tf.compat.v1.keras.initializers.glorot_uniform(), name='input_mlp') # Create RNN cell if len(lstm_size) == 1: cell = tf.keras.layers.LSTMCell(lstm_size[0]) else: cell = tf.keras.layers.StackedRNNCells( [tf.keras.layers.LSTMCell(size) for size in lstm_size]) state_spec = tf.nest.map_structure( functools.partial( tensor_spec.TensorSpec, dtype=tf.float32, name='network_state_spec'), cell.state_size) output_layers = utils.mlp_layers( fc_layer_params=output_fc_layer_params, name='output') projection_networks = [] for single_output_spec in tf.nest.flatten(output_tensor_spec): if tensor_spec.is_discrete(single_output_spec): projection_networks.append( categorical_projection_net(single_output_spec)) else: projection_networks.append(normal_projection_net(single_output_spec)) projection_distribution_specs = [ proj_net.output_spec for proj_net in projection_networks ] output_spec = tf.nest.pack_sequence_as(output_tensor_spec, projection_distribution_specs) super(ActorDistributionRnnNetwork, self).__init__( input_tensor_spec=input_tensor_spec, state_spec=state_spec, output_spec=output_spec, name=name) self._conv_layer_params = conv_layer_params self._input_layers = input_layers self._dynamic_unroll = dynamic_unroll_layer.DynamicUnroll(cell) self._output_layers = output_layers self._projection_networks = projection_networks self._output_tensor_spec = output_tensor_spec
def __init__( self, time_step_spec: Optional[ts.TimeStep], action_spec: Optional[NestedBoundedTensorSpec], scalarizer: multi_objective_scalarizer.Scalarizer, objective_networks: Sequence[Network], observation_and_action_constraint_splitter: types.Splitter = None, accepts_per_arm_features: bool = False, emit_policy_info: Tuple[Text] = (), name: Optional[Text] = None): """Builds a GreedyMultiObjectiveNeuralPolicy based on multiple networks. This policy takes an iterable of `tf_agents.Network`, each responsible for predicting a specific objective, along with a `Scalarizer` object to generate an action by maximizing the scalarized objective, i.e., the output of the `Scalarizer` applied to the multiple predicted objectives by the networks. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of `BoundedTensorSpec` representing the actions. scalarizer: A `tf_agents.bandits.multi_objective.multi_objective_scalarizer.Scalarizer` object that implements scalarization of multiple objectives into a single scalar reward. objective_networks: A Sequence of `tf_agents.network.Network` objects to be used by the policy. Each network will be called with call(observation, step_type) and is expected to provide a prediction for a specific objective for all actions. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the mask. The mask should be a 0-1 `Tensor` of shape `[batch_size, num_actions]`. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. accepts_per_arm_features: (bool) Whether the policy accepts per-arm features. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. Raises: NotImplementedError: If `action_spec` contains more than one `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid. NotImplementedError: If `action_spec` is not a `BoundedTensorSpec` of type int32 and shape (). ValueError: If `objective_networks` has fewer than two networks. ValueError: If `accepts_per_arm_features` is true but `time_step_spec` is None. """ flat_action_spec = tf.nest.flatten(action_spec) if len(flat_action_spec) > 1: raise NotImplementedError( 'action_spec can only contain a single BoundedTensorSpec.') action_spec = flat_action_spec[0] if (not tensor_spec.is_bounded(action_spec) or not tensor_spec.is_discrete(action_spec) or action_spec.shape.rank > 1 or action_spec.shape.num_elements() != 1): raise NotImplementedError( 'action_spec must be a BoundedTensorSpec of type int32 and shape (). ' 'Found {}.'.format(action_spec)) self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1 self._action_offset = action_spec.minimum policy_state_spec = [] for network in objective_networks: policy_state_spec.append(network.state_spec) network.create_variables() self._objective_networks = objective_networks self._scalarizer = scalarizer self._num_objectives = len(self._objective_networks) if self._num_objectives < 2: raise ValueError( 'Number of objectives should be at least two, but found to be {}' .format(self._num_objectives)) self._emit_policy_info = emit_policy_info predicted_rewards_mean = () if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info: predicted_rewards_mean = tensor_spec.TensorSpec( [self._num_objectives, self._expected_num_actions]) bandit_policy_type = () if policy_utilities.InfoFields.BANDIT_POLICY_TYPE in emit_policy_info: bandit_policy_type = ( policy_utilities.create_bandit_policy_type_tensor_spec( shape=[1])) if accepts_per_arm_features: if time_step_spec is None: raise ValueError( 'time_step_spec should not be None for per-arm-features policies, ' 'but found to be.') # The features for the chosen arm is saved to policy_info. chosen_arm_features_info = ( policy_utilities.create_chosen_arm_features_info_spec( time_step_spec.observation, observation_and_action_constraint_splitter)) info_spec = policy_utilities.PerArmPolicyInfo( predicted_rewards_mean=predicted_rewards_mean, bandit_policy_type=bandit_policy_type, chosen_arm_features=chosen_arm_features_info) else: info_spec = policy_utilities.PolicyInfo( predicted_rewards_mean=predicted_rewards_mean, bandit_policy_type=bandit_policy_type) self._accepts_per_arm_features = accepts_per_arm_features super(GreedyMultiObjectiveNeuralPolicy, self).__init__(time_step_spec, action_spec, policy_state_spec=policy_state_spec, clip=False, info_spec=info_spec, emit_log_probability='log_probability' in emit_policy_info, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), name=name)
def map_proj(spec): if tensor_spec.is_discrete(spec): return discrete_projection_net(spec) else: return continuous_projection_net(spec)
def train_eval( root_dir, env_name=None, env_load_fn=suite_mujoco.load, random_seed=0, # TODO(b/127576522): rename to policy_fc_layers. actor_fc_layers=(200, 100), value_fc_layers=(200, 100), inference_fc_layers=(200, 100), use_rnns=None, dim_z=4, categorical=True, # Params for collect num_environment_steps=10000000, collect_episodes_per_iteration=30, num_parallel_environments=30, replay_buffer_capacity=1001, # Per-environment # Params for train num_epochs=25, learning_rate=1e-4, entropy_regularization=None, kl_posteriors_penalty=None, mock_inference=None, mock_reward=None, l2_distance=None, rl_steps=None, inference_steps=None, # Params for eval num_eval_episodes=30, eval_interval=1000, # Params for summaries and logging train_checkpoint_interval=10000, policy_checkpoint_interval=10000, log_interval=1000, summary_interval=1000, summaries_flush_secs=1, use_tf_functions=True, debug_summaries=False, summarize_grads_and_vars=False): """A simple train and eval for PPO.""" if root_dir is None: raise AttributeError('train_eval requires a root_dir.') root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') saved_model_dir = os.path.join(root_dir, 'policy_saved_model') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes) ] global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): tf.compat.v1.set_random_seed(random_seed) def _env_load_fn(env_name): diayn_wrapper = ( lambda x: diayn_gym_env.DiaynGymEnv(x, dim_z, categorical)) return env_load_fn( env_name, gym_env_wrappers=[diayn_wrapper], ) eval_tf_env = tf_py_environment.TFPyEnvironment(_env_load_fn(env_name)) if num_parallel_environments == 1: py_env = _env_load_fn(env_name) else: py_env = parallel_py_environment.ParallelPyEnvironment( [lambda: _env_load_fn(env_name)] * num_parallel_environments) tf_env = tf_py_environment.TFPyEnvironment(py_env) augmented_time_step_spec = tf_env.time_step_spec() augmented_observation_spec = augmented_time_step_spec.observation observation_spec = augmented_observation_spec['observation'] z_spec = augmented_observation_spec['z'] reward_spec = augmented_time_step_spec.reward action_spec = tf_env.action_spec() time_step_spec = ts.time_step_spec(observation_spec) infer_from_com = False if env_name == "AntRandGoalEval-v1": infer_from_com = True if infer_from_com: input_inference_spec = tspec.BoundedTensorSpec( shape=[2], dtype=tf.float64, minimum=-1.79769313e+308, maximum=1.79769313e+308, name='body_com') else: input_inference_spec = observation_spec if tensor_spec.is_discrete(z_spec): _preprocessing_combiner = OneHotConcatenateLayer(dim_z) else: _preprocessing_combiner = DictConcatenateLayer() optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) if use_rnns: actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork( augmented_observation_spec, action_spec, preprocessing_combiner=_preprocessing_combiner, input_fc_layer_params=actor_fc_layers, output_fc_layer_params=None) value_net = value_rnn_network.ValueRnnNetwork( augmented_observation_spec, preprocessing_combiner=_preprocessing_combiner, input_fc_layer_params=value_fc_layers, output_fc_layer_params=None) else: actor_net = actor_distribution_network.ActorDistributionNetwork( augmented_observation_spec, action_spec, preprocessing_combiner=_preprocessing_combiner, fc_layer_params=actor_fc_layers, name="actor_net") value_net = value_network.ValueNetwork( augmented_observation_spec, preprocessing_combiner=_preprocessing_combiner, fc_layer_params=value_fc_layers, name="critic_net") inference_net = actor_distribution_network.ActorDistributionNetwork( input_tensor_spec=input_inference_spec, output_tensor_spec=z_spec, fc_layer_params=inference_fc_layers, continuous_projection_net=normal_projection_net, name="inference_net") tf_agent = ppo_diayn_agent.PPODiaynAgent( augmented_time_step_spec, action_spec, z_spec, optimizer, actor_net=actor_net, value_net=value_net, inference_net=inference_net, num_epochs=num_epochs, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step, entropy_regularization=entropy_regularization, kl_posteriors_penalty=kl_posteriors_penalty, mock_inference=mock_inference, mock_reward=mock_reward, infer_from_com=infer_from_com, l2_distance=l2_distance, rl_steps=rl_steps, inference_steps=inference_steps) tf_agent.initialize() environment_steps_metric = tf_metrics.EnvironmentSteps() step_metrics = [ tf_metrics.NumberOfEpisodes(), environment_steps_metric, ] train_metrics = step_metrics + [ tf_metrics.AverageReturnMetric( batch_size=num_parallel_environments), tf_metrics.AverageEpisodeLengthMetric( batch_size=num_parallel_environments), ] eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity) actor_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( root_dir, 'diayn_actor'), actor_net=actor_net, global_step=global_step) train_checkpointer = common.Checkpointer( ckpt_dir=train_dir, agent=tf_agent, global_step=global_step, metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')) policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'diayn_policy'), policy=eval_policy, global_step=global_step) saved_model = policy_saver.PolicySaver(eval_policy, train_step=global_step) rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( root_dir, 'diayn_replay_buffer'), max_to_keep=1, replay_buffer=replay_buffer) inference_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(root_dir, 'diayn_inference'), inference_net=inference_net, global_step=global_step) actor_checkpointer.initialize_or_restore() train_checkpointer.initialize_or_restore() rb_checkpointer.initialize_or_restore() inference_checkpointer.initialize_or_restore() collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( tf_env, collect_policy, observers=[replay_buffer.add_batch] + train_metrics, num_episodes=collect_episodes_per_iteration) # option_length = 200 # if env_name == "Plane-v1": # option_length = 10 # dataset = replay_buffer.as_dataset( # num_parallel_calls=3, sample_batch_size=num_parallel_environments, # num_steps=option_length) # iterator_dataset = iter(dataset) def train_step(): trajectories = replay_buffer.gather_all() # trajectories, _ = next(iterator_dataset) return tf_agent.train(experience=trajectories) if use_tf_functions: # TODO(b/123828980): Enable once the cause for slowdown was identified. collect_driver.run = common.function(collect_driver.run, autograph=False) tf_agent.train = common.function(tf_agent.train, autograph=False) train_step = common.function(train_step) collect_time = 0 train_time = 0 timed_at_step = global_step.numpy() while environment_steps_metric.result() < num_environment_steps: global_step_val = global_step.numpy() if global_step_val % eval_interval == 0: metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics', ) start_time = time.time() collect_driver.run() collect_time += time.time() - start_time start_time = time.time() total_loss, _ = train_step() replay_buffer.clear() train_time += time.time() - start_time for train_metric in train_metrics: train_metric.tf_summaries(train_step=global_step, step_metrics=step_metrics) if global_step_val % log_interval == 0: logging.info('step = %d, loss = %f', global_step_val, total_loss) steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time)) logging.info('%.3f steps/sec', steps_per_sec) logging.info('collect_time = {}, train_time = {}'.format( collect_time, train_time)) with tf.compat.v2.summary.record_if(True): tf.compat.v2.summary.scalar(name='global_steps_per_sec', data=steps_per_sec, step=global_step) if global_step_val % train_checkpoint_interval == 0: train_checkpointer.save(global_step=global_step_val) inference_checkpointer.save(global_step=global_step_val) actor_checkpointer.save(global_step=global_step_val) rb_checkpointer.save(global_step=global_step_val) if global_step_val % policy_checkpoint_interval == 0: policy_checkpointer.save(global_step=global_step_val) saved_model_path = os.path.join( saved_model_dir, 'policy_' + ('%d' % global_step_val).zfill(9)) saved_model.save(saved_model_path) timed_at_step = global_step_val collect_time = 0 train_time = 0 # One final eval before exiting. metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics', )