예제 #1
0
    def testUpdate(self):
        tf.compat.v1.set_random_seed(1)
        actor_network = DummyActorNet(self._action_spec)
        value_network = DummyValueNet()

        policy = ppo_policy.PPOPolicy(self._time_step_spec,
                                      self._action_spec,
                                      actor_network=actor_network,
                                      value_network=value_network)
        new_policy = ppo_policy.PPOPolicy(self._time_step_spec,
                                          self._action_spec,
                                          actor_network=actor_network,
                                          value_network=value_network)

        action_step = policy.action(self._time_step)
        new_action_step = new_policy.action(self._time_step)

        self.assertEqual(action_step.action.shape,
                         new_action_step.action.shape)
        self.assertEqual(action_step.action.dtype,
                         new_action_step.action.dtype)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.evaluate(new_policy.update(policy))
        actions_, new_actions_ = self.evaluate(
            [action_step.action, new_action_step.action])
        self.assertAllEqual(actions_, new_actions_)
예제 #2
0
    def test_same_policy_same_output(self):
        if not tf.executing_eagerly():
            self.skipTest(
                'Skipping test: sequential networks not supported in TF1')
        observation_tensor_spec = tf.TensorSpec(shape=[1], dtype=tf.float32)
        action_tensor_spec = tensor_spec.BoundedTensorSpec((8, ), tf.float32,
                                                           -1, 1)

        value_net = value_network.ValueNetwork(observation_tensor_spec,
                                               fc_layer_params=(1, ))

        actor_net_lib = ppo_actor_network.PPOActorNetwork()
        actor_net_lib.seed_stream_class = DeterministicSeedStream
        actor_net_sequential = actor_net_lib.create_sequential_actor_net(
            fc_layer_units=(1, ),
            action_tensor_spec=action_tensor_spec,
            seed=1)
        actor_net_actor_dist = actor_distribution_network.ActorDistributionNetwork(
            observation_tensor_spec,
            action_tensor_spec,
            fc_layer_params=(1, ),
            activation_fn=tf.nn.tanh,
            kernel_initializer=tf.keras.initializers.Orthogonal(seed=1),
            seed_stream_class=DeterministicSeedStream,
            seed=1)

        tf.random.set_seed(111)
        seq_policy = ppo_policy.PPOPolicy(
            ts.time_step_spec(observation_tensor_spec),
            action_tensor_spec,
            actor_net_sequential,
            value_net,
            collect=True)
        tf.random.set_seed(111)
        actor_dist_policy = ppo_policy.PPOPolicy(
            ts.time_step_spec(observation_tensor_spec),
            action_tensor_spec,
            actor_net_actor_dist,
            value_net,
            collect=True)

        sample_timestep = ts.TimeStep(step_type=tf.constant([1, 1],
                                                            dtype=tf.int32),
                                      reward=tf.constant([1, 1],
                                                         dtype=tf.float32),
                                      discount=tf.constant([1, 1],
                                                           dtype=tf.float32),
                                      observation=tf.constant(
                                          [[1], [2]], dtype=tf.float32))
        seq_policy_step = seq_policy._distribution(sample_timestep,
                                                   policy_state=())
        act_dist_policy_step = actor_dist_policy._distribution(sample_timestep,
                                                               policy_state=())

        seq_scale = seq_policy_step.info['dist_params']['scale_diag']
        act_dist_scale = act_dist_policy_step.info['dist_params']['scale']
        self.assertAllEqual(seq_scale, act_dist_scale)
        self.assertAllEqual(seq_policy_step.info['dist_params']['loc'],
                            act_dist_policy_step.info['dist_params']['loc'])
예제 #3
0
 def _make_policy(self, collect):
     return ppo_policy.PPOPolicy(
         time_step_spec=self.time_step_spec(),
         action_spec=self.action_spec(),
         actor_network=self._actor_net,
         value_network=self._value_net,
         observation_normalizer=self._observation_normalizer,
         clip=False,
         collect=collect)
예제 #4
0
    def testBuild(self, network_cls):
        actor_network = network_cls(self._action_spec)
        value_network = DummyValueNet()

        policy = ppo_policy.PPOPolicy(self._time_step_spec,
                                      self._action_spec,
                                      actor_network=actor_network,
                                      value_network=value_network)

        self.assertEqual(policy.time_step_spec, self._time_step_spec)
        self.assertEqual(policy.action_spec, self._action_spec)
예제 #5
0
    def testGaussianDistribution(self):
        actor_network = DummyActorDistributionNet(self._action_spec)
        value_network = DummyValueNet()

        policy = ppo_policy.PPOPolicy(self._time_step_spec,
                                      self._action_spec,
                                      actor_network=actor_network,
                                      value_network=value_network)

        distribution_step = policy.distribution(self._time_step)
        self.assertIsInstance(distribution_step.action,
                              tfp.distributions.Normal)
예제 #6
0
    def testReset(self, network_cls):
        actor_network = network_cls(self._action_spec)
        value_network = DummyValueNet()

        policy = ppo_policy.PPOPolicy(self._time_step_spec,
                                      self._action_spec,
                                      actor_network=actor_network,
                                      value_network=value_network)

        policy_state = policy.get_initial_state(batch_size=1)

        # Dummy network has no policy_state so expect empty tuple from reset.
        self.assertEqual((), policy_state)
예제 #7
0
    def testActionBatch(self, network_cls):
        actor_network = network_cls(self._action_spec)
        value_network = DummyValueNet()

        policy = ppo_policy.PPOPolicy(self._time_step_spec,
                                      self._action_spec,
                                      actor_network=actor_network,
                                      value_network=value_network)

        action_step = policy.action(self._time_step_batch)
        self.assertEqual(action_step.action.shape.as_list(), [2, 1])
        self.assertEqual(action_step.action.dtype, tf.float32)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        actions_ = self.evaluate(action_step.action)
        self.assertTrue(np.all(actions_ >= self._action_spec.minimum))
        self.assertTrue(np.all(actions_ <= self._action_spec.maximum))
예제 #8
0
    def testActionList(self, network_cls):
        action_spec = [self._action_spec]
        actor_network = network_cls(action_spec)
        value_network = DummyValueNet()

        policy = ppo_policy.PPOPolicy(self._time_step_spec,
                                      action_spec,
                                      actor_network=actor_network,
                                      value_network=value_network)

        action_step = policy.action(self._time_step)
        self.assertIsInstance(action_step.action, list)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        actions_ = self.evaluate(action_step.action)
        self.assertTrue(np.all(actions_ >= action_spec[0].minimum))
        self.assertTrue(np.all(actions_ <= action_spec[0].maximum))
예제 #9
0
    def testValueInPolicyInfo(self, network_cls):
        actor_network = network_cls(self._action_spec)
        value_network = DummyValueNet()

        policy = ppo_policy.PPOPolicy(self._time_step_spec,
                                      self._action_spec,
                                      actor_network=actor_network,
                                      value_network=value_network)

        policy_step = policy.action(self._time_step)
        self.assertEqual(policy_step.info['value_prediction'].shape.as_list(),
                         [1, 1])
        self.assertEqual(policy_step.info['value_prediction'].dtype,
                         tf.float32)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.evaluate(policy_step.info['value_prediction'])
예제 #10
0
    def testDeterministicDistribution(self):
        actor_network = DummyActorNet(self._action_spec)
        value_network = DummyValueNet()

        policy = ppo_policy.PPOPolicy(self._time_step_spec,
                                      self._action_spec,
                                      actor_network=actor_network,
                                      value_network=value_network)

        action_step = policy.action(self._time_step)
        distribution_step = policy.distribution(self._time_step)
        self.assertIsInstance(distribution_step.action,
                              tfp.distributions.Deterministic)
        distribution_mean = distribution_step.action.mean()
        self.evaluate(tf.compat.v1.global_variables_initializer())
        actions_ = self.evaluate(action_step.action)
        distribution_mean_ = self.evaluate(distribution_mean)
        self.assertNear(actions_, distribution_mean_, 1e-6)
예제 #11
0
    def testValue(self, network_cls):
        actor_network = network_cls(self._action_spec)
        value_network = DummyValueNet()

        policy = ppo_policy.PPOPolicy(self._time_step_spec,
                                      self._action_spec,
                                      actor_network=actor_network,
                                      value_network=value_network)

        batch_size = self._time_step.step_type.shape[0].value
        policy_state = policy.get_initial_state(batch_size=batch_size)
        value_pred, unused_policy_state = policy.apply_value_network(
            self._time_step.observation, self._time_step.step_type,
            policy_state)
        self.assertEqual(value_pred.shape.as_list(), [1, 1])
        self.assertEqual(value_pred.dtype, tf.float32)
        self.evaluate(tf.global_variables_initializer())
        self.evaluate(value_pred)
예제 #12
0
    def testNonLegacyDistribution(self):
        if not tf.executing_eagerly():
            self.skipTest(
                'Skipping test: sequential networks not supported in TF1')

        actor_network = create_sequential_actor_net()
        action_spec = {'my_action': self._action_spec}
        value_network = DummyValueNet()

        policy = ppo_policy.PPOPolicy(self._time_step_spec,
                                      action_spec,
                                      actor_network=actor_network,
                                      value_network=value_network)

        distribution_step = policy.distribution(self._time_step)
        self.assertIsInstance(distribution_step.action['my_action'],
                              tfp.distributions.TransformedDistribution)

        expected_info_spec = {
            'dist_params': {
                'my_action': {
                    'bijector': {
                        'bijectors:0': {},
                        'bijectors:1': {},
                        'bijectors:2': {}
                    },
                    'distribution': {
                        'scale': tf.TensorSpec([1], tf.float32),
                        'loc': tf.TensorSpec([1], tf.float32)
                    },
                }
            },
            'value_prediction': tf.TensorSpec([1, 1], tf.float32)
        }

        tf.nest.map_structure(
            lambda v, s: self.assertEqual(tf.type_spec_from_value(v), s),
            distribution_step.info, expected_info_spec)
예제 #13
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 optimizer=None,
                 actor_net=None,
                 value_net=None,
                 importance_ratio_clipping=0.0,
                 lambda_value=0.95,
                 discount_factor=0.99,
                 entropy_regularization=0.0,
                 policy_l2_reg=0.0,
                 value_function_l2_reg=0.0,
                 value_pred_loss_coef=0.5,
                 num_epochs=25,
                 use_gae=False,
                 use_td_lambda_return=False,
                 normalize_rewards=True,
                 reward_norm_clipping=10.0,
                 normalize_observations=True,
                 log_prob_clipping=0.0,
                 kl_cutoff_factor=2.0,
                 kl_cutoff_coef=1000.0,
                 initial_adaptive_kl_beta=1.0,
                 adaptive_kl_target=0.01,
                 adaptive_kl_tolerance=0.3,
                 gradient_clipping=None,
                 check_numerics=False,
                 debug_summaries=False,
                 summarize_grads_and_vars=False,
                 train_step_counter=None,
                 name=None):
        """Creates a PPO Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      optimizer: Optimizer to use for the agent.
      actor_net: A function actor_net(observations, action_spec) that returns
        tensor of action distribution params for each observation. Takes nested
        observation and returns nested action.
      value_net: A function value_net(time_steps) that returns value tensor from
        neural net predictions for each observation. Takes nested observation
        and returns batch of value_preds.
      importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective.
        For more detail, see explanation at the top of the doc.
      lambda_value: Lambda parameter for TD-lambda computation.
      discount_factor: Discount factor for return computation.
      entropy_regularization: Coefficient for entropy regularization loss term.
      policy_l2_reg: Coefficient for l2 regularization of policy weights.
      value_function_l2_reg: Coefficient for l2 regularization of value function
        weights.
      value_pred_loss_coef: Multiplier for value prediction loss to balance with
        policy gradient loss.
      num_epochs: Number of epochs for computing policy updates.
      use_gae: If True (default False), uses generalized advantage estimation
        for computing per-timestep advantage. Else, just subtracts value
        predictions from empirical return.
      use_td_lambda_return: If True (default False), uses td_lambda_return for
        training value function. (td_lambda_return = gae_advantage +
        value_predictions)
      normalize_rewards: If true, keeps moving variance of rewards and
        normalizes incoming rewards.
      reward_norm_clipping: Value above and below to clip normalized reward.
      normalize_observations: If true, keeps moving mean and variance of
        observations and normalizes incoming observations.
      log_prob_clipping: +/- value for clipping log probs to prevent inf / NaN
        values.  Default: no clipping.
      kl_cutoff_factor: If policy KL changes more than this much for any single
        timestep, adds a squared KL penalty to loss function.
      kl_cutoff_coef: Loss coefficient for kl cutoff term.
      initial_adaptive_kl_beta: Initial value for beta coefficient of adaptive
        kl penalty.
      adaptive_kl_target: Desired kl target for policy updates. If actual kl is
        far from this target, adaptive_kl_beta will be updated.
      adaptive_kl_tolerance: A tolerance for adaptive_kl_beta. Mean KL above (1
        + tol) * adaptive_kl_target, or below (1 - tol) * adaptive_kl_target,
        will cause adaptive_kl_beta to be updated.
      gradient_clipping: Norm length to clip gradients.  Default: no clipping.
      check_numerics: If true, adds tf.debugging.check_numerics to help find
        NaN / Inf values. For debugging only.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If true, gradient summaries will be written.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      ValueError: If the actor_net is not a DistributionNetwork.
    """
        if not isinstance(actor_net, network.DistributionNetwork):
            raise ValueError(
                'actor_net must be an instance of a DistributionNetwork.')

        tf.Module.__init__(self, name=name)

        self._optimizer = optimizer
        self._actor_net = actor_net
        self._value_net = value_net
        self._importance_ratio_clipping = importance_ratio_clipping
        self._lambda = lambda_value
        self._discount_factor = discount_factor
        self._entropy_regularization = entropy_regularization
        self._policy_l2_reg = policy_l2_reg
        self._value_function_l2_reg = value_function_l2_reg
        self._value_pred_loss_coef = value_pred_loss_coef
        self._num_epochs = num_epochs
        self._use_gae = use_gae
        self._use_td_lambda_return = use_td_lambda_return
        self._reward_norm_clipping = reward_norm_clipping
        self._log_prob_clipping = log_prob_clipping
        self._kl_cutoff_factor = kl_cutoff_factor
        self._kl_cutoff_coef = kl_cutoff_coef
        self._adaptive_kl_target = adaptive_kl_target
        self._adaptive_kl_tolerance = adaptive_kl_tolerance
        self._gradient_clipping = gradient_clipping or 0.0
        self._check_numerics = check_numerics

        if initial_adaptive_kl_beta > 0.0:
            # TODO(kbanoop): Rename create_variable.
            self._adaptive_kl_beta = common.create_variable(
                'adaptive_kl_beta', initial_adaptive_kl_beta, dtype=tf.float32)
        else:
            self._adaptive_kl_beta = None

        self._reward_normalizer = None
        if normalize_rewards:
            self._reward_normalizer = tensor_normalizer.StreamingTensorNormalizer(
                tensor_spec.TensorSpec([], tf.float32),
                scope='normalize_reward')

        self._observation_normalizer = None
        if normalize_observations:
            self._observation_normalizer = (
                tensor_normalizer.StreamingTensorNormalizer(
                    time_step_spec.observation,
                    scope='normalize_observations'))

        policy = greedy_policy.GreedyPolicy(
            ppo_policy.PPOPolicy(
                time_step_spec=time_step_spec,
                action_spec=action_spec,
                actor_network=actor_net,
                value_network=value_net,
                observation_normalizer=self._observation_normalizer,
                clip=False,
                collect=False))

        collect_policy = ppo_policy.PPOPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=actor_net,
            value_network=value_net,
            observation_normalizer=self._observation_normalizer,
            clip=False,
            collect=True)

        self._action_distribution_spec = (self._actor_net.output_spec)

        super(PPOAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
예제 #14
0
  def __init__(self,
               time_step_spec,
               action_spec,
               optimizer=None,
               actor_net=None,
               value_net=None,
               importance_ratio_clipping=0.0,
               lambda_value=0.95,
               discount_factor=0.99,
               entropy_regularization=0.0,
               policy_l2_reg=0.0,
               value_function_l2_reg=0.0,
               shared_vars_l2_reg=0.0,
               value_pred_loss_coef=0.5,
               num_epochs=25,
               use_gae=False,
               use_td_lambda_return=False,
               normalize_rewards=True,
               reward_norm_clipping=10.0,
               normalize_observations=True,
               log_prob_clipping=0.0,
               kl_cutoff_factor=2.0,
               kl_cutoff_coef=1000.0,
               initial_adaptive_kl_beta=1.0,
               adaptive_kl_target=0.01,
               adaptive_kl_tolerance=0.3,
               gradient_clipping=None,
               check_numerics=False,
               compute_value_and_advantage_in_train=False,
               debug_summaries=False,
               summarize_grads_and_vars=False,
               train_step_counter=None,
               name=None):
    """Creates a PPO Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      optimizer: Optimizer to use for the agent, default to using
        `tf.compat.v1.train.AdamOptimizer`.
      actor_net: A `network.DistributionNetwork` which maps observations to
        action distributions. Commonly, it is set to
        `actor_distribution_network.ActorDistributionNetwork`.
      value_net: A `Network` which returns the value prediction for input
        states, with `call(observation, step_type, network_state)`. Commonly, it
        is set to `value_network.ValueNetwork`.
      importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective.
        For more detail, see explanation at the top of the doc.
      lambda_value: Lambda parameter for TD-lambda computation.
      discount_factor: Discount factor for return computation. Default to `0.99`
        which is the value used for all environments from (Schulman, 2017).
      entropy_regularization: Coefficient for entropy regularization loss term.
        Default to `0.0` because no entropy bonus was used in (Schulman, 2017).
      policy_l2_reg: Coefficient for L2 regularization of unshared actor_net
        weights. Default to `0.0` because no L2 regularization was applied on
        the policy network weights in (Schulman, 2017).
      value_function_l2_reg: Coefficient for l2 regularization of unshared value
        function weights. Default to `0.0` because no L2 regularization was
        applied on the policy network weights in (Schulman, 2017).
      shared_vars_l2_reg: Coefficient for l2 regularization of weights shared
        between actor_net and value_net. Default to `0.0` because no L2
        regularization was applied on the policy network or value network
        weights in (Schulman, 2017).
      value_pred_loss_coef: Multiplier for value prediction loss to balance with
        policy gradient loss. Default to `0.5`, which was used for all
        environments in the OpenAI baseline implementation. This parameters is
        irrelevant unless you are sharing part of actor_net and value_net. In
        that case, you would want to tune this coeeficient, whose value depends
        on the network architecture of your choice.
      num_epochs: Number of epochs for computing policy updates. (Schulman,2017)
        sets this to 10 for Mujoco, 15 for Roboschool and 3 for Atari.
      use_gae: If True (default False), uses generalized advantage estimation
        for computing per-timestep advantage. Else, just subtracts value
        predictions from empirical return.
      use_td_lambda_return: If True (default False), uses td_lambda_return for
        training value function; here:
        `td_lambda_return = gae_advantage + value_predictions`.
        `use_gae` must be set to `True` as well to enable TD -lambda returns. If
        `use_td_lambda_return` is set to True while `use_gae` is False, the
        empirical return will be used and a warning will be logged.
      normalize_rewards: If true, keeps moving variance of rewards and
        normalizes incoming rewards. While not mentioned directly in (Schulman,
        2017), reward normalization was implemented in OpenAI baselines and
        (Ilyas et al., 2018) pointed out that it largely improves performance.
        You may refer to Figure 1 of https://arxiv.org/pdf/1811.02553.pdf for a
        comparison with and without reward scaling.
      reward_norm_clipping: Value above and below to clip normalized reward.
        Additional optimization proposed in (Ilyas et al., 2018) set to
        `5` or `10`.
      normalize_observations: If `True`, keeps moving mean and
        variance of observations and normalizes incoming observations.
        Additional optimization proposed in (Ilyas et al., 2018).
      log_prob_clipping: +/- value for clipping log probs to prevent inf / NaN
        values.  Default: no clipping.
      kl_cutoff_factor: Only meaningful when `kl_cutoff_coef > 0.0`. A multipler
        used for calculating the KL cutoff ( =
        `kl_cutoff_factor * adaptive_kl_target`). If policy KL averaged across
        the batch changes more than the cutoff, a squared cutoff loss would
        be added to the loss function.
      kl_cutoff_coef: kl_cutoff_coef and kl_cutoff_factor are additional params
        if one wants to use a KL cutoff loss term in addition to the adaptive KL
        loss term. Default to 0.0 to disable the KL cutoff loss term as this was
        not used in the paper.  kl_cutoff_coef is the coefficient to mulitply by
        the KL cutoff loss term, before adding to the total loss function.
      initial_adaptive_kl_beta: Initial value for beta coefficient of adaptive
        KL penalty. This initial value is not important in practice because the
        algorithm quickly adjusts to it. A common default is 1.0.
      adaptive_kl_target: Desired KL target for policy updates. If actual KL is
        far from this target, adaptive_kl_beta will be updated. You should tune
        this for your environment. 0.01 was found to perform well for Mujoco.
      adaptive_kl_tolerance: A tolerance for adaptive_kl_beta. Mean KL above
        `(1 + tol) * adaptive_kl_target`, or below
        `(1 - tol) * adaptive_kl_target`,
        will cause `adaptive_kl_beta` to be updated. `0.5` was chosen
        heuristically in the paper, but the algorithm is not very
        sensitive to it.
      gradient_clipping: Norm length to clip gradients.  Default: no clipping.
      check_numerics: If true, adds `tf.debugging.check_numerics` to help find
        NaN / Inf values. For debugging only.
      compute_value_and_advantage_in_train: A bool to indicate where value
        prediction and advantage calculation happen.  If True, both happen in
        agent.train(). If False, value prediction is computed during data
        collection. This argument must be set to `False` if mini batch learning
        is enabled.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If true, gradient summaries will be written.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall under
        that name. Defaults to the class name.

    Raises:
      ValueError: If the actor_net is not a DistributionNetwork or value_net is
        not a Network.
    """
    if not isinstance(actor_net, network.DistributionNetwork):
      raise ValueError(
          'actor_net must be an instance of a network.DistributionNetwork.')
    if not isinstance(value_net, network.Network):
      raise ValueError('value_net must be an instance of a network.Network.')

    actor_net.create_variables()
    value_net.create_variables()

    tf.Module.__init__(self, name=name)

    self._optimizer = optimizer
    self._actor_net = actor_net
    self._value_net = value_net
    self._importance_ratio_clipping = importance_ratio_clipping
    self._lambda = lambda_value
    self._discount_factor = discount_factor
    self._entropy_regularization = entropy_regularization
    self._policy_l2_reg = policy_l2_reg
    self._value_function_l2_reg = value_function_l2_reg
    self._shared_vars_l2_reg = shared_vars_l2_reg
    self._value_pred_loss_coef = value_pred_loss_coef
    self._num_epochs = num_epochs
    self._use_gae = use_gae
    self._use_td_lambda_return = use_td_lambda_return
    self._reward_norm_clipping = reward_norm_clipping
    self._log_prob_clipping = log_prob_clipping
    self._kl_cutoff_factor = kl_cutoff_factor
    self._kl_cutoff_coef = kl_cutoff_coef
    self._adaptive_kl_target = adaptive_kl_target
    self._adaptive_kl_tolerance = adaptive_kl_tolerance
    self._gradient_clipping = gradient_clipping or 0.0
    self._check_numerics = check_numerics
    self._compute_value_and_advantage_in_train = (
        compute_value_and_advantage_in_train)

    if initial_adaptive_kl_beta > 0.0:
      # TODO(kbanoop): Rename create_variable.
      self._adaptive_kl_beta = common.create_variable(
          'adaptive_kl_beta', initial_adaptive_kl_beta, dtype=tf.float32)
    else:
      self._adaptive_kl_beta = None

    self._reward_normalizer = None
    if normalize_rewards:
      self._reward_normalizer = tensor_normalizer.StreamingTensorNormalizer(
          tensor_spec.TensorSpec([], tf.float32), scope='normalize_reward')

    self._observation_normalizer = None
    if normalize_observations:
      self._observation_normalizer = (
          tensor_normalizer.StreamingTensorNormalizer(
              time_step_spec.observation, scope='normalize_observations'))

    policy = greedy_policy.GreedyPolicy(
        ppo_policy.PPOPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=actor_net,
            value_network=value_net,
            observation_normalizer=self._observation_normalizer,
            clip=False,
            collect=False))

    collect_policy = ppo_policy.PPOPolicy(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        actor_network=actor_net,
        value_network=value_net,
        observation_normalizer=self._observation_normalizer,
        clip=False,
        collect=True,
        compute_value_and_advantage_in_train=(
            self._compute_value_and_advantage_in_train),
    )

    self._action_distribution_spec = (self._actor_net.output_spec)

    super(PPOAgent, self).__init__(
        time_step_spec,
        action_spec,
        policy,
        collect_policy,
        train_sequence_length=None,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step_counter)
예제 #15
0
    def testPolicyStepWithActionMaskTurnedOn(self):
        # Creat specs with action constraints (mask).
        num_categories = 5
        observation_tensor_spec = (
            tensor_spec.TensorSpec(shape=(3, ),
                                   dtype=tf.int64,
                                   name='network_spec'),
            tensor_spec.TensorSpec(shape=(num_categories, ),
                                   dtype=tf.bool,
                                   name='mask_spec'),
        )
        network_spec, _ = observation_tensor_spec
        action_tensor_spec = tensor_spec.BoundedTensorSpec((1, ), tf.int32, 0,
                                                           num_categories - 1)

        # Create policy with splitter.
        def splitter_fn(observation_and_mask):
            return observation_and_mask[0], observation_and_mask[1]

        actor_network = mask_splitter_network.MaskSplitterNetwork(
            splitter_fn,
            actor_distribution_network.ActorDistributionNetwork(
                network_spec, action_tensor_spec),
            passthrough_mask=True)
        value_network = mask_splitter_network.MaskSplitterNetwork(
            splitter_fn, value_net.ValueNetwork(network_spec))
        policy = ppo_policy.PPOPolicy(
            ts.time_step_spec(observation_tensor_spec),
            action_tensor_spec,
            actor_network=actor_network,
            value_network=value_network,
            clip=False)

        # Take a step.
        mask = np.array([True, False, True, False, True], dtype=np.bool)
        self.assertLen(mask, num_categories)
        time_step = ts.TimeStep(step_type=tf.constant([1], dtype=tf.int32),
                                reward=tf.constant([1], dtype=tf.float32),
                                discount=tf.constant([1], dtype=tf.float32),
                                observation=(tf.constant(
                                    [[1, 2, 3], [4, 5, 6]], dtype=tf.int64),
                                             tf.constant([mask.tolist()],
                                                         dtype=tf.bool)))
        action_step = policy.action(time_step)

        # Check the shape and type of the resulted action step.
        self.assertEqual(action_step.action.shape.as_list(), [2, 1])
        self.assertEqual(action_step.action.dtype, tf.int32)
        self.evaluate(tf.compat.v1.global_variables_initializer())

        # Check the actions in general and with respect to masking.
        actions = self.evaluate(action_step.action)
        self.assertTrue(np.all(actions >= action_tensor_spec.minimum))
        self.assertTrue(np.all(actions <= action_tensor_spec.maximum))

        # Check the logits.
        logits = np.array(self.evaluate(
            action_step.info['dist_params']['logits']),
                          dtype=np.float32)
        masked_actions = np.array(range(len(mask)))[~mask]
        self.assertTrue(
            np.all(logits[:, :, masked_actions] == np.finfo(np.float32).min))
        valid_actions = np.array(range(len(mask)))[mask]
        self.assertTrue(
            np.all(logits[:, :, valid_actions] > np.finfo(np.float32).min))