예제 #1
0
 def _create_variables(self):
     """Creates the variables needed for EMATensorNormalizer."""
     self._mean_moving_avg = nest.map_structure(
         lambda spec: create_counter('mean', 0, spec.shape, tf.float32),
         self._tensor_spec)
     self._var_moving_avg = nest.map_structure(
         lambda spec: create_counter('var', 1, spec.shape, tf.float32),
         self._tensor_spec)
예제 #2
0
 def _create_variables(self):
     """Uses self._scope and creates all variables needed for the normalizer."""
     self._count = nest.map_structure(
         lambda spec: create_counter('count', 1e-8, spec.shape, tf.float32),
         self._tensor_spec)
     self._mean_sum = nest.map_structure(
         lambda spec: create_counter('mean_sum', 0, spec.shape, tf.float32),
         self._tensor_spec)
     self._var_sum = nest.map_structure(
         lambda spec: create_counter('var_sum', 0, spec.shape, tf.float32),
         self._tensor_spec)
예제 #3
0
 def __init__(self, name='NumberOfEpisodes', dtype=tf.int64):
     super(NumberOfEpisodes, self).__init__(name=name)
     self.dtype = dtype
     self.number_episodes = common.create_counter(initial_value=0,
                                                  dtype=self.dtype,
                                                  shape=(),
                                                  name='number_episodes')
예제 #4
0
 def __init__(self, name='EnvironmentSteps', dtype=tf.int64):
     super(EnvironmentSteps, self).__init__(name=name)
     self.dtype = dtype
     self.environment_steps = common.create_counter(
         initial_value=0,
         dtype=self.dtype,
         shape=(),
         name='environment_steps')
예제 #5
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 critic_network,
                 actor_network,
                 actor_optimizer,
                 critic_optimizer,
                 alpha_optimizer,
                 actor_policy_ctor=actor_policy.ActorPolicy,
                 squash_actions=True,
                 target_update_tau=1.0,
                 target_update_period=1,
                 td_errors_loss_fn=tf.math.squared_difference,
                 gamma=1.0,
                 reward_scale_factor=1.0,
                 initial_log_alpha=0.0,
                 target_entropy=None,
                 gradient_clipping=None,
                 debug_summaries=False,
                 summarize_grads_and_vars=False,
                 name=None):
        """Creates a SAC Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      critic_network: A function critic_network((observations, actions)) that
        returns the q_values for each observation and action.
      actor_network: A function actor_network(observation, action_spec) that
       returns action distribution.
      actor_optimizer: The optimizer to use for the actor network.
      critic_optimizer: The default optimizer to use for the critic network.
      alpha_optimizer: The default optimizer to use for the alpha variable.
      actor_policy_ctor: The policy class to use.
      squash_actions: Whether or not to use tanh to squash actions between
        -1 and 1.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      td_errors_loss_fn:  A function for computing the elementwise TD errors
        loss.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      initial_log_alpha: Initial value for log_alpha.
      target_entropy: The target average policy entropy, for updating alpha.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)

        self._critic_network1 = critic_network
        self._critic_network2 = critic_network.copy(name='CriticNetwork2')
        self._target_critic_network1 = critic_network.copy(
            name='TargetCriticNetwork1')
        self._target_critic_network2 = critic_network.copy(
            name='TargetCriticNetwork2')
        self._actor_network = actor_network

        policy = actor_policy_ctor(time_step_spec=time_step_spec,
                                   action_spec=action_spec,
                                   actor_network=self._actor_network)

        self._log_alpha = common_utils.create_counter(
            'initial_log_alpha',
            initial_value=initial_log_alpha,
            dtype=tf.float32,
            trainable=True)

        # If target_entropy was not passed, set it to negative of the total number
        # of action dimensions.
        if target_entropy is None:
            flat_action_spec = tf.nest.flatten(action_spec)
            target_entropy = -np.sum([
                np.product(single_spec.shape.as_list())
                for single_spec in flat_action_spec
            ])

        self._squash_actions = squash_actions
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer
        self._alpha_optimizer = alpha_optimizer
        self._td_errors_loss_fn = td_errors_loss_fn
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._target_entropy = target_entropy
        self._gradient_clipping = gradient_clipping
        self._debug_summaries = debug_summaries
        self._summarize_grads_and_vars = summarize_grads_and_vars

        super(SacAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy=policy,
                             collect_policy=policy,
                             train_sequence_length=2,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars)
예제 #6
0
 def testInitialValueWithShape(self):
     counter = common.create_counter('counter', 1, shape=(2, ))
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertAllEqual(self.evaluate(counter), [1, 1])
예제 #7
0
 def testMultipleCounters(self):
     counter1 = common.create_counter('counter', 1)
     counter2 = common.create_counter('counter', 2)
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertEqual(self.evaluate(counter1), 1)
     self.assertEqual(self.evaluate(counter2), 2)
예제 #8
0
 def testIncrement(self):
     counter = common.create_counter('counter', 0)
     inc_counter = counter.assign_add(1)
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertEqual(self.evaluate(inc_counter), 1)
예제 #9
0
 def testInitialValue(self):
     counter = common.create_counter('counter', 1)
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertEqual(self.evaluate(counter), 1)
예제 #10
0
 def testDefaults(self):
     counter = common.create_counter('counter')
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertEqual(self.evaluate(counter), 0)
예제 #11
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 optimizer=None,
                 actor_net=None,
                 value_net=None,
                 importance_ratio_clipping=0.0,
                 lambda_value=0.95,
                 discount_factor=0.99,
                 entropy_regularization=0.0,
                 policy_l2_reg=0.0,
                 value_function_l2_reg=0.0,
                 value_pred_loss_coef=0.5,
                 num_epochs=25,
                 use_gae=False,
                 use_td_lambda_return=False,
                 normalize_rewards=True,
                 reward_norm_clipping=10.0,
                 normalize_observations=True,
                 log_prob_clipping=0.0,
                 kl_cutoff_factor=2.0,
                 kl_cutoff_coef=1000.0,
                 initial_adaptive_kl_beta=1.0,
                 adaptive_kl_target=0.01,
                 adaptive_kl_tolerance=0.3,
                 gradient_clipping=None,
                 check_numerics=False,
                 debug_summaries=False,
                 summarize_grads_and_vars=False):
        """Creates a PPO Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      optimizer: Optimizer to use for the agent.
      actor_net: A function actor_net(observations, action_spec) that returns
        tensor of action distribution params for each observation. Takes nested
        observation and returns nested action.
      value_net: A function value_net(time_steps) that returns value tensor
        from neural net predictions for each observation. Takes nested
        observation and returns batch of value_preds.
      importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective.
        For more detail, see explanation at the top of the doc.
      lambda_value: Lambda parameter for TD-lambda computation.
      discount_factor: Discount factor for return computation.
      entropy_regularization: Coefficient for entropy regularization loss term.
      policy_l2_reg: Coefficient for l2 regularization of policy weights.
      value_function_l2_reg: Coefficient for l2 regularization of value function
        weights.
      value_pred_loss_coef: Multiplier for value prediction loss to balance
        with policy gradient loss.
      num_epochs: Number of epochs for computing policy updates.
      use_gae: If True (default False), uses generalized advantage estimation
        for computing per-timestep advantage. Else, just subtracts value
        predictions from empirical return.
      use_td_lambda_return: If True (default False), uses td_lambda_return for
        training value function.
        (td_lambda_return = gae_advantage + value_predictions)
      normalize_rewards: If true, keeps moving variance of rewards and
        normalizes incoming rewards.
      reward_norm_clipping: Value above an below to clip normalized reward.
      normalize_observations: If true, keeps moving mean and variance of
        observations and normalizes incoming observations.
      log_prob_clipping: +/- value for clipping log probs to prevent inf / NaN
        values.  Default: no clipping.
      kl_cutoff_factor: If policy KL changes more than this much for any single
        timestep, adds a squared KL penalty to loss function.
      kl_cutoff_coef: Loss coefficient for kl cutoff term.
      initial_adaptive_kl_beta: Initial value for beta coefficient of adaptive
        kl penalty.
      adaptive_kl_target: Desired kl target for policy updates. If actual kl is
        far from this target, adaptive_kl_beta will be updated.
      adaptive_kl_tolerance: A tolerance for adaptive_kl_beta. Mean KL above
        (1 + tol) * adaptive_kl_target, or below (1 - tol) * adaptive_kl_target,
        will cause adaptive_kl_beta to be updated.
      gradient_clipping: Norm length to clip gradients.  Default: no clipping.
      check_numerics: If true, adds tf.check_numerics to help find NaN / Inf
        values. For debugging only.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If true, gradient summaries will be written.

    Raises:
      ValueError: If the actor_net is not a DistributionNetwork.
    """
        if not isinstance(actor_net, network.DistributionNetwork):
            raise ValueError(
                'actor_net must be an instance of a DistributionNetwork.')

        self._optimizer = optimizer
        self._actor_net = actor_net
        self._value_net = value_net
        self._importance_ratio_clipping = importance_ratio_clipping
        self._lambda = lambda_value
        self._discount_factor = discount_factor
        self._entropy_regularization = entropy_regularization
        self._policy_l2_reg = policy_l2_reg
        self._value_function_l2_reg = value_function_l2_reg
        self._value_pred_loss_coef = value_pred_loss_coef
        self._num_epochs = num_epochs
        self._use_gae = use_gae
        self._use_td_lambda_return = use_td_lambda_return
        self._reward_norm_clipping = reward_norm_clipping
        self._log_prob_clipping = log_prob_clipping
        self._kl_cutoff_factor = kl_cutoff_factor
        self._kl_cutoff_coef = kl_cutoff_coef
        self._adaptive_kl_target = adaptive_kl_target
        self._adaptive_kl_tolerance = adaptive_kl_tolerance
        self._gradient_clipping = gradient_clipping or 0.0
        self._check_numerics = check_numerics

        if initial_adaptive_kl_beta > 0.0:
            # TODO(kbanoop): Rename create_variable.
            self._adaptive_kl_beta = common_utils.create_counter(
                'adaptive_kl_beta', initial_adaptive_kl_beta, dtype=tf.float32)
        else:
            self._adaptive_kl_beta = None

        self._reward_normalizer = None
        if normalize_rewards:
            self._reward_normalizer = tensor_normalizer.StreamingTensorNormalizer(
                tensor_spec.TensorSpec([], tf.float32),
                scope='normalize_reward')

        self._observation_normalizer = None
        if normalize_observations:
            self._observation_normalizer = (
                tensor_normalizer.StreamingTensorNormalizer(
                    time_step_spec.observation,
                    scope='normalize_observations'))

        policy = greedy_policy.GreedyPolicy(
            ppo_policy.PPOPolicy(
                time_step_spec=time_step_spec,
                action_spec=action_spec,
                actor_network=actor_net,
                value_network=value_net,
                observation_normalizer=self._observation_normalizer,
                clip=False,
                collect=False))

        collect_policy = ppo_policy.PPOPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=actor_net,
            value_network=value_net,
            observation_normalizer=self._observation_normalizer,
            clip=False,
            collect=True)

        self._action_distribution_spec = (self._actor_net.output_spec)

        super(PPOAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars)