示例#1
0
    def _value_loss(self, observ, reward, length):
        """Compute the loss function for the value baseline.

    The value loss is the difference between empirical and approximated returns
    over the collected episodes. Returns the loss tensor and a summary strin.

    Args:
      observ: Sequences of observations.
      reward: Sequences of reward.
      length: Batch of sequence lengths.

    Returns:
      Tuple of loss tensor and summary tensor.
    """
        with tf.name_scope('value_loss'):
            value = self._network(observ, length).value
            return_ = utility.discounted_return(reward, length,
                                                self._config.discount)
            advantage = return_ - value
            value_loss = 0.5 * self._mask(advantage**2, length)
            summary = tf.summary.merge([
                tf.summary.histogram('value_loss', value_loss),
                tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))
            ])
            value_loss = tf.reduce_mean(value_loss)
            return tf.check_numerics(value_loss, 'value_loss'), summary
示例#2
0
    def _update_policy(self, observ, action, old_mean, old_logstd, reward,
                       length):
        """Perform multiple update steps of the policy.

    The advantage is computed once at the beginning and shared across
    iterations. We need to decide for the summary of one iteration, and thus
    choose the one after half of the iterations.

    Args:
      observ: Sequences of observations.
      action: Sequences of actions.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      reward: Sequences of rewards.
      length: Batch of sequence lengths.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('update_policy'):
            return_ = utility.discounted_return(reward, length,
                                                self._config.discount)
            value = self._network(observ, length).value
            if self._config.gae_lambda:
                advantage = utility.lambda_return(reward, value, length,
                                                  self._config.discount,
                                                  self._config.gae_lambda)
            else:
                advantage = return_ - value
            mean, variance = tf.nn.moments(advantage,
                                           axes=[0, 1],
                                           keep_dims=True)
            advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
            advantage = tf.Print(
                advantage, [tf.reduce_mean(return_),
                            tf.reduce_mean(value)], 'return and value: ')
            advantage = tf.Print(advantage, [tf.reduce_mean(advantage)],
                                 'normalized advantage: ')
            # pylint: disable=g-long-lambda
            loss, summary = tf.scan(
                lambda _1, _2: self._update_policy_step(
                    observ, action, old_mean, old_logstd, advantage, length),
                tf.range(self._config.update_epochs_policy), [0., ''],
                parallel_iterations=1)
            print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'policy loss: ')
            with tf.control_dependencies([loss, print_loss]):
                return summary[self._config.update_epochs_policy // 2]