Exemplo n.º 1
0
    def compute_advantages(self, rewards, returns, discounts, value_preds):
        """Compute advantages, optionally using GAE.

    Based on baselines ppo1 implementation. Removes final timestep, as it needs
    to use this timestep for next-step value prediction for TD error
    computation.

    Args:
      rewards: Tensor of per-timestep rewards.
      returns: Tensor of per-timestep returns.
      discounts: Tensor of per-timestep discounts. Zero for terminal timesteps.
      value_preds: Cached value estimates from the data-collection policy.

    Returns:
      advantages: Tensor of length (len(rewards) - 1), because the final
        timestep is just used for next-step value prediction.
    """
        # Arg value_preds was appended with final next_step value. Make tensors
        #   next_value_preds by stripping first and last elements respectively.
        final_value_pred = value_preds[:, -1]
        value_preds = value_preds[:, :-1]

        if not self._use_gae:
            with tf.name_scope('empirical_advantage'):
                advantages = returns - value_preds
        else:
            advantages = value_ops.generalized_advantage_estimation(
                values=value_preds,
                final_value=final_value_pred,
                rewards=rewards,
                discounts=discounts,
                td_lambda=self._lambda,
                time_major=False)

        return advantages
Exemplo n.º 2
0
def compute_return_and_advantage(discount_factor, lambda_, rewards,
                                 next_time_steps, value_preds):
    """Compute the TD-lambda return and GAE(lambda) advantages.
    Normalization will be applied to the advantages.

    :param discount_factor: discount in [0,1]
    :param lambda_: trace_decay in [0,1]
    :param rewards: next_step rewards (possibly normalized)
    :param next_time_steps: batched tensor of TimeStep tuples after action is taken.
    :param value_preds: Batched value prediction tensor. Should have one more entry
        in time index than time_steps, with the final value corresponding to the
        value prediction of the final state.

    :return: tuple of (return, normalized_advantage), both are batched tensors.
    """
    discounts = next_time_steps.discount * tf.constant(discount_factor,
                                                       dtype=tf.float32)

    # Make discount 0.0 at end of each episode to restart cumulative sum
    #   end of each episode.
    episode_mask = common.get_episode_mask(next_time_steps)
    discounts *= episode_mask

    # Arg value_preds was appended with final next_step value. Make tensors
    #   next_value_preds by stripping first and last elements respectively.
    final_value_pred = value_preds[:, -1]
    value_preds = value_preds[:, :-1]

    # Compute advantages.
    advantages = value_ops.generalized_advantage_estimation(
        values=value_preds,
        final_value=final_value_pred,
        rewards=rewards,
        discounts=discounts,
        td_lambda=lambda_,
        time_major=False,
    )
    normalized_advantages = _normalize_advantages(advantages, axes=(0, 1))

    # compute TD-Lambda returns.
    returns = tf.add(advantages, value_preds, name="td_lambda_returns")

    return returns, normalized_advantages
Exemplo n.º 3
0
    def testAdvantagesAreCorrectlyComputed(self, batch_size, num_time_steps,
                                           td_lambda):
        rewards = np.random.rand(num_time_steps, batch_size).astype(np.float32)
        discounts = np.random.rand(num_time_steps,
                                   batch_size).astype(np.float32)
        values = np.random.rand(num_time_steps, batch_size).astype(np.float32)
        final_value = np.random.rand(batch_size).astype(np.float32)
        ground_truth = _naive_gae_as_ground_truth(discounts=discounts,
                                                  rewards=rewards,
                                                  values=values,
                                                  final_value=final_value,
                                                  td_lambda=td_lambda)

        advantages = value_ops.generalized_advantage_estimation(
            discounts=discounts,
            rewards=rewards,
            values=values,
            final_value=final_value,
            td_lambda=td_lambda)

        self.assertAllClose(advantages, ground_truth)
Exemplo n.º 4
0
    def testAdvantagesMatchPrecomputedResult(self):
        advantages = value_ops.generalized_advantage_estimation(
            discounts=tf.constant(
                [[1.0, 1.0, 1.0, 1.0, 0.0, 0.9, 0.9, 0.9, 0.0],
                 [1.0, 1.0, 1.0, 1.0, 0.0, 0.9, 0.9, 0.9, 0.0]]),
            rewards=tf.fill([2, 9], 1.0),
            values=tf.fill([2, 9], 3.0),
            final_value=tf.fill([2], 3.0),
            td_lambda=0.95,
            time_major=False)

        # Precomputed according to equation (16) in paper.
        ground_truth = tf.constant([[
            2.0808625, 1.13775, 0.145, -0.9, -2.0, 0.56016475, -0.16355, -1.01,
            -2.0
        ],
                                    [
                                        2.0808625, 1.13775, 0.145, -0.9, -2.0,
                                        0.56016475, -0.16355, -1.01, -2.0
                                    ]])

        self.assertAllClose(advantages, ground_truth)