Exemplo n.º 1
0
    def compute_return(self, next_time_steps, value_preds):
        """Compute the Monte Carlo return
        Args:
          next_time_steps: batched tensor of TimeStep tuples after action is taken.
          value_preds: Batched value prediction tensor. Should have one more entry
            in time index than time_steps, with the final value corresponding to the
            value prediction of the final state.
        Returns:
          tuple of (return, normalized_advantage), both are batched tensors.
        """
        discounts = next_time_steps.discount * tf.constant(
            self._discount_factor, dtype=tf.float32)

        rewards = next_time_steps.reward

        # Normalize rewards if self._reward_normalizer is defined.
        if self._reward_normalizer:
            rewards = self._reward_normalizer.normalize(
                rewards,
                center_mean=False,
                clip_value=self._reward_norm_clipping)

        # Make discount 0.0 at end of each episode to restart cumulative sum
        #   end of each episode.
        episode_mask = common.get_episode_mask(next_time_steps)
        discounts *= episode_mask

        # Compute Monte Carlo returns.
        final_vpreds = value_preds[:, -1, :]
        returns = cat_discounted_return(rewards, discounts, final_vpreds)

        return returns
    def compute_return_and_advantage(self, next_time_steps, value_preds):
        """Compute the Monte Carlo return and advantage.
    Normalazation will be applied to the computed returns and advantages if
    it's enabled.
    Args:
      next_time_steps: batched tensor of TimeStep tuples after action is taken.
      value_preds: Batched value prediction tensor. Should have one more entry
        in time index than time_steps, with the final value corresponding to the
        value prediction of the final state.
    Returns:
      tuple of (return, normalized_advantage), both are batched tensors.
    """
        #discounts = discounts * tf.constant(
        #  self._discount_factor, dtype=tf.float32)

        discounts = next_time_steps.discount * tf.constant(
            self._discount_factor, dtype=tf.float32)

        rewards = next_time_steps.reward

        # Normalize rewards if self._reward_normalizer is defined.
        if self._reward_normalizer:
            rewards = self._reward_normalizer.normalize(
                rewards,
                center_mean=False,
                clip_value=self._reward_norm_clipping)

        #print("rew_n",rewards)
        # Make discount 0.0 at end of each episode to restart cumulative sum
        #   end of each episode.
        episode_mask = common.get_episode_mask(next_time_steps)
        discounts *= episode_mask

        # Compute Monte Carlo returns.
        returns = value_ops.discounted_return(rewards,
                                              discounts,
                                              time_major=False)
        #print("RET",returns)
        # Compute advantages.
        advantages = self.compute_advantages(rewards, returns, discounts,
                                             value_preds)
        normalized_advantages = _normalize_advantages(advantages, axes=(0, 1))

        # Return TD-Lambda returns if both use_td_lambda_return and use_gae.
        if self._use_td_lambda_return:
            if not self._use_gae:
                logging.warning(
                    'use_td_lambda_return was True, but use_gae was '
                    'False. Using Monte Carlo return.')
            else:
                returns = tf.add(advantages,
                                 value_preds[:, :-1],
                                 name='td_lambda_returns')

        return returns, normalized_advantages
Exemplo n.º 3
0
  def test(self):
    first = ts.StepType.FIRST
    mid = ts.StepType.MID
    last = ts.StepType.LAST
    step_types = [first, mid, mid, last, mid, mid, mid, last]
    discounts = [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0]
    time_steps = ts.TimeStep(
        step_type=step_types, discount=discounts, reward=discounts,
        observation=discounts)
    episode_mask = common.get_episode_mask(time_steps)

    expected_mask = [1, 1, 1, 0, 1, 1, 1, 0]
    self.evaluate(tf.global_variables_initializer())
    self.assertAllEqual(expected_mask, self.evaluate(episode_mask))
Exemplo n.º 4
0
def compute_return_and_advantage(discount_factor, lambda_, rewards,
                                 next_time_steps, value_preds):
    """Compute the TD-lambda return and GAE(lambda) advantages.
    Normalization will be applied to the advantages.

    :param discount_factor: discount in [0,1]
    :param lambda_: trace_decay in [0,1]
    :param rewards: next_step rewards (possibly normalized)
    :param next_time_steps: batched tensor of TimeStep tuples after action is taken.
    :param value_preds: Batched value prediction tensor. Should have one more entry
        in time index than time_steps, with the final value corresponding to the
        value prediction of the final state.

    :return: tuple of (return, normalized_advantage), both are batched tensors.
    """
    discounts = next_time_steps.discount * tf.constant(discount_factor,
                                                       dtype=tf.float32)

    # Make discount 0.0 at end of each episode to restart cumulative sum
    #   end of each episode.
    episode_mask = common.get_episode_mask(next_time_steps)
    discounts *= episode_mask

    # Arg value_preds was appended with final next_step value. Make tensors
    #   next_value_preds by stripping first and last elements respectively.
    final_value_pred = value_preds[:, -1]
    value_preds = value_preds[:, :-1]

    # Compute advantages.
    advantages = value_ops.generalized_advantage_estimation(
        values=value_preds,
        final_value=final_value_pred,
        rewards=rewards,
        discounts=discounts,
        td_lambda=lambda_,
        time_major=False,
    )
    normalized_advantages = _normalize_advantages(advantages, axes=(0, 1))

    # compute TD-Lambda returns.
    returns = tf.add(advantages, value_preds, name="td_lambda_returns")

    return returns, normalized_advantages
Exemplo n.º 5
0
  def compute_return_and_advantage(self, next_time_steps, value_preds):
    """Compute the Monte Carlo return and advantage.

    Normalazation will be applied to the computed returns and advantages if
    it's enabled.

    Args:
      next_time_steps: batched tensor of TimeStep tuples after action is taken.
      value_preds: Batched value prediction tensor. Should have one more entry
        in time index than time_steps, with the final value corresponding to the
        value prediction of the final state.

    Returns:
      tuple of (return, normalized_advantage), both are batched tensors.
    """
    discounts = next_time_steps.discount * tf.constant(
        self._discount_factor, dtype=tf.float32)

    rewards = next_time_steps.reward
    if self._debug_summaries:
      # Summarize rewards before they get normalized below.
      tf.compat.v2.summary.histogram(
          name='rewards', data=rewards, step=self.train_step_counter)
      tf.compat.v2.summary.scalar(
          name='rewards_mean',
          data=tf.reduce_mean(rewards),
          step=self.train_step_counter)

    # Normalize rewards if self._reward_normalizer is defined.
    if self._reward_normalizer:
      rewards = self._reward_normalizer.normalize(
          rewards, center_mean=False, clip_value=self._reward_norm_clipping)
      if self._debug_summaries:
        tf.compat.v2.summary.histogram(
            name='rewards_normalized',
            data=rewards,
            step=self.train_step_counter)
        tf.compat.v2.summary.scalar(
            name='rewards_normalized_mean',
            data=tf.reduce_mean(rewards),
            step=self.train_step_counter)

    # Make discount 0.0 at end of each episode to restart cumulative sum
    #   end of each episode.
    episode_mask = common.get_episode_mask(next_time_steps)
    discounts *= episode_mask

    # Compute Monte Carlo returns. Data from incomplete trajectories, not
    #   containing the end of an episode will also be used, with a bootstrapped
    #   estimation from the last value.
    # Note that when a trajectory driver is used, then the final step is
    #   terminal, the bootstrapped estimation will not be used, as it will be
    #   multiplied by zero (the discount on the last step).
    final_value_bootstrapped = value_preds[:, -1]
    returns = value_ops.discounted_return(
        rewards,
        discounts,
        time_major=False,
        final_value=final_value_bootstrapped)
    if self._debug_summaries:
      tf.compat.v2.summary.histogram(
          name='returns', data=returns, step=self.train_step_counter)

    # Compute advantages.
    advantages = self.compute_advantages(rewards, returns, discounts,
                                         value_preds)
    normalized_advantages = _normalize_advantages(advantages, axes=(0, 1))
    if self._debug_summaries:
      tf.compat.v2.summary.histogram(
          name='advantages', data=advantages, step=self.train_step_counter)
      tf.compat.v2.summary.histogram(
          name='advantages_normalized',
          data=normalized_advantages,
          step=self.train_step_counter)

    # Return TD-Lambda returns if both use_td_lambda_return and use_gae.
    if self._use_td_lambda_return:
      if not self._use_gae:
        logging.warning('use_td_lambda_return was True, but use_gae was '
                        'False. Using Monte Carlo return.')
      else:
        returns = tf.add(
            advantages, value_preds[:, :-1], name='td_lambda_returns')

    return returns, normalized_advantages