예제 #1
0
    def testTimeMajorBatchMajorDiscountedReturnsAreSame(
            self, num_time_steps, batch_size, with_final_value):
        rewards = np.random.rand(num_time_steps, batch_size).astype(np.float32)
        discounts = np.random.rand(num_time_steps,
                                   batch_size).astype(np.float32)
        final_value = np.random.rand(batch_size).astype(
            np.float32) if with_final_value else None

        time_major_discounted_return = value_ops.discounted_return(
            rewards=rewards, discounts=discounts, final_value=final_value)

        batch_major_discounted_return = value_ops.discounted_return(
            rewards=tf.transpose(a=rewards),
            discounts=tf.transpose(a=discounts),
            final_value=final_value,
            time_major=False)
        self.assertAllClose(time_major_discounted_return,
                            tf.transpose(a=batch_major_discounted_return))
    def compute_return_and_advantage(self, next_time_steps, value_preds):
        """Compute the Monte Carlo return and advantage.
    Normalazation will be applied to the computed returns and advantages if
    it's enabled.
    Args:
      next_time_steps: batched tensor of TimeStep tuples after action is taken.
      value_preds: Batched value prediction tensor. Should have one more entry
        in time index than time_steps, with the final value corresponding to the
        value prediction of the final state.
    Returns:
      tuple of (return, normalized_advantage), both are batched tensors.
    """
        #discounts = discounts * tf.constant(
        #  self._discount_factor, dtype=tf.float32)

        discounts = next_time_steps.discount * tf.constant(
            self._discount_factor, dtype=tf.float32)

        rewards = next_time_steps.reward

        # Normalize rewards if self._reward_normalizer is defined.
        if self._reward_normalizer:
            rewards = self._reward_normalizer.normalize(
                rewards,
                center_mean=False,
                clip_value=self._reward_norm_clipping)

        #print("rew_n",rewards)
        # Make discount 0.0 at end of each episode to restart cumulative sum
        #   end of each episode.
        episode_mask = common.get_episode_mask(next_time_steps)
        discounts *= episode_mask

        # Compute Monte Carlo returns.
        returns = value_ops.discounted_return(rewards,
                                              discounts,
                                              time_major=False)
        #print("RET",returns)
        # Compute advantages.
        advantages = self.compute_advantages(rewards, returns, discounts,
                                             value_preds)
        normalized_advantages = _normalize_advantages(advantages, axes=(0, 1))

        # Return TD-Lambda returns if both use_td_lambda_return and use_gae.
        if self._use_td_lambda_return:
            if not self._use_gae:
                logging.warning(
                    'use_td_lambda_return was True, but use_gae was '
                    'False. Using Monte Carlo return.')
            else:
                returns = tf.add(advantages,
                                 value_preds[:, :-1],
                                 name='td_lambda_returns')

        return returns, normalized_advantages
예제 #3
0
    def _train(self, experience, weights=None):
        # Add a mask to ensure we reset the return calculation at episode
        # boundaries. This is needed in cases where episodes are truncated before
        # reaching a terminal state.
        non_last_mask = tf.cast(
            tf.math.not_equal(experience.next_step_type, ts.StepType.LAST),
            tf.float32)
        discounts = non_last_mask * experience.discount * self._gamma
        returns = value_ops.discounted_return(experience.reward,
                                              discounts,
                                              time_major=False)

        if self._debug_summaries:
            tf.compat.v2.summary.histogram(name='rewards',
                                           data=experience.reward,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='discounts',
                                           data=experience.discount,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='returns',
                                           data=returns,
                                           step=self.train_step_counter)

        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)

        with tf.GradientTape() as tape:
            loss_info = self.total_loss(time_step,
                                        experience.action,
                                        tf.stop_gradient(returns),
                                        weights=weights)
            tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
        variables_to_train = self._actor_network.trainable_weights
        if self._baseline:
            variables_to_train += self._value_network.trainable_weights
        grads = tape.gradient(loss_info.loss, variables_to_train)

        grads_and_vars = list(zip(grads, variables_to_train))
        if self._gradient_clipping:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars,
                                        global_step=self.train_step_counter)

        return tf.nest.map_structure(tf.identity, loss_info)
예제 #4
0
    def _train(self, experience, weights=None):
        returns = value_ops.discounted_return(experience.reward,
                                              experience.discount,
                                              time_major=False)

        if self._debug_summaries:
            tf.compat.v2.summary.histogram(name='rewards',
                                           data=experience.reward,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='discounts',
                                           data=experience.discount,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='returns',
                                           data=returns,
                                           step=self.train_step_counter)

        # TODO(b/126592060): replace with tensor normalizer.
        if self._normalize_returns:
            returns = _standard_normalize(returns, axes=(0, 1))
            if self._debug_summaries:
                tf.compat.v2.summary.histogram(name='normalized_returns',
                                               data=returns,
                                               step=self.train_step_counter)

        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)

        variables_to_train = self._actor_network.variables
        with tf.GradientTape() as tape:
            loss_info = self._loss(time_step,
                                   experience.action,
                                   tf.stop_gradient(returns),
                                   weights=weights)
            tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
        grads = tape.gradient(loss_info.loss, variables_to_train)

        grads_and_vars = zip(grads, variables_to_train)
        if self._gradient_clipping:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars,
                                        global_step=self.train_step_counter)

        return tf.nest.map_structure(tf.identity, loss_info)
예제 #5
0
    def testDiscountedReturnWithFinalValueMatchPrecomputedResult(self):

        discounted_return = value_ops.discounted_return(
            rewards=tf.constant([1] * 9, dtype=tf.float32),
            discounts=tf.constant([1, 1, 1, 1, 0, 0.9, 0.9, 0.9, 0.9],
                                  dtype=tf.float32),
            final_value=tf.constant(8, dtype=tf.float32))
        expected = [
            5, 4, 3, 2, 1, 8 * 0.9**4 + 3.439, 8 * 0.9**3 + 2.71,
            8 * 0.9**2 + 1.9, 8 * 0.9 + 1
        ]
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllClose(discounted_return, expected)
예제 #6
0
    def testDiscountedReturnIsCorrectlyComputed(self, num_time_steps,
                                                batch_size, with_final_value):
        rewards = np.random.rand(num_time_steps, batch_size).astype(np.float32)
        discounts = np.random.rand(num_time_steps,
                                   batch_size).astype(np.float32)
        final_value = np.random.rand(batch_size).astype(
            np.float32) if with_final_value else None

        discounted_return = value_ops.discounted_return(
            rewards=rewards, discounts=discounts, final_value=final_value)

        single_discounted_return = value_ops.discounted_return(
            rewards=rewards,
            discounts=discounts,
            final_value=final_value,
            provide_all_returns=False)

        expected = _numpy_discounted_return(rewards=rewards,
                                            discounts=discounts,
                                            final_value=final_value)

        self.assertAllClose(discounted_return, expected)
        self.assertAllClose(single_discounted_return, expected[0])
예제 #7
0
    def _train(self, experience, weights=None):
        # TODO(b/132914246): Use .is_last() to mask the end of each episode.
        returns = value_ops.discounted_return(experience.reward,
                                              experience.discount *
                                              self._gamma,
                                              time_major=False)

        if self._debug_summaries:
            tf.compat.v2.summary.histogram(name='rewards',
                                           data=experience.reward,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='discounts',
                                           data=experience.discount,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='returns',
                                           data=returns,
                                           step=self.train_step_counter)

        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)

        with tf.GradientTape() as tape:
            loss_info = self.total_loss(time_step,
                                        experience.action,
                                        tf.stop_gradient(returns),
                                        weights=weights)
            tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
        variables_to_train = self._actor_network.trainable_weights
        if self._baseline:
            variables_to_train += self._value_network.trainable_weights
        grads = tape.gradient(loss_info.loss, variables_to_train)

        grads_and_vars = zip(grads, variables_to_train)
        if self._gradient_clipping:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars,
                                        global_step=self.train_step_counter)

        return tf.nest.map_structure(tf.identity, loss_info)
예제 #8
0
파일: dqn_agent.py 프로젝트: zircote/agents
    def _loss(self,
              experience,
              td_errors_loss_fn=common.element_wise_huber_loss,
              gamma=1.0,
              reward_scale_factor=1.0,
              weights=None,
              training=False):
        """Computes loss for DQN training.

    Args:
      experience: A batch of experience data in the form of a `Trajectory`. The
        structure of `experience` must match that of `self.policy.step_spec`.
        All tensors in `experience` must be shaped `[batch, time, ...]` where
        `time` must be equal to `self.train_sequence_length` if that
        property is not `None`.
      td_errors_loss_fn: A function(td_targets, predictions) to compute the
        element wise loss.
      gamma: Discount for future rewards.
      reward_scale_factor: Multiplicative factor to scale rewards.
      weights: Optional scalar or elementwise (per-batch-entry) importance
        weights.  The output td_loss will be scaled by these weights, and
        the final scalar loss is the mean of these values.
      training: Whether this loss is being used for training.

    Returns:
      loss: An instance of `DqnLossInfo`.
    Raises:
      ValueError:
        if the number of actions is greater than 1.
    """
        # Check that `experience` includes two outer dimensions [B, T, ...]. This
        # method requires a time dimension to compute the loss properly.
        self._check_trajectory_dimensions(experience)

        squeeze_time_dim = not self._q_network.state_spec
        if self._n_step_update == 1:
            time_steps, policy_steps, next_time_steps = (
                trajectory.experience_to_transitions(experience,
                                                     squeeze_time_dim))
            actions = policy_steps.action
        else:
            # To compute n-step returns, we need the first time steps, the first
            # actions, and the last time steps. Therefore we extract the first and
            # last transitions from our Trajectory.
            first_two_steps = tf.nest.map_structure(lambda x: x[:, :2],
                                                    experience)
            last_two_steps = tf.nest.map_structure(lambda x: x[:, -2:],
                                                   experience)
            time_steps, policy_steps, _ = (
                trajectory.experience_to_transitions(first_two_steps,
                                                     squeeze_time_dim))
            actions = policy_steps.action
            _, _, next_time_steps = (trajectory.experience_to_transitions(
                last_two_steps, squeeze_time_dim))

        with tf.name_scope('loss'):
            q_values = self._compute_q_values(time_steps,
                                              actions,
                                              training=training)

            next_q_values = self._compute_next_q_values(
                next_time_steps, policy_steps.info)

            if self._n_step_update == 1:
                # Special case for n = 1 to avoid a loss of performance.
                td_targets = compute_td_targets(
                    next_q_values,
                    rewards=reward_scale_factor * next_time_steps.reward,
                    discounts=gamma * next_time_steps.discount)
            else:
                # When computing discounted return, we need to throw out the last time
                # index of both reward and discount, which are filled with dummy values
                # to match the dimensions of the observation.
                rewards = reward_scale_factor * experience.reward[:, :-1]
                discounts = gamma * experience.discount[:, :-1]

                # TODO(b/134618876): Properly handle Trajectories that include episode
                # boundaries with nonzero discount.

                td_targets = value_ops.discounted_return(
                    rewards=rewards,
                    discounts=discounts,
                    final_value=next_q_values,
                    time_major=False,
                    provide_all_returns=False)

            valid_mask = tf.cast(~time_steps.is_last(), tf.float32)
            td_error = valid_mask * (td_targets - q_values)

            td_loss = valid_mask * td_errors_loss_fn(td_targets, q_values)

            if nest_utils.is_batched_nested_tensors(time_steps,
                                                    self.time_step_spec,
                                                    num_outer_dims=2):
                # Do a sum over the time dimension.
                td_loss = tf.reduce_sum(input_tensor=td_loss, axis=1)

            # Aggregate across the elements of the batch and add regularization loss.
            # Note: We use an element wise loss above to ensure each element is always
            #   weighted by 1/N where N is the batch size, even when some of the
            #   weights are zero due to boundary transitions. Weighting by 1/K where K
            #   is the actual number of non-zero weight would artificially increase
            #   their contribution in the loss. Think about what would happen as
            #   the number of boundary samples increases.

            agg_loss = common.aggregate_losses(
                per_example_loss=td_loss,
                sample_weight=weights,
                regularization_loss=self._q_network.losses)
            total_loss = agg_loss.total_loss

            losses_dict = {
                'td_loss': agg_loss.weighted,
                'reg_loss': agg_loss.regularization,
                'total_loss': total_loss
            }

            common.summarize_scalar_dict(losses_dict,
                                         step=self.train_step_counter,
                                         name_scope='Losses/')

            if self._summarize_grads_and_vars:
                with tf.name_scope('Variables/'):
                    for var in self._q_network.trainable_weights:
                        tf.compat.v2.summary.histogram(
                            name=var.name.replace(':', '_'),
                            data=var,
                            step=self.train_step_counter)

            if self._debug_summaries:
                diff_q_values = q_values - next_q_values
                common.generate_tensor_summaries('td_error', td_error,
                                                 self.train_step_counter)
                common.generate_tensor_summaries('td_loss', td_loss,
                                                 self.train_step_counter)
                common.generate_tensor_summaries('q_values', q_values,
                                                 self.train_step_counter)
                common.generate_tensor_summaries('next_q_values',
                                                 next_q_values,
                                                 self.train_step_counter)
                common.generate_tensor_summaries('diff_q_values',
                                                 diff_q_values,
                                                 self.train_step_counter)

            return tf_agent.LossInfo(
                total_loss, DqnLossInfo(td_loss=td_loss, td_error=td_error))
예제 #9
0
    def _loss(self,
              experience,
              td_errors_loss_fn=tf.losses.huber_loss,
              gamma=1.0,
              reward_scale_factor=1.0,
              weights=None):
        """Computes critic loss for CategoricalDQN training.

    See Algorithm 1 and the discussion immediately preceding it in page 6 of
    "A Distributional Perspective on Reinforcement Learning"
      Bellemare et al., 2017
      https://arxiv.org/abs/1707.06887

    Args:
      experience: A batch of experience data in the form of a `Trajectory`. The
        structure of `experience` must match that of `self.policy.step_spec`.
        All tensors in `experience` must be shaped `[batch, time, ...]` where
        `time` must be equal to `self.required_experience_time_steps` if that
        property is not `None`.
      td_errors_loss_fn: A function(td_targets, predictions) to compute loss.
      gamma: Discount for future rewards.
      reward_scale_factor: Multiplicative factor to scale rewards.
      weights: Optional weights used for importance sampling.
    Returns:
      critic_loss: A scalar critic loss.
    Raises:
      ValueError:
        if the number of actions is greater than 1.
    """
        # Check that `experience` includes two outer dimensions [B, T, ...]. This
        # method requires a time dimension to compute the loss properly.
        self._check_trajectory_dimensions(experience)

        if self._n_step_update == 1:
            time_steps, actions, next_time_steps = self._experience_to_transitions(
                experience)
        else:
            # To compute n-step returns, we need the first time steps, the first
            # actions, and the last time steps. Therefore we extract the first and
            # last transitions from our Trajectory.
            first_two_steps = tf.nest.map_structure(lambda x: x[:, :2],
                                                    experience)
            last_two_steps = tf.nest.map_structure(lambda x: x[:, -2:],
                                                   experience)
            time_steps, actions, _ = self._experience_to_transitions(
                first_two_steps)
            _, _, next_time_steps = self._experience_to_transitions(
                last_two_steps)

        with tf.name_scope('critic_loss'):
            tf.nest.assert_same_structure(actions, self.action_spec)
            tf.nest.assert_same_structure(time_steps, self.time_step_spec)
            tf.nest.assert_same_structure(next_time_steps, self.time_step_spec)

            rank = nest_utils.get_outer_rank(time_steps.observation,
                                             self._time_step_spec.observation)

            # If inputs have a time dimension and the q_network is stateful,
            # combine the batch and time dimension.
            batch_squash = (None if rank <= 1 or self._q_network.state_spec
                            in ((), None) else utils.BatchSquash(rank))

            # q_logits contains the Q-value logits for all actions.
            q_logits, _ = self._q_network(time_steps.observation,
                                          time_steps.step_type)
            next_q_distribution = self._next_q_distribution(
                next_time_steps, batch_squash)

            if batch_squash is not None:
                # Squash outer dimensions to a single dimensions for facilitation
                # computing the loss the following. Required for supporting temporal
                # inputs, for example.
                q_logits = batch_squash.flatten(q_logits)
                actions = batch_squash.flatten(actions)
                next_time_steps = tf.nest.map_structure(
                    batch_squash.flatten, next_time_steps)

            actions = tf.nest.flatten(actions)[0]
            if actions.shape.ndims > 1:
                actions = tf.squeeze(actions, range(1, actions.shape.ndims))

            # Project the sample Bellman update \hat{T}Z_{\theta} onto the original
            # support of Z_{\theta} (see Figure 1 in paper).
            batch_size = tf.shape(q_logits)[0]
            tiled_support = tf.tile(self._support, [batch_size])
            tiled_support = tf.reshape(tiled_support,
                                       [batch_size, self._num_atoms])

            if self._n_step_update == 1:
                discount = next_time_steps.discount
                if discount.shape.ndims == 1:
                    # We expect discount to have a shape of [batch_size], while
                    # tiled_support will have a shape of [batch_size, num_atoms]. To
                    # multiply these, we add a second dimension of 1 to the discount.
                    discount = discount[:, None]
                next_value_term = tf.multiply(discount,
                                              tiled_support,
                                              name='next_value_term')

                reward = next_time_steps.reward
                if reward.shape.ndims == 1:
                    # See the explanation above.
                    reward = reward[:, None]
                reward_term = tf.multiply(reward_scale_factor,
                                          reward,
                                          name='reward_term')

                target_support = tf.add(reward_term,
                                        gamma * next_value_term,
                                        name='target_support')
            else:
                # When computing discounted return, we need to throw out the last time
                # index of both reward and discount, which are filled with dummy values
                # to match the dimensions of the observation.
                rewards = reward_scale_factor * experience.reward[:, :-1]
                discounts = gamma * experience.discount[:, :-1]

                # TODO(b/134618876): Properly handle Trajectories that include episode
                # boundaries with nonzero discount.

                # TODO(b/131557265): Replace value_ops.discounted_return with a method
                # that only computes the single value needed.
                discounted_rewards = value_ops.discounted_return(
                    rewards=rewards,
                    discounts=discounts,
                    final_value=tf.zeros([batch_size], dtype=discounts.dtype),
                    time_major=False)

                # We only need the first value within the time dimension which
                # corresponds to the full final return. The remaining values are only
                # partial returns.
                discounted_rewards = discounted_rewards[:, :1]

                final_value_discount = tf.reduce_prod(discounts, axis=1)
                final_value_discount = final_value_discount[:, None]

                # Save the values of discounted_rewards and final_value_discount in
                # order to check them in unit tests.
                self._discounted_rewards = discounted_rewards
                self._final_value_discount = final_value_discount

                target_support = tf.add(discounted_rewards,
                                        final_value_discount * tiled_support,
                                        name='target_support')

            target_distribution = tf.stop_gradient(
                project_distribution(target_support, next_q_distribution,
                                     self._support))

            # Obtain the current Q-value logits for the selected actions.
            indices = tf.range(tf.shape(q_logits)[0])[:, None]
            indices = tf.cast(indices, actions.dtype)
            reshaped_actions = tf.concat([indices, actions[:, None]], 1)
            chosen_action_logits = tf.gather_nd(q_logits, reshaped_actions)

            # Compute the cross-entropy loss between the logits. If inputs have
            # a time dimension, compute the sum over the time dimension before
            # computing the mean over the batch dimension.
            if batch_squash is not None:
                target_distribution = batch_squash.unflatten(
                    target_distribution)
                chosen_action_logits = batch_squash.unflatten(
                    chosen_action_logits)
                critic_loss = tf.reduce_mean(
                    tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=target_distribution,
                        logits=chosen_action_logits),
                                  axis=1))
            else:
                critic_loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=target_distribution,
                        logits=chosen_action_logits))

            with tf.name_scope('Losses/'):
                tf.compat.v2.summary.scalar('critic_loss',
                                            critic_loss,
                                            step=self.train_step_counter)

            if self._debug_summaries:
                distribution_errors = target_distribution - chosen_action_logits
                with tf.name_scope('distribution_errors'):
                    common.generate_tensor_summaries(
                        'distribution_errors',
                        distribution_errors,
                        step=self.train_step_counter)
                    tf.compat.v2.summary.scalar(
                        'mean',
                        tf.reduce_mean(distribution_errors),
                        step=self.train_step_counter)
                    tf.compat.v2.summary.scalar(
                        'mean_abs',
                        tf.reduce_mean(tf.abs(distribution_errors)),
                        step=self.train_step_counter)
                    tf.compat.v2.summary.scalar(
                        'max',
                        tf.reduce_max(distribution_errors),
                        step=self.train_step_counter)
                    tf.compat.v2.summary.scalar(
                        'min',
                        tf.reduce_min(distribution_errors),
                        step=self.train_step_counter)
                with tf.name_scope('target_distribution'):
                    common.generate_tensor_summaries(
                        'target_distribution',
                        target_distribution,
                        step=self.train_step_counter)

            # TODO(b/127318640): Give appropriate values for td_loss and td_error for
            # prioritized replay.
            return tf_agent.LossInfo(
                critic_loss, dqn_agent.DqnLossInfo(td_loss=(), td_error=()))
예제 #10
0
def to_n_step_transition(
    trajectory: Trajectory,
    gamma: types.Float
) -> Transition:
  """Create an n-step transition from a trajectory with `T=N + 1` frames.

  **NOTE** Tensors of `trajectory` are sliced along their *second* (`time`)
  dimension, to pull out the appropriate fields for the n-step transitions.

  The output transition's `next_time_step.{reward, discount}` will contain
  N-step discounted reward and discount values calculated as:

  ```
  next_time_step.reward = r_t +
                          g^{1} * d_t * r_{t+1} +
                          g^{2} * d_t * d_{t+1} * r_{t+2} +
                          g^{3} * d_t * d_{t+1} * d_{t+2} * r_{t+3} +
                          ...
                          g^{N-1} * d_t * ... * d_{t+N-2} * r_{t+N-1}
  next_time_step.discount = g^{N-1} * d_t * d_{t+1} * ... * d_{t+N-1}
  ```

  In python notation:

  ```python
  discount = gamma**(N-1) * reduce_prod(trajectory.discount[:, :-1])
  reward = discounted_return(
      rewards=trajectory.reward[:, :-1],
      discounts=gamma * trajectory.discount[:, :-1])
  ```

  When `trajectory.discount[:, :-1]` is an all-ones tensor, this is equivalent
  to:

  ```python
  next_time_step.discount = (
      gamma**(N-1) * tf.ones_like(trajectory.discount[:, 0]))
  next_time_step.reward = (
      sum_{n=0}^{N-1} gamma**n * trajectory.reward[:, n])
  ```

  Args:
    trajectory: An instance of `Trajectory`. The tensors in Trajectory must have
      shape `[B, T, ...]`.  `discount` is assumed to be a scalar float,
      hence the shape of `trajectory.discount` must be `[B, T]`.
    gamma: A floating point scalar; the discount factor.

  Returns:
    An N-step `Transition` where `N = T - 1`.  The reward and discount in
    `time_step.{reward, discount}` are NaN.  The n-step discounted reward
    and final discount are stored in `next_time_step.{reward, discount}`.
    All tensors in the `Transition` have shape `[B, ...]` (no time dimension).

  Raises:
    ValueError: if `discount.shape.rank != 2`.
    ValueError: if `discount.shape[1] < 2`.
  """
  _validate_rank(trajectory.discount, min_rank=2, max_rank=2)

  # Use static values when available, so that we can use XLA when the time
  # dimension is fixed.
  time_dim = (tf.compat.dimension_value(trajectory.discount.shape[1])
              or tf.shape(trajectory.discount)[1])

  static_time_dim = tf.get_static_value(time_dim)
  if static_time_dim in (0, 1):
    raise ValueError(
        'Trajectory frame count must be at least 2, but saw {}.  Shape of '
        'trajectory.discount: {}'.format(static_time_dim,
                                         trajectory.discount.shape))

  n = time_dim - 1

  # Use composite calculations to ensure we properly handle SparseTensor etc in
  # the observations.

  # pylint: disable=g-long-lambda

  # Pull out x[:,0] for x in trajectory
  first_frame = tf.nest.map_structure(
      lambda t: composite.squeeze(
          composite.slice_to(t, axis=1, end=1),
          axis=1),
      trajectory)

  # Pull out x[:,-1] for x in trajectory
  final_frame = tf.nest.map_structure(
      lambda t: composite.squeeze(
          composite.slice_from(t, axis=1, start=-1),
          axis=1),
      trajectory)
  # pylint: enable=g-long-lambda

  # When computing discounted return, we need to throw out the last time
  # index of both reward and discount, which are filled with dummy values
  # to match the dimensions of the observation.
  reward = trajectory.reward[:, :-1]
  discount = trajectory.discount[:, :-1]

  policy_steps = policy_step.PolicyStep(
      action=first_frame.action, state=(), info=first_frame.policy_info)

  discounted_reward = value_ops.discounted_return(
      rewards=reward,
      discounts=gamma * discount,
      time_major=False,
      provide_all_returns=False)

  # NOTE: `final_discount` will have one less discount than `discount`.
  # This is so that when the learner/update uses an additional
  # discount (e.g. gamma) we don't apply it twice.
  final_discount = gamma**(n-1) * tf.math.reduce_prod(discount, axis=1)

  time_steps = ts.TimeStep(
      first_frame.step_type,
      # unknown
      reward=tf.nest.map_structure(
          lambda r: np.nan * tf.ones_like(r), first_frame.reward),
      # unknown
      discount=np.nan * tf.ones_like(first_frame.discount),
      observation=first_frame.observation)
  next_time_steps = ts.TimeStep(
      step_type=final_frame.step_type,
      reward=discounted_reward,
      discount=final_discount,
      observation=final_frame.observation)
  return Transition(time_steps, policy_steps, next_time_steps)
예제 #11
0
    def critic_loss(self, experience, gamma=1.0, weights=None):
        """Computes the critic loss for TD3 training.

        Args:
          experience: A batch of timesteps.
          gamma: reward discount factor
          weights: Optional scalar or element-wise (per-batch-entry) importance
            weights.

        Returns:
          critic_loss: A scalar critic loss.
        """
        with tf.name_scope('critic_loss'):

            self._check_trajectory_dimensions(experience)

            if self._n_step_update == 1:
                time_steps, actions, next_time_steps = self._experience_to_transitions(
                    experience)
            else:
                # To compute n-step returns, we need the first time steps, the first
                # actions, and the last time steps. Therefore we extract the first and
                # last transitions from our Trajectory.
                first_two_steps = tf.nest.map_structure(lambda x: x[:, :2], experience)
                last_two_steps = tf.nest.map_structure(lambda x: x[:, -2:], experience)
                time_steps, actions, _ = self._experience_to_transitions(first_two_steps)
                _, _, next_time_steps = self._experience_to_transitions(last_two_steps)

            # Target q-values are the min of the two networks
            # print("first pass")
            target_q_values_1 = self._compute_next_q_values(self._target_q_value_policies_1, next_time_steps)
            target_q_values_2 = self._compute_next_q_values(self._target_q_value_policies_2, next_time_steps)

            target_q_values = tf.minimum(target_q_values_1, target_q_values_2)

            if self._n_step_update == 1:
                # Special case for n = 1 to avoid a loss of performance.
                td_targets = compute_td_targets(
                    target_q_values,
                    rewards=self._reward_scale_factor * next_time_steps.reward,
                    discounts=gamma * next_time_steps.discount)
            else:
                # When computing discounted return, we need to throw out the last time
                # index of both reward and discount, which are filled with dummy values
                # to match the dimensions of the observation.
                rewards = self._reward_scale_factor * experience.reward[:, :-1]
                discounts = gamma * experience.discount[:, :-1]

                td_targets = value_ops.discounted_return(
                    rewards=rewards,
                    discounts=discounts,
                    final_value=target_q_values,
                    time_major=False,
                    provide_all_returns=False)

            # td_targets = tf.stop_gradient(
            #    self._reward_scale_factor * next_time_steps.reward +
            #    self._gamma * next_time_steps.discount * target_q_values)
            # print("second pass")
            pred_td_targets_1 = self._compute_q_values(self._q_value_policies_1, time_steps, actions)
            pred_td_targets_2 = self._compute_q_values(self._q_value_policies_2, time_steps, actions)
            pred_td_targets_all = [pred_td_targets_1, pred_td_targets_2]
            # print("third pass")
            if self._debug_summaries:
                tf.compat.v2.summary.histogram(
                    name='td_targets', data=td_targets, step=self.train_step_counter)
                with tf.name_scope('td_targets'):
                    tf.compat.v2.summary.scalar(
                        name='mean',
                        data=tf.reduce_mean(input_tensor=td_targets),
                        step=self.train_step_counter)
                    tf.compat.v2.summary.scalar(
                        name='max',
                        data=tf.reduce_max(input_tensor=td_targets),
                        step=self.train_step_counter)
                    tf.compat.v2.summary.scalar(
                        name='min',
                        data=tf.reduce_min(input_tensor=td_targets),
                        step=self.train_step_counter)

                for td_target_idx in range(2):
                    pred_td_targets = pred_td_targets_all[td_target_idx]
                    td_errors = td_targets - pred_td_targets
                    with tf.name_scope('critic_net_%d' % (td_target_idx + 1)):
                        tf.compat.v2.summary.histogram(
                            name='td_errors', data=td_errors, step=self.train_step_counter)
                        tf.compat.v2.summary.histogram(
                            name='pred_td_targets',
                            data=pred_td_targets,
                            step=self.train_step_counter)
                        with tf.name_scope('td_errors'):
                            tf.compat.v2.summary.scalar(
                                name='mean',
                                data=tf.reduce_mean(input_tensor=td_errors),
                                step=self.train_step_counter)
                            tf.compat.v2.summary.scalar(
                                name='mean_abs',
                                data=tf.reduce_mean(input_tensor=tf.abs(td_errors)),
                                step=self.train_step_counter)
                            tf.compat.v2.summary.scalar(
                                name='max',
                                data=tf.reduce_max(input_tensor=td_errors),
                                step=self.train_step_counter)
                            tf.compat.v2.summary.scalar(
                                name='min',
                                data=tf.reduce_min(input_tensor=td_errors),
                                step=self.train_step_counter)
                        with tf.name_scope('pred_td_targets'):
                            tf.compat.v2.summary.scalar(
                                name='mean',
                                data=tf.reduce_mean(input_tensor=pred_td_targets),
                                step=self.train_step_counter)
                            tf.compat.v2.summary.scalar(
                                name='max',
                                data=tf.reduce_max(input_tensor=pred_td_targets),
                                step=self.train_step_counter)
                            tf.compat.v2.summary.scalar(
                                name='min',
                                data=tf.reduce_min(input_tensor=pred_td_targets),
                                step=self.train_step_counter)

            critic_loss = (self._td_errors_loss_fn(td_targets, pred_td_targets_1)
                           + self._td_errors_loss_fn(td_targets, pred_td_targets_2))

            if nest_utils.is_batched_nested_tensors(
                    time_steps, self.time_step_spec, num_outer_dims=2):
                # Sum over the time dimension.
                critic_loss = tf.reduce_sum(input_tensor=critic_loss, axis=1)

            if weights is not None:
                critic_loss *= weights
            # print("forth pass")

            # regularization_loss = self._embedding_loss(self._q_network_1) + self._embedding_loss(self._q_network_2)
            return tf.reduce_mean(input_tensor=critic_loss)
예제 #12
0
  def compute_return_and_advantage(self, next_time_steps, value_preds):
    """Compute the Monte Carlo return and advantage.

    Normalazation will be applied to the computed returns and advantages if
    it's enabled.

    Args:
      next_time_steps: batched tensor of TimeStep tuples after action is taken.
      value_preds: Batched value prediction tensor. Should have one more entry
        in time index than time_steps, with the final value corresponding to the
        value prediction of the final state.

    Returns:
      tuple of (return, normalized_advantage), both are batched tensors.
    """
    discounts = next_time_steps.discount * tf.constant(
        self._discount_factor, dtype=tf.float32)

    rewards = next_time_steps.reward
    if self._debug_summaries:
      # Summarize rewards before they get normalized below.
      tf.compat.v2.summary.histogram(
          name='rewards', data=rewards, step=self.train_step_counter)
      tf.compat.v2.summary.scalar(
          name='rewards_mean',
          data=tf.reduce_mean(rewards),
          step=self.train_step_counter)

    # Normalize rewards if self._reward_normalizer is defined.
    if self._reward_normalizer:
      rewards = self._reward_normalizer.normalize(
          rewards, center_mean=False, clip_value=self._reward_norm_clipping)
      if self._debug_summaries:
        tf.compat.v2.summary.histogram(
            name='rewards_normalized',
            data=rewards,
            step=self.train_step_counter)
        tf.compat.v2.summary.scalar(
            name='rewards_normalized_mean',
            data=tf.reduce_mean(rewards),
            step=self.train_step_counter)

    # Make discount 0.0 at end of each episode to restart cumulative sum
    #   end of each episode.
    episode_mask = common.get_episode_mask(next_time_steps)
    discounts *= episode_mask

    # Compute Monte Carlo returns. Data from incomplete trajectories, not
    #   containing the end of an episode will also be used, with a bootstrapped
    #   estimation from the last value.
    # Note that when a trajectory driver is used, then the final step is
    #   terminal, the bootstrapped estimation will not be used, as it will be
    #   multiplied by zero (the discount on the last step).
    final_value_bootstrapped = value_preds[:, -1]
    returns = value_ops.discounted_return(
        rewards,
        discounts,
        time_major=False,
        final_value=final_value_bootstrapped)
    if self._debug_summaries:
      tf.compat.v2.summary.histogram(
          name='returns', data=returns, step=self.train_step_counter)

    # Compute advantages.
    advantages = self.compute_advantages(rewards, returns, discounts,
                                         value_preds)
    normalized_advantages = _normalize_advantages(advantages, axes=(0, 1))
    if self._debug_summaries:
      tf.compat.v2.summary.histogram(
          name='advantages', data=advantages, step=self.train_step_counter)
      tf.compat.v2.summary.histogram(
          name='advantages_normalized',
          data=normalized_advantages,
          step=self.train_step_counter)

    # Return TD-Lambda returns if both use_td_lambda_return and use_gae.
    if self._use_td_lambda_return:
      if not self._use_gae:
        logging.warning('use_td_lambda_return was True, but use_gae was '
                        'False. Using Monte Carlo return.')
      else:
        returns = tf.add(
            advantages, value_preds[:, :-1], name='td_lambda_returns')

    return returns, normalized_advantages
예제 #13
0
  def _loss(self,
            experience,
            td_errors_loss_fn=element_wise_huber_loss,
            gamma=1.0,
            reward_scale_factor=1.0,
            weights=None):
    """Computes loss for DQN training.

    Args:
      experience: A batch of experience data in the form of a `Trajectory`. The
        structure of `experience` must match that of `self.policy.step_spec`.
        All tensors in `experience` must be shaped `[batch, time, ...]` where
        `time` must be equal to `self.train_sequence_length` if that property is
        not `None`.
      td_errors_loss_fn: A function(td_targets, predictions) to compute the
        element wise loss.
      gamma: Discount for future rewards.
      reward_scale_factor: Multiplicative factor to scale rewards.
      weights: Optional scalar or elementwise (per-batch-entry) importance
        weights.  The output td_loss will be scaled by these weights, and the
        final scalar loss is the mean of these values.

    Returns:
      loss: An instance of `DqnLossInfo`.
    Raises:
      ValueError:
        if the number of actions is greater than 1.
    """
    # Check that `experience` includes two outer dimensions [B, T, ...]. This
    # method requires `experience` to include the time dimension.
    self._check_trajectory_dimensions(experience)

    if self._n_step_update == 1:
      time_steps, actions, next_time_steps = self._experience_to_transitions(
          experience)
    else:
      # To compute n-step returns, we need the first time steps, the first
      # actions, and the last time steps. Therefore we extract the first and
      # last transitions from our Trajectory.
      first_two_steps = tf.nest.map_structure(lambda x: x[:, :2], experience)
      last_two_steps = tf.nest.map_structure(lambda x: x[:, -2:], experience)
      time_steps, actions, _ = self._experience_to_transitions(first_two_steps)
      _, _, next_time_steps = self._experience_to_transitions(last_two_steps)

    with tf.name_scope('loss'):
      actions = tf.nest.flatten(actions)[0]
      q_values, _ = self._q_network(time_steps.observation,
                                    time_steps.step_type)

      # Handle action_spec.shape=(), and shape=(1,) by using the
      # multi_dim_actions param.
      multi_dim_actions = tf.nest.flatten(self._action_spec)[0].shape.ndims > 0
      q_values = common.index_with_actions(
          q_values,
          tf.cast(actions, dtype=tf.int32),
          multi_dim_actions=multi_dim_actions)

      next_q_values = self._compute_next_q_values(next_time_steps)

      if self._n_step_update == 1:
        # Special case for n = 1 to avoid a loss of performance.
        td_targets = compute_td_targets(
            next_q_values,
            rewards=reward_scale_factor * next_time_steps.reward,
            discounts=gamma * next_time_steps.discount)
      else:
        # When computing discounted return, we need to throw out the last time
        # index of both reward and discount, which are filled with dummy values
        # to match the dimensions of the observation.
        # TODO(b/131557265): Replace value_ops.discounted_return with a method
        # that only computes the single value needed.
        n_step_return = value_ops.discounted_return(
            rewards=reward_scale_factor * experience.reward[:, :-1],
            discounts=gamma * experience.discount[:, :-1],
            final_value=next_q_values,
            time_major=False)

        # We only need the first value within the time dimension which
        # corresponds to the full final return. The remaining values are only
        # partial returns.
        td_targets = n_step_return[:, 0]

      valid_mask = tf.cast(~time_steps.is_last(), tf.float32)
      td_error = valid_mask * (td_targets - q_values)

      td_loss = valid_mask * td_errors_loss_fn(td_targets, q_values)

      if nest_utils.is_batched_nested_tensors(
          time_steps, self.time_step_spec, num_outer_dims=2):
        # Do a sum over the time dimension.
        td_loss = tf.reduce_sum(input_tensor=td_loss, axis=1)

      if weights is not None:
        td_loss *= weights

      # Average across the elements of the batch.
      # Note: We use an element wise loss above to ensure each element is always
      #   weighted by 1/N where N is the batch size, even when some of the
      #   weights are zero due to boundary transitions. Weighting by 1/K where K
      #   is the actual number of non-zero weight would artificially increase
      #   their contribution in the loss. Think about what would happen as
      #   the number of boundary samples increases.
      loss = tf.reduce_mean(input_tensor=td_loss)

      with tf.name_scope('Losses/'):
        tf.compat.v1.summary.scalar(
            'loss_' + self.name, loss, collections=['train_' + self.name])
        # family=self.name)

      if self._summarize_grads_and_vars:
        with tf.name_scope('Variables/'):
          for var in self._q_network.trainable_weights:
            tf.compat.v2.summary.histogram(
                name=var.name.replace(':', '_'),
                data=var,
                step=self.train_step_counter)

      if self._debug_summaries:
        diff_q_values = q_values - next_q_values
        common.generate_tensor_summaries('td_error', td_error,
                                         self.train_step_counter)
        common.generate_tensor_summaries('td_loss', td_loss,
                                         self.train_step_counter)
        common.generate_tensor_summaries('q_values', q_values,
                                         self.train_step_counter)
        common.generate_tensor_summaries('next_q_values', next_q_values,
                                         self.train_step_counter)
        common.generate_tensor_summaries('diff_q_values', diff_q_values,
                                         self.train_step_counter)

      return tf_agent.LossInfo(loss,
                               DqnLossInfo(td_loss=td_loss, td_error=td_error))
예제 #14
0
    def _loss(self,
              experience,
              td_errors_loss_fn=common.element_wise_huber_loss,
              gamma=1.0,
              reward_scale_factor=1.0,
              weights=None):
        self._check_trajectory_dimensions(experience)

        if self._n_step_update == 1:
            time_steps, actions, next_time_steps = self._experience_to_transitions(
                experience)
        else:
            first_two_steps = tf.nest.map_structure(lambda x: x[:, :2],
                                                    experience)
            last_two_steps = tf.nest.map_structure(lambda x: x[:, -2:],
                                                   experience)
            time_steps, actions, _ = self._experience_to_transitions(
                first_two_steps)
            _, _, next_time_steps = self._experience_to_transitions(
                last_two_steps)

        with tf.name_scope("loss"):
            q_values = self._compute_q_values(time_steps, actions)
            next_q_values = self._compute_next_q_values(next_time_steps)

            if self._n_step_update == 1:
                td_targets = compute_td_targets(
                    next_q_values,
                    rewards=reward_scale_factor * next_time_steps.reward,
                    discounts=gamma * next_time_steps.discount)
            else:
                rewards = reward_scale_factor * experience.reward[:, :-1]
                discounts = gamma * experience.discount[:, :-1]
                td_targets = value_ops.discounted_return(
                    rewards=rewards,
                    discounts=discounts,
                    final_value=next_q_values,
                    time_major=False,
                    provide_all_returns=False)
            valid_mask = tf.cast(~time_steps.is_last(), tf.float32)
            td_error = valid_mask * (td_targets - q_values)
            td_loss = valid_mask * td_errors_loss_fn(td_targets, q_values)

            if nest_utils.is_batched_nested_tensors(time_steps,
                                                    self.time_step_spec,
                                                    num_outer_dims=2):
                td_loss = tf.reduce_sum(input_tensor=td_loss, axis=1)

            if weights is not None:
                td_loss *= weights

            loss = tf.reduce_mean(input_tensor=td_loss)

            if self._q_network.losses:
                loss = loss + tf.reduce_mean(self._q_network.losses)

            with tf.name_scope("Losses/"):
                tf.compat.v2.summary.scalar(name="loss",
                                            data=loss,
                                            step=self.train_step_counter)

            if self._summarize_grads_and_vars:
                with tf.name_scope("Variables/"):
                    for var in self._q_network.trainable_weights:
                        tf.compat.v2.summary.historgram(
                            name=var.name.replace(":", "_"),
                            data=var,
                            step=self.train_step_counter)

            if self._debug_summaries:
                diff_q_values = q_values - next_q_values
                common.generate_tensor_summaries("td_error", td_error,
                                                 self.train_step_counter)
                common.generate_tensor_summaries("td_loss", td_loss,
                                                 self.train_step_counter)
                common.generate_tensor_summaries("q_values", q_values,
                                                 self.train_step_counter)
                common.generate_tensor_summaries("next_q_values",
                                                 next_q_values,
                                                 self.train_step_counter)
                common.generate_tensor_summaries("diff_q_values",
                                                 diff_q_values,
                                                 self.train_step_counter)
            return tf_agent.LossInfo(
                loss, DqnLossInfo(td_loss=td_loss, td_error=td_error))