Пример #1
0
    def actor_loss(self, time_steps):
        with tf.compat.v1.name_scope('actor_loss'):
            actions, _ = self._actor_network(time_steps.observation,
                                             time_steps.step_type)
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(actions)
                avg_expected_q_values = self._get_state_values(
                    time_steps, actions, aggregate='mean')
                actions = tf.nest.flatten(actions)
            dqdas = tape.gradient([avg_expected_q_values], actions)

            actor_losses = []
            for dqda, action in zip(dqdas, actions):  # 其实就只有一个
                loss = common.element_wise_squared_loss(
                    tf.stop_gradient(dqda + action), action)
                loss = tf.reduce_sum(input_tensor=loss, axis=1)
                loss = tf.reduce_mean(input_tensor=loss)
                actor_losses.append(loss)

            actor_loss = tf.add_n(actor_losses)

            with tf.compat.v1.name_scope('Losses/'):
                tf.compat.v2.summary.scalar(name='actor_loss',
                                            data=actor_loss,
                                            step=self.train_step_counter)

        return actor_loss
Пример #2
0
  def actor_loss(self, time_steps):
    """Computes the actor_loss for DDPG training.

    Args:
      time_steps: A batch of timesteps.
      # TODO(kbanoop): Add an action norm regularizer.
    Returns:
      actor_loss: A scalar actor loss.
    """
    with tf.name_scope('actor_loss'):
      actions, _ = self._actor_network(time_steps.observation,
                                       time_steps.step_type)
      q_values, _ = self._critic_network(time_steps.observation, actions,
                                         time_steps.step_type)
      actions = nest.flatten(actions)
      dqda = tf.gradients([q_values], actions)
      actor_losses = []
      for dqda, action in zip(dqda, actions):
        if self._dqda_clipping is not None:
          dqda = tf.clip_by_value(dqda, -1 * self._dqda_clipping,
                                  self._dqda_clipping)
        loss = common_utils.element_wise_squared_loss(
            tf.stop_gradient(dqda + action), action)
        if nest_utils.is_batched_nested_tensors(
            time_steps, self.time_step_spec(), num_outer_dims=2):
          # Sum over the time dimension.
          loss = tf.reduce_sum(loss, axis=1)
        loss = tf.reduce_mean(loss)
        actor_losses.append(loss)

      actor_loss = tf.add_n(actor_losses)
      with tf.name_scope('Losses/'):
        tf.contrib.summary.scalar('actor_loss', actor_loss)

    return actor_loss
Пример #3
0
    def actor_loss(self,
                   time_steps: ts.TimeStep,
                   weights: Optional[types.Tensor] = None,
                   training: bool = False) -> types.Tensor:
        """Computes the actor_loss for DDPG training.

    Args:
      time_steps: A batch of timesteps.
      weights: Optional scalar or element-wise (per-batch-entry) importance
        weights.
      training: Whether this loss is being used for training.
      # TODO(b/124383618): Add an action norm regularizer.
    Returns:
      actor_loss: A scalar actor loss.
    """
        with tf.name_scope('actor_loss'):
            actions, _ = self._actor_network(time_steps.observation,
                                             time_steps.step_type,
                                             training=training)
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(actions)
                q_values, _ = self._critic_network(
                    (time_steps.observation, actions),
                    time_steps.step_type,
                    training=False)
                actions = tf.nest.flatten(actions)

            dqdas = tape.gradient([q_values], actions)

            actor_losses = []
            for dqda, action in zip(dqdas, actions):
                if self._dqda_clipping is not None:
                    dqda = tf.clip_by_value(dqda, -1 * self._dqda_clipping,
                                            self._dqda_clipping)
                loss = common.element_wise_squared_loss(
                    tf.stop_gradient(dqda + action), action)
                if nest_utils.is_batched_nested_tensors(time_steps,
                                                        self.time_step_spec,
                                                        num_outer_dims=2):
                    # Sum over the time dimension.
                    loss = tf.reduce_sum(loss, axis=1)
                if weights is not None:
                    loss *= weights
                loss = tf.reduce_mean(loss)
                actor_losses.append(loss)

            actor_loss = tf.add_n(actor_losses)

            with tf.name_scope('Losses/'):
                tf.compat.v2.summary.scalar(name='actor_loss',
                                            data=actor_loss,
                                            step=self.train_step_counter)

        return actor_loss
Пример #4
0
    def actor_loss(self, time_steps, weights=None):
        """Computes the actor_loss for TD3 training.

    Args:
      time_steps: A batch of timesteps.
      weights: Optional scalar or element-wise (per-batch-entry) importance
        weights.

    Returns:
      actor_loss: A scalar actor loss.
    """
        with tf.name_scope('actor_loss'):
            actions, _ = self._actor_network(time_steps.observation,
                                             time_steps.step_type)

            critic_network_input = (time_steps.observation, actions)
            q_values, _ = self._critic_network_1(critic_network_input,
                                                 time_steps.step_type)

            actions = nest.flatten(actions)
            dqda = tf.gradients([q_values], actions)
            actor_losses = []
            for dqda, action in zip(dqda, actions):
                if self._dqda_clipping is not None:
                    # pylint: disable=invalid-unary-operand-type
                    dqda = tf.clip_by_value(dqda, -self._dqda_clipping,
                                            self._dqda_clipping)
                loss = common_utils.element_wise_squared_loss(
                    tf.stop_gradient(dqda + action), action)
                if nest_utils.is_batched_nested_tensors(time_steps,
                                                        self.time_step_spec(),
                                                        num_outer_dims=2):
                    # Sum over the time dimension.
                    loss = tf.reduce_sum(loss, axis=1)

                if weights is not None:
                    loss *= weights

                loss = tf.reduce_mean(loss)
                actor_losses.append(loss)

            # TODO(kbanoop): Add an action norm regularizer.
            return tf.add_n(actor_losses)
Пример #5
0
    def actor_loss(self, time_steps, weights=None):
        """Computes the actor_loss for DDPG training.

    Args:
      time_steps: A batch of timesteps.
      weights: Optional scalar or element-wise (per-batch-entry) importance
        weights.
      # TODO(kbanoop): Add an action norm regularizer.
    Returns:
      actor_loss: A scalar actor loss.
    """
        with tf.name_scope('actor_loss'):
            actions, _ = self._actor_network(time_steps.observation,
                                             time_steps.step_type)
            critic_net_input = (time_steps.observation, actions)
            q_values, _ = self._critic_network(critic_net_input,
                                               time_steps.step_type)
            actions = tf.nest.flatten(actions)
            dqdas = tf.gradients(ys=[q_values], xs=actions)
            actor_losses = []
            for dqda, action in zip(dqdas, actions):
                if self._dqda_clipping is not None:
                    dqda = tf.clip_by_value(dqda, -1 * self._dqda_clipping,
                                            self._dqda_clipping)
                loss = common_utils.element_wise_squared_loss(
                    tf.stop_gradient(dqda + action), action)
                if nest_utils.is_batched_nested_tensors(time_steps,
                                                        self.time_step_spec,
                                                        num_outer_dims=2):
                    # Sum over the time dimension.
                    loss = tf.reduce_sum(input_tensor=loss, axis=1)
                if weights is not None:
                    loss *= weights
                loss = tf.reduce_mean(input_tensor=loss)
                actor_losses.append(loss)

            actor_loss = tf.add_n(actor_losses)
            with tf.name_scope('Losses/'):
                tf.compat.v2.summary.scalar(name='actor_loss',
                                            data=actor_loss,
                                            step=self.train_step_counter)

        return actor_loss
Пример #6
0
    def actor_loss(self,
                   time_steps,
                   total_actions,
                   index,
                   weights=None,
                   training=False):
        """Computes the actor_loss for DDPG training.
    Args:
      time_steps: A batch of timesteps.
      weights: Optional scalar or element-wise (per-batch-entry) importance
        weights.
      training: Whether this loss is being used for training.
      # TODO(b/124383618): Add an action norm regularizer.
    Returns:
      actor_loss: A scalar actor loss.
    """
        with tf.name_scope('actor_loss'):
            action_current_agent, _ = self._actor_network(
                time_steps.observation[index],
                time_steps.step_type,
                training=training)
            total_actions = list(total_actions)
            main_actions = []
            for i, action in enumerate(total_actions):
                if i == index:
                    main_actions.append(action_current_agent)
                else:
                    main_actions.append(action)
            main_actions = tuple(main_actions)
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(main_actions)
                q_values, _ = self._critic_network(
                    (time_steps.observation, main_actions),
                    time_steps.step_type,
                    training=training)
                main_actions = tf.nest.flatten(main_actions[index])

            dqdas = tape.gradient([q_values], main_actions)

            actor_losses = []
            for dqda, action in zip(dqdas, main_actions):
                if self._dqda_clipping is not None:
                    dqda = tf.clip_by_value(dqda, -1 * self._dqda_clipping,
                                            self._dqda_clipping)
                loss = common.element_wise_squared_loss(
                    tf.stop_gradient(dqda + action), action)
                if nest_utils.is_batched_nested_tensors(time_steps,
                                                        self.time_step_spec,
                                                        num_outer_dims=2):
                    # Sum over the time dimension.
                    loss = tf.reduce_sum(loss, axis=1)
                if weights is not None:
                    loss *= weights
                loss = tf.reduce_mean(loss)
                actor_losses.append(loss)

            actor_loss = tf.add_n(actor_losses)

            with tf.name_scope('Losses/'):
                tf.compat.v2.summary.scalar(name='actor_loss',
                                            data=actor_loss,
                                            step=self.train_step_counter)

        return actor_loss

    # def actor_loss(self, time_steps, weights=None, training=False):
    #   """Computes the actor_loss for DDPG training.
    #
    #   Args:
    #     time_steps: A batch of timesteps.
    #     weights: Optional scalar or element-wise (per-batch-entry) importance
    #       weights.
    #     training: Whether this loss is being used for training.
    #     # TODO(b/124383618): Add an action norm regularizer.
    #   Returns:
    #     actor_loss: A scalar actor loss.
    #   """
    #   with tf.name_scope('actor_loss'):
    #     actions, _ = self._actor_network(time_steps.observation,
    #                                      time_steps.step_type,
    #                                      training=training)
    #     with tf.GradientTape(watch_accessed_variables=False) as tape:
    #       tape.watch(actions)
    #       q_values, _ = self._critic_network((time_steps.observation, actions),
    #                                          time_steps.step_type,
    #                                          training=False)
    #       actions = tf.nest.flatten(actions)
    #
    #     dqdas = tape.gradient([q_values], actions)
    #
    #     actor_losses = []
    #     for dqda, action in zip(dqdas, actions):
    #       if self._dqda_clipping is not None:
    #         dqda = tf.clip_by_value(dqda, -1 * self._dqda_clipping,
    #                                 self._dqda_clipping)
    #       loss = common.element_wise_squared_loss(
    #           tf.stop_gradient(dqda + action), action)
    #       if nest_utils.is_batched_nested_tensors(
    #           time_steps, self.time_step_spec, num_outer_dims=2):
    #         # Sum over the time dimension.
    #         loss = tf.reduce_sum(loss, axis=1)
    #       if weights is not None:
    #         loss *= weights
    #       loss = tf.reduce_mean(loss)
    #       actor_losses.append(loss)
    #
    #     actor_loss = tf.add_n(actor_losses)
    #
    #     with tf.name_scope('Losses/'):
    #       tf.compat.v2.summary.scalar(
    #           name='actor_loss', data=actor_loss, step=self.train_step_counter)
    #
    #   return actor_loss
Пример #7
0
    def actor_loss(self, time_steps, discrete_actions, weights=None):
        """Computes the actor_loss for TD3 training.

    Args:
      time_steps: A batch of timesteps.
      discrete_actions: A tensor of discrete action arguments.
      weights: Optional scalar or element-wise (per-batch-entry) importance
        weights.
      # TODO(b/124383618): Add an action norm regularizer.
    Returns:
      actor_loss: A scalar actor loss.
    """
        with tf.name_scope('actor_loss'):
            with tf.GradientTape(watch_accessed_variables=True) as tape:
                q_values, continuous_actions = self._compute_q_values(
                    self._q_network_1, time_steps, discrete_actions)
                continuous_actions = tf.nest.flatten(continuous_actions)
                tape.watch(continuous_actions)

            dqdas = tape.gradient([q_values], continuous_actions)
            actor_losses = []
            for dqda, cont_action in zip(dqdas, continuous_actions):
                if self._dqda_clipping is not None:
                    dqda = tf.clip_by_value(dqda, -1 * self._dqda_clipping,
                                            self._dqda_clipping)
                # mask unrelevant continuous actions for each discrete action
                multi_dim_actions = tf.nest.flatten(
                    self._action_spec.q_network)[0].shape.ndims > 0
                if multi_dim_actions:
                    raise NotImplementedError(
                        "multidimensional action space is not supported")
                discrete_actions_shape = tf.shape(discrete_actions)
                cont_action_mask = tf.cast(
                    tf.gather_nd(self._action_params_mask,
                                 tf.reshape(discrete_actions, [-1, 1])),
                    tf.float32)
                cont_action_mask = tf.reshape(
                    cont_action_mask,
                    tf.concat([discrete_actions_shape, [-1]], axis=-1))
                loss = common.element_wise_squared_loss(
                    tf.stop_gradient(dqda + cont_action), cont_action)

                tf.nest.assert_same_structure(loss, cont_action_mask)
                loss = loss * cont_action_mask
                if nest_utils.is_batched_nested_tensors(time_steps,
                                                        self.time_step_spec,
                                                        num_outer_dims=2):
                    # Sum over the time dimension.
                    loss = tf.reduce_sum(loss, axis=1)
                if weights is not None:
                    loss *= weights
                loss = tf.reduce_mean(loss)
                actor_losses.append(loss)

            actor_loss = tf.add_n(actor_losses)

            with tf.name_scope('Losses/'):
                tf.compat.v2.summary.scalar(name='actor_loss',
                                            data=actor_loss,
                                            step=self.train_step_counter)

        return actor_loss