Пример #1
0
def check_no_shared_variables(network_1, network_2):
  """Checks that there are no shared trainable variables in the two networks.

  Args:
    network_1: A network.Network.
    network_2: A network.Network.

  Raises:
    ValueError: if there are any common trainable variables.
    ValueError: if one of the networks has not yet been built
      (e.g. user must call `create_variables`).
  """
  variables_1 = object_identity.ObjectIdentitySet(network_1.trainable_variables)
  variables_2 = object_identity.ObjectIdentitySet(network_2.trainable_variables)
  shared_variables = variables_1 & variables_2
  if shared_variables:
    raise ValueError(
        'After making a copy of network \'{}\' to create a target '
        'network \'{}\', the target network shares weights with '
        'the original network.  This is not allowed.  If '
        'you want explicitly share weights with the target network, or '
        'if your input network shares weights with others, please '
        'provide a target network which explicitly, selectively, shares '
        'layers/weights with the input network.  If you are not intending to '
        'share weights make sure all the weights are created inside the Network'
        ' since a copy will be created by creating a new Network with the same '
        'args but a new name. Shared variables found: '
        '\'{}\'.'.format(
            network_1.name, network_2.name,
            [x.name for x in shared_variables]))
  def testDifference(self):

    class Element(object):
      pass

    a = Element()
    b = Element()
    c = Element()
    set1 = object_identity.ObjectIdentitySet([a, b])
    set2 = object_identity.ObjectIdentitySet([b, c])
    diff_set = set1.difference(set2)
    self.assertIn(a, diff_set)
    self.assertNotIn(b, diff_set)
    self.assertNotIn(c, diff_set)
 def testDiscard(self):
   a = object()
   b = object()
   set1 = object_identity.ObjectIdentitySet([a, b])
   set1.discard(a)
   self.assertIn(b, set1)
   self.assertNotIn(a, set1)
Пример #4
0
    def _train(self, experience, weights=None):
        # TODO(b/120034503): Move the conversion to transitions to the base class.
        squeeze_time_dim = not self._actor_network.state_spec
        time_steps, policy_steps, next_time_steps = (
            trajectory.experience_to_transitions(experience, squeeze_time_dim))
        actions = policy_steps.action

        trainable_critic_variables = list(
            object_identity.ObjectIdentitySet(
                self._critic_network_1.trainable_variables +
                self._critic_network_2.trainable_variables))
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            assert trainable_critic_variables, (
                'No trainable critic variables to '
                'optimize.')
            tape.watch(trainable_critic_variables)
            critic_loss = self.critic_loss(time_steps,
                                           actions,
                                           next_time_steps,
                                           weights=weights,
                                           training=True)
        tf.debugging.check_numerics(critic_loss, 'Critic loss is inf or nan.')
        critic_grads = tape.gradient(critic_loss, trainable_critic_variables)
        self._apply_gradients(critic_grads, trainable_critic_variables,
                              self._critic_optimizer)

        trainable_actor_variables = self._actor_network.trainable_variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            assert trainable_actor_variables, (
                'No trainable actor variables to '
                'optimize.')
            tape.watch(trainable_actor_variables)
            actor_loss = self.actor_loss(time_steps,
                                         weights=weights,
                                         training=True)
        tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.')

        # We only optimize the actor every actor_update_period training steps.
        def optimize_actor():
            actor_grads = tape.gradient(actor_loss, trainable_actor_variables)
            return self._apply_gradients(actor_grads,
                                         trainable_actor_variables,
                                         self._actor_optimizer)

        remainder = tf.math.mod(self.train_step_counter,
                                self._actor_update_period)
        tf.cond(pred=tf.equal(remainder, 0),
                true_fn=optimize_actor,
                false_fn=tf.no_op)

        self.train_step_counter.assign_add(1)
        self._update_target()

        # TODO(b/124382360): Compute per element TD loss and return in loss_info.
        total_loss = actor_loss + critic_loss

        return tf_agent.LossInfo(total_loss, Td3Info(actor_loss, critic_loss))
Пример #5
0
def deduped_network_variables(network, *args):
  """Returns a list of variables in net1 that are not in any other nets.

  Args:
    network: A Keras network.
    *args: other networks to check for duplicate variables.
  """
  other_vars = object_identity.ObjectIdentitySet(
      [v for n in args for v in n.variables])  # pylint:disable=g-complex-comprehension
  return [v for v in network.variables if v not in other_vars]
Пример #6
0
 def trainable_variables(self):
     """Override trainable_variables property to remove encoder_variables."""
     if self._image_encoder:
         encoder_variables = object_identity.ObjectIdentitySet(
             self._image_encoder.trainable_variables)
         return [
             v for v in super(Actor, self).trainable_variables
             if v not in encoder_variables
         ]
     else:
         return super(Actor, self).trainable_variables
Пример #7
0
def extract_shared_variables(variables_1, variables_2):
    """Separates shared variables from the given collections.

  Args:
    variables_1: An iterable of Variables
    variables_2: An iterable of Variables

  Returns:
    A Tuple of ObjectIdentitySets described by the set operations

    ```
    (variables_1 - variables_2,
     variables_2 - variables_1,
     variables_1 & variables_2)
    ```
  """
    var_refs1 = object_identity.ObjectIdentitySet(variables_1)
    var_refs2 = object_identity.ObjectIdentitySet(variables_2)

    shared_vars = var_refs1.intersection(var_refs2)
    return (var_refs1.difference(shared_vars),
            var_refs2.difference(shared_vars), shared_vars)
Пример #8
0
 def trainable_variables(self):
     tvars = super(DiagGuassianPolicy, self).trainable_variables
     if self.encoder is None:
         return tvars
     else:
         # Remove the encoder conv2d variables (Policy shouldn't update the conv2d
         # vars). Note that a call to stop_gradient on the fprop isn't enough to
         # ensure that this is the case, this is because conv2d vars are shared
         # with the critic and so they can get updated when bpropping through the
         # critic to minimze the actor loss.
         encoder_variables = object_identity.ObjectIdentitySet(
             self.encoder.conv_stack.trainable_variables)
         return [v for v in tvars if v not in encoder_variables]
Пример #9
0
def _filter_empty_layer_containers(layer_list):
    """Remove empty layer containers."""
    existing = object_identity.ObjectIdentitySet()
    to_visit = layer_list[::-1]
    while to_visit:
        obj = to_visit.pop()
        if obj in existing:
            continue
        existing.add(obj)
        if _is_layer(obj):
            yield obj
        else:
            sub_layers = getattr(obj, "layers", None) or []

            # Trackable data structures will not show up in ".layers" lists, but
            # the layers they contain will.
            to_visit.extend(sub_layers[::-1])
Пример #10
0
    def _train(self, experience, weights):
        """Returns a train op to update the agent's networks.

    This method trains with the provided batched experience.

    Args:
      experience: A time-stacked trajectory object.
      weights: Optional scalar or elementwise (per-batch-entry) importance
        weights.

    Returns:
      A train_op.

    Raises:
      ValueError: If optimizers are None and no default value was provided to
        the constructor.
    """
        squeeze_time_dim = not self._critic_network_1.state_spec
        time_steps, policy_steps, next_time_steps = (
            trajectory.experience_to_transitions(experience, squeeze_time_dim))
        actions = policy_steps.action

        trainable_critic_variables = list(
            object_identity.ObjectIdentitySet(
                self._critic_network_1.trainable_variables +
                self._critic_network_2.trainable_variables))

        with tf.GradientTape(watch_accessed_variables=False) as tape:
            assert trainable_critic_variables, (
                'No trainable critic variables to '
                'optimize.')
            tape.watch(trainable_critic_variables)
            critic_loss = self._critic_loss_weight * self.critic_loss(
                time_steps,
                actions,
                next_time_steps,
                td_errors_loss_fn=self._td_errors_loss_fn,
                gamma=self._gamma,
                reward_scale_factor=self._reward_scale_factor,
                weights=weights,
                training=True)

        tf.debugging.check_numerics(critic_loss, 'Critic loss is inf or nan.')
        critic_grads = tape.gradient(critic_loss, trainable_critic_variables)
        self._apply_gradients(critic_grads, trainable_critic_variables,
                              self._critic_optimizer)

        critic_no_entropy_loss = None
        if self._critic_network_no_entropy_1 is not None:
            trainable_critic_no_entropy_variables = list(
                object_identity.ObjectIdentitySet(
                    self._critic_network_no_entropy_1.trainable_variables +
                    self._critic_network_no_entropy_2.trainable_variables))
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                assert trainable_critic_no_entropy_variables, (
                    'No trainable critic_no_entropy variables to optimize.')
                tape.watch(trainable_critic_no_entropy_variables)
                critic_no_entropy_loss = self._critic_loss_weight * self.critic_no_entropy_loss(
                    time_steps,
                    actions,
                    next_time_steps,
                    td_errors_loss_fn=self._td_errors_loss_fn,
                    gamma=self._gamma,
                    reward_scale_factor=self._reward_scale_factor,
                    weights=weights,
                    training=True)

            tf.debugging.check_numerics(
                critic_no_entropy_loss,
                'Critic (without entropy) loss is inf or nan.')
            critic_no_entropy_grads = tape.gradient(
                critic_no_entropy_loss, trainable_critic_no_entropy_variables)
            self._apply_gradients(critic_no_entropy_grads,
                                  trainable_critic_no_entropy_variables,
                                  self._critic_no_entropy_optimizer)

        trainable_actor_variables = self._actor_network.trainable_variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            assert trainable_actor_variables, (
                'No trainable actor variables to '
                'optimize.')
            tape.watch(trainable_actor_variables)
            actor_loss = self._actor_loss_weight * self.actor_loss(
                time_steps, weights=weights)
        tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.')
        actor_grads = tape.gradient(actor_loss, trainable_actor_variables)
        self._apply_gradients(actor_grads, trainable_actor_variables,
                              self._actor_optimizer)

        alpha_variable = [self._log_alpha]
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            assert alpha_variable, 'No alpha variable to optimize.'
            tape.watch(alpha_variable)
            alpha_loss = self._alpha_loss_weight * self.alpha_loss(
                time_steps, weights=weights)
        tf.debugging.check_numerics(alpha_loss, 'Alpha loss is inf or nan.')
        alpha_grads = tape.gradient(alpha_loss, alpha_variable)
        self._apply_gradients(alpha_grads, alpha_variable,
                              self._alpha_optimizer)

        with tf.name_scope('Losses'):
            tf.compat.v2.summary.scalar(name='critic_loss_' + self.name,
                                        data=critic_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='actor_loss_' + self.name,
                                        data=actor_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='alpha_loss_' + self.name,
                                        data=alpha_loss,
                                        step=self.train_step_counter)
            if critic_no_entropy_loss is not None:
                tf.compat.v2.summary.scalar(name='critic_no_entropy_loss_' +
                                            self.name,
                                            data=critic_no_entropy_loss,
                                            step=self.train_step_counter)

        self.train_step_counter.assign_add(1)
        self._update_target()

        total_loss = critic_loss + actor_loss + alpha_loss
        if critic_no_entropy_loss is not None:
            total_loss += critic_no_entropy_loss

        extra = SacLossInfo(critic_loss=critic_loss,
                            actor_loss=actor_loss,
                            alpha_loss=alpha_loss,
                            critic_no_entropy_loss=critic_no_entropy_loss)

        return tf_agent.LossInfo(loss=total_loss, extra=extra)
Пример #11
0
  def _train(self, experience, weights):
    """Returns a train op to update the agent's networks.

    This method trains with the provided batched experience.

    Args:
      experience: A time-stacked trajectory object.
      weights: Optional scalar or elementwise (per-batch-entry) importance
        weights.

    Returns:
      A train_op.

    Raises:
      ValueError: If optimizers are None and no default value was provided to
        the constructor.
    """
    experience, expert_experience = experience

    if self._n_step is None:
      transition = self._as_transition(experience)
      time_steps, policy_steps, next_time_steps = transition
      future_time_steps = next_time_steps
    else:
      experience_1 = experience._replace(
          observation=experience.observation[:, :2],
          action=experience.action[:, :2],
          discount=experience.discount[:, :2],
          reward=experience.reward[:, :2],
          step_type=experience.step_type[:, :2],
          next_step_type=experience.next_step_type[:, :2],
          )
      obs_2 = tf.stack([experience.observation[:, 0],
                        experience.observation[:, -1],], axis=1)
      action_2 = tf.stack([experience.action[:, 0],
                           experience.action[:, -1],], axis=1)
      discount_2 = tf.stack([experience.discount[:, 0],
                             experience.discount[:, -1],], axis=1)
      step_type_2 = tf.stack([experience.step_type[:, 0],
                              experience.step_type[:, -1],], axis=1)
      next_step_type_2 = tf.stack([experience.next_step_type[:, 0],
                                   experience.next_step_type[:, -1],], axis=1)
      reward_2 = tf.stack([experience.reward[:, 0],
                           experience.reward[:, -1],], axis=1)
      experience_2 = experience._replace(
          observation=obs_2,
          action=action_2,
          discount=discount_2,
          step_type=step_type_2,
          next_step_type=next_step_type_2,
          reward=reward_2)
      time_steps, policy_steps, next_time_steps = self._as_transition(
          experience_1)
      _, _, future_time_steps = self._as_transition(experience_2)

    actions = policy_steps.action

    trainable_critic_variables = list(object_identity.ObjectIdentitySet(
        self._critic_network_1.trainable_variables +
        self._critic_network_2.trainable_variables))

    with tf.GradientTape(watch_accessed_variables=False) as tape:
      assert trainable_critic_variables, ('No trainable critic variables to '
                                          'optimize.')
      tape.watch(trainable_critic_variables)
      critic_loss = self._critic_loss_weight*self.critic_loss(
          time_steps,
          expert_experience,
          actions,
          next_time_steps,
          future_time_steps,
          td_errors_loss_fn=self._td_errors_loss_fn,
          gamma=self._gamma,
          reward_scale_factor=self._reward_scale_factor,
          weights=weights,
          training=True)

    tf.debugging.check_numerics(critic_loss, 'Critic loss is inf or nan.')
    critic_grads = tape.gradient(critic_loss, trainable_critic_variables)
    self._apply_gradients(critic_grads, trainable_critic_variables,
                          self._critic_optimizer)

    trainable_actor_variables = self._actor_network.trainable_variables
    with tf.GradientTape(watch_accessed_variables=False) as tape:
      assert trainable_actor_variables, ('No trainable actor variables to '
                                         'optimize.')
      tape.watch(trainable_actor_variables)
      actor_loss = self._actor_loss_weight*self.actor_loss(
          time_steps, actions, weights=weights)
    tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.')
    actor_grads = tape.gradient(actor_loss, trainable_actor_variables)
    self._apply_gradients(actor_grads, trainable_actor_variables,
                          self._actor_optimizer)

    # Train the behavior policy
    if self._use_behavior_policy:
      trainable_behavior_variables = self._behavior_actor_network.trainable_variables
      with tf.GradientTape(watch_accessed_variables=False) as tape:
        assert trainable_behavior_variables, ('No trainable behavior variables '
                                              'to optimize.')
        tape.watch(trainable_behavior_variables)
        behavior_loss = self._actor_loss_weight*self.behavior_loss(
            time_steps, actions, weights=weights)
      tf.debugging.check_numerics(behavior_loss, 'Behavior loss is inf or nan.')
      behavior_grads = tape.gradient(behavior_loss,
                                     trainable_behavior_variables)
      self._apply_gradients(behavior_grads, trainable_behavior_variables,
                            self._actor_optimizer)
    else:
      behavior_loss = 0.0

    with tf.name_scope('Losses'):
      tf.compat.v2.summary.scalar(
          name='critic_loss', data=critic_loss, step=self.train_step_counter)
      tf.compat.v2.summary.scalar(
          name='actor_loss', data=actor_loss, step=self.train_step_counter)
      tf.compat.v2.summary.scalar(name='behavior_loss', data=behavior_loss,
                                  step=self.train_step_counter)

    self.train_step_counter.assign_add(1)
    self._update_target()

    total_loss = critic_loss + actor_loss

    extra = RceLossInfo(
        critic_loss=critic_loss, actor_loss=actor_loss)

    return tf_agent.LossInfo(loss=total_loss, extra=extra)
Пример #12
0
  def _train(self, experience, weights):
    # Get individual tensors from transitions.
    (time_steps, policy_steps_,
     next_time_steps) = trajectory.to_transition(experience)
    actions = policy_steps_.action

    batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0]

    if self._debug_summaries:
      actions_list = tf.nest.flatten(actions)
      show_action_index = len(actions_list) != 1
      for i, single_action in enumerate(actions_list):
        action_name = ('actions_{}'.format(i)
                       if show_action_index else 'actions')
        tf.compat.v2.summary.histogram(
            name=action_name, data=single_action, step=self.train_step_counter)

    action_distribution_parameters = policy_steps_.info['dist_params']

    # Reconstruct per-timestep policy distribution from stored distribution
    #   parameters.
    old_actions_distribution = (
        distribution_spec.nested_distributions_from_specs(
            self._action_distribution_spec, action_distribution_parameters))

    # Compute log probability of actions taken during data collection, using the
    #   collect policy distribution.
    act_log_probs = common.log_probability(old_actions_distribution, actions,
                                           self._action_spec)

    valid_mask = ppo_utils.make_timestep_mask(
        next_time_steps, allow_partial_episodes=True)

    if weights is None:
      weights = valid_mask
    else:
      weights *= valid_mask

    if self._compute_value_and_advantage_in_train:
      value_state = self._collect_policy.get_initial_value_state(batch_size)
      value_preds, _ = self._collect_policy.apply_value_network(
          experience.observation,
          experience.step_type,
          value_state=value_state,
          training=False)
    else:
      value_preds = experience.policy_info['value_prediction']

    value_preds = tf.stop_gradient(value_preds)
    returns, normalized_advantages = self.compute_return_and_advantage(
        next_time_steps, value_preds)

    # Loss tensors across batches will be aggregated for summaries.
    policy_gradient_losses = []
    value_estimation_losses = []
    l2_regularization_losses = []
    entropy_regularization_losses = []
    kl_penalty_losses = []

    loss_info = None  # TODO(b/123627451): Remove.
    variables_to_train = list(
        object_identity.ObjectIdentitySet(self._actor_net.trainable_weights +
                                          self._value_net.trainable_weights))
    # For each epoch, create its own train op that depends on the previous one.
    for i_epoch in range(self._num_epochs):
      with tf.name_scope('epoch_%d' % i_epoch):
        # Only save debug summaries for first and last epochs.
        debug_summaries = (
            self._debug_summaries and
            (i_epoch == 0 or i_epoch == self._num_epochs - 1))

        # Build one epoch train op.
        with tf.GradientTape() as tape:
          loss_info = self.get_epoch_loss(
              time_steps,
              actions,
              act_log_probs,
              returns,
              normalized_advantages,
              action_distribution_parameters,
              weights,
              self.train_step_counter,
              debug_summaries,
              training=True)

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping > 0:
          grads_and_vars = eager_utils.clip_gradient_norms(
              grads_and_vars, self._gradient_clipping)

        # If summarize_gradients, create functions for summarizing both
        # gradients and variables.
        if self._summarize_grads_and_vars and debug_summaries:
          eager_utils.add_gradients_summaries(grads_and_vars,
                                              self.train_step_counter)
          eager_utils.add_variables_summaries(grads_and_vars,
                                              self.train_step_counter)

        self._optimizer.apply_gradients(
            grads_and_vars, global_step=self.train_step_counter)

        policy_gradient_losses.append(loss_info.extra.policy_gradient_loss)
        value_estimation_losses.append(loss_info.extra.value_estimation_loss)
        l2_regularization_losses.append(loss_info.extra.l2_regularization_loss)
        entropy_regularization_losses.append(
            loss_info.extra.entropy_regularization_loss)
        kl_penalty_losses.append(loss_info.extra.kl_penalty_loss)

    # After update epochs, update adaptive kl beta, then update observation
    #   normalizer and reward normalizer.
    policy_state = self._collect_policy.get_initial_state(batch_size)
    # Compute the mean kl from previous action distribution.
    kl_divergence = self._kl_divergence(
        time_steps, action_distribution_parameters,
        self._collect_policy.distribution(time_steps, policy_state).action)
    self.update_adaptive_kl_beta(kl_divergence)

    if self._observation_normalizer:
      self._observation_normalizer.update(
          time_steps.observation, outer_dims=[0, 1])
    else:
      # TODO(b/127661780): Verify performance of reward_normalizer when obs are
      #                    not normalized
      if self._reward_normalizer:
        self._reward_normalizer.update(
            next_time_steps.reward, outer_dims=[0, 1])

    loss_info = tf.nest.map_structure(tf.identity, loss_info)

    # Make summaries for total loss averaged across all epochs.
    # The *_losses lists will have been populated by
    #   calls to self.get_epoch_loss. Assumes all the losses have same length.
    with tf.name_scope('Losses/'):
      num_epochs = len(policy_gradient_losses)
      total_policy_gradient_loss = tf.add_n(policy_gradient_losses) / num_epochs
      total_value_estimation_loss = tf.add_n(
          value_estimation_losses) / num_epochs
      total_l2_regularization_loss = tf.add_n(
          l2_regularization_losses) / num_epochs
      total_entropy_regularization_loss = tf.add_n(
          entropy_regularization_losses) / num_epochs
      total_kl_penalty_loss = tf.add_n(kl_penalty_losses) / num_epochs
      tf.compat.v2.summary.scalar(
          name='policy_gradient_loss',
          data=total_policy_gradient_loss,
          step=self.train_step_counter)
      tf.compat.v2.summary.scalar(
          name='value_estimation_loss',
          data=total_value_estimation_loss,
          step=self.train_step_counter)
      tf.compat.v2.summary.scalar(
          name='l2_regularization_loss',
          data=total_l2_regularization_loss,
          step=self.train_step_counter)
      tf.compat.v2.summary.scalar(
          name='entropy_regularization_loss',
          data=total_entropy_regularization_loss,
          step=self.train_step_counter)
      tf.compat.v2.summary.scalar(
          name='kl_penalty_loss',
          data=total_kl_penalty_loss,
          step=self.train_step_counter)

      total_abs_loss = (
          tf.abs(total_policy_gradient_loss) +
          tf.abs(total_value_estimation_loss) +
          tf.abs(total_entropy_regularization_loss) +
          tf.abs(total_l2_regularization_loss) + tf.abs(total_kl_penalty_loss))

      tf.compat.v2.summary.scalar(
          name='total_abs_loss',
          data=total_abs_loss,
          step=self.train_step_counter)

    if self._summarize_grads_and_vars:
      with tf.name_scope('Variables/'):
        all_vars = (
            self._actor_net.trainable_weights +
            self._value_net.trainable_weights)
        for var in all_vars:
          tf.compat.v2.summary.histogram(
              name=var.name.replace(':', '_'),
              data=var,
              step=self.train_step_counter)

    return loss_info
Пример #13
0
    def _train(self, experience, weights):
        """Returns a train op to update the agent's networks.

    This method trains with the provided batched experience.

    Args:
      experience: A time-stacked trajectory object.
      weights: Optional scalar or elementwise (per-batch-entry) importance
        weights.

    Returns:
      A train_op.

    Raises:
      ValueError: If optimizers are None and no default value was provided to
        the constructor.
    """
        transition = self._as_transition(experience)
        time_steps, policy_steps, next_time_steps = transition
        actions = policy_steps.action

        trainable_critic_variables = list(
            object_identity.ObjectIdentitySet(
                self._critic_network_1.trainable_variables +
                self._critic_network_2.trainable_variables))

        with tf.GradientTape(watch_accessed_variables=False) as tape:
            assert trainable_critic_variables, (
                'No trainable critic variables to '
                'optimize.')
            tape.watch(trainable_critic_variables)
            critic_loss = self._critic_loss_with_optional_entropy_term(
                time_steps,
                actions,
                next_time_steps,
                td_errors_loss_fn=self._td_errors_loss_fn,
                gamma=self._gamma,
                reward_scale_factor=self._reward_scale_factor,
                weights=weights,
                training=True)
            critic_loss *= self._critic_loss_weight

            cql_alpha = self._get_cql_alpha()
            cql_loss = self._cql_loss(time_steps, actions, training=True)

            if self._bc_debug_mode:
                cql_critic_loss = cql_loss * cql_alpha
            else:
                cql_critic_loss = critic_loss + (cql_loss * cql_alpha)

        tf.debugging.check_numerics(critic_loss, 'Critic loss is inf or nan.')
        tf.debugging.check_numerics(cql_loss, 'CQL loss is inf or nan.')
        critic_grads = tape.gradient(cql_critic_loss,
                                     trainable_critic_variables)
        self._apply_gradients(critic_grads, trainable_critic_variables,
                              self._critic_optimizer)

        trainable_actor_variables = self._actor_network.trainable_variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            assert trainable_actor_variables, (
                'No trainable actor variables to '
                'optimize.')
            tape.watch(trainable_actor_variables)
            actor_loss = self._actor_loss_weight * self.actor_loss(
                time_steps, actions=actions, weights=weights)
        tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.')
        actor_grads = tape.gradient(actor_loss, trainable_actor_variables)
        self._apply_gradients(actor_grads, trainable_actor_variables,
                              self._actor_optimizer)

        alpha_variable = [self._log_alpha]
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            assert alpha_variable, 'No alpha variable to optimize.'
            tape.watch(alpha_variable)
            alpha_loss = self._alpha_loss_weight * self.alpha_loss(
                time_steps, weights=weights)
        tf.debugging.check_numerics(alpha_loss, 'Alpha loss is inf or nan.')
        alpha_grads = tape.gradient(alpha_loss, alpha_variable)
        self._apply_gradients(alpha_grads, alpha_variable,
                              self._alpha_optimizer)

        # Based on the equation (24), which automates CQL alpha with the "budget"
        # parameter tau. CQL(H) is now CQL-Lagrange(H):
        # ```
        # min_Q max_{alpha >= 0} alpha * (log_sum_exp(Q(s, a')) - Q(s, a) - tau)
        # ```
        # If the expected difference in Q-values is less than tau, alpha
        # will adjust to be closer to 0. If the difference is higher than tau,
        # alpha is likely to take on high values and more aggressively penalize
        # Q-values.
        cql_alpha_loss = tf.constant(0.)
        if self._use_lagrange_cql_alpha:
            cql_alpha_variable = [self._log_cql_alpha]
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(cql_alpha_variable)
                cql_alpha_loss = -self._get_cql_alpha() * (cql_loss -
                                                           self._cql_tau)
            tf.debugging.check_numerics(cql_alpha_loss,
                                        'CQL alpha loss is inf or nan.')
            cql_alpha_gradients = tape.gradient(cql_alpha_loss,
                                                cql_alpha_variable)
            self._apply_gradients(cql_alpha_gradients, cql_alpha_variable,
                                  self._cql_alpha_optimizer)

        with tf.name_scope('Losses'):
            tf.compat.v2.summary.scalar(name='critic_loss',
                                        data=critic_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='actor_loss',
                                        data=actor_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='alpha_loss',
                                        data=alpha_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='cql_loss',
                                        data=cql_loss,
                                        step=self.train_step_counter)
            if self._use_lagrange_cql_alpha:
                tf.compat.v2.summary.scalar(name='cql_alpha_loss',
                                            data=cql_alpha_loss,
                                            step=self.train_step_counter)
        tf.compat.v2.summary.scalar(name='cql_alpha',
                                    data=cql_alpha,
                                    step=self.train_step_counter)
        tf.compat.v2.summary.scalar(name='sac_alpha',
                                    data=tf.exp(self._log_alpha),
                                    step=self.train_step_counter)

        self.train_step_counter.assign_add(1)
        self._update_target()

        total_loss = cql_critic_loss + actor_loss + alpha_loss

        extra = CqlSacLossInfo(critic_loss=critic_loss,
                               actor_loss=actor_loss,
                               alpha_loss=alpha_loss,
                               cql_loss=cql_loss,
                               cql_alpha=cql_alpha,
                               cql_alpha_loss=cql_alpha_loss)

        return tf_agent.LossInfo(loss=total_loss, extra=extra)
 def testClear(self):
   a = object()
   b = object()
   set1 = object_identity.ObjectIdentitySet([a, b])
   set1.clear()
   self.assertLen(set1, 0)
Пример #15
0
  def _train(self, experience, weights):
    """Modifies the default _train step in two ways.

      1. Passes actions and next time steps to actor loss.
      2. Clips the dual parameter.

    Args:
      experience: A time-stacked trajectory object.
      weights: Optional scalar or elementwise (per-batch-entry) importance
        weights.

    Returns:
      A train_op.
    """
    transition = self._as_transition(experience)
    time_steps, policy_steps, next_time_steps = transition
    actions = policy_steps.action

    trainable_critic_variables = list(object_identity.ObjectIdentitySet(
        self._critic_network_1.trainable_variables +
        self._critic_network_2.trainable_variables))

    tf.debugging.check_numerics(
        tf.reduce_mean(time_steps.reward), 'ts.reward is inf or nan.')
    tf.debugging.check_numerics(
        tf.reduce_mean(next_time_steps.reward), 'next_ts.reward is inf or nan.')
    tf.debugging.check_numerics(
        tf.reduce_mean(actions), 'Actions is inf or nan.')

    with tf.GradientTape(watch_accessed_variables=False) as tape:
      assert trainable_critic_variables, ('No trainable critic variables to '
                                          'optimize.')
      tape.watch(trainable_critic_variables)
      critic_loss = self._critic_loss_weight*self.critic_loss(
          time_steps,
          actions,
          next_time_steps,
          td_errors_loss_fn=self._td_errors_loss_fn,
          gamma=self._gamma,
          reward_scale_factor=self._reward_scale_factor,
          weights=weights,
          training=True)

    tf.debugging.check_numerics(critic_loss, 'Critic loss is inf or nan.')
    critic_grads = tape.gradient(critic_loss, trainable_critic_variables)
    self._apply_gradients(critic_grads, trainable_critic_variables,
                          self._critic_optimizer)

    trainable_actor_variables = self._actor_network.trainable_variables
    with tf.GradientTape(watch_accessed_variables=False) as tape:
      assert trainable_actor_variables, ('No trainable actor variables to '
                                         'optimize.')
      tape.watch(trainable_actor_variables)
      actor_loss = self._actor_loss_weight*self.actor_loss(
          time_steps, actions, next_time_steps, weights=weights)
    tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.')
    actor_grads = tape.gradient(actor_loss, trainable_actor_variables)
    self._apply_gradients(actor_grads, trainable_actor_variables,
                          self._actor_optimizer)

    alpha_variable = [self._log_alpha]
    with tf.GradientTape(watch_accessed_variables=False) as tape:
      assert alpha_variable, 'No alpha variable to optimize.'
      tape.watch(alpha_variable)
      alpha_loss = self._alpha_loss_weight*self.alpha_loss(
          time_steps, weights=weights)
    tf.debugging.check_numerics(alpha_loss, 'Alpha loss is inf or nan.')
    alpha_grads = tape.gradient(alpha_loss, alpha_variable)
    self._apply_gradients(alpha_grads, alpha_variable, self._alpha_optimizer)

    with tf.name_scope('Losses'):
      tf.compat.v2.summary.scalar(
          name='critic_loss', data=critic_loss, step=self.train_step_counter)
      tf.compat.v2.summary.scalar(
          name='actor_loss', data=actor_loss, step=self.train_step_counter)
      tf.compat.v2.summary.scalar(
          name='alpha_loss', data=alpha_loss, step=self.train_step_counter)

    self.train_step_counter.assign_add(1)
    self._update_target()

    total_loss = critic_loss + actor_loss + alpha_loss

    extra = sac_agent.SacLossInfo(
        critic_loss=critic_loss, actor_loss=actor_loss, alpha_loss=alpha_loss)

    return LossInfo(loss=total_loss, extra=extra)