示例#1
0
    def testToTransition(self):
        first = ts.StepType.FIRST
        mid = ts.StepType.MID
        last = ts.StepType.LAST

        # Define a batch size 1, 3-step trajectory.
        traj = trajectory.Trajectory(
            step_type=np.array([[first, mid, last]]),
            next_step_type=np.array([[mid, last, first]]),
            observation=np.array([[10.0, 20.0, 30.0]]),
            action=np.array([[11.0, 22.0, 33.0]]),
            # reward at step 0 is an invalid dummy reward.
            reward=np.array([[0.0, 1.0, 2.0]]),
            discount=np.array([[1.0, 1.0, 0.0]]),
            policy_info=np.array([[1.0, 2.0, 3.0]]))

        time_steps, policy_steps, next_time_steps = trajectory.to_transition(
            traj)

        self.assertAllEqual(time_steps.step_type, np.array([[first, mid]]))
        self.assertAllEqual(time_steps.observation, np.array([[10.0, 20.0]]))

        self.assertAllEqual(next_time_steps.step_type, np.array([[mid, last]]))
        self.assertAllEqual(next_time_steps.observation,
                            np.array([[20.0, 30.0]]))
        self.assertAllEqual(next_time_steps.reward, np.array([[0.0, 1.0]]))
        self.assertAllEqual(next_time_steps.discount, np.array([[1.0, 1.0]]))

        self.assertAllEqual(policy_steps.action, np.array([[11.0, 22.0]]))
        self.assertAllEqual(policy_steps.info, np.array([[1.0, 2.0]]))
示例#2
0
    def testToTransitionHandlesTrajectoryFromDriverCorrectly(self):
        env = tf_py_environment.TFPyEnvironment(test_utils.PyEnvironmentMock())
        policy = test_utils.TFPolicyMock(env.time_step_spec(),
                                         env.action_spec())
        replay_buffer = test_utils.make_replay_buffer(policy)

        driver = dynamic_episode_driver.DynamicEpisodeDriver(
            env, policy, num_episodes=3, observers=[replay_buffer.add_batch])

        run_driver = driver.run()
        rb_gather_all = replay_buffer.gather_all()

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.evaluate(run_driver)
        trajectories = self.evaluate(rb_gather_all)

        time_steps, policy_step, next_time_steps = trajectory.to_transition(
            trajectories)

        self.assertAllEqual(time_steps.observation,
                            trajectories.observation[:, :-1])
        self.assertAllEqual(time_steps.step_type,
                            trajectories.step_type[:, :-1])
        self.assertAllEqual(next_time_steps.observation,
                            trajectories.observation[:, 1:])
        self.assertAllEqual(next_time_steps.step_type,
                            trajectories.step_type[:, 1:])
        self.assertAllEqual(next_time_steps.reward,
                            trajectories.reward[:, :-1])
        self.assertAllEqual(next_time_steps.discount,
                            trajectories.discount[:, :-1])

        self.assertAllEqual(policy_step.action, trajectories.action[:, :-1])
        self.assertAllEqual(policy_step.info, trajectories.policy_info[:, :-1])
示例#3
0
 def _experience_to_transitions(self, experience):
   transitions = trajectory.to_transition(experience)
   time_steps, policy_steps, next_time_steps = transitions
   actions = policy_steps.action
   # TODO(eholly): Figure out how to properly deal with time dimension.
   time_steps, actions, next_time_steps = nest.map_structure(
       lambda t: tf.squeeze(t, axis=1), (time_steps, actions, next_time_steps))
   return time_steps, actions, next_time_steps
示例#4
0
    def _experience_to_transitions(self, experience):
        transitions = trajectory.to_transition(experience)

        # Remove time dim if we are not using a recurrent network.
        if not self._actor_network.state_spec:
            transitions = nest.map_structure(lambda x: tf.squeeze(x, [1]),
                                             transitions)

        time_steps, policy_steps, next_time_steps = transitions
        actions = policy_steps.action
        return time_steps, actions, next_time_steps
示例#5
0
  def _train(self, experience, weights):
    # Get individual tensors from transitions.
    (time_steps, policy_steps_,
     next_time_steps) = trajectory.to_transition(experience)
    actions = policy_steps_.action

    if self._debug_summaries:
      tf.compat.v2.summary.histogram(
          name='actions', data=actions, step=self.train_step_counter)

    action_distribution_parameters = policy_steps_.info

    # Reconstruct per-timestep policy distribution from stored distribution
    #   parameters.
    old_actions_distribution = (
        distribution_spec.nested_distributions_from_specs(
            self._action_distribution_spec, action_distribution_parameters))

    # Compute log probability of actions taken during data collection, using the
    #   collect policy distribution.
    act_log_probs = common.log_probability(old_actions_distribution, actions,
                                           self._action_spec)

    # Compute the value predictions for states using the current value function.
    # To be used for return & advantage computation.
    batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0]
    policy_state = self._collect_policy.get_initial_state(batch_size=batch_size)

    value_preds, unused_policy_state = self._collect_policy.apply_value_network(
        experience.observation, experience.step_type, policy_state=policy_state)
    value_preds = tf.stop_gradient(value_preds)

    valid_mask = ppo_utils.make_timestep_mask(next_time_steps)

    if weights is None:
      weights = valid_mask
    else:
      weights *= valid_mask

    returns, normalized_advantages = self.compute_return_and_advantage(
        next_time_steps, value_preds)

    # Loss tensors across batches will be aggregated for summaries.
    policy_gradient_losses = []
    value_estimation_losses = []
    l2_regularization_losses = []
    entropy_regularization_losses = []
    kl_penalty_losses = []

    loss_info = None  # TODO(b/123627451): Remove.
    # For each epoch, create its own train op that depends on the previous one.
    for i_epoch in range(self._num_epochs):
      with tf.name_scope('epoch_%d' % i_epoch):
        # Only save debug summaries for first and last epochs.
        debug_summaries = (
            self._debug_summaries and
            (i_epoch == 0 or i_epoch == self._num_epochs - 1))

        # Build one epoch train op.
        with tf.GradientTape() as tape:
          loss_info = self.get_epoch_loss(
              time_steps, actions, act_log_probs, returns,
              normalized_advantages, action_distribution_parameters, weights,
              self.train_step_counter, debug_summaries)

        variables_to_train = (
            self._actor_net.trainable_weights +
            self._value_net.trainable_weights)
        grads = tape.gradient(loss_info.loss, variables_to_train)
        grads_and_vars = zip(grads, variables_to_train)

        if self._gradient_clipping > 0:
          grads_and_vars = eager_utils.clip_gradient_norms(
              grads_and_vars, self._gradient_clipping)

        # If summarize_gradients, create functions for summarizing both
        # gradients and variables.
        if self._summarize_grads_and_vars and debug_summaries:
          eager_utils.add_gradients_summaries(grads_and_vars,
                                              self.train_step_counter)
          eager_utils.add_variables_summaries(grads_and_vars,
                                              self.train_step_counter)

        self._optimizer.apply_gradients(
            grads_and_vars, global_step=self.train_step_counter)

        policy_gradient_losses.append(loss_info.extra.policy_gradient_loss)
        value_estimation_losses.append(loss_info.extra.value_estimation_loss)
        l2_regularization_losses.append(loss_info.extra.l2_regularization_loss)
        entropy_regularization_losses.append(
            loss_info.extra.entropy_regularization_loss)
        kl_penalty_losses.append(loss_info.extra.kl_penalty_loss)

    # After update epochs, update adaptive kl beta, then update observation
    #   normalizer and reward normalizer.
    batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0]
    policy_state = self._collect_policy.get_initial_state(batch_size)
    # Compute the mean kl from previous action distribution.
    kl_divergence = self._kl_divergence(
        time_steps, action_distribution_parameters,
        self._collect_policy.distribution(time_steps, policy_state).action)
    self.update_adaptive_kl_beta(kl_divergence)

    if self._observation_normalizer:
      self._observation_normalizer.update(
          time_steps.observation, outer_dims=[0, 1])
    else:
      # TODO(b/127661780): Verify performance of reward_normalizer when obs are
      #                    not normalized
      if self._reward_normalizer:
        self._reward_normalizer.update(next_time_steps.reward,
                                       outer_dims=[0, 1])

    loss_info = tf.nest.map_structure(tf.identity, loss_info)

    # Make summaries for total loss across all epochs.
    # The *_losses lists will have been populated by
    #   calls to self.get_epoch_loss.
    with tf.name_scope('Losses/'):
      total_policy_gradient_loss = tf.add_n(policy_gradient_losses)
      total_value_estimation_loss = tf.add_n(value_estimation_losses)
      total_l2_regularization_loss = tf.add_n(l2_regularization_losses)
      total_entropy_regularization_loss = tf.add_n(
          entropy_regularization_losses)
      total_kl_penalty_loss = tf.add_n(kl_penalty_losses)
      tf.compat.v2.summary.scalar(
          name='policy_gradient_loss',
          data=total_policy_gradient_loss,
          step=self.train_step_counter)
      tf.compat.v2.summary.scalar(
          name='value_estimation_loss',
          data=total_value_estimation_loss,
          step=self.train_step_counter)
      tf.compat.v2.summary.scalar(
          name='l2_regularization_loss',
          data=total_l2_regularization_loss,
          step=self.train_step_counter)
      if self._entropy_regularization:
        tf.compat.v2.summary.scalar(
            name='entropy_regularization_loss',
            data=total_entropy_regularization_loss,
            step=self.train_step_counter)
      tf.compat.v2.summary.scalar(
          name='kl_penalty_loss',
          data=total_kl_penalty_loss,
          step=self.train_step_counter)

      total_abs_loss = (
          tf.abs(total_policy_gradient_loss) +
          tf.abs(total_value_estimation_loss) +
          tf.abs(total_entropy_regularization_loss) +
          tf.abs(total_l2_regularization_loss) +
          tf.abs(total_kl_penalty_loss))

      tf.compat.v2.summary.scalar(
          name='total_abs_loss',
          data=total_abs_loss,
          step=self.train_step_counter)

    if self._summarize_grads_and_vars:
      with tf.name_scope('Variables/'):
        all_vars = (
            self._actor_net.trainable_weights +
            self._value_net.trainable_weights)
        for var in all_vars:
          tf.compat.v2.summary.histogram(
              name=var.name.replace(':', '_'),
              data=var,
              step=self.train_step_counter)

    return loss_info
示例#6
0
    def _train(self, experience, weights, train_step_counter):
        # Change trajectory to transitions.
        trajectory0 = nest.map_structure(lambda t: t[:, :-1], experience)
        trajectory1 = nest.map_structure(lambda t: t[:, 1:], experience)

        # Get individual tensors from transitions.
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(trajectory0, trajectory1)
        actions = policy_steps_.action
        if self._debug_summaries:
            tf.contrib.summary.histogram('actions', actions)

        action_distribution_parameters = policy_steps_.info

        # Reconstruct per-timestep policy distribution from stored distribution
        #   parameters.
        old_actions_distribution = (
            distribution_spec.nested_distributions_from_specs(
                self._action_distribution_spec,
                action_distribution_parameters))

        # Compute log probability of actions taken during data collection, using the
        #   collect policy distribution.
        act_log_probs = common_utils.log_probability(old_actions_distribution,
                                                     actions,
                                                     self._action_spec)

        # Compute the value predictions for states using the current value function.
        # To be used for return & advantage computation.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(
            batch_size=batch_size)

        value_preds, unused_policy_state = self._collect_policy.apply_value_network(
            experience.observation,
            experience.step_type,
            policy_state=policy_state)
        value_preds = tf.stop_gradient(value_preds)

        valid_mask = ppo_utils.make_timestep_mask(next_time_steps)

        if weights is None:
            weights = valid_mask
        else:
            weights *= valid_mask

        returns, normalized_advantages = self.compute_return_and_advantage(
            next_time_steps, value_preds)

        # Loss tensors across batches will be aggregated for summaries.
        policy_gradient_losses = []
        value_estimation_losses = []
        l2_regularization_losses = []
        entropy_regularization_losses = []
        kl_penalty_losses = []

        # For each epoch, create its own train op that depends on the previous one.
        loss_info = tf.no_op()
        for i_epoch in range(self._num_epochs):
            with tf.name_scope('epoch_%d' % i_epoch):
                with tf.control_dependencies(nest.flatten(loss_info)):
                    # Only save debug summaries for first and last epochs.
                    debug_summaries = (self._debug_summaries and
                                       (i_epoch == 0
                                        or i_epoch == self._num_epochs - 1))

                    # Build one epoch train op.
                    loss_info = self.build_train_op(
                        time_steps, actions, act_log_probs, returns,
                        normalized_advantages, action_distribution_parameters,
                        weights, train_step_counter,
                        self._summarize_grads_and_vars,
                        self._gradient_clipping, debug_summaries)

                    policy_gradient_losses.append(
                        loss_info.extra.policy_gradient_loss)
                    value_estimation_losses.append(
                        loss_info.extra.value_estimation_loss)
                    l2_regularization_losses.append(
                        loss_info.extra.l2_regularization_loss)
                    entropy_regularization_losses.append(
                        loss_info.extra.entropy_regularization_loss)
                    kl_penalty_losses.append(loss_info.extra.kl_penalty_loss)

        # After update epochs, update adaptive kl beta, then update observation
        #   normalizer and reward normalizer.
        with tf.control_dependencies(nest.flatten(loss_info)):
            # Compute the mean kl from old.
            batch_size = nest_utils.get_outer_shape(time_steps,
                                                    self._time_step_spec)[0]
            policy_state = self._collect_policy.get_initial_state(batch_size)
            kl_divergence = self._kl_divergence(
                time_steps, action_distribution_parameters,
                self._collect_policy.distribution(time_steps,
                                                  policy_state).action)
            update_adaptive_kl_beta_op = self.update_adaptive_kl_beta(
                kl_divergence)

        with tf.control_dependencies([update_adaptive_kl_beta_op]):
            if self._observation_normalizer:
                update_obs_norm = (self._observation_normalizer.update(
                    time_steps.observation, outer_dims=[0, 1]))
            else:
                update_obs_norm = tf.no_op()
            if self._reward_normalizer:
                update_reward_norm = self._reward_normalizer.update(
                    next_time_steps.reward, outer_dims=[0, 1])
            else:
                update_reward_norm = tf.no_op()

        with tf.control_dependencies([update_obs_norm, update_reward_norm]):
            loss_info = nest.map_structure(tf.identity, loss_info)

        # Make summaries for total loss across all epochs.
        # The *_losses lists will have been populated by
        #   calls to self.build_train_op.
        with tf.name_scope('Losses/'):
            total_policy_gradient_loss = tf.add_n(policy_gradient_losses)
            total_value_estimation_loss = tf.add_n(value_estimation_losses)
            total_l2_regularization_loss = tf.add_n(l2_regularization_losses)
            total_entropy_regularization_loss = tf.add_n(
                entropy_regularization_losses)
            total_kl_penalty_loss = tf.add_n(kl_penalty_losses)
            tf.contrib.summary.scalar('policy_gradient_loss',
                                      total_policy_gradient_loss)
            tf.contrib.summary.scalar('value_estimation_loss',
                                      total_value_estimation_loss)
            tf.contrib.summary.scalar('l2_regularization_loss',
                                      total_l2_regularization_loss)
            if self._entropy_regularization:
                tf.contrib.summary.scalar('entropy_regularization_loss',
                                          total_entropy_regularization_loss)
            tf.contrib.summary.scalar('kl_penalty_loss', total_kl_penalty_loss)

            total_abs_loss = (tf.abs(total_policy_gradient_loss) +
                              tf.abs(total_value_estimation_loss) +
                              tf.abs(total_entropy_regularization_loss) +
                              tf.abs(total_l2_regularization_loss) +
                              tf.abs(total_kl_penalty_loss))

            tf.contrib.summary.scalar('total_abs_loss', total_abs_loss)

        if self._summarize_grads_and_vars:
            with tf.name_scope('Variables/'):
                all_vars = (self._actor_net.trainable_weights +
                            self._value_net.trainable_weights)
                for var in all_vars:
                    tf.contrib.summary.histogram(var.name.replace(':', '_'),
                                                 var)

        return loss_info