Exemplo n.º 1
0
  def testMakeTimestepMaskWithPartialEpisode(self, allow_partial):
    first, mid, last = ts.StepType.FIRST, ts.StepType.MID, ts.StepType.LAST

    next_step_types = tf.constant([[mid, mid, last, first,
                                    mid, mid, last, first,
                                    mid, mid],
                                   [mid, mid, last, first,
                                    mid, mid, mid, mid,
                                    mid, last]])
    zeros = tf.zeros_like(next_step_types)
    next_time_step = ts.TimeStep(next_step_types, zeros, zeros, zeros)

    if not allow_partial:
      # Mask should be 0.0 for transition timesteps (3, 7) and for all timesteps
      #   belonging to the final, incomplete episode.
      expected_mask = [[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0],
                       [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
    else:
      # Zeros only between episodes. Incomplete episodes are valid and not
      # zeroed out.
      expected_mask = [[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0],
                       [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
    timestep_mask = ppo_utils.make_timestep_mask(
        next_time_step, allow_partial_episodes=allow_partial)

    timestep_mask_ = self.evaluate(timestep_mask)
    self.assertAllClose(expected_mask, timestep_mask_)
Exemplo n.º 2
0
    def _train(self, experience, weights=None):
        # unpack trajectories
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(experience)

        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        value_state = self._collect_policy.get_initial_value_state(
            batch_size=batch_size)

        weights = ppo_utils.make_timestep_mask(next_time_steps)

        value_preds, _ = self._collect_policy.apply_value_network(
            experience.observation,
            experience.step_type,
            value_state=value_state)
        value_preds = tf.stop_gradient(value_preds)

        rewards = next_time_steps.reward

        # normalize rewards
        if self._reward_normalizer is not None:
            rewards = self._reward_normalizer.normalize(
                rewards,
                center_mean=False,
                clip_value=self._reward_norm_clipping)

        returns, normalized_advantages = compute_return_and_advantage(
            self._discount_factor, self._lambda, rewards, next_time_steps,
            value_preds)

        policy_loss = self._update_policy(time_steps, policy_steps_,
                                          normalized_advantages, weights)

        value_loss = self._update_values(time_steps, returns, weights)

        return tf_agent.LossInfo(
            loss=value_loss + policy_loss,
            extra=TRPOLossInfo(value_estimation_loss=value_loss,
                               policy_gradient_loss=policy_loss),
        )
Exemplo n.º 3
0
    def _train(self, experience, weights):
        # Get individual tensors from transitions.
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(experience)
        actions = policy_steps_.action

        if self._debug_summaries:
            actions_list = tf.nest.flatten(actions)
            show_action_index = len(actions_list) != 1
            for i, single_action in enumerate(actions_list):
                action_name = ('actions_{}'.format(i)
                               if show_action_index else 'actions')
                tf.compat.v2.summary.histogram(name=action_name,
                                               data=single_action,
                                               step=self.train_step_counter)

        action_distribution_parameters = policy_steps_.info

        # Reconstruct per-timestep policy distribution from stored distribution
        #   parameters.
        old_actions_distribution = (
            distribution_spec.nested_distributions_from_specs(
                self._action_distribution_spec,
                action_distribution_parameters))

        # Compute log probability of actions taken during data collection, using the
        #   collect policy distribution.
        act_log_probs = common.log_probability(old_actions_distribution,
                                               actions, self._action_spec)

        # Compute the value predictions for states using the current value function.
        # To be used for return & advantage computation.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(
            batch_size=batch_size)

        value_preds, unused_policy_state = self._collect_policy.apply_value_network(
            experience.observation,
            experience.step_type,
            policy_state=policy_state)
        value_preds = tf.stop_gradient(value_preds)

        valid_mask = ppo_utils.make_timestep_mask(next_time_steps)

        if weights is None:
            weights = valid_mask
        else:
            weights *= valid_mask

        returns, normalized_advantages = self.compute_return_and_advantage(
            next_time_steps, value_preds)

        # Loss tensors across batches will be aggregated for summaries.
        policy_gradient_losses = []
        value_estimation_losses = []
        l2_regularization_losses = []
        entropy_regularization_losses = []
        kl_penalty_losses = []

        loss_info = None  # TODO(b/123627451): Remove.
        # For each epoch, create its own train op that depends on the previous one.
        for i_epoch in range(self._num_epochs):
            with tf.name_scope('epoch_%d' % i_epoch):
                # Only save debug summaries for first and last epochs.
                debug_summaries = (self._debug_summaries
                                   and (i_epoch == 0
                                        or i_epoch == self._num_epochs - 1))

                # Build one epoch train op.
                with tf.GradientTape() as tape:
                    loss_info = self.get_epoch_loss(
                        time_steps, actions, act_log_probs, returns,
                        normalized_advantages, action_distribution_parameters,
                        weights, self.train_step_counter, debug_summaries)

                variables_to_train = (self._actor_net.trainable_weights +
                                      self._value_net.trainable_weights)
                grads = tape.gradient(loss_info.loss, variables_to_train)
                # Tuple is used for py3, where zip is a generator producing values once.
                grads_and_vars = tuple(zip(grads, variables_to_train))
                if self._gradient_clipping > 0:
                    grads_and_vars = eager_utils.clip_gradient_norms(
                        grads_and_vars, self._gradient_clipping)

                # If summarize_gradients, create functions for summarizing both
                # gradients and variables.
                if self._summarize_grads_and_vars and debug_summaries:
                    eager_utils.add_gradients_summaries(
                        grads_and_vars, self.train_step_counter)
                    eager_utils.add_variables_summaries(
                        grads_and_vars, self.train_step_counter)

                self._optimizer.apply_gradients(
                    grads_and_vars, global_step=self.train_step_counter)

                policy_gradient_losses.append(
                    loss_info.extra.policy_gradient_loss)
                value_estimation_losses.append(
                    loss_info.extra.value_estimation_loss)
                l2_regularization_losses.append(
                    loss_info.extra.l2_regularization_loss)
                entropy_regularization_losses.append(
                    loss_info.extra.entropy_regularization_loss)
                kl_penalty_losses.append(loss_info.extra.kl_penalty_loss)

        # After update epochs, update adaptive kl beta, then update observation
        #   normalizer and reward normalizer.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(batch_size)
        # Compute the mean kl from previous action distribution.
        kl_divergence = self._kl_divergence(
            time_steps, action_distribution_parameters,
            self._collect_policy.distribution(time_steps, policy_state).action)
        self.update_adaptive_kl_beta(kl_divergence)

        if self._observation_normalizer:
            self._observation_normalizer.update(time_steps.observation,
                                                outer_dims=[0, 1])
        else:
            # TODO(b/127661780): Verify performance of reward_normalizer when obs are
            #                    not normalized
            if self._reward_normalizer:
                self._reward_normalizer.update(next_time_steps.reward,
                                               outer_dims=[0, 1])

        loss_info = tf.nest.map_structure(tf.identity, loss_info)

        # Make summaries for total loss across all epochs.
        # The *_losses lists will have been populated by
        #   calls to self.get_epoch_loss.
        with tf.name_scope('Losses/'):
            total_policy_gradient_loss = tf.add_n(policy_gradient_losses)
            total_value_estimation_loss = tf.add_n(value_estimation_losses)
            total_l2_regularization_loss = tf.add_n(l2_regularization_losses)
            total_entropy_regularization_loss = tf.add_n(
                entropy_regularization_losses)
            total_kl_penalty_loss = tf.add_n(kl_penalty_losses)
            tf.compat.v2.summary.scalar(name='policy_gradient_loss',
                                        data=total_policy_gradient_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='value_estimation_loss',
                                        data=total_value_estimation_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='l2_regularization_loss',
                                        data=total_l2_regularization_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='entropy_regularization_loss',
                                        data=total_entropy_regularization_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='kl_penalty_loss',
                                        data=total_kl_penalty_loss,
                                        step=self.train_step_counter)

            total_abs_loss = (tf.abs(total_policy_gradient_loss) +
                              tf.abs(total_value_estimation_loss) +
                              tf.abs(total_entropy_regularization_loss) +
                              tf.abs(total_l2_regularization_loss) +
                              tf.abs(total_kl_penalty_loss))

            tf.compat.v2.summary.scalar(name='total_abs_loss',
                                        data=total_abs_loss,
                                        step=self.train_step_counter)

        if self._summarize_grads_and_vars:
            with tf.name_scope('Variables/'):
                all_vars = (self._actor_net.trainable_weights +
                            self._value_net.trainable_weights)
                for var in all_vars:
                    tf.compat.v2.summary.histogram(
                        name=var.name.replace(':', '_'),
                        data=var,
                        step=self.train_step_counter)

        return loss_info
Exemplo n.º 4
0
    def _train(self, experience, weights, train_step_counter):
        # Change trajectory to transitions.
        trajectory0 = nest.map_structure(lambda t: t[:, :-1], experience)
        trajectory1 = nest.map_structure(lambda t: t[:, 1:], experience)

        # Get individual tensors from transitions.
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(trajectory0, trajectory1)
        actions = policy_steps_.action
        if self._debug_summaries:
            tf.contrib.summary.histogram('actions', actions)

        action_distribution_parameters = policy_steps_.info

        # Reconstruct per-timestep policy distribution from stored distribution
        #   parameters.
        old_actions_distribution = (
            distribution_spec.nested_distributions_from_specs(
                self._action_distribution_spec,
                action_distribution_parameters))

        # Compute log probability of actions taken during data collection, using the
        #   collect policy distribution.
        act_log_probs = common_utils.log_probability(old_actions_distribution,
                                                     actions,
                                                     self._action_spec)

        # Compute the value predictions for states using the current value function.
        # To be used for return & advantage computation.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(
            batch_size=batch_size)

        value_preds, unused_policy_state = self._collect_policy.apply_value_network(
            experience.observation,
            experience.step_type,
            policy_state=policy_state)
        value_preds = tf.stop_gradient(value_preds)

        valid_mask = ppo_utils.make_timestep_mask(next_time_steps)

        if weights is None:
            weights = valid_mask
        else:
            weights *= valid_mask

        returns, normalized_advantages = self.compute_return_and_advantage(
            next_time_steps, value_preds)

        # Loss tensors across batches will be aggregated for summaries.
        policy_gradient_losses = []
        value_estimation_losses = []
        l2_regularization_losses = []
        entropy_regularization_losses = []
        kl_penalty_losses = []

        # For each epoch, create its own train op that depends on the previous one.
        loss_info = tf.no_op()
        for i_epoch in range(self._num_epochs):
            with tf.name_scope('epoch_%d' % i_epoch):
                with tf.control_dependencies(nest.flatten(loss_info)):
                    # Only save debug summaries for first and last epochs.
                    debug_summaries = (self._debug_summaries and
                                       (i_epoch == 0
                                        or i_epoch == self._num_epochs - 1))

                    # Build one epoch train op.
                    loss_info = self.build_train_op(
                        time_steps, actions, act_log_probs, returns,
                        normalized_advantages, action_distribution_parameters,
                        weights, train_step_counter,
                        self._summarize_grads_and_vars,
                        self._gradient_clipping, debug_summaries)

                    policy_gradient_losses.append(
                        loss_info.extra.policy_gradient_loss)
                    value_estimation_losses.append(
                        loss_info.extra.value_estimation_loss)
                    l2_regularization_losses.append(
                        loss_info.extra.l2_regularization_loss)
                    entropy_regularization_losses.append(
                        loss_info.extra.entropy_regularization_loss)
                    kl_penalty_losses.append(loss_info.extra.kl_penalty_loss)

        # After update epochs, update adaptive kl beta, then update observation
        #   normalizer and reward normalizer.
        with tf.control_dependencies(nest.flatten(loss_info)):
            # Compute the mean kl from old.
            batch_size = nest_utils.get_outer_shape(time_steps,
                                                    self._time_step_spec)[0]
            policy_state = self._collect_policy.get_initial_state(batch_size)
            kl_divergence = self._kl_divergence(
                time_steps, action_distribution_parameters,
                self._collect_policy.distribution(time_steps,
                                                  policy_state).action)
            update_adaptive_kl_beta_op = self.update_adaptive_kl_beta(
                kl_divergence)

        with tf.control_dependencies([update_adaptive_kl_beta_op]):
            if self._observation_normalizer:
                update_obs_norm = (self._observation_normalizer.update(
                    time_steps.observation, outer_dims=[0, 1]))
            else:
                update_obs_norm = tf.no_op()
            if self._reward_normalizer:
                update_reward_norm = self._reward_normalizer.update(
                    next_time_steps.reward, outer_dims=[0, 1])
            else:
                update_reward_norm = tf.no_op()

        with tf.control_dependencies([update_obs_norm, update_reward_norm]):
            loss_info = nest.map_structure(tf.identity, loss_info)

        # Make summaries for total loss across all epochs.
        # The *_losses lists will have been populated by
        #   calls to self.build_train_op.
        with tf.name_scope('Losses/'):
            total_policy_gradient_loss = tf.add_n(policy_gradient_losses)
            total_value_estimation_loss = tf.add_n(value_estimation_losses)
            total_l2_regularization_loss = tf.add_n(l2_regularization_losses)
            total_entropy_regularization_loss = tf.add_n(
                entropy_regularization_losses)
            total_kl_penalty_loss = tf.add_n(kl_penalty_losses)
            tf.contrib.summary.scalar('policy_gradient_loss',
                                      total_policy_gradient_loss)
            tf.contrib.summary.scalar('value_estimation_loss',
                                      total_value_estimation_loss)
            tf.contrib.summary.scalar('l2_regularization_loss',
                                      total_l2_regularization_loss)
            if self._entropy_regularization:
                tf.contrib.summary.scalar('entropy_regularization_loss',
                                          total_entropy_regularization_loss)
            tf.contrib.summary.scalar('kl_penalty_loss', total_kl_penalty_loss)

            total_abs_loss = (tf.abs(total_policy_gradient_loss) +
                              tf.abs(total_value_estimation_loss) +
                              tf.abs(total_entropy_regularization_loss) +
                              tf.abs(total_l2_regularization_loss) +
                              tf.abs(total_kl_penalty_loss))

            tf.contrib.summary.scalar('total_abs_loss', total_abs_loss)

        if self._summarize_grads_and_vars:
            with tf.name_scope('Variables/'):
                all_vars = (self._actor_net.trainable_weights +
                            self._value_net.trainable_weights)
                for var in all_vars:
                    tf.contrib.summary.histogram(var.name.replace(':', '_'),
                                                 var)

        return loss_info