Пример #1
0
    def calc_loss(self, training_info: TrainingInfo):
        info = training_info.info  # SarsaInfo
        critic_loss = losses.element_wise_squared_loss(info.returns,
                                                       info.critic)
        not_first_step = tf.not_equal(training_info.step_type, StepType.FIRST)
        critic_loss *= tf.cast(not_first_step, tf.float32)

        def _summary():
            with self.name_scope:
                tf.summary.scalar("values", tf.reduce_mean(info.critic))
                tf.summary.scalar("returns", tf.reduce_mean(info.returns))
                safe_mean_hist_summary("td_error", info.returns - info.critic)
                tf.summary.scalar(
                    "explained_variance_of_return_by_value",
                    common.explained_variance(info.critic, info.returns))

        if self._debug_summaries:
            common.run_if(common.should_record_summaries(), _summary)

        return LossInfo(
            loss=info.actor_loss,
            # put critic_loss to scalar_loss because loss will be masked by
            # ~is_last at train_complete(). The critic_loss here should be
            # masked by ~is_first instead, which is done above.
            scalar_loss=tf.reduce_mean(critic_loss),
            extra=SarsaLossInfo(actor=info.actor_loss, critic=critic_loss))
Пример #2
0
    def _pg_loss(self, training_info: TrainingInfo, advantages):
        scope = tf.name_scope(self.__class__.__name__)
        importance_ratio, importance_ratio_clipped = value_ops.action_importance_ratio(
            action_distribution=training_info.action_distribution,
            collect_action_distribution=training_info.
            collect_action_distribution,
            action=training_info.action,
            action_spec=self._action_spec,
            clipping_mode='double_sided',
            scope=scope,
            importance_ratio_clipping=self._importance_ratio_clipping,
            log_prob_clipping=self._log_prob_clipping,
            check_numerics=self._check_numerics,
            debug_summaries=self._debug_summaries)
        # Pessimistically choose the maximum objective value for clipped and
        # unclipped importance ratios.
        pg_objective = -importance_ratio * advantages
        pg_objective_clipped = -importance_ratio_clipped * advantages
        policy_gradient_loss = tf.maximum(pg_objective, pg_objective_clipped)

        def _summary():
            with scope:
                tf.summary.histogram('pg_objective', pg_objective)
                tf.summary.histogram('pg_objective_clipped',
                                     pg_objective_clipped)

        if self._debug_summaries:
            common.run_if(common.should_record_summaries(), _summary)

        if self._check_numerics:
            policy_gradient_loss = tf.debugging.check_numerics(
                policy_gradient_loss, 'policy_gradient_loss')

        return policy_gradient_loss
Пример #3
0
    def calc_loss(self, training_info: EntropyTargetInfo, valid_mask=None):
        loss_info = training_info.loss
        mask = tf.cast(training_info.step_type != StepType.LAST, tf.float32)
        if valid_mask:
            mask = mask * tf.cast(valid_mask, tf.float32)
        entropy = -loss_info.extra.neg_entropy * mask
        num = tf.reduce_sum(mask)
        not_empty = num > 0
        num = tf.maximum(num, 1)
        entropy2 = tf.reduce_sum(tf.square(entropy)) / num
        entropy = tf.reduce_sum(entropy) / num
        entropy_std = tf.sqrt(tf.maximum(0.0, entropy2 - entropy * entropy))

        run_if(not_empty, lambda: self.adjust_alpha(entropy))

        def _summarize():
            with self.name_scope:
                tf.summary.scalar("entropy_std", entropy_std)

        if self._debug_summaries:
            run_if(
                tf.logical_and(not_empty, should_record_summaries()),
                _summarize)

        alpha = tf.exp(self._log_alpha)
        return loss_info._replace(loss=loss_info.loss * alpha)
Пример #4
0
    def __call__(self, training_info: TrainingInfo, value):
        """Cacluate actor critic loss

        The first dimension of all the tensors is time dimension and the second
        dimesion is the batch dimension.

        Args:
            training_info (TrainingInfo): training_info collected by
                (On/Off)PolicyDriver. All tensors in training_info are time-major
            value (tf.Tensor): the time-major tensor for the value at each time
                step
            final_value (tf.Tensor): the value at one step ahead.
        Returns:
            loss_info (LossInfo): with loss_info.extra being ActorCriticLossInfo
        """

        returns, advantages = self._calc_returns_and_advantages(
            training_info, value)

        def _summary():
            with tf.name_scope('ActorCriticLoss'):
                tf.summary.scalar("values", tf.reduce_mean(value))
                tf.summary.scalar("returns", tf.reduce_mean(returns))
                tf.summary.scalar("advantages/mean",
                                  tf.reduce_mean(advantages))
                tf.summary.histogram("advantages/value", advantages)
                tf.summary.scalar("explained_variance_of_return_by_value",
                                  common.explained_variance(value, returns))

        if self._debug_summaries:
            common.run_if(common.should_record_summaries(), _summary)

        if self._normalize_advantages:
            advantages = _normalize_advantages(advantages, axes=(0, 1))

        if self._advantage_clip:
            advantages = tf.clip_by_value(advantages, -self._advantage_clip,
                                          self._advantage_clip)

        pg_loss = self._pg_loss(training_info, tf.stop_gradient(advantages))

        td_loss = self._td_error_loss_fn(tf.stop_gradient(returns), value)

        loss = pg_loss + self._td_loss_weight * td_loss

        entropy_loss = ()
        if self._entropy_regularization is not None:
            entropy, entropy_for_gradient = dist_utils.entropy_with_fallback(
                training_info.action_distribution, self._action_spec)
            entropy_loss = -entropy
            loss -= self._entropy_regularization * entropy_for_gradient

        return LossInfo(loss=loss,
                        extra=ActorCriticLossInfo(td_loss=td_loss,
                                                  pg_loss=pg_loss,
                                                  entropy_loss=entropy_loss))
Пример #5
0
    def after_train(self, training_info):
        """Adjust actor parameter according to KL-divergence."""
        exp_array = TracExperience(
            observation=training_info.info.observation,
            step_type=training_info.step_type,
            action_param=common.get_distribution_params(
                training_info.action_distribution),
            state=training_info.info.state)
        exp_array = common.create_and_unstack_tensor_array(
            exp_array, clear_after_read=False)
        dists, steps = self._trusted_updater.adjust_step(
            lambda: self._calc_change(exp_array), self._action_dist_clips)

        def _summarize():
            with self.name_scope:
                for i, d in enumerate(tf.nest.flatten(dists)):
                    tf.summary.scalar("unadjusted_action_dist/%s" % i, d)
                tf.summary.scalar("adjust_steps", steps)

        common.run_if(common.should_record_summaries(), _summarize)
        self._ac_algorithm.after_train(
            training_info._replace(info=training_info.info.ac))
Пример #6
0
    def calc_loss(self, training_info: EntropyTargetInfo):
        loss_info = training_info.loss
        mask = tf.cast(training_info.step_type != StepType.LAST, tf.float32)
        entropy = -loss_info.extra.entropy_loss * mask
        num = tf.reduce_sum(mask)
        entropy2 = tf.reduce_sum(tf.square(entropy)) / num
        entropy = tf.reduce_sum(entropy) / num
        entropy_std = tf.sqrt(tf.maximum(0.0, entropy2 - entropy * entropy))
        prev_avg_entropy = self._avg_entropy.get()
        avg_entropy = self._avg_entropy.average(entropy)

        def _init():
            crossing = avg_entropy < self._target_entropy
            self._stage.assign_add(tf.cast(crossing, tf.int32))

        def _adjust():
            previous_above = tf.cast(self._stage, tf.bool)
            above = avg_entropy > self._target_entropy
            self._stage.assign(tf.cast(above, tf.int32))
            crossing = above != previous_above
            update_rate = self._update_rate
            update_rate = tf.where(crossing, 0.9 * update_rate, update_rate)
            update_rate = tf.maximum(update_rate, self._slow_update_rate)
            update_rate = tf.where(entropy < self._fast_stage_thresh,
                                   np.float32(self._fast_update_rate),
                                   update_rate)
            self._update_rate.assign(update_rate)
            above = tf.cast(above, tf.float32)
            below = 1 - above
            increasing = tf.cast(avg_entropy > prev_avg_entropy, tf.float32)
            decreasing = 1 - increasing
            log_alpha = self._log_alpha + (
                (below + 0.5 * above) * decreasing -
                (above + 0.5 * below) * increasing) * update_rate
            log_alpha = tf.maximum(log_alpha, np.float32(self._min_log_alpha))
            self._log_alpha.assign(log_alpha)

        run_if(self._stage == -1, _init)
        run_if(self._stage >= 0, _adjust)
        alpha = tf.exp(self._log_alpha)

        def _summarize():
            with self.name_scope:
                tf.summary.scalar("alpha", alpha)
                tf.summary.scalar("entropy_std", entropy_std)
                tf.summary.scalar("avg_entropy", avg_entropy)
                tf.summary.scalar("stage", self._stage)
                tf.summary.scalar("update_rate", self._update_rate)

        if self._debug_summaries:
            run_if(should_record_summaries(), _summarize)

        return loss_info._replace(loss=loss_info.loss * alpha)
Пример #7
0
def action_importance_ratio(action_distribution, collect_action_distribution,
                            action, action_spec, clipping_mode, scope,
                            importance_ratio_clipping, log_prob_clipping,
                            check_numerics, debug_summaries):
    """ ratio for importance sampling, used in PPO loss and vtrace loss.

        Caller has to save tf.name_scope() and pass scope to this function.

        Args:
            action_distribution (nested tf.distribution): Distribution over
                actions under target policy.
            collect_action_distribution (nested tf.distribution): distribution
                over actions from behavior policy, used to sample actions for
                the rollout.
            action (nested tf.distribution): possibly batched action tuple
                taken during rollout.
            action_spec (nested BoundedTensorSpec): representing the actions.
            clipping_mode (str): mode for clipping the importance ratio.
                'double_sided': clips the range of importance ratio into
                    [1-importance_ratio_clipping, 1+importance_ratio_clipping],
                    which is used by PPOLoss.
                'capping': clips the range of importance ratio into
                    min(1+importance_ratio_clipping, importance_ratio),
                    which is used by VTraceLoss, where c_bar or rho_bar =
                    1+importance_ratio_clipping.
            scope (name scope manager): returned by tf.name_scope(), set
                outside.
            importance_ratio_clipping (float):  Epsilon in clipped, surrogate
                PPO objective. See the cited paper for more detail.
            log_prob_clipping (float): If >0, clipping log probs to the range
                (-log_prob_clipping, log_prob_clipping) to prevent inf / NaN
                values.
            check_numerics (bool):  If true, adds tf.debugging.check_numerics to
                help find NaN / Inf values. For debugging only.
            debug_summaries (bool): If true, output summary metrics to tf.

        Returns:
            importance_ratio (Tensor), importance_ratio_clipped (Tensor).
    """
    current_policy_distribution = action_distribution

    sample_action_log_probs = tfa_common.log_probability(
        collect_action_distribution, action, action_spec)
    sample_action_log_probs = tf.stop_gradient(sample_action_log_probs)

    action_log_prob = tfa_common.log_probability(current_policy_distribution,
                                                 action, action_spec)
    if log_prob_clipping > 0.0:
        action_log_prob = tf.clip_by_value(action_log_prob, -log_prob_clipping,
                                           log_prob_clipping)
    if check_numerics:
        action_log_prob = tf.debugging.check_numerics(action_log_prob,
                                                      'action_log_prob')

    # Prepare both clipped and unclipped importance ratios.
    importance_ratio = tf.exp(action_log_prob - sample_action_log_probs)
    if check_numerics:
        importance_ratio = tf.debugging.check_numerics(importance_ratio,
                                                       'importance_ratio')

    if clipping_mode == 'double_sided':
        importance_ratio_clipped = tf.clip_by_value(
            importance_ratio, 1 - importance_ratio_clipping,
            1 + importance_ratio_clipping)
    elif clipping_mode == 'capping':
        importance_ratio_clipped = tf.minimum(importance_ratio,
                                              1 + importance_ratio_clipping)
    else:
        raise Exception('Unsupported clipping mode: ' + clipping_mode)

    def _summary():
        with scope:
            if importance_ratio_clipping > 0.0:
                clip_fraction = tf.reduce_mean(input_tensor=tf.cast(
                    tf.greater(tf.abs(importance_ratio - 1.0),
                               importance_ratio_clipping), tf.float32))
                tf.summary.scalar('clip_fraction', clip_fraction)

            tf.summary.histogram('action_log_prob', action_log_prob)
            tf.summary.histogram('action_log_prob_sample',
                                 sample_action_log_probs)
            tf.summary.histogram('importance_ratio', importance_ratio)
            tf.summary.scalar('importance_ratio_mean',
                              tf.reduce_mean(input_tensor=importance_ratio))
            tf.summary.histogram('importance_ratio_clipped',
                                 importance_ratio_clipped)

    if debug_summaries:
        common.run_if(common.should_record_summaries(), _summary)

    return importance_ratio, importance_ratio_clipped
Пример #8
0
 def wrapper(*args, **kwargs):
     from alf.utils.common import run_if
     return run_if(summary_ops_v2._should_record_summaries_v2(),
                   lambda: summary_func(*args, **kwargs))
Пример #9
0
    def adjust_alpha(self, entropy):
        """Adjust alpha according to the current entropy.

        Args:
            entropy (scalar Tensor). the current entropy.
        Returns:
            adjusted entropy regularization
        """
        prev_avg_entropy = self._avg_entropy.get()
        avg_entropy = self._avg_entropy.average(entropy)

        def _init_entropy():
            self._max_entropy.assign(
                tf.minimum(0.8 * avg_entropy, avg_entropy / 0.8))
            self._stage.assign_add(1)

        def _init():
            below = avg_entropy < self._max_entropy
            increasing = tf.cast(avg_entropy > prev_avg_entropy, tf.float32)
            # -1 * increasing + 0.5 * (1 - increasing)
            update_rate = (
                0.5 - 1.5 * increasing) * self._very_slow_update_rate
            self._stage.assign_add(tf.cast(below, tf.int32))
            self._log_alpha.assign(
                tf.maximum(self._log_alpha + update_rate,
                           np.float32(self._min_log_alpha)))

        def _free():
            crossing = avg_entropy < self._target_entropy
            self._stage.assign_add(tf.cast(crossing, tf.int32))

        def _adjust():
            previous_above = tf.cast(self._stage, tf.bool)
            above = avg_entropy > self._target_entropy
            self._stage.assign(tf.cast(above, tf.int32))
            crossing = above != previous_above
            update_rate = self._update_rate
            update_rate = tf.where(crossing, 0.9 * update_rate, update_rate)
            update_rate = tf.maximum(update_rate, self._slow_update_rate)
            update_rate = tf.where(entropy < self._fast_stage_thresh,
                                   np.float32(self._fast_update_rate),
                                   update_rate)
            self._update_rate.assign(update_rate)
            above = tf.cast(above, tf.float32)
            below = 1 - above
            increasing = tf.cast(avg_entropy > prev_avg_entropy, tf.float32)
            decreasing = 1 - increasing
            log_alpha = self._log_alpha + (
                (below + 0.5 * above) * decreasing -
                (above + 0.5 * below) * increasing) * update_rate
            log_alpha = tf.maximum(log_alpha, np.float32(self._min_log_alpha))
            self._log_alpha.assign(log_alpha)

        run_if(self._stage < -2, _init_entropy)
        run_if(self._stage == -2, _init)
        run_if(self._stage == -1, _free)
        run_if(self._stage >= 0, _adjust)
        alpha = tf.exp(self._log_alpha)

        def _summarize():
            with self.name_scope:
                tf.summary.scalar("alpha", alpha)
                tf.summary.scalar("avg_entropy", avg_entropy)
                tf.summary.scalar("stage", self._stage)
                tf.summary.scalar("update_rate", self._update_rate)

        if self._debug_summaries:
            run_if(should_record_summaries(), _summarize)

        return alpha