示例#1
0
def diag_normal_kl(mean0, logstd0, mean1, logstd1):
    """Epirical KL divergence of two normals with diagonal covariance."""
    logstd0_2, logstd1_2 = 2 * logstd0, 2 * logstd1
    return 0.5 * (tf.reduce_sum(tf.exp(logstd0_2 - logstd1_2), -1) +
                  tf.reduce_sum((mean1 - mean0)**2 / tf.exp(logstd1_2), -1) +
                  tf.reduce_sum(logstd1_2, -1) - tf.reduce_sum(logstd0_2, -1) -
                  mean0.shape[-1].value)
示例#2
0
文件: normalize.py 项目: Gs-001/quad
    def update(self, value):
        """Update the mean and variance estimates.

    Args:
      value: Batch or single value tensor.

    Returns:
      Summary tensor.
    """
        with tf.name_scope(self._name + '/update'):
            if value.shape.ndims == self._mean.shape.ndims:
                # Add a batch dimension if necessary.
                value = value[None, ...]
            count = tf.shape(value)[0]
            with tf.control_dependencies([self._count.assign_add(count)]):
                step = tf.cast(self._count, tf.float32)
                mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0)
                new_mean = self._mean + mean_delta / step
                new_mean = tf.cond(self._count > 1, lambda: new_mean,
                                   lambda: value[0])
                var_delta = (value - self._mean[None, ...]) * (
                    value - new_mean[None, ...])
                new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0)
            with tf.control_dependencies([new_mean, new_var_sum]):
                update = self._mean.assign(new_mean), self._var_sum.assign(
                    new_var_sum)
            with tf.control_dependencies(update):
                if value.shape.ndims == 1:
                    value = tf.reduce_mean(value)
                return self._summary('value', tf.reduce_mean(value))
def calc_bound_loss(x_tf, bound_min, bound_max):
    # penalty for violating bounds
    violation_min = tf.minimum(x_tf - bound_min, 0)
    violation_max = tf.maximum(x_tf - bound_max, 0)
    violation = tf.reduce_sum(tf.square(violation_min),
                              axis=-1) + tf.reduce_sum(
                                  tf.square(violation_max), axis=-1)
    loss = 0.5 * tf.reduce_mean(violation)
    return loss
def calc_logp_gaussian(x_tf, mean_tf, std_tf):
    dim = tf.to_float(tf.shape(x_tf)[-1])

    if mean_tf is None:
        diff_tf = x_tf
    else:
        diff_tf = x_tf - mean_tf

    logp_tf = -0.5 * tf.reduce_sum(tf.square(diff_tf / std_tf), axis=-1)
    logp_tf += -0.5 * dim * np.log(2 * np.pi) - tf.reduce_sum(tf.log(std_tf),
                                                              axis=-1)

    return logp_tf
示例#5
0
    def append(self, transitions, rows=None):
        """Append a batch of transitions to rows of the memory.

    Args:
      transitions: Tuple of transition quantities with batch dimension.
      rows: Episodes to append to, defaults to all.

    Returns:
      Operation.
    """
        rows = tf.range(self._capacity) if rows is None else rows
        assert rows.shape.ndims == 1
        assert_capacity = tf.assert_less(rows,
                                         self._capacity,
                                         message='capacity exceeded')
        with tf.control_dependencies([assert_capacity]):
            assert_max_length = tf.assert_less(tf.gather(self._length, rows),
                                               self._max_length,
                                               message='max length exceeded')
        append_ops = []
        with tf.control_dependencies([assert_max_length]):
            for buffer_, elements in zip(self._buffers, transitions):
                timestep = tf.gather(self._length, rows)
                indices = tf.stack([rows, timestep], 1)
                append_ops.append(
                    tf.scatter_nd_update(buffer_, indices, elements))
        with tf.control_dependencies(append_ops):
            episode_mask = tf.reduce_sum(
                tf.one_hot(rows, self._capacity, dtype=tf.int32), 0)
            return self._length.assign_add(episode_mask)
示例#6
0
 def submit(self, value):
     """Submit a single or batch tensor to refine the streaming mean."""
     # Add a batch dimension if necessary.
     if value.shape.ndims == self._sum.shape.ndims:
         value = value[None, ...]
     return tf.group(self._sum.assign_add(tf.reduce_sum(value, 0)),
                     self._count.assign_add(tf.shape(value)[0]))
    def _build_losses(self, json_data):
        actor_weight_decay = 0 if (
            self.ACTOR_WEIGHT_DECAY_KEY
            not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY]
        critic_weight_decay = 0 if (
            self.CRITIC_WEIGHT_DECAY_KEY
            not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY]

        norm_val_diff = self.val_norm.normalize_tf(
            self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf)
        self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff))

        if (critic_weight_decay != 0):
            self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss(
                'main/critic')

        norm_a_mean_tf = self.a_norm.normalize_tf(self.actor_tf)
        norm_a_diff = self.a_norm.normalize_tf(self.a_tf) - norm_a_mean_tf

        self.actor_loss_tf = tf.reduce_sum(tf.square(norm_a_diff), axis=-1)
        self.actor_loss_tf *= self.adv_tf
        self.actor_loss_tf = 0.5 * tf.reduce_mean(self.actor_loss_tf)

        norm_a_bound_min = self.a_norm.normalize(self.a_bound_min)
        norm_a_bound_max = self.a_norm.normalize(self.a_bound_max)
        a_bound_loss = TFUtil.calc_bound_loss(norm_a_mean_tf, norm_a_bound_min,
                                              norm_a_bound_max)
        a_bound_loss /= self.exp_params_curr.noise
        self.actor_loss_tf += a_bound_loss

        if (actor_weight_decay != 0):
            self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss(
                'main/actor')

        return
    def _policy_loss(self, mean, logstd, old_mean, old_logstd, action,
                     advantage, length):
        """Compute the policy loss composed of multiple components.

    1. The policy gradient loss is importance sampled from the data-collecting
       policy at the beginning of training.
    2. The second term is a KL penalty between the policy at the beginning of
       training and the current policy.
    3. Additionally, if this KL already changed more than twice the target
       amount, we activate a strong penalty discouraging further divergence.

    Args:
      mean: Sequences of action means of the current policy.
      logstd: Sequences of action log stddevs of the current policy.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      action: Sequences of actions.
      advantage: Sequences of advantages.
      length: Batch of sequence lengths.

    Returns:
      Tuple of loss tensor and summary tensor.
    """
        with tf.name_scope('policy_loss'):
            entropy = utility.diag_normal_entropy(mean, logstd)
            kl = tf.reduce_mean(
                self._mask(
                    utility.diag_normal_kl(old_mean, old_logstd, mean, logstd),
                    length), 1)
            policy_gradient = tf.exp(
                utility.diag_normal_logpdf(mean, logstd, action) -
                utility.diag_normal_logpdf(old_mean, old_logstd, action))
            surrogate_loss = -tf.reduce_mean(
                self._mask(policy_gradient * tf.stop_gradient(advantage),
                           length), 1)
            kl_penalty = self._penalty * kl
            cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor
            cutoff_count = tf.reduce_sum(
                tf.cast(kl > cutoff_threshold, tf.int32))
            with tf.control_dependencies([
                    tf.cond(cutoff_count > 0,
                            lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '),
                            int)
            ]):
                kl_cutoff = (self._config.kl_cutoff_coef *
                             tf.cast(kl > cutoff_threshold, tf.float32) *
                             (kl - cutoff_threshold)**2)
            policy_loss = surrogate_loss + kl_penalty + kl_cutoff
            summary = tf.summary.merge([
                tf.summary.histogram('entropy', entropy),
                tf.summary.histogram('kl', kl),
                tf.summary.histogram('surrogate_loss', surrogate_loss),
                tf.summary.histogram('kl_penalty', kl_penalty),
                tf.summary.histogram('kl_cutoff', kl_cutoff),
                tf.summary.histogram('kl_penalty_combined',
                                     kl_penalty + kl_cutoff),
                tf.summary.histogram('policy_loss', policy_loss),
                tf.summary.scalar('avg_surr_loss',
                                  tf.reduce_mean(surrogate_loss)),
                tf.summary.scalar('avg_kl_penalty',
                                  tf.reduce_mean(kl_penalty)),
                tf.summary.scalar('avg_policy_loss',
                                  tf.reduce_mean(policy_loss))
            ])
            policy_loss = tf.reduce_mean(policy_loss, 0)
            return tf.check_numerics(policy_loss, 'policy_loss'), summary
示例#9
0
def diag_normal_entropy(mean, logstd):
  """Empirical entropy of a normal with diagonal covariance."""
  constant = mean.shape[-1].value * math.log(2 * math.pi * math.e)
  return (constant + tf.reduce_sum(2 * logstd, 1)) / 2
示例#10
0
def diag_normal_logpdf(mean, logstd, loc):
  """Log density of a normal with diagonal covariance."""
  constant = -0.5 * (math.log(2 * math.pi) + logstd)
  value = -0.5 * ((loc - mean) / tf.exp(logstd))**2
  return tf.reduce_sum(constant + value, -1)