def make_beta_update(self) -> None: """ Creates the beta parameter and its updater for GAIL """ new_beta = tf.maximum( self.beta + self.alpha * (self.kl_loss - self.mutual_information), EPSILON ) with tf.control_dependencies([self.update_batch]): self.update_beta = tf.assign(self.beta, new_beta)
def _create_losses(self, probs, old_probs, value_heads, entropy, beta, epsilon, lr, max_step): """ Creates training-specific Tensorflow ops for PPO models. :param probs: Current policy probabilities :param old_probs: Past policy probabilities :param value_heads: Value estimate tensors from each value stream :param beta: Entropy regularization strength :param entropy: Current policy entropy :param epsilon: Value for policy-divergence threshold :param lr: Learning rate :param max_step: Total number of training steps. """ self.returns_holders = {} self.old_values = {} for name in value_heads.keys(): returns_holder = tf.placeholder(shape=[None], dtype=tf.float32, name="{}_returns".format(name)) old_value = tf.placeholder(shape=[None], dtype=tf.float32, name="{}_value_estimate".format(name)) self.returns_holders[name] = returns_holder self.old_values[name] = old_value self.advantage = tf.placeholder(shape=[None], dtype=tf.float32, name="advantages") advantage = tf.expand_dims(self.advantage, -1) decay_epsilon = tf.train.polynomial_decay(epsilon, self.policy.global_step, max_step, 0.1, power=1.0) decay_beta = tf.train.polynomial_decay(beta, self.policy.global_step, max_step, 1e-5, power=1.0) value_losses = [] for name, head in value_heads.items(): clipped_value_estimate = self.old_values[name] + tf.clip_by_value( tf.reduce_sum(head, axis=1) - self.old_values[name], -decay_epsilon, decay_epsilon, ) v_opt_a = tf.squared_difference(self.returns_holders[name], tf.reduce_sum(head, axis=1)) v_opt_b = tf.squared_difference(self.returns_holders[name], clipped_value_estimate) value_loss = tf.reduce_mean( tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.policy.mask, 2)[1]) value_losses.append(value_loss) self.value_loss = tf.reduce_mean(value_losses) r_theta = tf.exp(probs - old_probs) p_opt_a = r_theta * advantage p_opt_b = (tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage) self.policy_loss = -tf.reduce_mean( tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.policy.mask, 2)[1]) # For cleaner stats reporting self.abs_policy_loss = tf.abs(self.policy_loss) self.loss = ( self.policy_loss + 0.5 * self.value_loss - decay_beta * tf.reduce_mean( tf.dynamic_partition(entropy, self.policy.mask, 2)[1]))