def create_forward_model( self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor ) -> None: """ Creates forward model TensorFlow ops for Curiosity module. Predicts encoded future state based on encoded current state and given action. :param encoded_state: Tensor corresponding to encoded current state. :param encoded_next_state: Tensor corresponding to encoded next state. """ combined_input = tf.concat( [encoded_state, self.policy.selected_actions], axis=1 ) hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish) pred_next_state = tf.layers.dense( hidden, self.encoding_size * (self.policy.vis_obs_size + int(self.policy.vec_obs_size > 0)), activation=None, ) squared_difference = 0.5 * tf.reduce_sum( tf.squared_difference(pred_next_state, encoded_next_state), axis=1 ) self.intrinsic_reward = squared_difference self.forward_loss = tf.reduce_mean( tf.dynamic_partition(squared_difference, self.policy.mask, 2)[1] )
def create_inverse_model(self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor) -> None: """ Creates inverse model TensorFlow ops for Curiosity module. Predicts action taken given current and future encoded states. :param encoded_state: Tensor corresponding to encoded current state. :param encoded_next_state: Tensor corresponding to encoded next state. """ combined_input = tf.concat([encoded_state, encoded_next_state], axis=1) hidden = tf.layers.dense(combined_input, 256, activation=LearningModel.swish) if self.policy_model.brain.vector_action_space_type == "continuous": pred_action = tf.layers.dense(hidden, self.policy_model.act_size[0], activation=None) squared_difference = tf.reduce_sum( tf.squared_difference(pred_action, self.policy_model.selected_actions), axis=1, ) self.inverse_loss = tf.reduce_mean( tf.dynamic_partition(squared_difference, self.policy_model.mask, 2)[1]) else: pred_action = tf.concat( [ tf.layers.dense(hidden, self.policy_model.act_size[i], activation=tf.nn.softmax) for i in range(len(self.policy_model.act_size)) ], axis=1, ) cross_entropy = tf.reduce_sum( -tf.log(pred_action + 1e-10) * self.policy_model.selected_actions, axis=1, ) self.inverse_loss = tf.reduce_mean( tf.dynamic_partition(cross_entropy, self.policy_model.mask, 2)[1])
def _create_losses(self, probs, old_probs, value_heads, entropy, beta, epsilon, lr, max_step): """ Creates training-specific Tensorflow ops for PPO models. :param probs: Current policy probabilities :param old_probs: Past policy probabilities :param value_heads: Value estimate tensors from each value stream :param beta: Entropy regularization strength :param entropy: Current policy entropy :param epsilon: Value for policy-divergence threshold :param lr: Learning rate :param max_step: Total number of training steps. """ self.returns_holders = {} self.old_values = {} for name in value_heads.keys(): returns_holder = tf.placeholder(shape=[None], dtype=tf.float32, name="{}_returns".format(name)) old_value = tf.placeholder(shape=[None], dtype=tf.float32, name="{}_value_estimate".format(name)) self.returns_holders[name] = returns_holder self.old_values[name] = old_value self.advantage = tf.placeholder(shape=[None], dtype=tf.float32, name="advantages") advantage = tf.expand_dims(self.advantage, -1) decay_epsilon = tf.train.polynomial_decay(epsilon, self.policy.global_step, max_step, 0.1, power=1.0) decay_beta = tf.train.polynomial_decay(beta, self.policy.global_step, max_step, 1e-5, power=1.0) value_losses = [] for name, head in value_heads.items(): clipped_value_estimate = self.old_values[name] + tf.clip_by_value( tf.reduce_sum(head, axis=1) - self.old_values[name], -decay_epsilon, decay_epsilon, ) v_opt_a = tf.squared_difference(self.returns_holders[name], tf.reduce_sum(head, axis=1)) v_opt_b = tf.squared_difference(self.returns_holders[name], clipped_value_estimate) value_loss = tf.reduce_mean( tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.policy.mask, 2)[1]) value_losses.append(value_loss) self.value_loss = tf.reduce_mean(value_losses) r_theta = tf.exp(probs - old_probs) p_opt_a = r_theta * advantage p_opt_b = (tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage) self.policy_loss = -tf.reduce_mean( tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.policy.mask, 2)[1]) # For cleaner stats reporting self.abs_policy_loss = tf.abs(self.policy_loss) self.loss = ( self.policy_loss + 0.5 * self.value_loss - decay_beta * tf.reduce_mean( tf.dynamic_partition(entropy, self.policy.mask, 2)[1]))