def discount_target_values(self, observations, actions, rewards, terminals): sampled_actions = self.policy.get_expected_value(observations[:, :(-1), ...]) sampled_log_probs = terminals[:, :(-1)] * self.policy.get_log_probs( sampled_actions, observations[:, :(-1), ...]) discount_target_values = discounted_sum( (rewards - tf.exp(self.log_alpha) * sampled_log_probs), self.gamma) self.record("value_discount_target_mean", tf.reduce_mean(discount_target_values)) return discount_target_values
def loss_function(): returns = discounted_sum(rewards, self.gamma) advantages = returns - tf.reduce_mean(returns) log_probs = self.policy.get_log_probs(actions, observations[:, :(-1), ...]) policy_loss = -1.0 * tf.reduce_mean(advantages * log_probs) self.record("rewards_mean", tf.reduce_mean(rewards)) self.record("log_probs_policy_mean", tf.reduce_mean(log_probs)) self.record("log_probs_policy_max", tf.reduce_max(log_probs)) self.record("log_probs_policy_min", tf.reduce_min(log_probs)) self.record("policy_loss", policy_loss) return policy_loss
def discount_target_values( self, observations, actions, rewards, terminals ): discount_target_values = discounted_sum(rewards, self.gamma) self.record( "q_discount_target_mean", tf.reduce_mean(discount_target_values)) self.record( "q_discount_target_max", tf.reduce_max(discount_target_values)) self.record( "q_discount_target_min", tf.reduce_min(discount_target_values)) return discount_target_values