示例#1
0
 def discount_target_values(self, observations, actions, rewards,
                            terminals):
     sampled_actions = self.policy.get_expected_value(observations[:, :(-1),
                                                                   ...])
     sampled_log_probs = terminals[:, :(-1)] * self.policy.get_log_probs(
         sampled_actions, observations[:, :(-1), ...])
     discount_target_values = discounted_sum(
         (rewards - tf.exp(self.log_alpha) * sampled_log_probs), self.gamma)
     self.record("value_discount_target_mean",
                 tf.reduce_mean(discount_target_values))
     return discount_target_values
示例#2
0
 def loss_function():
     returns = discounted_sum(rewards, self.gamma)
     advantages = returns - tf.reduce_mean(returns)
     log_probs = self.policy.get_log_probs(actions,
                                           observations[:, :(-1), ...])
     policy_loss = -1.0 * tf.reduce_mean(advantages * log_probs)
     self.record("rewards_mean", tf.reduce_mean(rewards))
     self.record("log_probs_policy_mean", tf.reduce_mean(log_probs))
     self.record("log_probs_policy_max", tf.reduce_max(log_probs))
     self.record("log_probs_policy_min", tf.reduce_min(log_probs))
     self.record("policy_loss", policy_loss)
     return policy_loss
示例#3
0
 def discount_target_values(
     self,
     observations,
     actions,
     rewards,
     terminals
 ):
     discount_target_values = discounted_sum(rewards, self.gamma)
     self.record(
         "q_discount_target_mean",
         tf.reduce_mean(discount_target_values))
     self.record(
         "q_discount_target_max",
         tf.reduce_max(discount_target_values))
     self.record(
         "q_discount_target_min",
         tf.reduce_min(discount_target_values))
     return discount_target_values