def __init__(self, behaviour_actions_log_probs, target_actions_log_probs, policy_entropy, dones, discount, rewards, values, bootstrap_value, entropy_coeff=-0.01, vf_loss_coeff=0.5, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0): """Policy gradient loss with vtrace importance weighting. VTraceLoss takes tensors of shape [T, B, ...], where `B` is the batch_size. The reason we need to know `B` is for V-trace to properly handle episode cut boundaries. Args: behaviour_actions_log_probs: A float32 tensor of shape [T, B]. target_actions_log_probs: A float32 tensor of shape [T, B]. policy_entropy: A float32 tensor of shape [T, B]. dones: A float32 tensor of shape [T, B]. discount: A float32 scalar. rewards: A float32 tensor of shape [T, B]. values: A float32 tensor of shape [T, B]. bootstrap_value: A float32 tensor of shape [B]. """ self.vtrace_returns = from_importance_weights( behaviour_actions_log_probs=behaviour_actions_log_probs, target_actions_log_probs=target_actions_log_probs, discounts=inverse(dones) * discount, rewards=rewards, values=values, bootstrap_value=bootstrap_value, clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold) # The policy gradients loss self.pi_loss = -1.0 * layers.reduce_sum( target_actions_log_probs * self.vtrace_returns.pg_advantages) # The baseline loss delta = values - self.vtrace_returns.vs self.vf_loss = 0.5 * layers.reduce_sum(layers.square(delta)) # The entropy loss (We want to maximize entropy, so entropy_ceoff < 0) self.entropy = layers.reduce_sum(policy_entropy) # The summed weighted loss self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff + self.entropy * entropy_coeff)
def _calc_kl(self, means, logvars, old_means, old_logvars): """ Calculate KL divergence between old and new distributions See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Kullback.E2.80.93Leibler_divergence Args: means: shape (batch_size, act_dim) logvars: shape (act_dim) old_means: shape (batch_size, act_dim) old_logvars: shape (act_dim) Returns: kl: shape (batch_size) """ log_det_cov_old = layers.reduce_sum(old_logvars) log_det_cov_new = layers.reduce_sum(logvars) tr_old_new = layers.reduce_sum(layers.exp(old_logvars - logvars)) kl = 0.5 * (layers.reduce_sum( layers.square(means - old_means) / layers.exp(logvars), dim=1) + (log_det_cov_new - log_det_cov_old) + tr_old_new - self.act_dim) return kl
def _calc_logprob(self, actions, means, logvars): """ Calculate log probabilities of actions, when given means and logvars of normal distribution. The constant sqrt(2 * pi) is omitted, which will be eliminated in later. Args: actions: shape (batch_size, act_dim) means: shape (batch_size, act_dim) logvars: shape (act_dim) Returns: logprob: shape (batch_size) """ exp_item = layers.elementwise_div(layers.square(actions - means), layers.exp(logvars), axis=1) exp_item = -0.5 * layers.reduce_sum(exp_item, dim=1) vars_item = -0.5 * layers.reduce_sum(logvars) logprob = exp_item + vars_item return logprob
def learn(self, obs, actions, advantages, target_values, learning_rate, entropy_coeff): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. actions: An int64 tensor of shape [B]. advantages: A float32 tensor of shape [B]. target_values: A float32 tensor of shape [B]. learning_rate: float scalar of learning rate. entropy_coeff: float scalar of entropy coefficient. """ logits = self.model.policy(obs) policy_distribution = CategoricalDistribution(logits) actions_log_probs = policy_distribution.logp(actions) # The policy gradient loss pi_loss = -1.0 * layers.reduce_sum(actions_log_probs * advantages) # The value function loss values = self.model.value(obs) delta = values - target_values vf_loss = 0.5 * layers.reduce_sum(layers.square(delta)) # The entropy loss (We want to maximize entropy, so entropy_ceoff < 0) policy_entropy = policy_distribution.entropy() entropy = layers.reduce_sum(policy_entropy) total_loss = (pi_loss + vf_loss * self.vf_loss_coeff + entropy * entropy_coeff) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=40.0)) optimizer = fluid.optimizer.AdamOptimizer(learning_rate) optimizer.minimize(total_loss) return total_loss, pi_loss, vf_loss, entropy
def _actor_learn(self, obs_n, act_n): i = self.agent_index this_policy = self.model.policy(obs_n[i]) sample_this_action = SoftPDistribution( logits=this_policy, act_space=self.act_space[self.agent_index]).sample() action_input_n = act_n + [] action_input_n[i] = sample_this_action eval_q = self.Q(obs_n, action_input_n) act_cost = layers.reduce_mean(-1.0 * eval_q) act_reg = layers.reduce_mean(layers.square(this_policy)) cost = act_cost + act_reg * 1e-3 fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByNorm(clip_norm=0.5), param_list=self.model.get_actor_params()) optimizer = fluid.optimizer.AdamOptimizer(self.lr) optimizer.minimize(cost, parameter_list=self.model.get_actor_params()) return cost