def kl(self, other): """ Args: other: object of CategoricalDistribution Returns: kl: A float32 tensor with shape [BATCH_SIZE] """ assert isinstance(other, CategoricalDistribution) logits = self.logits - layers.reduce_max(self.logits, dim=1) other_logits = other.logits - layers.reduce_max(other.logits, dim=1) e_logits = layers.exp(logits) other_e_logits = layers.exp(other_logits) z = layers.reduce_sum(e_logits, dim=1) other_z = layers.reduce_sum(other_e_logits, dim=1) prob = e_logits / z kl = layers.reduce_sum( prob * (logits - layers.log(z) - other_logits + layers.log(other_z)), dim=1) return kl
def sample(self): """ Returns: sample_action: An int64 tensor with shape [BATCH_SIZE, NUM_ACTIOINS] of sample action, with noise to keep the target close to the original action. """ eps = 1e-4 logits_shape = layers.cast(layers.shape(self.logits), dtype='int64') uniform = layers.uniform_random(logits_shape, min=eps, max=1.0 - eps) soft_uniform = layers.log(-1.0 * layers.log(uniform)) return layers.softmax(self.logits - soft_uniform, axis=-1)
def logp(self, actions, eps=1e-6): """ Args: actions: An int64 tensor with shape [BATCH_SIZE] eps: A small float constant that avoids underflows when computing the log probability Returns: actions_log_prob: A float32 tensor with shape [BATCH_SIZE] """ assert len(actions.shape) == 1 logits = self.logits - layers.reduce_max(self.logits, dim=1) e_logits = layers.exp(logits) z = layers.reduce_sum(e_logits, dim=1) prob = e_logits / z actions = layers.unsqueeze(actions, axes=[1]) actions_onehot = layers.one_hot(actions, prob.shape[1]) actions_onehot = layers.cast(actions_onehot, dtype='float32') actions_prob = layers.reduce_sum(prob * actions_onehot, dim=1) actions_prob = actions_prob + eps actions_log_prob = layers.log(actions_prob) return actions_log_prob
def entropy(self): """ Returns: entropy: A float32 tensor with shape [BATCH_SIZE] of entropy of self policy distribution. """ logits = self.logits - layers.reduce_max(self.logits, dim=1) e_logits = layers.exp(logits) z = layers.reduce_sum(e_logits, dim=1) prob = e_logits / z entropy = -1.0 * layers.reduce_sum(prob * (logits - layers.log(z)), dim=1) return entropy
def sample(self, obs): mean, log_std = self.actor.policy(obs) std = layers.exp(log_std) normal = Normal(mean, std) x_t = normal.sample([1])[0] y_t = layers.tanh(x_t) action = y_t * self.max_action log_prob = normal.log_prob(x_t) log_prob -= layers.log(self.max_action * (1 - layers.pow(y_t, 2)) + epsilon) log_prob = layers.reduce_sum(log_prob, dim=1, keep_dim=True) log_prob = layers.squeeze(log_prob, axes=[1]) return action, log_prob
def learn(self, obs, actions, means, log_std, rewards, dones, learning_rate, entropy_coeff): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. actions: An int64 tensor of shape [B]. behaviour_logits: A float32 tensor of shape [B, NUM_ACTIONS]. rewards: A float32 tensor of shape [B]. dones: A float32 tensor of shape [B]. learning_rate: float scalar of learning rate. entropy_coeff: float scalar of entropy coefficient. """ values = self.model.value(obs) # pi log_std = layers.exp(log_std) normal_pi = Normal(means, log_std) # x_t1 = normal_pi.sample([1])[0] # x_t1.stop_gradient = True y_t1 = actions / self.max_action # action1 = y_t1 * self.max_action log_prob1 = normal_pi.log_prob(actions) log_prob1 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) + epsilon) log_prob1 = layers.reduce_sum(log_prob1, dim=1, keep_dim=True) log_prob_pi = layers.squeeze(log_prob1, axes=[1]) # mu actions_mu, log_std_mu = self.model.policy(obs) log_std_mu = layers.exp(log_std_mu) normal_mu = Normal(actions_mu, log_std_mu) # x_t2 = normal_mu.sample([1])[0] # x_t2.stop_gradient = True # y_t2 = actions # action2 = y_t2 * self.max_action log_prob2 = normal_mu.log_prob(actions) log_prob2 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) + epsilon) log_prob2 = layers.reduce_sum(log_prob2, dim=1, keep_dim=True) log_prob_mu = layers.squeeze(log_prob2, axes=[1]) # target_policy_distribution = CategoricalDistribution(target_logits) # behaviour_policy_distribution = CategoricalDistribution( # behaviour_logits) policy_entropy = normal_mu.entropy() # policy_entropy = layers.reduce_mean(policy_entropy, dim=1) target_actions_log_probs = log_prob_mu behaviour_actions_log_probs = log_prob_pi # Calculating kl for debug # kl = target_policy_distribution.kl(behaviour_policy_distribution) kl = normal_mu.kl_divergence(normal_pi) kl = layers.reduce_mean(kl, dim=1) # kl = layers.unsqueeze(kl, axes=[1]) """ Split the tensor into batches at known episode cut boundaries. [B * T] -> [T, B] """ T = self.sample_batch_steps def split_batches(tensor): B = tensor.shape[0] // T splited_tensor = layers.reshape(tensor, [B, T] + list(tensor.shape[1:])) # transpose B and T return layers.transpose(splited_tensor, [1, 0] + list(range(2, 1 + len(tensor.shape)))) behaviour_actions_log_probs = split_batches( behaviour_actions_log_probs) target_actions_log_probs = split_batches(target_actions_log_probs) policy_entropy = split_batches(policy_entropy) dones = split_batches(dones) rewards = split_batches(rewards) values = split_batches(values) # [T, B] -> [T - 1, B] for V-trace calc. behaviour_actions_log_probs = layers.slice(behaviour_actions_log_probs, axes=[0], starts=[0], ends=[-1]) target_actions_log_probs = layers.slice(target_actions_log_probs, axes=[0], starts=[0], ends=[-1]) policy_entropy = layers.slice(policy_entropy, axes=[0], starts=[0], ends=[-1]) dones = layers.slice(dones, axes=[0], starts=[0], ends=[-1]) rewards = layers.slice(rewards, axes=[0], starts=[0], ends=[-1]) bootstrap_value = layers.slice(values, axes=[0], starts=[T - 1], ends=[T]) values = layers.slice(values, axes=[0], starts=[0], ends=[-1]) bootstrap_value = layers.squeeze(bootstrap_value, axes=[0]) vtrace_loss = VTraceLoss( behaviour_actions_log_probs=behaviour_actions_log_probs, target_actions_log_probs=target_actions_log_probs, policy_entropy=policy_entropy, dones=dones, discount=self.gamma, rewards=rewards, values=values, bootstrap_value=bootstrap_value, entropy_coeff=entropy_coeff, vf_loss_coeff=self.vf_loss_coeff, clip_rho_threshold=self.clip_rho_threshold, clip_pg_rho_threshold=self.clip_pg_rho_threshold) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=40.0)) optimizer = fluid.optimizer.AdamOptimizer(learning_rate) optimizer.minimize(vtrace_loss.total_loss) return vtrace_loss, kl