def kl(self, other): """ Args: other: object of CategoricalDistribution Returns: kl: A float32 tensor with shape [BATCH_SIZE] """ assert isinstance(other, CategoricalDistribution) logits = self.logits - layers.reduce_max(self.logits, dim=1) other_logits = other.logits - layers.reduce_max(other.logits, dim=1) e_logits = layers.exp(logits) other_e_logits = layers.exp(other_logits) z = layers.reduce_sum(e_logits, dim=1) other_z = layers.reduce_sum(other_e_logits, dim=1) prob = e_logits / z kl = layers.reduce_sum( prob * (logits - layers.log(z) - other_logits + layers.log(other_z)), dim=1) return kl
def learn(self, obs, action, reward, next_obs, terminal, sample_weight): # print("obs:",obs) # raise NotImplementedError # obs = layers.squeeze(input=obs,axes=[-1]) pred_value = self.model.value(obs) action_onehot = layers.one_hot(action, self.act_dim) pred_action_value = layers.reduce_sum(action_onehot * pred_value, dim=1) # calculate the target q value next_action_value = self.model.value(next_obs) greedy_action = layers.argmax(next_action_value, axis=-1) greedy_action = layers.unsqueeze(greedy_action, axes=[1]) greedy_action_onehot = layers.one_hot(greedy_action, self.act_dim) next_pred_value = self.target_model.value(next_obs) max_v = layers.reduce_sum(greedy_action_onehot * next_pred_value, dim=1) max_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * max_v delta = layers.abs(target - pred_action_value) cost = sample_weight * layers.square_error_cost( pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3) optimizer.minimize(cost) return cost, delta
def logp(self, actions, eps=1e-6): """ Args: actions: An int64 tensor with shape [BATCH_SIZE] eps: A small float constant that avoids underflows when computing the log probability Returns: actions_log_prob: A float32 tensor with shape [BATCH_SIZE] """ assert len(actions.shape) == 1 logits = self.logits - layers.reduce_max(self.logits, dim=1) e_logits = layers.exp(logits) z = layers.reduce_sum(e_logits, dim=1) prob = e_logits / z actions = layers.unsqueeze(actions, axes=[1]) actions_onehot = layers.one_hot(actions, prob.shape[1]) actions_onehot = layers.cast(actions_onehot, dtype='float32') actions_prob = layers.reduce_sum(prob * actions_onehot, dim=1) actions_prob = actions_prob + eps actions_log_prob = layers.log(actions_prob) return actions_log_prob
def __init__(self, behaviour_actions_log_probs, target_actions_log_probs, policy_entropy, dones, discount, rewards, values, bootstrap_value, entropy_coeff=-0.01, vf_loss_coeff=0.5, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0): """Policy gradient loss with vtrace importance weighting. VTraceLoss takes tensors of shape [T, B, ...], where `B` is the batch_size. The reason we need to know `B` is for V-trace to properly handle episode cut boundaries. Args: behaviour_actions_log_probs: A float32 tensor of shape [T, B]. target_actions_log_probs: A float32 tensor of shape [T, B]. policy_entropy: A float32 tensor of shape [T, B]. dones: A float32 tensor of shape [T, B]. discount: A float32 scalar. rewards: A float32 tensor of shape [T, B]. values: A float32 tensor of shape [T, B]. bootstrap_value: A float32 tensor of shape [B]. """ self.vtrace_returns = from_importance_weights( behaviour_actions_log_probs=behaviour_actions_log_probs, target_actions_log_probs=target_actions_log_probs, discounts=inverse(dones) * discount, rewards=rewards, values=values, bootstrap_value=bootstrap_value, clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold) # The policy gradients loss self.pi_loss = -1.0 * layers.reduce_sum( target_actions_log_probs * self.vtrace_returns.pg_advantages) # The baseline loss delta = values - self.vtrace_returns.vs self.vf_loss = 0.5 * layers.reduce_sum(layers.square(delta)) # The entropy loss (We want to maximize entropy, so entropy_ceoff < 0) self.entropy = layers.reduce_sum(policy_entropy) # The summed weighted loss self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff + self.entropy * entropy_coeff)
def entropy(self): """ Returns: entropy: A float32 tensor with shape [BATCH_SIZE] of entropy of self policy distribution. """ logits = self.logits - layers.reduce_max(self.logits, dim=1) e_logits = layers.exp(logits) z = layers.reduce_sum(e_logits, dim=1) prob = e_logits / z entropy = -1.0 * layers.reduce_sum(prob * (logits - layers.log(z)), dim=1) return entropy
def learn(self, obs, action, reward, next_obs, terminal, learning_rate=None): """ update value model self.model with DQN algorithm """ # Support the modification of learning_rate if learning_rate is None: assert isinstance( self.lr, float), "Please set the learning rate of DQN in initializaion." learning_rate = self.lr pred_value = self.model.value(obs) next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=1) best_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') pred_action_value = layers.reduce_sum( layers.elementwise_mul(action_onehot, pred_value), dim=1) cost = layers.square_error_cost(pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam( learning_rate=learning_rate, epsilon=1e-3) optimizer.minimize(cost) return cost
def _calc_kl(self, means, logvars, old_means, old_logvars): """ Calculate KL divergence between old and new distributions See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Kullback.E2.80.93Leibler_divergence Args: means: shape (batch_size, act_dim) logvars: shape (act_dim) old_means: shape (batch_size, act_dim) old_logvars: shape (act_dim) Returns: kl: shape (batch_size) """ log_det_cov_old = layers.reduce_sum(old_logvars) log_det_cov_new = layers.reduce_sum(logvars) tr_old_new = layers.reduce_sum(layers.exp(old_logvars - logvars)) kl = 0.5 * (layers.reduce_sum( layers.square(means - old_means) / layers.exp(logvars), dim=1) + (log_det_cov_new - log_det_cov_old) + tr_old_new - self.act_dim) return kl
def _calc_logprob(self, actions, means, logvars): """ Calculate log probabilities of actions, when given means and logvars of normal distribution. The constant sqrt(2 * pi) is omitted, which will be eliminated in later. Args: actions: shape (batch_size, act_dim) means: shape (batch_size, act_dim) logvars: shape (act_dim) Returns: logprob: shape (batch_size) """ exp_item = layers.elementwise_div(layers.square(actions - means), layers.exp(logvars), axis=1) exp_item = -0.5 * layers.reduce_sum(exp_item, dim=1) vars_item = -0.5 * layers.reduce_sum(logvars) logprob = exp_item + vars_item return logprob
def sample(self, obs): mean, log_std = self.actor.policy(obs) std = layers.exp(log_std) normal = Normal(mean, std) x_t = normal.sample([1])[0] y_t = layers.tanh(x_t) action = y_t * self.max_action log_prob = normal.log_prob(x_t) log_prob -= layers.log(self.max_action * (1 - layers.pow(y_t, 2)) + epsilon) log_prob = layers.reduce_sum(log_prob, dim=1, keep_dim=True) log_prob = layers.squeeze(log_prob, axes=[1]) return action, log_prob
def learn(self, obs, actions, advantages, target_values, learning_rate, entropy_coeff): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. actions: An int64 tensor of shape [B]. advantages: A float32 tensor of shape [B]. target_values: A float32 tensor of shape [B]. learning_rate: float scalar of learning rate. entropy_coeff: float scalar of entropy coefficient. """ logits = self.model.policy(obs) policy_distribution = CategoricalDistribution(logits) actions_log_probs = policy_distribution.logp(actions) # The policy gradient loss pi_loss = -1.0 * layers.reduce_sum(actions_log_probs * advantages) # The value function loss values = self.model.value(obs) delta = values - target_values vf_loss = 0.5 * layers.reduce_sum(layers.square(delta)) # The entropy loss (We want to maximize entropy, so entropy_ceoff < 0) policy_entropy = policy_distribution.entropy() entropy = layers.reduce_sum(policy_entropy) total_loss = (pi_loss + vf_loss * self.vf_loss_coeff + entropy * entropy_coeff) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=40.0)) optimizer = fluid.optimizer.AdamOptimizer(learning_rate) optimizer.minimize(total_loss) return total_loss, pi_loss, vf_loss, entropy
def cal_bellman_residual(self, obs, action, reward, next_obs, terminal): """ use self.model to get squared Bellman residual with fed data """ pred_value = self.model.value(obs) next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=1) best_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') pred_action_value = layers.reduce_sum( layers.elementwise_mul(action_onehot, pred_value), dim=1) cost = layers.square_error_cost(pred_action_value, target) cost = layers.reduce_mean(cost) return cost
def learn(self, obs, action, reward, next_obs, terminal, sample_weight): """ update value model self.model with DQN algorithm """ pred_value = self.model.value(obs) next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=1) best_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') pred_action_value = layers.reduce_sum(action_onehot * pred_value, dim=1) delta = layers.abs(target - pred_action_value) cost = sample_weight * layers.square_error_cost( pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3) optimizer.minimize(cost) return cost, delta # `delta` is the TD-error
def learn(self, obs, actions, means, log_std, rewards, dones, learning_rate, entropy_coeff): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. actions: An int64 tensor of shape [B]. behaviour_logits: A float32 tensor of shape [B, NUM_ACTIONS]. rewards: A float32 tensor of shape [B]. dones: A float32 tensor of shape [B]. learning_rate: float scalar of learning rate. entropy_coeff: float scalar of entropy coefficient. """ values = self.model.value(obs) # pi log_std = layers.exp(log_std) normal_pi = Normal(means, log_std) # x_t1 = normal_pi.sample([1])[0] # x_t1.stop_gradient = True y_t1 = actions / self.max_action # action1 = y_t1 * self.max_action log_prob1 = normal_pi.log_prob(actions) log_prob1 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) + epsilon) log_prob1 = layers.reduce_sum(log_prob1, dim=1, keep_dim=True) log_prob_pi = layers.squeeze(log_prob1, axes=[1]) # mu actions_mu, log_std_mu = self.model.policy(obs) log_std_mu = layers.exp(log_std_mu) normal_mu = Normal(actions_mu, log_std_mu) # x_t2 = normal_mu.sample([1])[0] # x_t2.stop_gradient = True # y_t2 = actions # action2 = y_t2 * self.max_action log_prob2 = normal_mu.log_prob(actions) log_prob2 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) + epsilon) log_prob2 = layers.reduce_sum(log_prob2, dim=1, keep_dim=True) log_prob_mu = layers.squeeze(log_prob2, axes=[1]) # target_policy_distribution = CategoricalDistribution(target_logits) # behaviour_policy_distribution = CategoricalDistribution( # behaviour_logits) policy_entropy = normal_mu.entropy() # policy_entropy = layers.reduce_mean(policy_entropy, dim=1) target_actions_log_probs = log_prob_mu behaviour_actions_log_probs = log_prob_pi # Calculating kl for debug # kl = target_policy_distribution.kl(behaviour_policy_distribution) kl = normal_mu.kl_divergence(normal_pi) kl = layers.reduce_mean(kl, dim=1) # kl = layers.unsqueeze(kl, axes=[1]) """ Split the tensor into batches at known episode cut boundaries. [B * T] -> [T, B] """ T = self.sample_batch_steps def split_batches(tensor): B = tensor.shape[0] // T splited_tensor = layers.reshape(tensor, [B, T] + list(tensor.shape[1:])) # transpose B and T return layers.transpose(splited_tensor, [1, 0] + list(range(2, 1 + len(tensor.shape)))) behaviour_actions_log_probs = split_batches( behaviour_actions_log_probs) target_actions_log_probs = split_batches(target_actions_log_probs) policy_entropy = split_batches(policy_entropy) dones = split_batches(dones) rewards = split_batches(rewards) values = split_batches(values) # [T, B] -> [T - 1, B] for V-trace calc. behaviour_actions_log_probs = layers.slice(behaviour_actions_log_probs, axes=[0], starts=[0], ends=[-1]) target_actions_log_probs = layers.slice(target_actions_log_probs, axes=[0], starts=[0], ends=[-1]) policy_entropy = layers.slice(policy_entropy, axes=[0], starts=[0], ends=[-1]) dones = layers.slice(dones, axes=[0], starts=[0], ends=[-1]) rewards = layers.slice(rewards, axes=[0], starts=[0], ends=[-1]) bootstrap_value = layers.slice(values, axes=[0], starts=[T - 1], ends=[T]) values = layers.slice(values, axes=[0], starts=[0], ends=[-1]) bootstrap_value = layers.squeeze(bootstrap_value, axes=[0]) vtrace_loss = VTraceLoss( behaviour_actions_log_probs=behaviour_actions_log_probs, target_actions_log_probs=target_actions_log_probs, policy_entropy=policy_entropy, dones=dones, discount=self.gamma, rewards=rewards, values=values, bootstrap_value=bootstrap_value, entropy_coeff=entropy_coeff, vf_loss_coeff=self.vf_loss_coeff, clip_rho_threshold=self.clip_rho_threshold, clip_pg_rho_threshold=self.clip_pg_rho_threshold) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=40.0)) optimizer = fluid.optimizer.AdamOptimizer(learning_rate) optimizer.minimize(vtrace_loss.total_loss) return vtrace_loss, kl