def surrogate_loss(self, episodes, old_pis=None): losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): if self.usePPO: params, grad_norm = self.adapt_ppo(train_episodes) else: params = self.adapt(train_episodes) self.logger.info("in surrogate_loss") with torch.set_grad_enabled(old_pi is None): if self.baseline_type == 'critic shared': pi, _ = self.policy(valid_episodes.observations, params=params) pi = self.policy(valid_episodes.observations, params=params) pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) if self.baseline_type == 'linear': values = self.baseline(valid_episodes) elif self.baseline_type == 'critic separate': values = self.baseline(valid_episodes.observations) elif self.baseline_type == 'critic shared': _, values = self.policy(valid_episodes.observations, params=params) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def surrogate_loss(self, episodes, old_pis=None): """ Using TRPO. old_pis are not None only when doing line search? How are old_pis used? Like the behavior policy in TRPO? How? """ losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): # adapt our policy network to a new task params = self.adapt(train_episodes) # doing learning only when old_pi is None? with torch.set_grad_enabled(old_pi is None): pi = self.policy(valid_episodes.observations, params=params) # the set of policies adapted to each task pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def surrogate_loss(self, episodes, old_pis=None): losses, kls, action_dists, critic_losses = [], [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): policy_params, critic_params = self.adapt(train_episodes) with torch.set_grad_enabled(old_pi is None): action_dist = self.policy(valid_episodes.observations, params=policy_params) action_dists.append(detach_distribution(action_dist)) if old_pi is None: old_pi = detach_distribution(action_dist) values = self.critic(valid_episodes.observations, params=critic_params) advantages = valid_episodes.gae(values, tau=self.tau) value_loss = weighted_mean(advantages.pow(2), dim=0, weights=valid_episodes.mask) critic_losses.append(value_loss) advantages = weighted_normalize(advantages, weights=valid_episodes.mask, epsilon=1e-5) log_ratio = (action_dist.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -weighted_mean(ratio * advantages.detach(), dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(action_dist, old_pi), dim=0, weights=mask) kls.append(kl) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), action_dists, torch.mean(torch.stack(critic_losses, dim=0)))
async def surrogate_loss(self, train_futures, valid_futures, old_pi=None): first_order = (old_pi is not None) or self.first_order # 暂停协程函数,等待协程函数 adapt() 运行结束并输出返回值 # 要先在此处暂停函数 params = await self.adapt(train_futures, first_order=first_order) """ 要等到上面的 train_futures 进行完之后,再往下进行 每一个 train_futures 要对应一个 valid_futures,每一对是并行分开运行的 future 的数量就是 每一个 batch 中 tasks 的数量 """ with torch.set_grad_enabled(old_pi is None): # 暂停协程函数,等待协程对象 valid_futures 运行结束并输出返回值 valid_episodes = await valid_futures pi = self.policy(valid_episodes.observations, params=params) if old_pi is None: old_pi = detach_distribution(pi) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) ratio = torch.exp(log_ratio) losses = -weighted_mean(ratio * valid_episodes.advantages, lengths=valid_episodes.lengths) kls = weighted_mean(kl_divergence(pi, old_pi), lengths=valid_episodes.lengths) return losses.mean(), kls.mean(), old_pi
async def surrogate_loss(self, train_futures, valid_futures, old_pi=None, args=None, inner=None): first_order = (old_pi is not None) or self.first_order params = await self.adapt(train_futures, first_order=first_order, args=args, inner=inner) with torch.set_grad_enabled(old_pi is None): valid_episodes = await valid_futures pi = self.policy(valid_episodes.observations, params=params) if old_pi is None: old_pi = detach_distribution(pi) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) ratio = torch.exp(log_ratio) losses = -weighted_mean(ratio * valid_episodes.advantages, lengths=valid_episodes.lengths) kls = weighted_mean(kl_divergence(pi, old_pi), lengths=valid_episodes.lengths) return losses.mean(), kls.mean(), old_pi
def kl_divergence(self, episodes, old_pis=None): kls = [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): self.logger.info("in kl divergence") if self.usePPO: params, grad_norm = self.adapt_ppo(train_episodes) else: params = self.adapt(train_episodes) grad_norm = [] #if self.baseline_type = 'critic shared': # pi,_ = self.policy(valid_episodes.obervations,params=params) pi = self.policy(valid_episodes.observations, params=params) if old_pi is None: old_pi = detach_distribution(pi) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) self.logger.info("kl:") self.logger.info(kls) self.logger.info("grad_norm:") self.logger.info(grad_norm) #pdb.set_trace() return torch.mean(torch.stack(kls, dim=0))
def kl_divergence_ng(self, episodes): # episode is the train episode pi = self.policy(episodes.observations) pi_detach = detach_distribution(pi) mask = episodes.mask if episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi_detach, pi), dim=0, weights=mask) return kl
def surrogate_loss(self, episodes, old_pis=None): losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): params = self.adapt(train_episodes) with torch.set_grad_enabled(True): pi = self.policy(valid_episodes.observations, params=params) pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss_clipped = ratio.clamp(1.0 - self.ppo_ratio, 1.0 + self.ppo_ratio) * advantages loss = ratio * advantages loss = -torch.min(loss, loss_clipped) loss = weighted_mean(loss, dim=0, weights=valid_episodes.mask) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(old_pi, pi), dim=0, weights=mask) kls.append(kl) losses.append(loss + kl * 0.0005) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def surrogate_loss(self, episodes, old_pis=None): losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): params = self.adapt(train_episodes) with torch.set_grad_enabled(old_pi is None): pi = self.policy(valid_episodes.observations, params=params) # detach the mu, scale parameters of distribution pi, no gradients, no update to them pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) # initial 0, changed in line search process as pi changed, old_pi not changed log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) # print('log_ratio: ',log_ratio) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) # print('ratio: ', ratio) # print('advantages: ', advantages) loss = -weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) # the weighted_mean loss is very samll, e-8 magnitude print('loss: ', loss) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def surrogate_loss(self, episodes, old_pis=None): """Computes the surrogate loss in TRPO: (pi(a|s) / q(a|s)) * Q(s,a) in Eqn 14 Because the meta-loss tried to find theta that minimizes loss with phi, the loss is computed with valid episodes """ losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): params = self.adapt(train_episodes) with torch.set_grad_enabled(old_pi is None): pi = self.policy(valid_episodes.observations, params=params) pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) # Convert back to ratio from log loss = -weighted_mean(ratio * advantages, dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return ( torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def kl_divergence(self, episodes, old_pis=None): kls = [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): params = self.adapt(train_episodes) pi = self.policy(valid_episodes.observations, params=params) if old_pi is None: old_pi = detach_distribution(pi) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return torch.mean(torch.stack(kls, dim=0))
def kl_divergence(self, episodes, old_pis=None): """In Trust Region Policy Optimization (TRPO, [4]), the heuristic approximation which considers the "average" KL divergence is used instead """ kls = [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): params = self.adapt(train_episodes) pi = self.policy(valid_episodes.observations, params=params) if old_pi is None: old_pi = detach_distribution(pi) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return torch.mean(torch.stack(kls, dim=0))
def step(self, episodes, max_kl=1e-3, cg_iters=10, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): """Meta-optimization step (ie. update of the initial parameters), based on Trust Region Policy Optimization (TRPO, [4]). """ old_pis = [] for train_episodes, valid_episodes in episodes: params = self.adapt(train_episodes) pi = self.policy(valid_episodes.observations, params=params) old_pis.append(detach_distribution(pi)) for _ in range(self.optimization_epochs): self.optimizer.zero_grad() old_loss, _, _ = self.surrogate_loss(episodes, old_pis=old_pis) old_loss.backward() # torch.nn.utils.clip_grad_norm_(self.policy.parameters(), self.gradient_clip) self.optimizer.step()
def surrogate_loss(self, episodes, old_pis=None): """ Surrogate objective: E_r SmoothReLU( V_r^{adapted self.policy} - \max_{\pi \in self.policies[0:policy_idx - 1} V_r^\pi) V_r^{adapted self.policy} can be evaluated by valid_episodes in episodes \max_{\pi \in self.policies[0:policy_idx - 1} V_r^\pi is computed in self.values_of_optimized_policies :param episodes: [(episodes before adapting, episodes after adapting) for task in sampled tasks] :param old_pis: dummy parameter derived from super :return: mean of losses, mean of kls, pis """ losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for episode_index in range(len(episodes)): (train_episodes, valid_episodes) = episodes[episode_index] old_pi = old_pis[episode_index] if self.current_policy_idx == 0: dominance_correction = 1 else: difference_from_best_value = total_rewards( valid_episodes.rewards ) - self.values_of_optimized_policies[episode_index] dominance_correction = 1 - 1 / ( 1 + math.exp(difference_from_best_value)) params = self.adapt(train_episodes) with torch.set_grad_enabled(old_pi is None): pi = self.policy(valid_episodes.observations, params=params) pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -dominance_correction * weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) if len(losses) == 0 or len(kls) == 0: # signal outside that no losses. avoiding taking mean of empty tensors.. return (None, None, pis) else: return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def compute_ng_gradient(self, episodes, max_kl=1e-3, cg_iters=20, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): ng_grads = [] for train_episodes, valid_episodes in episodes: params, step_size, step = self.adapt(train_episodes) # compute $grad = \nabla_x J^{lvc}(x) at x = \theta - \eta\UM(\theta) pi = self.policy(valid_episodes.observations, params=params) pi_detach = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = pi.log_prob( valid_episodes.actions) - pi_detach.log_prob( valid_episodes.actions) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) ng_grad_0 = torch.autograd.grad( loss, self.policy.parameters()) # no create graph ng_grad_0 = parameters_to_vector(ng_grad_0) # compute the inverse of Fihser matrix at x=\theta times $grad with Conjugate Gradient hessian_vector_product = self.hessian_vector_product_ng( train_episodes, damping=cg_damping) F_inv_grad = conjugate_gradient(hessian_vector_product, ng_grad_0, cg_iters=cg_iters) # compute $ng_grad_1 = \nabla^2 J^{lvc}(x) at x = \theta times $F_inv_grad # create graph for higher differential # self.baseline.fit(train_episodes) loss = self.inner_loss(train_episodes) grad = torch.autograd.grad(loss, self.policy.parameters(), create_graph=True) grad = parameters_to_vector(grad) grad_F_inv_grad = torch.dot(grad, F_inv_grad.detach()) ng_grad_1 = torch.autograd.grad(grad_F_inv_grad, self.policy.parameters()) ng_grad_1 = parameters_to_vector(ng_grad_1) # compute $ng_grad_2 = the Jocobian of {F(x) U(\theta)} at x = \theta times $F_inv_grad hessian_vector_product = self.hessian_vector_product_ng( train_episodes, damping=cg_damping) F_U = hessian_vector_product(step) ng_grad_2 = torch.autograd.grad( torch.dot(F_U, F_inv_grad.detach()), self.policy.parameters()) ng_grad_2 = parameters_to_vector(ng_grad_2) ng_grad = ng_grad_0 - step_size * (ng_grad_1 + ng_grad_2) ng_grad = parameters_to_vector(ng_grad) ng_grads.append(ng_grad.view(len(ng_grad), 1)) return torch.mean(torch.stack(ng_grads, dim=1), dim=[1, 2])