def get_returns(episodes): # print('!!!!!!!!!!!!!!!!!') # for episode in episodes: # print(episode.rewards.shape) # print(np.where(episode.rewards.numpy() == 1)) ans = to_numpy([episode.rewards.sum(dim=0) for episode in episodes]) return ans
def get_returns(tasks, episodes): ret = [ -np.linalg.norm(np.array(episode.observations) - np.expand_dims(tasks[taskIdx]['goal'], 0), axis=2).sum(0) for taskIdx, episode in enumerate(episodes) ] return to_numpy(ret)
def step(self, train_futures, valid_futures, max_kl=1e-3, cg_iters=10, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): num_tasks = len(train_futures[0]) logs = {} # Compute the surrogate loss old_losses, old_kls, old_pis = self._async_gather([ self.surrogate_loss(train, valid, old_pi=None) for (train, valid) in zip(zip(*train_futures), valid_futures) ]) logs['loss_before'] = to_numpy(old_losses) logs['kl_before'] = to_numpy(old_kls) old_loss = sum(old_losses) / num_tasks grads = torch.autograd.grad(old_loss, self.policy.parameters(), retain_graph=True) grads = parameters_to_vector(grads) # Compute the step direction with Conjugate Gradient old_kl = sum(old_kls) / num_tasks hessian_vector_product = self.hessian_vector_product( old_kl, damping=cg_damping) stepdir = conjugate_gradient(hessian_vector_product, grads, cg_iters=cg_iters) # Compute the Lagrange multiplier shs = 0.5 * torch.dot( stepdir, hessian_vector_product(stepdir, retain_graph=False)) lagrange_multiplier = torch.sqrt(shs / max_kl) step = stepdir / lagrange_multiplier # Save the old parameters old_params = parameters_to_vector(self.policy.parameters()) # Line search step_size = 1.0 for _ in range(ls_max_steps): vector_to_parameters(old_params - step_size * step, self.policy.parameters()) losses, kls, _ = self._async_gather([ self.surrogate_loss(train, valid, old_pi=old_pi) for (train, valid, old_pi) in zip(zip( *train_futures), valid_futures, old_pis) ]) improve = (sum(losses) / num_tasks) - old_loss kl = sum(kls) / num_tasks if (improve.item() < 0.0) and (kl.item() < max_kl): logs['loss_after'] = to_numpy(losses) logs['kl_after'] = to_numpy(kls) break step_size *= ls_backtrack_ratio else: vector_to_parameters(old_params, self.policy.parameters()) return logs
def step(self, # futures=(train_episodes_futures, valid_episodes_futures) train_futures, valid_futures, max_kl=1e-3, cg_iters=10, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): # 计算任务数量 num_tasks = len(train_futures[0]) logs = {} """ Compute the surrogate loss 针对每一个 task, 计算 train 和 valid 对,对应的参数之类的 此处的policy 可以是 GradientBasedMetaLearner 中继承至MultiTaskSampler的采样前的policy 应该也可以设置为 MAMLTRPO 中的 original_policy,该policy直接来自原始的传参 有待进一步验证两者 policy 是否一致 """ # 此处语句作用是 按 task 依次计算 surrogate_loss() 损失 train_loss, valid_loss = self._async_gather([ self.surrogate_loss(train, valid, old_pi=None) for (train, valid) in zip(zip(*train_futures), valid_futures)]) logs['train_loss'] = to_numpy(train_loss) logs['valid_loss'] = to_numpy(valid_loss) """ # 计算平均误差,输出为标量 old_loss = sum(old_losses) / num_tasks grads = torch.autograd.grad(old_loss, self.policy.parameters(), retain_graph=True) grads = parameters_to_vector(grads) # Compute the step direction with Conjugate Gradient # 计算平均误差,输出为标量 old_kl = sum(old_kls) / num_tasks hessian_vector_product = self.hessian_vector_product(old_kl, damping=cg_damping) stepdir = conjugate_gradient(hessian_vector_product, grads, cg_iters=cg_iters) # Compute the Lagrange multiplier shs = 0.5 * torch.dot(stepdir, hessian_vector_product(stepdir, retain_graph=False)) lagrange_multiplier = torch.sqrt(shs / max_kl) step = stepdir / lagrange_multiplier # Save the old parameters old_params = parameters_to_vector(self.policy.parameters()) # vector_to_parameter( * , self.policy.parameters()) 就是对网络参数的更新 # Line search step_size = 1.0 for _ in range(ls_max_steps): vector_to_parameters(old_params - step_size * step, self.policy.parameters()) losses, kls, _ = self._async_gather([ self.surrogate_loss(train, valid, old_pi=old_pi) for (train, valid, old_pi) in zip(zip(*train_futures), valid_futures, old_pis)]) improve = (sum(losses) / num_tasks) - old_loss kl = sum(kls) / num_tasks if (improve.item() < 0.0) and (kl.item() < max_kl): logs['loss_after'] = to_numpy(losses) logs['kl_after'] = to_numpy(kls) break step_size *= ls_backtrack_ratio else: vector_to_parameters(old_params, self.policy.parameters()) # 查看最终神经网络参数 params_final = self.policy.parameters() """ # logs['loss_before', 'kl_before', 'loss_after', 'kl_after'] return logs
def get_returns(episodes): return to_numpy([episode.rewards.sum(dim=0) for episode in episodes])