def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))
def update(): data = buf.get() # Get loss and info values before update pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with a single step of gradient descent pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent = pi_info['kl'], pi_info_old['ent'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))
def update(self): data = self.buffer.get() actor_loss_old, actor_info_old = self.compute_actor_loss(data) actor_loss_old = actor_loss_old.item() critic_loss_old = self.compute_critic_loss(data).item() # train policy self.actor_optimizer.zero_grad() actor_loss, actor_info = self.compute_actor_loss(data) actor_loss.backward() mpi_avg_grads(self.ac.actor) self.actor_optimizer.step() # train critic for i in range(self.num_iter_train_critic): self.critic_optimizer.zero_grad() critic_loss = self.compute_critic_loss(data) critic_loss.backward() mpi_avg_grads(self.ac.critic) self.critic_optimizer.step() #log kl, entropy = actor_info['kl'], actor_info['entropy'] self.logger.store(LossPi=actor_loss_old, LossV=critic_loss_old, KL=kl, Entropy=entropy, DeltaLossV=(critic_loss.item() - critic_loss_old), DeltaLossPi=(actor_loss.item() - actor_loss_old))
def update(buf, ac, train_v_iters, pi_optimizer, vf_optimizer, logger): # 提取一个周期的轨迹, data 是一个dict, 键包括obs, act, ret, adv, logp data = buf.get() # data['obs'].shape=(4000, obs_dim), adv.shape=(4000,) # 计算更新之前的计算损失函数 pi_l_old, pi_info_old = compute_loss_pi(data=data, actor=ac.pi) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data, critic=ac.v).item() # 更新策略网络的参数 pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data=data, actor=ac.pi) loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() # 更新值网络的参数 for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data=data, critic=ac.v) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent = pi_info['kl'], pi_info_old['ent'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))
def update(): data = buf.get() v_l_old, pi_l_old, pi_info_old = compute_loss(data) pi_l_old = pi_l_old.item() vl_l_old = v_l_old.item() # Train policy with multiple steps of gradient descent for i in range(train_iters): optimizer.zero_grad() loss_v, loss_pi, pi_info = compute_loss(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( f'Early stopping at step {i} due to reaching max kl.') break loss = loss_pi + loss_v * v_loss_coeff loss.backward() mpi_avg_grads(ac.ac) # average grads across MPI processes optimizer.step() logger.store(StopIter=i) # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))
def demo_update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v_pi(data).item() for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: # logger.log('Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) for i in range(train_v_iters): vf_pi_optimizer.zero_grad() loss_v = compute_loss_v_pi(data) loss_v.backward() mpi_avg_grads(ac.v_pi) vf_pi_optimizer.step() print("Pi loss: {}".format(pi_l_old)) kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))
def update(): epsilon = 0.1 data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break # Manually update pi.parameters # loss_pi.backward() for l in ac.pi.logits_net: for x in l.parameters(): y, = torch.autograd.grad(loss_pi, x, create_graph=True, retain_graph=True) w = torch.zeros(y.size(), requires_grad=True) g, = torch.autograd.grad(y, x, grad_outputs=w, create_graph=True) r, = torch.autograd.grad(g, w, grad_outputs=y, create_graph=False) x.grad = y - epsilon * r mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))
def update_vf(): data = buf.get() v_l_old = compute_loss_v(data).item() print("Loss for Value function: {}".format(v_l_old)) for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) vf_optimizer.step()
def update(episode_buffer): # Update if episode_buffer.dones[-1]: next_value = 0.0 else: last_obs = episode_buffer.next_observations[-1] previous_reward = episode_buffer.rewards[-1] last_obs_tensor = torch.tensor(last_obs, dtype=torch.float32).unsqueeze(0) previous_reward_tensor = torch.tensor([previous_reward], dtype=torch.float32).unsqueeze(0) context = agent.get_context() next_value = target_agent.predict_value(obs_tensor=last_obs_tensor, previous_reward_tensor=previous_reward_tensor, goal_grid_code_tensor=goal_grid_code_tensor, context=context).cpu().item() # Super critical!! optimizer.zero_grad() # Compute value and policy losses loss, info = agent.compute_loss(rewards=np.array(episode_buffer.rewards), dones=np.array(episode_buffer.dones), next_value=next_value, discount_factor=gamma, use_gae=use_gae, tau=tau, value_loss_coef=value_loss_coef, policy_loss_coef=policy_loss_coef, entropy_reg_coef=entropy_loss_coef, grid_layer_wreg_loss_coef=grid_layer_weight_reg_loss_coef) loss.backward() if use_MPI: mpi_pytorch.mpi_avg_grads(agent) # Optimize if max_grad_norm is not None: torch.nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm) optimizer.step() # Log losses and info logger.store(**info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(agent.parameters(), target_agent.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) if use_MPI: mpi_pytorch.sync_params(target_agent)
def update(): data = buf.get() # Get loss and info values before update pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with a single step of gradient descent pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) # Manually update pi.parameters # loss_pi.backward() for l in ac.pi.logits_net: for x in l.parameters(): y, = torch.autograd.grad(loss_pi, x, create_graph=True, retain_graph=True) w = torch.zeros(y.size(), requires_grad=True) g, = torch.autograd.grad(y, x, grad_outputs=w, create_graph=True) r, = torch.autograd.grad(g, w, grad_outputs=y, create_graph=False) x.grad = y - epsilon * r mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent = pi_info['kl'], pi_info_old['ent'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))
def update(): # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data[ 'adv'], data['logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))
def update(): data = buf.get() # compute old pi distribution obs, act = data['obs'], data['act'] with torch.no_grad(): old_pi, _ = ac.pi(obs, act) pi_loss = compute_loss_pi(data) pi_l_old = pi_loss.item() v_l_old = compute_loss_v(data).item() grads = core.flat_grads( torch.autograd.grad(pi_loss, ac.pi.parameters())) # Core calculations for TRPO or NPG Hx = lambda v: hessian_vector_product(data, old_pi, v) x = core.conjugate_gradients(Hx, grads, cg_iters) alpha = torch.sqrt(2 * delta / (torch.matmul(x, Hx(x)) + EPS)) old_params = core.get_flat_params_from(ac.pi) def set_and_eval(step): new_params = old_params - alpha * x * step core.set_flat_params_to(ac.pi, new_params) loss_pi, kl_loss = compute_kl_loss_pi(data, old_pi) return kl_loss.item(), loss_pi.item() if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == 'trpo': # trpo augments npg with backtracking line search, hard kl for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))