def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old))
def update(self, data): """Run gradient descent on the actor and critic. Args: data (dict): batch of agent-environment information Returns: Policy loss, value loss, KL-divergence, entropy and clip fraction. """ pi_l_old, pi_info_old = self.compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = self.compute_loss_v(data).item() for i in range(self.train_pi_iters): self.pi_optimizer.zero_grad() loss_pi, pi_info = self.compute_loss_pi(data) kl = mpi_avg(pi_info["kl"]) if kl > 1.5 * self.target_kl: print(f"Early stopping at {i} due to reaching max kl") break loss_pi.backward() mpi_avg_grads(self.ac.pi) self.pi_optimizer.step() for i in range(self.train_v_iters): self.vf_optimizer.zero_grad() loss_v = self.compute_loss_v(data) loss_v.backward() mpi_avg_grads(self.ac.v) self.vf_optimizer.step() kl, ent, cf = pi_info["kl"], pi_info_old["ent"], pi_info["cf"] return pi_l_old, v_l_old, kl, ent, cf
def mpi_avg_grads(module): """ Average contents of gradient buffers across MPI processes. """ if num_procs() == 1: return for p in module.parameters(): p_grad_numpy = p.grad.numpy() # numpy view of tensor data avg_p_grad = mpi_avg(p.grad) p_grad_numpy[:] = avg_p_grad[:]
def mpi_avg_grads(module): if num_procs() == 1: return for p in module.parameters(): p_grad_numpy = p.grad.numpy() avg_p_grad = mpi_avg(p.grad) p_grad_numpy[:] = avg_p_grad[:]
def update(): # Prepare hessian func, gradient eval inputs = {k:v for k,v in zip(all_phs, buf.get())} Hx = lambda x : mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2*delta/(np.dot(x, Hx(x))+EPS)) old_params = sess.run(get_pi_params) def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo=='npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo=='trpo': # trpo augments npg with backtracking line search, hard kl for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log('Accepting new params at step %d of line search.'%j) logger.store(BacktrackIters=j) break if j==backtrack_iters-1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old))
def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))
def average_gradients(param_groups): for param_group in param_groups: for p in param_group['params']: if p.requires_grad and p.grad is not None: p.grad.data.copy_(torch.Tensor(mpi_avg(p.grad.data.numpy())))
def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))