def update(): # Prepare hessian func, gradient eval inputs = {k: v for k, v in zip(all_phs, buf.get())} Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) old_params = sess.run(get_pi_params) def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == 'trpo': # trpo augments npg with backtracking line search, hard kl for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old))
def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old))
def update(self, dataX, dataZ, mean_x, std_x, mean_y, std_y, mean_z, std_z, mean_r, std_r, nEpoch, train_pi_iters, train_v_iters, target_kl, logger): #, logger # pi_loss_list=[] # v_loss_list=[] self.update_mean_std(mean_x, mean_y, mean_z, mean_r, std_x, std_y, std_z, std_r) training_loss_list = [] range_of_indeces = np.arange(dataX.shape[0]) nData = dataX.shape[0] batchsize = int(self.batchsize) # Training for i in range(nEpoch): avg_loss = 0 num_batches = 0 indeces = npr.choice(range_of_indeces, size=(dataX.shape[0],), replace=False) for batch in range(int(math.floor(nData / batchsize))): # walk through the randomly reordered "old data" dataX_batch = dataX[indeces[batch * batchsize:(batch + 1) * batchsize], :] dataZ_batch = dataZ[indeces[batch * batchsize:(batch + 1) * batchsize], :] # one iteration of feedforward training _, loss, output, true_output = self.sess.run([self.train_step, self.mse_, self.curr_nn_output, self.z_], feed_dict={self.x_: dataX_batch, self.z_: dataZ_batch}) training_loss_list.append(loss) avg_loss += loss num_batches += 1 # if ((i % 10) == 0): # print("\n=== Epoch {} ===".format(i)) # print("loss: ", avg_loss / num_batches) dyn_avg_loss = avg_loss / num_batches # logger.log('dynamics model ave_training_loss:%f' %dyn_avg_loss) inputs = {k: v for k, v in zip(self.all_phs, self.buf.get())} pi_l_old, v_l_old, ent = self.sess.run([self.pi_loss, self.v_loss, self.approx_ent], feed_dict=inputs) for i in range(train_pi_iters): _, kl = self.sess.run([self.train_pi, self.approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): self.sess.run(self.train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = self.sess.run([self.pi_loss, self.v_loss, self.approx_kl, self.clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, LossDyn=dyn_avg_loss, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) return training_loss_list, dyn_avg_loss, pi_l_old, v_l_old
def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))