def train(self, memory): """Train the actor based on sampled memory. Modifies the actor that was provided in __init__. Uses the TargetAlgorithm specified in __init__ to determine update targets, then calculates loss and natural gradient. Step size then depends on subclass (NPG or TRPO).""" self.actor.train() states = memory[0] actions = memory[1] # ---------------------------- # step 1: get targets returns = self.target_alg.targets(memory) # ---------------------------- # step 3: get gradient of loss and hessian of kl loss = self.get_loss(returns, states, actions) loss_grad = torch.autograd.grad(loss, self.actor.parameters()) loss_grad = utils.flat_grad(loss_grad) step_dir = self.conjugate_gradient(states, loss_grad.data, nsteps=10) # ---------------------------- # step 4: get step direction and step size and update actor results = self.step(step_dir, states, actions, returns, loss, loss_grad) return results
def fisher_vector_product(self, states, p): p.detach() kl = self.actor.get_kl(states.float()) kl_grad = torch.autograd.grad(kl, self.actor.parameters(), create_graph=True) kl_grad = utils.flat_grad(kl_grad) # check kl_grad == 0 kl_grad_p = (kl_grad * p).sum() kl_hessian_p = torch.autograd.grad(kl_grad_p, self.actor.parameters()) kl_hessian_p = utils.flat_hessian(kl_hessian_p) return kl_hessian_p + 0.1 * p
valGradients, valVaribales = zip( *optimizer.compute_gradients(stateValueLoss)) valGradients, _ = tf.clip_by_global_norm(valGradients, args.grad_clip) svfOptimizationStep = optimizer.apply_gradients( zip(valGradients, valVaribales)) else: svfOptimizationStep = optimizer.minimize(stateValueLoss) #other ops policyParams = utils.get_vars(policyParamsScope) getPolicyParams = utils.flat_concat(policyParams) setPolicyParams = utils.assign_params_from_flat(policyParamsFlatten, policyParams) d, HxOp = utils.hesian_vector_product(KLcontraint, policyParams) surrogateFlatLoss = utils.flat_grad(Lloss, policyParams) if args.damping_coef > 0: HxOp += args.damping_coef * d #tf session initialization init = tf.initialize_local_variables() init2 = tf.initialize_all_variables() sess.run([init, init2]) nextObs = env.reset() nextDone = 0 epLen = 0 epTotalRew = 0 epTotalTrainRews = deque(maxlen=args.test_episodes_with_noise)
feed_dict={ d: newDir, logProbsAllPh: additionalInfos[0], obsPh: observations }) else: Hx = lambda newDir: sess.run( HxOp, feed_dict={ d: newDir, oldActionMeanPh: additionalInfos[0], oldActionLogStdPh: additionalInfos[1], obsPh: observations }) grad = sess.run(utils.flat_grad(Lloss, policyParams), feed_dict={ obsPh: observations, aPh: actions, advPh: advEst, logProbSampPh: sampledLogProb }) #, logProbsAllPh : allLogProbs}) cjStart = time.time() newDir = utils.conjugate_gradients(Hx, grad, args.cg_iters) cjEnd = time.time() curDelta = utils.annealedNoise(args.delta, args.delta_final, args.epochs, e) LlossOld = sess.run( Lloss, feed_dict={