Exemplo n.º 1
0
    def train(self, memory):
        """Train the actor based on sampled memory.
        Modifies the actor that was provided in __init__.

        Uses the TargetAlgorithm specified in __init__ to determine update targets,
        then calculates loss and natural gradient. Step size then depends on subclass (NPG or TRPO)."""
        self.actor.train()
        states = memory[0]
        actions = memory[1]

        # ----------------------------
        # step 1: get targets
        returns = self.target_alg.targets(memory)

        # ----------------------------
        # step 3: get gradient of loss and hessian of kl
        loss = self.get_loss(returns, states, actions)
        loss_grad = torch.autograd.grad(loss, self.actor.parameters())
        loss_grad = utils.flat_grad(loss_grad)
        step_dir = self.conjugate_gradient(states, loss_grad.data, nsteps=10)

        # ----------------------------
        # step 4: get step direction and step size and update actor
        results = self.step(step_dir, states, actions, returns, loss, loss_grad)

        return results
Exemplo n.º 2
0
    def fisher_vector_product(self, states, p):
        p.detach()
        kl = self.actor.get_kl(states.float())
        kl_grad = torch.autograd.grad(kl, self.actor.parameters(), create_graph=True)
        kl_grad = utils.flat_grad(kl_grad)  # check kl_grad == 0

        kl_grad_p = (kl_grad * p).sum()
        kl_hessian_p = torch.autograd.grad(kl_grad_p, self.actor.parameters())
        kl_hessian_p = utils.flat_hessian(kl_hessian_p)

        return kl_hessian_p + 0.1 * p
Exemplo n.º 3
0
        valGradients, valVaribales = zip(
            *optimizer.compute_gradients(stateValueLoss))
        valGradients, _ = tf.clip_by_global_norm(valGradients, args.grad_clip)
        svfOptimizationStep = optimizer.apply_gradients(
            zip(valGradients, valVaribales))
    else:
        svfOptimizationStep = optimizer.minimize(stateValueLoss)

    #other ops
    policyParams = utils.get_vars(policyParamsScope)
    getPolicyParams = utils.flat_concat(policyParams)
    setPolicyParams = utils.assign_params_from_flat(policyParamsFlatten,
                                                    policyParams)

    d, HxOp = utils.hesian_vector_product(KLcontraint, policyParams)
    surrogateFlatLoss = utils.flat_grad(Lloss, policyParams)

    if args.damping_coef > 0:
        HxOp += args.damping_coef * d

    #tf session initialization
    init = tf.initialize_local_variables()
    init2 = tf.initialize_all_variables()
    sess.run([init, init2])

    nextObs = env.reset()
    nextDone = 0
    epLen = 0
    epTotalRew = 0
    epTotalTrainRews = deque(maxlen=args.test_episodes_with_noise)
Exemplo n.º 4
0
                                         feed_dict={
                                             d: newDir,
                                             logProbsAllPh: additionalInfos[0],
                                             obsPh: observations
                                         })
        else:
            Hx = lambda newDir: sess.run(
                HxOp,
                feed_dict={
                    d: newDir,
                    oldActionMeanPh: additionalInfos[0],
                    oldActionLogStdPh: additionalInfos[1],
                    obsPh: observations
                })

        grad = sess.run(utils.flat_grad(Lloss, policyParams),
                        feed_dict={
                            obsPh: observations,
                            aPh: actions,
                            advPh: advEst,
                            logProbSampPh: sampledLogProb
                        })  #, logProbsAllPh : allLogProbs})
        cjStart = time.time()
        newDir = utils.conjugate_gradients(Hx, grad, args.cg_iters)
        cjEnd = time.time()

        curDelta = utils.annealedNoise(args.delta, args.delta_final,
                                       args.epochs, e)
        LlossOld = sess.run(
            Lloss,
            feed_dict={