示例#1
0
 def arrayflatgrad(self, f, symmetric=True):
     shape = f.shape + (self.total_size, )
     Res = U.torchify(np.zeros(shape))
     #assert shape[0]==shape[1]
     for i in range(shape[0]):
         for j in range(i, shape[0]):
             Res[i, j] = self.flatgrad(f[i, j], retain=True)
     return Res
示例#2
0
 def prob_predict(self,x):
     self.eval()
     x = U.torchify(x)
     if len(x.shape) ==len(self.input_shape):
         x.unsqueeze_(0)
     y = U.get(self.logsoftmax(x).squeeze().exp())
     self.train()
     return y
示例#3
0
文件: cholesky.py 项目: aghriss/TRHPO
 def __init__(self, k, alpha=1e-2):
     super(Cholesky, self).__init__()
     self.layer = nn.Linear(k, k, bias=False)
     self.k = k
     self.alpha = alpha
     self.layer.weight.requires_grad = False
     self.Sig = U.torchify(np.eye(k))
     self.min_eig = U.queue(50)
     self.update_weight()
示例#4
0
 def predict(self,x):
     self.eval()
     x = U.torchify(x)
     if len(x.shape) ==len(self.input_shape):
         x.unsqueeze_(0)
     
     y = U.get(self.forward(x).squeeze())
     self.train()
     return y
示例#5
0
    def transform(self, state0):
        state = state0[self.crop['up']:self.crop['down'],
                       self.crop['left']:self.crop['right'], :].astype(int)
        state = skimage.transform.resize(state.astype(float),
                                         (self.size, self.size, 3),
                                         mode="constant") / 255.0

        self.last_frame = state.copy()
        self.episode.append(self.last_frame)

        state = state.astype(float).transpose(self.axis)
        return U.get(self.spin(U.torchify(state).unsqueeze(0))).squeeze()
示例#6
0
 def predict(self, x):
     x = U.torchify(x)
     if len(x.shape) == len(self.input_shape):
         x.unsqueeze_(0)
     return U.get(self.forward(x).squeeze())
示例#7
0
 def set_grad(self, d_theta):
     assert d_theta.shape == (self.total_size, )
     for i, v in enumerate(self.variables()):
         v.grad = U.torchify(d_theta[self.idx[i]:self.idx[i + 1]].view(
             self.shapes[i])).detach()
示例#8
0
 def set(self, theta):
     assert theta.shape == (self.total_size, )
     for i, v in enumerate(self.variables()):
         v.data = U.torchify(theta[self.idx[i]:self.idx[i + 1]].view(
             self.shapes[i])).detach()
示例#9
0
path = self.path_generator.__next__()
self.oldpolicy.copy(self.policy)
for p in self.options:
    p.oldpolicy.copy(p.policy)
import numpy as np
import torch
import collections
from base.baseagent import BaseAgent
from core.console import Progbar
import core.math as m_utils
import core.utils as U
from Option import OptionTRPO
import core.console as C


states = U.torchify(path["states"])
options = U.torchify(path["options"]).long()
actions = U.torchify(path["actions"]).long()
advantages = U.torchify(path["baseline"])
tdlamret = U.torchify(path["tdlamret"])
vpred = U.torchify(path["vf"]) # predicted value function before udpate
advantages = (advantages - advantages.mean()) / advantages.std() # standardized advantage function estimate        
losses = self.calculate_losses(states, options, actions, advantages)       
kl = losses["meankl"]
optimization_gain = losses["gain"]
loss_grad = self.policy.flaten.flatgrad(optimization_gain,retain=True)     
grad_kl = self.policy.flaten.flatgrad(kl,create=True,retain=True)
theta_before = self.policy.flaten.get()
self.log("Init param sum", theta_before.sum())
self.log("explained variance",(vpred-tdlamret).var()/tdlamret.var())
if np.allclose(loss_grad.detach().cpu().numpy(), 0,atol=1e-15):
示例#10
0
文件: trpo.py 项目: aghriss/RL
    def _train(self):

        # Prepare for rollouts
        # ----------------------------------------

        self.oldpolicy.copy(self.policy)

        path = self.path_generator.__next__()

        states = U.torchify(path["state"])
        actions = U.torchify(path["action"]).long()
        advantages = U.torchify(path["advantage"])
        tdlamret = U.torchify(path["tdlamret"])
        vpred = U.torchify(
            path["vf"])  # predicted value function before udpate
        advantages = (advantages - advantages.mean()) / advantages.std(
        )  # standardized advantage function estimate

        losses = self.calculate_losses(states, actions, advantages, tdlamret)
        kl = losses["meankl"]
        optimization_gain = losses["gain"]

        loss_grad = self.policy.flaten.flatgrad(optimization_gain, retain=True)
        grad_kl = self.policy.flaten.flatgrad(kl, create=True, retain=True)

        theta_before = self.policy.flaten.get()
        self.log("Init param sum", theta_before.sum())
        self.log("explained variance",
                 (vpred - tdlamret).var() / tdlamret.var())

        if np.allclose(loss_grad.detach().cpu().numpy(), 0, atol=1e-15):
            print("Got zero gradient. not updating")
        else:
            print("Conjugate Gradient", end="")
            start = time.time()
            stepdir = m_utils.conjugate_gradient(self.Fvp(grad_kl),
                                                 loss_grad,
                                                 cg_iters=self.cg_iters)
            elapsed = time.time() - start
            print(", Done in %.3f" % elapsed)
            self.log("Conjugate Gradient in s", elapsed)
            assert stepdir.sum() != float("Inf")
            shs = .5 * stepdir.dot(self.Fvp(grad_kl)(stepdir))
            lm = torch.sqrt(shs / self.max_kl)
            self.log("lagrange multiplier:", lm)
            self.log("gnorm:",
                     np.linalg.norm(loss_grad.cpu().detach().numpy()))
            fullstep = stepdir / lm
            expected_improve = loss_grad.dot(fullstep)
            surrogate_before = losses["surrogate"]
            stepsize = 1.0

            print("Line Search", end="")
            start = time.time()
            for _ in range(10):
                theta_new = theta_before + fullstep * stepsize
                self.policy.flaten.set(theta_new)
                losses = self.calculate_losses(states, actions, advantages,
                                               tdlamret)
                surr = losses["surrogate"]
                improve = surr - surrogate_before
                kl = losses["meankl"]
                if surr == float("Inf") or kl == float("Inf"):
                    print("Infinite value of losses")
                elif kl > self.max_kl:
                    print("Violated KL")
                elif improve < 0:
                    print("Surrogate didn't improve. shrinking step.")
                else:
                    print("Expected: %.3f Actual: %.3f" %
                          (expected_improve, improve))
                    print("Stepsize OK!")
                    self.log("Line Search", "OK")
                    break
                stepsize *= .5
            else:
                print("couldn't compute a good step")
                self.log("Line Search", "NOPE")
                self.policy.flaten.set(theta_before)
            elapsed = time.time() - start
            print(", Done in %.3f" % elapsed)
            self.log("Line Search in s", elapsed)
            self.log("KL", kl)
            self.log("Surrogate", surr)
        start = time.time()
        print("Value Function Update", end="")
        self.value_function.fit(states[::5],
                                tdlamret[::5],
                                batch_size=50,
                                epochs=self.vf_iters)
        elapsed = time.time() - start
        print(", Done in %.3f" % elapsed)
        self.log("Value Function Fitting in s", elapsed)
        self.log("TDlamret mean", tdlamret.mean())
        self.log("Last 50 rolls mean rew", np.mean(self.episodes_reward))
        self.log("Last 50 rolls mean len", np.mean(self.episodes_len))
        self.print()
示例#11
0
    def train(self):

        self.progbar.__init__(self.memory_min)
        while (self.memory.size < self.memory_min):
            self.path_generator.__next__()

        while (self.done < self.train_steps):

            to_log = 0
            self.progbar.__init__(self.update_double)
            old_theta = self.Q.flaten.get()
            self.target_Q.copy(self.Q)
            while to_log < self.update_double:

                self.path_generator.__next__()

                rollout = self.memory.sample(self.batch_size)
                state_batch = U.torchify(rollout["state"])
                action_batch = U.torchify(rollout["action"]).long()
                reward_batch = U.torchify(rollout["reward"])

                non_final_batch = U.torchify(1 - rollout["terminated"])
                next_state_batch = U.torchify(rollout["next_state"])

                #current_q = self.Q(state_batch)

                current_q = self.Q(state_batch).gather(
                    1, action_batch.unsqueeze(1)).view(-1)
                _, a_prime = self.Q(next_state_batch).max(1)

                # Compute the target of the current Q values
                next_max_q = self.target_Q(next_state_batch).gather(
                    1, a_prime.unsqueeze(1)).view(-1)
                target_q = reward_batch + self.discount * non_final_batch * next_max_q.squeeze(
                )

                # Compute loss
                loss = self.Q.loss(current_q, target_q.detach(
                ))  # loss = self.Q.total_loss(current_q, target_q)

                # Optimize the model
                self.Q.optimize(loss, clip=True)

                self.progbar.add(self.batch_size,
                                 values=[("Loss", U.get(loss))])

                to_log += self.batch_size

            self.target_Q.copy(self.Q)
            new_theta = self.Q.flaten.get()

            self.log("Delta Theta L1",
                     U.get((new_theta - old_theta).abs().mean()))
            self.log("Av 50ep  rew", np.mean(self.past_rewards))
            self.log("Max 50ep rew", np.max(self.past_rewards))
            self.log("Min 50ep rew", np.min(self.past_rewards))
            self.log("Epsilon", self.eps)
            self.log("Done", self.done)
            self.log("Total", self.train_steps)
            self.target_Q.copy(self.Q)
            self.print()
            #self.play()
            self.save()
示例#12
0
文件: Gate.py 项目: aghriss/TRHPO
    def _train(self, path):

        states = U.torchify(path["states"])
        options = U.torchify(path["options"]).long()
        actions = U.torchify(path["actions"]).long()
        advantages = U.torchify(path["baseline"])
        tdlamret = U.torchify(path["tdlamret"])
        vpred = U.torchify(
            path["vf"])  # predicted value function before udpate
        #advantages = (advantages - advantages.mean()) / advantages.std() # standardized advantage function estimate

        losses = self.calculate_losses(states, options, actions, advantages)
        kl = losses["gate_meankl"]
        optimization_gain = losses["gain"]

        loss_grad = self.policy.flaten.flatgrad(optimization_gain, retain=True)
        grad_kl = self.policy.flaten.flatgrad(kl, create=True, retain=True)

        theta_before = self.policy.flaten.get()
        self.log("Init param sum", theta_before.sum())
        self.log("explained variance",
                 (vpred - tdlamret).var() / tdlamret.var())

        if np.allclose(loss_grad.detach().cpu().numpy(), 0, atol=1e-19):
            print("Got zero gradient. not updating")
        else:
            with C.timeit("Conjugate Gradient"):
                stepdir = m_utils.conjugate_gradient(self.Fvp(grad_kl),
                                                     loss_grad,
                                                     cg_iters=self.cg_iters)

            self.log("Conjugate Gradient in s", C.elapsed)
            assert stepdir.sum() != float("Inf")
            shs = .5 * stepdir.dot(self.Fvp(grad_kl)(stepdir))
            lm = torch.sqrt(shs / self.gate_max_kl)
            self.log("lagrange multiplier:", lm)
            self.log("gnorm:",
                     np.linalg.norm(loss_grad.cpu().detach().numpy()))
            fullstep = stepdir / lm
            expected_improve = loss_grad.dot(fullstep)
            surrogate_before = losses["gain"].detach()

            with C.timeit("Line Search"):
                stepsize = 1.0
                for i in range(10):
                    theta_new = theta_before + fullstep * stepsize
                    self.policy.flaten.set(theta_new)
                    surr = losses["surr_get"]()
                    improve = surr - surrogate_before
                    kl = losses["KL_gate_get"]()
                    if surr == float("Inf") or kl == float("Inf"):
                        C.warning("Infinite value of losses")
                    elif kl > self.gate_max_kl:
                        C.warning("Violated KL")
                    elif improve < 0:
                        stepsize *= self.ls_step
                    else:
                        self.log("Line Search", "OK")
                        break
                else:
                    improve = 0
                    self.log("Line Search", "NOPE")
                    self.policy.flaten.set(theta_before)

            for op in self.options:
                losses["gain"] = losses["surr_get"](grad=True)
                op.train(states, options, actions, advantages, tdlamret,
                         losses)

            surr = losses["surr_get"]()
            improve = surr - surrogate_before
            self.log("Expected", expected_improve)
            self.log("Actual", improve)
            self.log("Line Search in s", C.elapsed)
            self.log("LS Steps", i)
            self.log("KL", kl)
            self.log("MI", -losses["MI"])
            self.log("MI improve", -losses["MI_get"]()[0] + losses["MI"])
            self.log("Surrogate", surr)
            self.log("Gate KL", losses["KL_gate_get"]())
            self.log("HRL KL", losses["KL_get"]())
            self.log("TDlamret mean", tdlamret.mean())
            del (improve, surr, kl)
        self.log("Last %i rolls mean rew" % len(self.episodes_reward),
                 np.mean(self.episodes_reward))
        self.log("Last %i rolls mean len" % len(self.episodes_len),
                 np.mean(self.episodes_len))
        del (losses, states, options, actions, advantages, tdlamret, vpred,
             optimization_gain, loss_grad, grad_kl)
        for _ in range(10):
            gc.collect()