Пример #1
0
class KSCGDModel(object):
    def __init__(self, stateCount, config):
        self.V = KernelRepresentation(stateCount, 1, config)
        # Learning rate
        self.eta = ScheduledParameter('LearningRate', config)
        # Regularization
        self.lossL = config.getfloat('Regularization', 1e-6)
        # Representation error budget
        # self.eps = config.getfloat('RepresentationError', 1.0)
        self.eps = ScheduledParameter('RepresentationError', config)
        # Reward discount
        self.gamma = config.getfloat('RewardDiscount')

        # TD-loss expectation approximation rate
        self.beta = ScheduledParameter('ExpectationRate', config)
        # Running estimate of our expected TD-loss
        self.y = 0.

    def bellman_error(self, s, a, r, s_):
        if s_ is None:
            return r - self.V(s)
        else:
            return r + self.gamma * self.V(s_) - self.V(s)

    def model_error(self):
        return 0.5 * self.lossL * self.V.normsq()

    def train(self, step, sample):
        self.eta.step(step)
        self.eps.step(step)
        self.beta.step(step)
        # Unpack sample
        s, a, r, s_ = sample
        # Compute error
        delta = self.bellman_error(s, a, r, s_)
        # Running average
        self.y += self.beta.value * (delta - self.y)
        # Gradient step
        self.V.shrink(1. - self.eta.value * self.lossL)
        if s_ is None:
            self.V.append(s, -self.eta.value * self.y * np.array([[-1.]]))
        else:
            W = np.zeros((2, 1))
            W[0] = -1
            W[1] = self.gamma
            self.V.append(np.vstack((s, s_)), -self.eta.value * self.y * W)
        # Prune
        modelOrder = len(self.V.D)

        self.V.prune(self.eps.value * self.eta.value**2)
        modelOrder_ = len(self.V.D)
        # Compute new error
        loss = 0.5 * self.bellman_error(s, a, r, s_)**2 + self.model_error()
        return (float(loss), float(modelOrder_), self.eta.value,
                self.beta.value)

    @property
    def metrics_names(self):
        return ('Training Loss', 'Model Order', 'Step Size',
                'Averaging Coefficient')
Пример #2
0
class KQGreedyModel(object):
    def __init__(self, stateCount, actionCount, config):
        self.Q = KernelRepresentation(stateCount + actionCount, 2, config)
        # Learning rate
        self.eta = ScheduledParameter('LearningRate', config)
        # Regularization
        self.lossL = config.getfloat('Regularization', 1e-4)
        # Representation error budget
        self.eps = config.getfloat('RepresentationError', 1.0)
        # TD-loss expectation approximation rate
        self.beta = ScheduledParameter('ExpectationRate', config)
        # Running estimate of our expected TD-loss
        self.y = 0  # np.zeros((0,1))

    def train(self, step, x, x_, nonterminal, delta, g, gamma):
        self.eta.step(step)
        self.beta.step(step)
        self.Q.shrink(1. - self.eta.value * self.lossL)

        # Stack sample points
        X = np.vstack((x, x_[nonterminal]))
        W = np.zeros((len(X), 2))
        N = float(len(delta))

        # Compute gradient weights
        W[:len(x), 0] = self.eta.value / N * delta
        W[len(x):, 0] = -self.eta.value / N * gamma * g[nonterminal][:]
        W[:len(x), 1] = self.beta.value / N * (delta[:] - g[:])
        self.Q.append(X, W)

        # Prune
        self.Q.prune((self.eps / N)**2 * self.eta.value**2 / self.beta.value)

    def evaluate(self, xs):
        "Evaluate the Q function for a list of (s,a) pairs."
        return np.reshape(self.Q(np.array(xs))[:, 0],
                          (-1, 1))  #self.Q(np.array(xs))

    def evaluateOne(self, x):
        "Evaluate the Q function for a single (s,a) pair."
        return self.Q(x)[:, 0]

    def maximize(self, ss):
        "Find the maximizing action for a batch of states."
        return [self.Q.argmax(s) for s in ss]

    def maximizeOne(self, s):
        "Find the maximizing action for a single state."
        return self.Q.argmax(s)

    def model_error(self):
        return 0.5 * self.lossL * self.Q.normsq()
Пример #3
0
class KTDModel(object):
    def __init__(self, stateCount, config):
        self.V = KernelRepresentation(stateCount, 1, config)
        # Learning rate
        self.eta = ScheduledParameter('LearningRate', config)
        # Regularization
        self.lossL = config.getfloat('Regularization', 1e-6)
        # Representation error budget
        self.eps = ScheduledParameter('RepresentationError', config)
        # Reward discount
        self.gamma = config.getfloat('RewardDiscount')

    def bellman_error(self, s, a, r, s_):
        if s_ is None:
            return r - self.V(s)
        else:
            return r + self.gamma * self.V(s_) - self.V(s)

    def model_error(self):
        return 0.5 * self.lossL * self.V.normsq()

    def train(self, step, sample):
        self.eta.step(step)
        self.eps.step(step)
        # Unpack sample
        s, a, r, s_ = sample
        # Compute error
        delta = self.bellman_error(s, a, r, s_)
        # Gradient step
        self.V.shrink(1. - self.eta.value * self.lossL)
        self.V.append(s, self.eta.value * delta)
        # Prune
        modelOrder = len(self.V.D)
        self.V.prune(self.eps.value * self.eta.value**2)
        modelOrder_ = len(self.V.D)
        # Compute new error
        loss = 0.5 * self.bellman_error(s, a, r, s_)**2 + self.model_error()
        return (float(loss), float(modelOrder_), self.eta.value)

    @property
    def metrics_names(self):
        return ('Training Loss', 'Model Order', 'Step Size')
Пример #4
0
class KGreedyQModel(object):
    def __init__(self, stateCount, actionCount, config):

        self.Q = KernelRepresentation(stateCount + actionCount, 2, config)

        # Learning rates
        self.eta = ScheduledParameter('LearningRate', config)
        self.beta = ScheduledParameter('ExpectationRate', config)

        # Regularization
        self.lossL = config.getfloat('Regularization', 1e-4)

        # Representation error budget
        self.eps = config.getfloat('RepresentationError', 1.0)

        # Reward discount
        self.gamma = config.getfloat('RewardDiscount')

    def get_q(self, x):
        return self.Q(x)[0][0]

    def get_g(self, x):
        return self.Q(x)[0][1]

    def bellman_error(self, s, a, r, s_):
        x = np.concatenate((np.reshape(s, (1, -1)), np.reshape(a, (1, -1))),
                           axis=1)
        if s_ is None:
            return r - self.get_q(x)
        else:
            a_ = self.Q.argmax(s_)
            x_ = np.concatenate((np.reshape(s_,
                                            (1, -1)), np.reshape(a_, (1, -1))),
                                axis=1)
            return r + self.gamma * self.get_q(x_) - self.get_q(x)

    def bellman_error2(self, x, r, x_):
        if x_ is None:
            return r - self.get_q(x)
        else:
            return r + self.gamma * self.get_q(x_) - self.get_q(x)

    def model_error(self):
        return 0.5 * self.lossL * self.Q.normsq()

    @property
    def metrics_names(self):
        return ('Training Loss', 'Model Order')

    def train(self, step, sample):
        self.eta.step(step)
        self.beta.step(step)

        # Unpack sample and compute error
        s, a, r, s_ = sample
        x = np.concatenate(
            (np.reshape(np.array(s),
                        (1, -1)), np.reshape(np.array(a), (1, -1))),
            axis=1)
        if s_ is None:
            x_ = None
        else:
            a_ = self.Q.argmax(s_)
            x_ = np.concatenate(
                (np.reshape(np.array(s_),
                            (1, -1)), np.reshape(np.array(a_), (1, -1))),
                axis=1)

        delta = self.bellman_error2(x, r, x_)

        # Gradient step
        self.Q.shrink(1. - self.eta.value * self.lossL)
        if s_ is None:
            W = np.zeros((1, 2))
            W[0, 0] = self.eta.value * delta
            W[0, 1] = self.beta.value * (delta - self.get_g(x))
            self.Q.append(x, W)
        else:
            W = np.zeros((2, 2))
            W[0, 0] = self.eta.value * delta
            W[1, 0] = -self.eta.value * self.gamma * self.get_g(x)
            W[0, 1] = self.beta.value * (delta - self.get_g(x))
            self.Q.append(np.vstack((x, x_)), W)

        # Prune
        self.Q.prune(self.eps**2 * self.eta.value**2 / self.beta.value)
        modelOrder_ = self.Q.model_order()

        # Compute new error
        loss = 0.5 * self.bellman_error2(x, r, x_)**2 + self.model_error(
        )  # TODO should we have model error here?

        return (float(loss), float(modelOrder_))
Пример #5
0
class Model:
    def __init__(self, indim, outdim, grad_type):

        self.f = KernelRepresentation(indim, outdim, ModelParameters.config)
        self.indim = indim
        self.outdim = outdim
        self.grad_type = grad_type
        self.delta = 0

    def model_error(self):
        return 0.5 * ModelParameters.lossL * self.f.normsq()

    def predict(self,
                x):  # Predict the Q function values for a batch of states.
        return self.f(x)

    def predictOne(self,
                   x):  # Predict the Q function values for a single state.
        return self.f(np.reshape(x, (1, -1)))[0]

    def ucb(self, x):  # Predict the Q function values for a single state.
        return np.abs(self.mom(x)) + np.sqrt(self.var(x))

    def val(self, x):
        return self.predictOne(x)[0] + ModelParameters.mu

    def mom(self, x):
        return self.predictOne(x)[1] + 100.0

    def var(self, x):
        return max(0, self.predictOne(x)[2] + ModelParameters.sigma)

    def train(self, sample):

        x, y = sample
        grad = (self.val(x) - y)
        grad_sq = grad**2

        # V gradient
        W = np.zeros((3, ))

        if self.grad_type == GradType.SGD:  # simple SGD
            W[0] = -ModelParameters.eta * grad
        # elif self.grad_type == GradType.MOM:
        #     W[0] =  -ModelParameters.eta * grad / (np.sqrt(self.var(x) + ModelParameters.eta**2))
        #     W[2] = (-ModelParameters.beta2 * self.var(x) + ModelParameters.beta2 * grad_sq)
        elif self.grad_type == GradType.MOM:
            # W[0] =  -ModelParameters.eta * self.mom(x) #/ (np.sqrt(self.var(x) + ModelParameters.eta**2))
            # W[1] = (-ModelParameters.beta1 * self.mom(x) + ModelParameters.beta1 * grad)
            ##########################
            W[0] = -ModelParameters.eta * self.mom(
                x)  #/ (np.sqrt(self.var(x) + ModelParameters.eta**2))
            W[1] = (-ModelParameters.beta1 * self.mom(x) +
                    ModelParameters.beta1 * grad)
            #W[2] = (-ModelParameters.beta2 * self.var(x) + ModelParameters.beta2 * grad_sq)
        elif self.grad_type == GradType.VAR:
            grad_var = (grad - self.mom(x))**2
            #print(grad_var)
            W[0] = -ModelParameters.eta * self.mom(
                x
            )  #/ (np.sqrt(self.var(x) + ModelParameters.eta**2)) #/ np.sqrt(self.var(x) + ModelParameters.eta)
            W[1] = (-ModelParameters.beta1 * self.mom(x) +
                    ModelParameters.beta1 * grad)
            W[2] = (-ModelParameters.beta2 * self.var(x) +
                    ModelParameters.beta2 * grad_var)
        elif self.grad_type == GradType.MOMENTUM:
            W[0] = -ModelParameters.eta * self.mom(x) / (
                np.sqrt(self.var(x) + ModelParameters.eta**2))
            W[1] = (-ModelParameters.beta1 * self.mom(x) +
                    ModelParameters.beta1 * grad)
            W[2] = (-ModelParameters.beta2 * self.var(x) +
                    ModelParameters.beta2 * grad_sq)
        elif self.grad_type == GradType.DELTA:
            W[0] = -ModelParameters.eta * self.delta
            self.delta = self.delta + (-ModelParameters.beta1 * self.delta +
                                       ModelParameters.beta1 * grad)
        else:
            print('error')

        # Gradient step
        self.f.shrink(1. - ModelParameters.lossL)
        self.f.append(np.array(x), np.reshape(W, (1, -1)))
        # Prune
        self.f.prune(ModelParameters.eps)

        return (grad_sq / 2, len(self.f.D))

    def loss(self, sample):
        x, y = sample
        return 0.5 * (self.val(x) - y)**2

    def point_density(self, x):
        return np.sum(self.f.kernel.f(x, self.f.D))

    def compose(f1, f2):  #static function

        #f = KernelRepresentation(4, 3, config)
        f = Model(f1.indim, f1.outdim, f1.grad_type)
        d = np.vstack([f1.f.D, f2.f.D])

        thresh = np.shape(f1.f.D)[0]

        W = np.zeros((3, ))
        for i in np.random.permutation(np.shape(d)[0]):
            x = d[i, :]
            if f.grad_type == GradType.SGD:
                if f1.point_density(x) >= f2.point_density(
                        x
                ) and i < thresh:  # reciprocal here so larger is better
                    W[1] = -f.mom(x) + f1.mom(x)
                    W[2] = -f.var(x) + f1.var(x)
                    W[0] = -f.val(x) + f1.val(x)
                    f.f.append(np.array(x), np.reshape(W, (1, -1)))
                elif f1.point_density(x) < f2.point_density(x) and i >= thresh:
                    W[1] = -f.mom(x) + f2.mom(x)
                    W[2] = -f.var(x) + f2.var(x)
                    W[0] = -f.val(x) + f2.val(x)
                    f.f.append(np.array(x), np.reshape(W, (1, -1)))
            elif f.grad_type == GradType.MOM:
                if f1.mom(x) <= f2.mom(
                        x
                ) and i < thresh:  # reciprocal here so larger is better
                    W[1] = -f.mom(x) + f1.mom(x)
                    W[2] = -f.var(x) + f1.var(x)
                    W[0] = -f.val(x) + f1.val(x)
                    f.f.append(np.array(x), np.reshape(W, (1, -1)))
                elif f1.mom(x) > f2.mom(x) and i >= thresh:
                    W[1] = -f.mom(x) + f2.mom(x)
                    W[2] = -f.var(x) + f2.var(x)
                    W[0] = -f.val(x) + f2.val(x)
                    f.f.append(np.array(x), np.reshape(W, (1, -1)))
            # elif f.grad_type == GradType.VAR:
            #     if f1.var(x) <= f2.var(x) and i < thresh: # reciprocal here so larger is better
            #         W[1] = -f.mom(x) + f1.mom(x)
            #         W[2] = -f.var(x) + f1.var(x)
            #         W[0] = -f.val(x) + f1.val(x)
            #         f.f.append(np.array(x), np.reshape(W, (1, -1)))
            #     elif f1.var(x) > f2.var(x) and i >= thresh:
            #         W[1] = -f.mom(x) + f2.mom(x)
            #         W[2] = -f.var(x) + f2.var(x)
            #         W[0] = -f.val(x) + f2.val(x)
            #         f.f.append(np.array(x), np.reshape(W, (1, -1)))
            elif f.grad_type == GradType.VAR:
                if (f1.ucb(x)) <= f2.ucb(
                        x
                ) and i < thresh:  # reciprocal here so larger is better
                    W[1] = -f.mom(x) + f1.mom(x)
                    W[2] = -f.var(x) + f1.var(x)
                    W[0] = -f.val(x) + f1.val(x)
                    f.f.append(np.array(x), np.reshape(W, (1, -1)))
                elif f1.ucb(x) > f2.ucb(x) and i >= thresh:
                    W[1] = -f.mom(x) + f2.mom(x)
                    W[2] = -f.var(x) + f2.var(x)
                    W[0] = -f.val(x) + f2.val(x)
                    f.f.append(np.array(x), np.reshape(W, (1, -1)))
        return f
Пример #6
0
class KNAFModel(object):
    def __init__(self, stateCount, actionCount, config):

        # Get dimensions of V, pi and L
        self.dim_v = 1
        self.dim_p = actionCount
        self.dim_l = 1  #(1 + actionCount) * actionCount / 2 #TODO
        self.dim_a = self.dim_v + self.dim_p + self.dim_l

        # Get action space
        self.min_act = np.reshape(json.loads(config.get('MinAction')), (-1, 1))
        self.max_act = np.reshape(json.loads(config.get('MaxAction')), (-1, 1))

        # Initialize L
        self.init_l = config.getfloat('InitL', 0.01)

        # Represent V, pi, L in one RKHS
        self.vpl = KernelRepresentation(stateCount, self.dim_a, config)

        # Learning rates
        self.eta_v = ScheduledParameter('LearningRateV', config)
        self.eta_p = ScheduledParameter('LearningRateP', config)
        self.eta_l = ScheduledParameter('LearningRateL', config)

        # Regularization
        self.lossL = config.getfloat('Regularization', 1e-6)

        # Representation error budget
        self.eps = config.getfloat('RepresentationError', 1.0)

        # Reward discount
        self.gamma = config.getfloat('RewardDiscount')

    def get_q(self, s, a):
        lmat = self.get_lmat(s)
        pi = self.get_pi(s)

        if self.dim_p > 1:
            return self.get_v(s) - 0.5 * (
                (a - pi).T).dot(lmat).dot(lmat.T).dot(a - pi)
        else:
            return np.array(
                [self.get_v(s) - 0.5 * (a - pi) * lmat * lmat * (a - pi)])

    def get_v(self, s):
        return np.array([self.predictOne(s)[0, 0]])

    def get_pi(self, s):
        pi = self.predictOne(s)[0, 1:self.dim_p + 1]
        return np.reshape(np.clip(pi, self.min_act, self.max_act), (-1, ))

    def get_lmat(self, s):
        lmat = np.zeros((self.dim_p, self.dim_p))
        temp = self.predictOne(s)
        if self.dim_p > 1:
            lmat[np.tril_indices(self.dim_p)] = temp[self.dim_p + 1:]
            return lmat + self.init_l * np.eye(self.dim_p)
        else:
            return np.array([temp[0, 2] + self.init_l])

    def bellman_error(self, s, a, r, s_):
        if s_ is None:
            return r - self.get_q(s, a)
        else:
            return r + self.gamma * self.get_v(s_) - self.get_q(s, a)

    def model_error(self):
        return 0.5 * self.lossL * self.vpl.normsq()

    def predict(self,
                s):  # Predict the Q function values for a batch of states.
        return self.vpl(s)

    def predictOne(self,
                   s):  # Predict the Q function values for a single state.
        return self.vpl(np.reshape(s, (1, -1)))

    @property
    def metrics_names(self):
        return ('Training Loss', 'Model Order')

    def train(self, step, sample):
        self.eta_v.step(step)
        self.eta_p.step(step)
        self.eta_l.step(step)
        #self.beta.step(step)

        # Unpack sample
        s, a, r, s_ = sample

        # Compute error
        delta = self.bellman_error(s, a, r, s_)

        # Gradient step
        self.vpl.shrink(1. - self.lossL)

        # V gradient
        W = np.zeros((self.dim_a, ))
        W[0] = -1 * self.eta_v.value
        lmat = self.get_lmat(s)
        pi = self.get_pi(s)

        # Pi gradient
        if self.dim_p > 1:
            W[1:self.dim_p + 1] = -self.eta_p.value * np.matmul(
                np.matmul(lmat, np.transpose(lmat)), a - pi)
            lgrad_temp = np.matmul(np.matmul(np.transpose(lmat), a - pi),
                                   np.transpose(a - pi))
        else:
            lgrad_temp = lmat * (a - pi) * (a - pi)
            W[1] = -self.eta_p.value * lmat * lmat * (a - pi)

        if self.dim_p > 1:
            W[self.dim_p + 1:self.dim_a] = np.reshape(
                lgrad_temp[np.tril_indices(self.dim_p)],
                (-1, 1)) * self.eta_l.value
        else:
            W[-1] = lgrad_temp * self.eta_l.value

        # Check for model divergence!
        # if np.abs(delta) > 50 and False:
        #     print ("Divergence!")
        #     print (pi)
        #     print (lmat)
        #     print (delta)

        self.vpl.append(np.array(s), -delta * np.reshape(W, (1, -1)))

        # Prune
        self.vpl.prune(self.eps)

        modelOrder_ = len(self.vpl.D)
        # Compute new error
        loss = 0.5 * self.bellman_error(s, a, r, s_)**2  # + self.model_error()
        return (float(loss), float(modelOrder_))
Пример #7
0
class KQLearningModel(object):
    def __init__(self, stateCount, actionCount, config):
        self.Q = KernelRepresentation(stateCount + actionCount, 1, config)
        self.algorithm = config.get('Algorithm', 'td').lower()  # gtd, td or hybrid
        # Learning rate
        self.eta = ScheduledParameter('LearningRate', config)
        # Regularization
        self.lossL = config.getfloat('Regularization', 1e-4)
        self.phi = config.getfloat('Phi', 0.0)
        # Representation error budget
        self.eps = config.getfloat('RepresentationError', 1.0)
        # TD-loss expectation approximation rate
        self.beta = ScheduledParameter('ExpectationRate', config)
        # Running estimate of our expected TD-loss
        self.y = 0  # np.zeros((0,1))

    def train(self, step, x, x_, nonterminal, delta, gamma, rand_act=None):
        self.eta.step(step)
        self.beta.step(step)

        yy = self.y + self.beta.value * (delta - self.y)
        self.Q.shrink(1. - self.eta.value * self.lossL)

        # Stack sample points
        if self.algorithm == 'hybrid':
            nonterminal = list(set(nonterminal) & set(rand_act))

        if self.algorithm == 'gtd' or self.algorithm == 'hybrid':
            X = np.vstack((x, x_[nonterminal]))
            W = np.zeros((len(X), 1))
            N = float(len(delta))

            W[:len(x)] = self.eta.value / N * yy
            W[len(x):] = -self.phi * self.eta.value / N * gamma * yy[nonterminal]
            self.y = np.mean(yy)  # Running average of TAD error
        elif self.algorithm == 'td':
            X = x
            N = float(len(delta))
            W = self.eta.value / N * yy
            self.y = np.mean(yy)  # Running average of TAD error
        else:
            raise ValueError('Unknown algorithm: {}'.format(self.algorithm))

        self.Q.append(X, W)
        # Prune
        # self.Q.prune(self.eps ** 2 * (self.eta.value / N) ** 2 / self.beta.value)
        self.Q.prune((self.eps * self.eta.value ** 2) ** 2)

    def evaluate(self, xs):
        "Evaluate the Q function for a list of (s,a) pairs."
        return self.Q(np.array(xs))

    def evaluateOne(self, x):
        "Evaluate the Q function for a single (s,a) pair."
        return self.Q(x)

    def maximize(self, ss):
        "Find the maximizing action for a batch of states."
        return [self.Q.argmax(s) for s in ss]

    def maximizeOne(self, s):
        "Find the maximizing action for a single state."
        return self.Q.argmax(s)

    def model_error(self):
        return 0.5 * self.lossL * self.Q.normsq()
Пример #8
0
class KNAFIIDModel(object):
    def __init__(self, stateCount, actionCount, config):

        # Get dimensions of V, pi and L
        self.dim_v = 1
        self.dim_p = actionCount
        self.dim_l = 1  # (1 + actionCount) * actionCount / 2 #TODO
        self.dim_a = self.dim_v + self.dim_p + self.dim_l

        # Get action space
        self.min_act = np.reshape(json.loads(config.get('MinAction')), (-1, 1))
        self.max_act = np.reshape(json.loads(config.get('MaxAction')), (-1, 1))

        # Initialize L
        self.init_l = config.getfloat('InitL', 0.01)

        # Represent V, pi, L in one RKHS
        self.vpl = KernelRepresentation(stateCount, self.dim_a, config)

        # Learning rates
        self.eta_v = ScheduledParameter('LearningRateV', config)
        self.eta_p = ScheduledParameter('LearningRateP', config)
        self.eta_l = ScheduledParameter('LearningRateL', config)
        # Learning rate
        self.eta = ScheduledParameter('LearningRate', config)
        # Regularization
        self.lossL = config.getfloat('Regularization', 1e-4)
        # self.phi = config.getfloat('Phi', 1)
        # Representation error budget
        self.eps = config.getfloat('RepresentationError', 1.0)

        # Reward discount
        self.gamma = config.getfloat('RewardDiscount')

    def get_q(self, s, a):
        lmat = self.get_lmat(s)
        pi = self.get_pi(s)
        return np.array(
            [self.get_v(s) - 0.5 * (a - pi) * lmat * lmat * (a - pi)])

    def get_v(self, s):
        return np.array([self.predictOne(s)[0, 0]])

    def get_pi(self, s):
        pi = self.predictOne(s)[0, 1:self.dim_p + 1]
        return np.reshape(np.clip(pi, self.min_act, self.max_act), (-1, ))

    def get_lmat(self, s):
        lmat = np.zeros((self.dim_p, self.dim_p))
        temp = self.predictOne(s)
        if self.dim_p > 1:
            lmat[np.tril_indices(self.dim_p)] = temp[self.dim_p + 1:]
            return lmat + self.init_l * np.eye(self.dim_p)
        else:
            return np.array([temp[0, 2] + self.init_l])

    def train(self, step, sample):
        self.eta_v.step(step)
        self.eta_p.step(step)
        self.eta_l.step(step)

        s, a, r, s_ = sample[0][1][0], sample[0][1][1], sample[0][1][
            2], sample[0][1][3]
        delta = self.bellman_error(s, a, r, s_)

        # Gradient step
        self.vpl.shrink(1. - self.lossL)

        W = np.zeros((self.dim_a, ))
        W[0] = -1 * self.eta_v.value
        lmat = self.get_lmat(s)
        pi = self.get_pi(s)

        lgrad_temp = lmat * (a - pi) * (a - pi)
        W[1] = -self.eta_p.value * lmat * lmat * (a - pi)
        W[-1] = lgrad_temp * self.eta_l.value
        self.vpl.append(np.array(s), -delta * np.reshape(W, (1, -1)))
        self.vpl.prune(self.eps)
        modelOrder_ = len(self.vpl.D)
        loss = 0.5 * self.bellman_error(s, a, r, s_)**2  # + self.model_error()
        return (float(loss), float(modelOrder_))

    def bellman_error(self, s, a, r, s_):
        if s_ is None:
            return r - self.get_q(s, a)
        else:
            return r + self.gamma * self.get_v(s_) - self.get_q(s, a)

    def predict(self,
                s):  # Predict the Q function values for a batch of states.
        return self.vpl(s)

    def predictOne(self,
                   s):  # Predict the Q function values for a single state.
        return self.vpl(np.reshape(s, (1, -1)))

    def model_error(self):
        return 0.5 * self.lossL * self.vpl.normsq()