예제 #1
0
파일: task.py 프로젝트: liubocn/tdlearn
 def MSBE(self, theta):
     """ Mean Squared Bellman Error """
     V = np.array((theta * self.mu_phi).sum(axis=1))
     theta_trans = features.squared_tri(self.mdp.dim_S).param_forward(
         *self.bellman_operator(*self.phi.param_back(theta)))
     V2 = np.array((theta_trans * self.mu_phi_full).sum(axis=1))
     return np.mean((V - V2) ** 2)
예제 #2
0
파일: task.py 프로젝트: Somnus1990/tdlearn
 def MSBE(self, theta):
     """ Mean Squared Bellman Error """
     V = np.array((theta * self.mu_phi).sum(axis=1))
     theta_trans = features.squared_tri(self.mdp.dim_S).param_forward(
         *self.bellman_operator(*self.phi.param_back(theta)))
     V2 = np.array((theta_trans * self.mu_phi_full).sum(axis=1))
     return np.mean((V - V2) ** 2)
예제 #3
0
파일: task.py 프로젝트: liubocn/tdlearn
 def MSPBE(self, theta):
     """ Mean Squared Projected Bellman Error """
     V = np.matrix((theta * np.asarray(self.mu_phi)).sum(axis=1)).T
     theta_trans = features.squared_tri(self.mdp.dim_S).param_forward(
         *self.bellman_operator(*self.phi.param_back(theta)))
     v = np.asarray(V - self.projection_operator(
     ) * np.matrix(self.mu_phi_full) * np.matrix(theta_trans).T)
     return np.mean(v ** 2)
예제 #4
0
파일: task.py 프로젝트: Somnus1990/tdlearn
 def MSPBE(self, theta):
     """ Mean Squared Projected Bellman Error """
     V = np.matrix((theta * np.asarray(self.mu_phi)).sum(axis=1)).T
     theta_trans = features.squared_tri(self.mdp.dim_S).param_forward(
         *self.bellman_operator(*self.phi.param_back(theta)))
     v = np.asarray(V - self.projection_operator(
     ) * np.matrix(self.mu_phi_full) * np.matrix(theta_trans).T)
     return np.mean(v ** 2)
예제 #5
0
파일: task.py 프로젝트: liubocn/tdlearn
 def __getattr__(self, name):
     """
     some attribute such as state distribution or the true value function
     are very costly to compute, so they are only evaluated, if really needed
     """
     if name == "V_true":
         self.V_true = dynamic_prog.estimate_V_LQR(
             self.mdp, lambda x, y: self.bellman_operator(
                 x, y, policy="target"),
             gamma=self.gamma)
         return self.V_true
     elif name == "mu_phi_full":
         n = (self.mdp.dim_S+1)*(self.mdp.dim_S)+1
         self.mu_phi_full = util.apply_rowise(
             features.squared_tri(n), self.mu)
         return self.mu_phi_full
     else:
         return LinearContinuousValuePredictionTask.__getattr__(self, name)
예제 #6
0
파일: task.py 프로젝트: Somnus1990/tdlearn
 def __getattr__(self, name):
     """
     some attribute such as state distribution or the true value function
     are very costly to compute, so they are only evaluated, if really needed
     """
     if name == "V_true":
         self.V_true = dynamic_prog.estimate_V_LQR(
             self.mdp, lambda x, y: self.bellman_operator(
                 x, y, policy="target"),
             gamma=self.gamma)
         return self.V_true
     elif name == "mu_phi_full":
         n = (self.mdp.dim_S+1)*(self.mdp.dim_S)+1
         self.mu_phi_full = util.apply_rowise(
             features.squared_tri(n), self.mu)
         return self.mu_phi_full
     else:
         return LinearContinuousValuePredictionTask.__getattr__(self, name)
예제 #7
0
파일: task.py 프로젝트: liubocn/tdlearn
 def MSE(self, theta):
     p = features.squared_tri(self.mdp.dim_S).param_forward(*self.phi.param_back(theta)) -\
         features.squared_tri(self.mdp.dim_S).param_forward(*self.V_true)
     return np.mean((p * self.mu_phi_full).sum(axis=1) ** 2)
예제 #8
0
import td
import examples
import numpy as np
import matplotlib.pyplot as plt
import dynamic_prog as dp
import features
import policies
from task import LinearLQRValuePredictionTask
import pickle

gamma = 0.95
sigma = np.array([0.] * 3 + [0.01])
dt = 0.1
mdp = examples.PoleBalancingMDP(sigma=sigma, dt=dt)
phi = features.squared_tri(11)


n_feat = len(phi(np.zeros(mdp.dim_S)))
theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma)
theta_p = np.array(theta_p).flatten()
theta_o = theta_p.copy()
beh_policy = policies.LinearContinuous(theta=theta_o, noise=np.array([0.01]))
target_policy = policies.LinearContinuous(theta=theta_p, noise=np.array([0.001]))
theta0 = 0. * np.ones(n_feat)

task = LinearLQRValuePredictionTask(
    mdp, gamma, phi, theta0, policy=beh_policy, target_policy=target_policy,
    normalize_phi=True, mu_next=1000)

예제 #9
0
#import matplotlib.pyplot as plt
import features
import policies
from task import LinearLQRValuePredictionTask
import util
import dynamic_prog as dp

dim = 100
gamma = 0.95
sigma = np.ones(2 * dim) * 1.
dt = 0.1
mdp = examples.NLinkPendulumMDP(np.ones(dim) * .5,
                                np.ones(dim) * .6,
                                sigma=sigma,
                                dt=dt)
phi = features.squared_tri((2 * dim) * (2 * dim + 1) / 2 + 1)

n_feat = phi.dim
print phi.dim, "features"
theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma)
theta_p = np.array(theta_p)
theta_o = theta_p.copy()
policy = policies.LinearContinuous(theta=theta_p, noise=np.ones(dim) * 0.4)
theta0 = 0. * np.ones(n_feat)

task = LinearLQRValuePredictionTask(mdp,
                                    gamma,
                                    phi,
                                    theta0,
                                    policy=policy,
                                    normalize_phi=True,
예제 #10
0
파일: task.py 프로젝트: Somnus1990/tdlearn
 def MSE(self, theta):
     p = features.squared_tri(self.mdp.dim_S).param_forward(*self.phi.param_back(theta)) -\
         features.squared_tri(self.mdp.dim_S).param_forward(*self.V_true)
     return np.mean((p * self.mu_phi_full).sum(axis=1) ** 2)
예제 #11
0
import matplotlib.pyplot as plt
import dynamic_prog as dp
import util
import features
import policies
from task import LinearLQRValuePredictionTask

dim = 30
gamma = 0.95
sigma = np.ones(2 * dim) * 1.
dt = 0.1
mdp = examples.NLinkPendulumMDP(np.ones(dim) * .5,
                                np.ones(dim) * .6,
                                sigma=sigma,
                                dt=dt)
phi = features.squared_tri(dim * (dim + 1) / 2 + 1)

n_feat = len(phi(np.zeros(mdp.dim_S)))
theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma)
theta_p = np.array(theta_p)
theta_o = theta_p.copy()
beh_policy = policies.LinearContinuous(theta=theta_p, noise=np.ones(dim) * 0.4)
theta0 = 0. * np.ones(n_feat)

task = LinearLQRValuePredictionTask(mdp,
                                    gamma,
                                    phi,
                                    theta0,
                                    policy=beh_policy,
                                    normalize_phi=True,
                                    mu_next=1000)
예제 #12
0
import td
import examples
import numpy as np
import matplotlib.pyplot as plt
import dynamic_prog as dp
import util
import features
import policies
from task import LinearLQRValuePredictionTask

dim=30
gamma = 0.95
sigma = np.ones(2*dim)*1.
dt = 0.1
mdp = examples.NLinkPendulumMDP(np.ones(dim)*.5, np.ones(dim)*.6, sigma=sigma, dt=dt)
phi = features.squared_tri(dim*(dim+1)/2+1)


n_feat = len(phi(np.zeros(mdp.dim_S)))
theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma)
theta_p = np.array(theta_p)
theta_o = theta_p.copy()
beh_policy = policies.LinearContinuous(theta=theta_p, noise=np.ones(dim)*0.4)
theta0 = 0. * np.ones(n_feat)

task = LinearLQRValuePredictionTask(mdp, gamma, phi, theta0,
                                    policy=beh_policy,
                                    normalize_phi=True, mu_next=1000)


예제 #13
0
import regtd
#import matplotlib.pyplot as plt
import features
import policies
from task import LinearLQRValuePredictionTask
import util
import dynamic_prog as dp


dim = 100
gamma = 0.95
sigma = np.ones(2 * dim) * 1.
dt = 0.1
mdp = examples.NLinkPendulumMDP(
    np.ones(dim) * .5, np.ones(dim) * .6, sigma=sigma, dt=dt)
phi = features.squared_tri((2 * dim) * (2 * dim + 1) / 2 + 1)

n_feat = phi.dim
print phi.dim, "features"
theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma)
theta_p = np.array(theta_p)
theta_o = theta_p.copy()
policy = policies.LinearContinuous(theta=theta_p, noise=np.ones(dim) * 0.4)
theta0 = 0. * np.ones(n_feat)

task = LinearLQRValuePredictionTask(mdp, gamma, phi, theta0,
                                    policy=policy,
                                    normalize_phi=True, mu_next=1000, mu_iter=1000,
                                    mu_restarts=8)