def MSBE(self, theta): """ Mean Squared Bellman Error """ V = np.array((theta * self.mu_phi).sum(axis=1)) theta_trans = features.squared_tri(self.mdp.dim_S).param_forward( *self.bellman_operator(*self.phi.param_back(theta))) V2 = np.array((theta_trans * self.mu_phi_full).sum(axis=1)) return np.mean((V - V2) ** 2)
def MSPBE(self, theta): """ Mean Squared Projected Bellman Error """ V = np.matrix((theta * np.asarray(self.mu_phi)).sum(axis=1)).T theta_trans = features.squared_tri(self.mdp.dim_S).param_forward( *self.bellman_operator(*self.phi.param_back(theta))) v = np.asarray(V - self.projection_operator( ) * np.matrix(self.mu_phi_full) * np.matrix(theta_trans).T) return np.mean(v ** 2)
def __getattr__(self, name): """ some attribute such as state distribution or the true value function are very costly to compute, so they are only evaluated, if really needed """ if name == "V_true": self.V_true = dynamic_prog.estimate_V_LQR( self.mdp, lambda x, y: self.bellman_operator( x, y, policy="target"), gamma=self.gamma) return self.V_true elif name == "mu_phi_full": n = (self.mdp.dim_S+1)*(self.mdp.dim_S)+1 self.mu_phi_full = util.apply_rowise( features.squared_tri(n), self.mu) return self.mu_phi_full else: return LinearContinuousValuePredictionTask.__getattr__(self, name)
def MSE(self, theta): p = features.squared_tri(self.mdp.dim_S).param_forward(*self.phi.param_back(theta)) -\ features.squared_tri(self.mdp.dim_S).param_forward(*self.V_true) return np.mean((p * self.mu_phi_full).sum(axis=1) ** 2)
import td import examples import numpy as np import matplotlib.pyplot as plt import dynamic_prog as dp import features import policies from task import LinearLQRValuePredictionTask import pickle gamma = 0.95 sigma = np.array([0.] * 3 + [0.01]) dt = 0.1 mdp = examples.PoleBalancingMDP(sigma=sigma, dt=dt) phi = features.squared_tri(11) n_feat = len(phi(np.zeros(mdp.dim_S))) theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma) theta_p = np.array(theta_p).flatten() theta_o = theta_p.copy() beh_policy = policies.LinearContinuous(theta=theta_o, noise=np.array([0.01])) target_policy = policies.LinearContinuous(theta=theta_p, noise=np.array([0.001])) theta0 = 0. * np.ones(n_feat) task = LinearLQRValuePredictionTask( mdp, gamma, phi, theta0, policy=beh_policy, target_policy=target_policy, normalize_phi=True, mu_next=1000)
#import matplotlib.pyplot as plt import features import policies from task import LinearLQRValuePredictionTask import util import dynamic_prog as dp dim = 100 gamma = 0.95 sigma = np.ones(2 * dim) * 1. dt = 0.1 mdp = examples.NLinkPendulumMDP(np.ones(dim) * .5, np.ones(dim) * .6, sigma=sigma, dt=dt) phi = features.squared_tri((2 * dim) * (2 * dim + 1) / 2 + 1) n_feat = phi.dim print phi.dim, "features" theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma) theta_p = np.array(theta_p) theta_o = theta_p.copy() policy = policies.LinearContinuous(theta=theta_p, noise=np.ones(dim) * 0.4) theta0 = 0. * np.ones(n_feat) task = LinearLQRValuePredictionTask(mdp, gamma, phi, theta0, policy=policy, normalize_phi=True,
import matplotlib.pyplot as plt import dynamic_prog as dp import util import features import policies from task import LinearLQRValuePredictionTask dim = 30 gamma = 0.95 sigma = np.ones(2 * dim) * 1. dt = 0.1 mdp = examples.NLinkPendulumMDP(np.ones(dim) * .5, np.ones(dim) * .6, sigma=sigma, dt=dt) phi = features.squared_tri(dim * (dim + 1) / 2 + 1) n_feat = len(phi(np.zeros(mdp.dim_S))) theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma) theta_p = np.array(theta_p) theta_o = theta_p.copy() beh_policy = policies.LinearContinuous(theta=theta_p, noise=np.ones(dim) * 0.4) theta0 = 0. * np.ones(n_feat) task = LinearLQRValuePredictionTask(mdp, gamma, phi, theta0, policy=beh_policy, normalize_phi=True, mu_next=1000)
import td import examples import numpy as np import matplotlib.pyplot as plt import dynamic_prog as dp import util import features import policies from task import LinearLQRValuePredictionTask dim=30 gamma = 0.95 sigma = np.ones(2*dim)*1. dt = 0.1 mdp = examples.NLinkPendulumMDP(np.ones(dim)*.5, np.ones(dim)*.6, sigma=sigma, dt=dt) phi = features.squared_tri(dim*(dim+1)/2+1) n_feat = len(phi(np.zeros(mdp.dim_S))) theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma) theta_p = np.array(theta_p) theta_o = theta_p.copy() beh_policy = policies.LinearContinuous(theta=theta_p, noise=np.ones(dim)*0.4) theta0 = 0. * np.ones(n_feat) task = LinearLQRValuePredictionTask(mdp, gamma, phi, theta0, policy=beh_policy, normalize_phi=True, mu_next=1000)
import regtd #import matplotlib.pyplot as plt import features import policies from task import LinearLQRValuePredictionTask import util import dynamic_prog as dp dim = 100 gamma = 0.95 sigma = np.ones(2 * dim) * 1. dt = 0.1 mdp = examples.NLinkPendulumMDP( np.ones(dim) * .5, np.ones(dim) * .6, sigma=sigma, dt=dt) phi = features.squared_tri((2 * dim) * (2 * dim + 1) / 2 + 1) n_feat = phi.dim print phi.dim, "features" theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma) theta_p = np.array(theta_p) theta_o = theta_p.copy() policy = policies.LinearContinuous(theta=theta_p, noise=np.ones(dim) * 0.4) theta0 = 0. * np.ones(n_feat) task = LinearLQRValuePredictionTask(mdp, gamma, phi, theta0, policy=policy, normalize_phi=True, mu_next=1000, mu_iter=1000, mu_restarts=8)