import features import policies from task import LinearLQRValuePredictionTask import pickle gamma = 0.95 sigma = np.array([0.] * 3 + [0.01]) dt = 0.1 mdp = examples.PoleBalancingMDP(sigma=sigma, dt=dt) phi = features.squared_diag(4) n_feat = len(phi(np.zeros(mdp.dim_S))) theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma) theta_p = np.array(theta_p).flatten() policy = policies.LinearContinuous(theta=theta_p, noise=np.zeros((1))) theta0 = 0. * np.ones(n_feat) task = LinearLQRValuePredictionTask(mdp, gamma, phi, theta0, policy=policy, normalize_phi=False, mu_next=1000) methods = [] alpha = 0.2 mu = 2 gtd = td.GTD(alpha=alpha, beta=mu * alpha, phi=phi) gtd.name = r"GTD $\alpha$={} $\mu$={}".format(alpha, mu)
dim = 100 gamma = 0.95 sigma = np.ones(2 * dim) * 1. dt = 0.1 mdp = examples.NLinkPendulumMDP(np.ones(dim) * .5, np.ones(dim) * .6, sigma=sigma, dt=dt) phi = features.squared_tri((2 * dim) * (2 * dim + 1) / 2 + 1) n_feat = phi.dim print phi.dim, "features" theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma) theta_p = np.array(theta_p) theta_o = theta_p.copy() policy = policies.LinearContinuous(theta=theta_p, noise=np.ones(dim) * 0.4) theta0 = 0. * np.ones(n_feat) task = LinearLQRValuePredictionTask(mdp, gamma, phi, theta0, policy=policy, normalize_phi=True, mu_next=1000, mu_iter=1000, mu_restarts=8) #states, _, _, _, _ = mdp.samples_cached(n_iter=1000, n_restarts=15, # policy=policy, seed=8000)
import policies from task import LinearLQRValuePredictionTask import pickle gamma = 0.95 sigma = np.array([0.] * 3 + [0.01]) dt = 0.1 mdp = examples.PoleBalancingMDP(sigma=sigma, dt=dt) phi = features.squared_tri(11) n_feat = len(phi(np.zeros(mdp.dim_S))) theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma) theta_p = np.array(theta_p).flatten() theta_o = theta_p.copy() beh_policy = policies.LinearContinuous(theta=theta_o, noise=np.array([0.01])) target_policy = policies.LinearContinuous(theta=theta_p, noise=np.array([0.001])) theta0 = 0. * np.ones(n_feat) task = LinearLQRValuePredictionTask( mdp, gamma, phi, theta0, policy=beh_policy, target_policy=target_policy, normalize_phi=True, mu_next=1000) methods = [] alpha = 0.001 mu = .0001 gtd = td.GTD(alpha=alpha, beta=mu * alpha, phi=phi) gtd.name = r"GTD $\alpha$={} $\mu$={}".format(alpha, mu) gtd.color = "r" methods.append(gtd)
from task import LinearLQRValuePredictionTask gamma = 0.95 sigma = np.zeros(4) sigma[-1] = 0.01 dt = 0.1 mdp = examples.PoleBalancingMDP(sigma=sigma, dt=dt) phi = features.squared_diag(4) n_feat = len(phi(np.zeros(mdp.dim_S))) theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma) print theta_p theta_p = np.array(theta_p).flatten() theta_o = theta_p.copy() beh_policy = policies.LinearContinuous(theta=theta_o, noise=np.ones(1) * 0.01) target_policy = policies.LinearContinuous( theta=theta_p, noise=np.ones(1) * 0.001) theta0 = 0. * np.ones(n_feat) task = LinearLQRValuePredictionTask(mdp, gamma, phi, theta0, policy=beh_policy, target_policy=target_policy, normalize_phi=True, mu_next=1000) methods = [] alpha = 0.002 mu = .1 gtd = td.GTD(alpha=alpha, beta=mu * alpha, phi=phi) gtd.name = r"GTD $\alpha$={} $\mu$={}".format(alpha, mu)