Exemplo n.º 1
0
import features
import policies
from task import LinearLQRValuePredictionTask
import pickle

gamma = 0.95
sigma = np.array([0.] * 3 + [0.01])
dt = 0.1
mdp = examples.PoleBalancingMDP(sigma=sigma, dt=dt)
phi = features.squared_diag(4)

n_feat = len(phi(np.zeros(mdp.dim_S)))
theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma)
theta_p = np.array(theta_p).flatten()

policy = policies.LinearContinuous(theta=theta_p, noise=np.zeros((1)))
theta0 = 0. * np.ones(n_feat)

task = LinearLQRValuePredictionTask(mdp,
                                    gamma,
                                    phi,
                                    theta0,
                                    policy=policy,
                                    normalize_phi=False,
                                    mu_next=1000)

methods = []
alpha = 0.2
mu = 2
gtd = td.GTD(alpha=alpha, beta=mu * alpha, phi=phi)
gtd.name = r"GTD $\alpha$={} $\mu$={}".format(alpha, mu)
Exemplo n.º 2
0
dim = 100
gamma = 0.95
sigma = np.ones(2 * dim) * 1.
dt = 0.1
mdp = examples.NLinkPendulumMDP(np.ones(dim) * .5,
                                np.ones(dim) * .6,
                                sigma=sigma,
                                dt=dt)
phi = features.squared_tri((2 * dim) * (2 * dim + 1) / 2 + 1)

n_feat = phi.dim
print phi.dim, "features"
theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma)
theta_p = np.array(theta_p)
theta_o = theta_p.copy()
policy = policies.LinearContinuous(theta=theta_p, noise=np.ones(dim) * 0.4)
theta0 = 0. * np.ones(n_feat)

task = LinearLQRValuePredictionTask(mdp,
                                    gamma,
                                    phi,
                                    theta0,
                                    policy=policy,
                                    normalize_phi=True,
                                    mu_next=1000,
                                    mu_iter=1000,
                                    mu_restarts=8)

#states, _, _, _, _ = mdp.samples_cached(n_iter=1000, n_restarts=15,
#                                        policy=policy, seed=8000)
Exemplo n.º 3
0
import policies
from task import LinearLQRValuePredictionTask
import pickle

gamma = 0.95
sigma = np.array([0.] * 3 + [0.01])
dt = 0.1
mdp = examples.PoleBalancingMDP(sigma=sigma, dt=dt)
phi = features.squared_tri(11)


n_feat = len(phi(np.zeros(mdp.dim_S)))
theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma)
theta_p = np.array(theta_p).flatten()
theta_o = theta_p.copy()
beh_policy = policies.LinearContinuous(theta=theta_o, noise=np.array([0.01]))
target_policy = policies.LinearContinuous(theta=theta_p, noise=np.array([0.001]))
theta0 = 0. * np.ones(n_feat)

task = LinearLQRValuePredictionTask(
    mdp, gamma, phi, theta0, policy=beh_policy, target_policy=target_policy,
    normalize_phi=True, mu_next=1000)


methods = []
alpha = 0.001
mu = .0001
gtd = td.GTD(alpha=alpha, beta=mu * alpha, phi=phi)
gtd.name = r"GTD $\alpha$={} $\mu$={}".format(alpha, mu)
gtd.color = "r"
methods.append(gtd)
Exemplo n.º 4
0
from task import LinearLQRValuePredictionTask

gamma = 0.95
sigma = np.zeros(4)
sigma[-1] = 0.01
dt = 0.1
mdp = examples.PoleBalancingMDP(sigma=sigma, dt=dt)
phi = features.squared_diag(4)


n_feat = len(phi(np.zeros(mdp.dim_S)))
theta_p, _, _ = dp.solve_LQR(mdp, gamma=gamma)
print theta_p
theta_p = np.array(theta_p).flatten()
theta_o = theta_p.copy()
beh_policy = policies.LinearContinuous(theta=theta_o, noise=np.ones(1) * 0.01)
target_policy = policies.LinearContinuous(
    theta=theta_p, noise=np.ones(1) * 0.001)
theta0 = 0. * np.ones(n_feat)

task = LinearLQRValuePredictionTask(mdp, gamma, phi, theta0,
                                    policy=beh_policy, target_policy=target_policy,
                                    normalize_phi=True, mu_next=1000)



methods = []
alpha = 0.002
mu = .1
gtd = td.GTD(alpha=alpha, beta=mu * alpha, phi=phi)
gtd.name = r"GTD $\alpha$={} $\mu$={}".format(alpha, mu)