def lambda_errors(phi, lambdas, noises): mserrors = np.zeros((len(lambdas), len(noises))) variances = np.zeros(len(noises)) a = np.ones((n, n)) / n b = np.zeros((n, n)) for i in range(n - 1): b[i, i + 1] = 1. b[-1, 0] = 1. for i, noise in enumerate(noises): c = noise * a + (1 - noise) * b c /= c.sum(axis=1)[:, None] beh_pol = policies.Discrete(c) task = LinearDiscreteValuePredictionTask(mdp, gamma, phi, np.zeros(phi.dim), policy=beh_pol) d = globals().copy() d["phi"] = phi d["task"] = task mean, std, raw = run_experiment(n_jobs=-1, **d) val = mean[:, -1, n:] val[mean[:, -1, n:] > mean[0, -1, 0]] = mean[0, -1, 0] val = val.mean(axis=1) mserrors[:, i] = val - np.mean(val) print noise, lambdas[np.argmin(val)] #mserrors -= mserrors.min(axis=1)[:,None] #mserrors /= mserrors.max(axis=1)[:,None] return mserrors
import numpy as np import matplotlib.pyplot as plt import features import policies n = 20 n_random = 800 mdp = examples.CorruptedChain(n_states=n) phi = features.corrupted_rbfs(n_S=n, n_rbfs=5, n_random=n_random) gamma = .9 n_feat = phi.dim p0 = np.zeros(n_feat) pol = np.zeros((n, 2)) pol[:10, 0] = 1 pol[10:, 1] = 1 policy = policies.Discrete(prop_table=pol) task = LinearDiscreteValuePredictionTask(mdp, gamma, phi, p0, policy=policy) # define the methods to examine methods = [] # [td0, gtd, gtd2] lstd = td.RecursiveLSTDLambdaJP(lam=0, eps=1000, phi=phi) lstd.name = r"LSTD({}) $\ell_2 \tau={}$".format(0, 0) lstd.color = "b" methods.append(lstd) #for eps in np.power(10,np.arange(-1,4)): lstd = td.LSTDLambdaJP(lam=0, tau=0.8, phi=phi) lstd.name = r"LSTD({}) $\ell_2 \tau={}$".format(0, .8) lstd.color = "b" #methods.append(lstd)
import td import examples from task import LinearDiscreteValuePredictionTask import numpy as np import features import policies import regtd n = 400 n_a = 10 n_feat = 200 mdp = examples.RandomMDP(n, n_a) phi = features.lin_random(n_feat, n, constant=True) gamma = .95 np.random.seed(3) beh_pol = policies.Discrete(np.random.rand(n, n_a)) tar_pol = policies.Discrete(np.random.rand(n, n_a)) task = LinearDiscreteValuePredictionTask(mdp, gamma, phi, np.zeros(phi.dim), policy=beh_pol, target_policy=tar_pol) methods = [] alpha = 0.007 mu = .0001 gtd = td.GTD(alpha=alpha, beta=mu * alpha, phi=phi) gtd.name = r"GTD $\alpha$={} $\mu$={}".format(alpha, mu) gtd.color = "r" methods.append(gtd)
""" Experiment that shows arbitrary off-policy behavior of TD """ __author__ = "Christoph Dann <*****@*****.**>" import td import examples import numpy as np import features import matplotlib.pyplot as plt from task import LinearDiscreteValuePredictionTask import policies n = 7 beh_pi = np.ones((n + 1, 2)) beh_pi[:, 0] = float(n) / (n + 1) beh_pi[:, 1] = float(1) / (n + 1) beh_pol = policies.Discrete(prop_table=beh_pi) target_pi = np.zeros((n + 1, 2)) target_pi[:, 0] = 0 target_pi[:, 1] = 1 target_pol = policies.Discrete(prop_table=target_pi) mdp = examples.BairdStarExample(n) phi = features.linear_blended(n + 1) methods = [] gamma = 0.99 task = LinearDiscreteValuePredictionTask(mdp, gamma, phi, np.asarray(n * [1.] + [10., 1.]),