示例#1
0
 def __init__(
     self, mdp, gamma, phi, theta0, policy, target_policy=None, normalize_phi=False, mu_iter=1000,
         mu_restarts=5, mu_seed=1000, mu_subsample=1, mu_next=50):
     self.mdp = mdp
     self.mu_n_next = mu_next
     self.mu_iter = mu_iter
     self.mu_seed = mu_seed
     self.mu_restarts = mu_restarts
     self.gamma = gamma
     self.phi = phi
     self.theta0 = theta0
     self.behavior_policy = policy
     self.mu_subsample = mu_subsample
     if target_policy is not None:
         self.off_policy = True
         self.target_policy = target_policy
     else:
         self.target_policy = policy
         self.off_policy = False
     if normalize_phi:
         mu, _, _, _, _ = self.mdp.samples_cached(policy=self.target_policy,
                                                  n_iter=self.mu_iter,
                                                  n_restarts=self.mu_restarts,
                                                  no_next_noise=True,
                                                  seed=self.mu_seed)
         Phi = util.apply_rowise(phi, mu)
         phi.normalization = np.std(Phi, axis=0)
         phi.normalization[phi.normalization == 0] = 1.
示例#2
0
文件: task.py 项目: liubocn/tdlearn
 def __init__(
     self, mdp, gamma, phi, theta0, policy, target_policy=None, normalize_phi=False, mu_iter=1000,
         mu_restarts=5, mu_seed=1000, mu_subsample=1, mu_next=50):
     self.mdp = mdp
     self.mu_n_next = mu_next
     self.mu_iter = mu_iter
     self.mu_seed = mu_seed
     self.mu_restarts = mu_restarts
     self.gamma = gamma
     self.phi = phi
     self.theta0 = theta0
     self.behavior_policy = policy
     self.mu_subsample = mu_subsample
     if target_policy is not None:
         self.off_policy = True
         self.target_policy = target_policy
     else:
         self.target_policy = policy
         self.off_policy = False
     if normalize_phi:
         mu, _, _, _, _ = self.mdp.samples_cached(policy=self.target_policy,
                                                  n_iter=self.mu_iter,
                                                  n_restarts=self.mu_restarts,
                                                  no_next_noise=True,
                                                  seed=self.mu_seed)
         Phi = util.apply_rowise(phi, mu)
         phi.normalization = np.std(Phi, axis=0)
         phi.normalization[phi.normalization == 0] = 1.
示例#3
0
文件: mdp.py 项目: amoliu/tdlearn
def samples_distribution(mymdp, policy, phi, policy_traj=None, n_subsample=1,
                         n_iter=1000, n_restarts=100, n_next=20, seed=1, verbose=True):
    assert(n_subsample == 1)  # not implemented, do that if you need it
    states = np.ones([n_restarts * n_iter, mymdp.dim_S])
    if policy_traj is None:
        policy_traj = policy
    states_next = np.ones([n_restarts * n_iter, mymdp.dim_S])
    feat = np.zeros((n_restarts * n_iter, phi.dim))
    feat_next = np.zeros_like(feat)
    rewards = np.ones(n_restarts * n_iter)
    np.random.seed(seed)

    k = 0
    s = mymdp.start()
    c = 0
    with ProgressBar(enabled=verbose) as p:
        for k in xrange(n_restarts * n_iter):
            if mymdp.terminal_f(s) or c >= n_iter:
                s = mymdp.start()
                c = 0
            p.update(k, n_restarts * n_iter, "Sampling MDP Distribution")
            s0, a, s1, r = mymdp.sample_step(
                s, policy=policy, n_samples=n_next)
            states[k, :] = s0
            feat[k, :] = phi(s0)
            fn = apply_rowise(phi, s1)
            feat_next[k, :] = np.mean(fn, axis=0)
            states_next[k, :] = np.mean(s1, axis=0)
            rewards[k] = np.mean(r)
            _, _, s, _ = mymdp.sample_step(s, policy=policy_traj, n_samples=1)
            c += 1

    return states, rewards, states_next, feat, feat_next
示例#4
0
def samples_distribution_from_states(mymdp,
                                     policy,
                                     phi,
                                     states,
                                     n_next=20,
                                     seed=1,
                                     verbose=True):
    n = states.shape[0]
    states_next = np.ones([n, mymdp.dim_S])
    feat = np.zeros((n, phi.dim))
    feat_next = np.zeros_like(feat)
    rewards = np.ones(n)
    np.random.seed(seed)

    with ProgressBar(enabled=verbose) as p:
        for k in xrange(n):
            p.update(k, n, "Sampling MDP Distribution")
            s = states[k, :]
            s0, a, s1, r = mymdp.sample_step(s,
                                             policy=policy,
                                             n_samples=n_next)
            states[k, :] = s0
            feat[k, :] = phi(s0)
            fn = apply_rowise(phi, s1)
            feat_next[k, :] = np.mean(fn, axis=0)
            states_next[k, :] = np.mean(s1, axis=0)
            rewards[k] = np.mean(r)

    return states, rewards, states_next, feat, feat_next
示例#5
0
文件: task.py 项目: liubocn/tdlearn
 def __getattr__(self, name):
     """
     some attribute such as state distribution or the true value function
     are very costly to compute, so they are only evaluated, if really needed
     """
     if name == "V_true":
         self.V_true = dynamic_prog.estimate_V_LQR(
             self.mdp, lambda x, y: self.bellman_operator(
                 x, y, policy="target"),
             gamma=self.gamma)
         return self.V_true
     elif name == "mu_phi_full":
         n = (self.mdp.dim_S+1)*(self.mdp.dim_S)+1
         self.mu_phi_full = util.apply_rowise(
             features.squared_tri(n), self.mu)
         return self.mu_phi_full
     else:
         return LinearContinuousValuePredictionTask.__getattr__(self, name)
示例#6
0
 def __getattr__(self, name):
     """
     some attribute such as state distribution or the true value function
     are very costly to compute, so they are only evaluated, if really needed
     """
     if name == "V_true":
         self.V_true = dynamic_prog.estimate_V_LQR(
             self.mdp, lambda x, y: self.bellman_operator(
                 x, y, policy="target"),
             gamma=self.gamma)
         return self.V_true
     elif name == "mu_phi_full":
         n = (self.mdp.dim_S+1)*(self.mdp.dim_S)+1
         self.mu_phi_full = util.apply_rowise(
             features.squared_tri(n), self.mu)
         return self.mu_phi_full
     else:
         return LinearContinuousValuePredictionTask.__getattr__(self, name)
示例#7
0
def samples_distribution(mymdp,
                         policy,
                         phi,
                         policy_traj=None,
                         n_subsample=1,
                         n_iter=1000,
                         n_restarts=100,
                         n_next=20,
                         seed=1,
                         verbose=True):
    assert (n_subsample == 1)  # not implemented, do that if you need it
    states = np.ones([n_restarts * n_iter, mymdp.dim_S])
    if policy_traj is None:
        policy_traj = policy
    states_next = np.ones([n_restarts * n_iter, mymdp.dim_S])
    feat = np.zeros((n_restarts * n_iter, phi.dim))
    feat_next = np.zeros_like(feat)
    rewards = np.ones(n_restarts * n_iter)
    np.random.seed(seed)

    k = 0
    s = mymdp.start()
    c = 0
    with ProgressBar(enabled=verbose) as p:
        for k in xrange(n_restarts * n_iter):
            if mymdp.terminal_f(s) or c >= n_iter:
                s = mymdp.start()
                c = 0
            p.update(k, n_restarts * n_iter, "Sampling MDP Distribution")
            s0, a, s1, r = mymdp.sample_step(s,
                                             policy=policy,
                                             n_samples=n_next)
            states[k, :] = s0
            feat[k, :] = phi(s0)
            fn = apply_rowise(phi, s1)
            feat_next[k, :] = np.mean(fn, axis=0)
            states_next[k, :] = np.mean(s1, axis=0)
            rewards[k] = np.mean(r)
            _, _, s, _ = mymdp.sample_step(s, policy=policy_traj, n_samples=1)
            c += 1

    return states, rewards, states_next, feat, feat_next
示例#8
0
文件: mdp.py 项目: amoliu/tdlearn
def samples_distribution_from_states(mymdp, policy, phi, states, n_next=20, seed=1, verbose=True):
    n = states.shape[0]
    states_next = np.ones([n, mymdp.dim_S])
    feat = np.zeros((n, phi.dim))
    feat_next = np.zeros_like(feat)
    rewards = np.ones(n)
    np.random.seed(seed)

    with ProgressBar(enabled=verbose) as p:
        for k in xrange(n):
            p.update(k, n, "Sampling MDP Distribution")
            s = states[k, :]
            s0, a, s1, r = mymdp.sample_step(
                s, policy=policy, n_samples=n_next)
            states[k, :] = s0
            feat[k, :] = phi(s0)
            fn = apply_rowise(phi, s1)
            feat_next[k, :] = np.mean(fn, axis=0)
            states_next[k, :] = np.mean(s1, axis=0)
            rewards[k] = np.mean(r)

    return states, rewards, states_next, feat, feat_next
示例#9
0
states, _, _, _, _ = mdp.samples_cached(n_iter=200,
                                        n_restarts=30,
                                        policy=policy,
                                        seed=8000)

n_slices = [3, 5, 7, 10]
bounds = [[0, 35], [-3, 4], [-12, 12], [-3, 3]]
s = [make_slice(b[0], b[1], n) for b, n in zip(bounds, n_slices)]
bounds = np.array(bounds, dtype="float")
means = np.mgrid[s[0], s[1], s[2], s[3]].reshape(4, -1).T

sigmas = np.ones_like(means) * ((bounds[:, 1] - bounds[:, 0]) / 2. /
                                (np.array(n_slices) - 1)).flatten()
phi = features.gaussians(means, sigmas, constant=False)
A = util.apply_rowise(arr=states, f=phi)
a = np.nonzero(np.sum(A > 0.05, axis=0) > 20)[0]
phi = features.gaussians(means[a], sigmas[a], constant=True)
print phi.dim, "features are used"
theta0 = 0. * np.ones(phi.dim)

task = LinearContinuousValuePredictionTask(mdp,
                                           gamma,
                                           phi,
                                           theta0,
                                           policy=policy,
                                           normalize_phi=False,
                                           mu_seed=1100,
                                           mu_subsample=1,
                                           mu_iter=200,
                                           mu_restarts=150,
示例#10
0
states,_,_,_,_ = mdp.samples_cached(n_iter=15000, n_restarts=1,
                                policy=policy,seed=8000)

def make_slice(l, u, n):
    return slice(l, u + float(u - l) / (n - 1) / 2., float(u - l) / (n - 1))

n_slices = [3, 5, 7,10]
bounds = [[-0.012, 0.012], [-0.02, 0.02], [-.6, .6], [-.6, .6]]
s = [make_slice(b[0], b[1], n) for b, n in zip(bounds, n_slices)]
bounds = np.array(bounds, dtype="float")
means = np.mgrid[s[0], s[1], s[2], s[3]].reshape(4, -1).T

sigmas = np.ones_like(means) * (
    (bounds[:, 1] - bounds[:, 0]) / 2. / (np.array(n_slices) - 1)).flatten()
phi = features.gaussians(means, sigmas, constant=False)
A = util.apply_rowise(arr=states, f=phi)
a = np.nonzero(np.sum(A > 0.05, axis=0) > 20)[0]
phi = features.gaussians(means[a], sigmas[a], constant=True)
print phi.dim, "features are used"



theta0 = np.zeros(phi.dim)

task = LinearContinuousValuePredictionTask(
    mdp, gamma, phi, theta0, policy=policy, normalize_phi=False, mu_next=200)

methods = []
alpha = 0.001
mu = .01
gtd = td.GTD(alpha=alpha, beta=mu * alpha, phi=phi)