def __init__( self, mdp, gamma, phi, theta0, policy, target_policy=None, normalize_phi=False, mu_iter=1000, mu_restarts=5, mu_seed=1000, mu_subsample=1, mu_next=50): self.mdp = mdp self.mu_n_next = mu_next self.mu_iter = mu_iter self.mu_seed = mu_seed self.mu_restarts = mu_restarts self.gamma = gamma self.phi = phi self.theta0 = theta0 self.behavior_policy = policy self.mu_subsample = mu_subsample if target_policy is not None: self.off_policy = True self.target_policy = target_policy else: self.target_policy = policy self.off_policy = False if normalize_phi: mu, _, _, _, _ = self.mdp.samples_cached(policy=self.target_policy, n_iter=self.mu_iter, n_restarts=self.mu_restarts, no_next_noise=True, seed=self.mu_seed) Phi = util.apply_rowise(phi, mu) phi.normalization = np.std(Phi, axis=0) phi.normalization[phi.normalization == 0] = 1.
def samples_distribution(mymdp, policy, phi, policy_traj=None, n_subsample=1, n_iter=1000, n_restarts=100, n_next=20, seed=1, verbose=True): assert(n_subsample == 1) # not implemented, do that if you need it states = np.ones([n_restarts * n_iter, mymdp.dim_S]) if policy_traj is None: policy_traj = policy states_next = np.ones([n_restarts * n_iter, mymdp.dim_S]) feat = np.zeros((n_restarts * n_iter, phi.dim)) feat_next = np.zeros_like(feat) rewards = np.ones(n_restarts * n_iter) np.random.seed(seed) k = 0 s = mymdp.start() c = 0 with ProgressBar(enabled=verbose) as p: for k in xrange(n_restarts * n_iter): if mymdp.terminal_f(s) or c >= n_iter: s = mymdp.start() c = 0 p.update(k, n_restarts * n_iter, "Sampling MDP Distribution") s0, a, s1, r = mymdp.sample_step( s, policy=policy, n_samples=n_next) states[k, :] = s0 feat[k, :] = phi(s0) fn = apply_rowise(phi, s1) feat_next[k, :] = np.mean(fn, axis=0) states_next[k, :] = np.mean(s1, axis=0) rewards[k] = np.mean(r) _, _, s, _ = mymdp.sample_step(s, policy=policy_traj, n_samples=1) c += 1 return states, rewards, states_next, feat, feat_next
def samples_distribution_from_states(mymdp, policy, phi, states, n_next=20, seed=1, verbose=True): n = states.shape[0] states_next = np.ones([n, mymdp.dim_S]) feat = np.zeros((n, phi.dim)) feat_next = np.zeros_like(feat) rewards = np.ones(n) np.random.seed(seed) with ProgressBar(enabled=verbose) as p: for k in xrange(n): p.update(k, n, "Sampling MDP Distribution") s = states[k, :] s0, a, s1, r = mymdp.sample_step(s, policy=policy, n_samples=n_next) states[k, :] = s0 feat[k, :] = phi(s0) fn = apply_rowise(phi, s1) feat_next[k, :] = np.mean(fn, axis=0) states_next[k, :] = np.mean(s1, axis=0) rewards[k] = np.mean(r) return states, rewards, states_next, feat, feat_next
def __getattr__(self, name): """ some attribute such as state distribution or the true value function are very costly to compute, so they are only evaluated, if really needed """ if name == "V_true": self.V_true = dynamic_prog.estimate_V_LQR( self.mdp, lambda x, y: self.bellman_operator( x, y, policy="target"), gamma=self.gamma) return self.V_true elif name == "mu_phi_full": n = (self.mdp.dim_S+1)*(self.mdp.dim_S)+1 self.mu_phi_full = util.apply_rowise( features.squared_tri(n), self.mu) return self.mu_phi_full else: return LinearContinuousValuePredictionTask.__getattr__(self, name)
def samples_distribution(mymdp, policy, phi, policy_traj=None, n_subsample=1, n_iter=1000, n_restarts=100, n_next=20, seed=1, verbose=True): assert (n_subsample == 1) # not implemented, do that if you need it states = np.ones([n_restarts * n_iter, mymdp.dim_S]) if policy_traj is None: policy_traj = policy states_next = np.ones([n_restarts * n_iter, mymdp.dim_S]) feat = np.zeros((n_restarts * n_iter, phi.dim)) feat_next = np.zeros_like(feat) rewards = np.ones(n_restarts * n_iter) np.random.seed(seed) k = 0 s = mymdp.start() c = 0 with ProgressBar(enabled=verbose) as p: for k in xrange(n_restarts * n_iter): if mymdp.terminal_f(s) or c >= n_iter: s = mymdp.start() c = 0 p.update(k, n_restarts * n_iter, "Sampling MDP Distribution") s0, a, s1, r = mymdp.sample_step(s, policy=policy, n_samples=n_next) states[k, :] = s0 feat[k, :] = phi(s0) fn = apply_rowise(phi, s1) feat_next[k, :] = np.mean(fn, axis=0) states_next[k, :] = np.mean(s1, axis=0) rewards[k] = np.mean(r) _, _, s, _ = mymdp.sample_step(s, policy=policy_traj, n_samples=1) c += 1 return states, rewards, states_next, feat, feat_next
def samples_distribution_from_states(mymdp, policy, phi, states, n_next=20, seed=1, verbose=True): n = states.shape[0] states_next = np.ones([n, mymdp.dim_S]) feat = np.zeros((n, phi.dim)) feat_next = np.zeros_like(feat) rewards = np.ones(n) np.random.seed(seed) with ProgressBar(enabled=verbose) as p: for k in xrange(n): p.update(k, n, "Sampling MDP Distribution") s = states[k, :] s0, a, s1, r = mymdp.sample_step( s, policy=policy, n_samples=n_next) states[k, :] = s0 feat[k, :] = phi(s0) fn = apply_rowise(phi, s1) feat_next[k, :] = np.mean(fn, axis=0) states_next[k, :] = np.mean(s1, axis=0) rewards[k] = np.mean(r) return states, rewards, states_next, feat, feat_next
states, _, _, _, _ = mdp.samples_cached(n_iter=200, n_restarts=30, policy=policy, seed=8000) n_slices = [3, 5, 7, 10] bounds = [[0, 35], [-3, 4], [-12, 12], [-3, 3]] s = [make_slice(b[0], b[1], n) for b, n in zip(bounds, n_slices)] bounds = np.array(bounds, dtype="float") means = np.mgrid[s[0], s[1], s[2], s[3]].reshape(4, -1).T sigmas = np.ones_like(means) * ((bounds[:, 1] - bounds[:, 0]) / 2. / (np.array(n_slices) - 1)).flatten() phi = features.gaussians(means, sigmas, constant=False) A = util.apply_rowise(arr=states, f=phi) a = np.nonzero(np.sum(A > 0.05, axis=0) > 20)[0] phi = features.gaussians(means[a], sigmas[a], constant=True) print phi.dim, "features are used" theta0 = 0. * np.ones(phi.dim) task = LinearContinuousValuePredictionTask(mdp, gamma, phi, theta0, policy=policy, normalize_phi=False, mu_seed=1100, mu_subsample=1, mu_iter=200, mu_restarts=150,
states,_,_,_,_ = mdp.samples_cached(n_iter=15000, n_restarts=1, policy=policy,seed=8000) def make_slice(l, u, n): return slice(l, u + float(u - l) / (n - 1) / 2., float(u - l) / (n - 1)) n_slices = [3, 5, 7,10] bounds = [[-0.012, 0.012], [-0.02, 0.02], [-.6, .6], [-.6, .6]] s = [make_slice(b[0], b[1], n) for b, n in zip(bounds, n_slices)] bounds = np.array(bounds, dtype="float") means = np.mgrid[s[0], s[1], s[2], s[3]].reshape(4, -1).T sigmas = np.ones_like(means) * ( (bounds[:, 1] - bounds[:, 0]) / 2. / (np.array(n_slices) - 1)).flatten() phi = features.gaussians(means, sigmas, constant=False) A = util.apply_rowise(arr=states, f=phi) a = np.nonzero(np.sum(A > 0.05, axis=0) > 20)[0] phi = features.gaussians(means[a], sigmas[a], constant=True) print phi.dim, "features are used" theta0 = np.zeros(phi.dim) task = LinearContinuousValuePredictionTask( mdp, gamma, phi, theta0, policy=policy, normalize_phi=False, mu_next=200) methods = [] alpha = 0.001 mu = .01 gtd = td.GTD(alpha=alpha, beta=mu * alpha, phi=phi)