def generate_polytope_densities(): n_states, n_actions = 2, 2 pis = utils.gen_grid_policies(41) nx = 4 ny = 5 plt.figure(figsize=(16, 16)) for i in range(nx * ny): print(i) mdp = utils.build_random_mdp(n_states, n_actions, 0.5) Vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) # just set all to be the same probability p_pi = 0.1 pVs = [ density_value_functional(p_pi, mdp.P, mdp.r, pi, mdp.discount) for pi in pis ] plt.subplot(nx, ny, i + 1) fig = plt.scatter(Vs[:, 0], Vs[:, 1], c=pVs) # plt.colorbar() fig.axes.get_xaxis().set_visible(False) fig.axes.get_yaxis().set_visible(False) plt.tight_layout() plt.show()
def generate_model_iteration(): n_states, n_actions = 2, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) pis = utils.gen_grid_policies(7) init = rnd.standard_normal( (mdp.S * mdp.S * mdp.A + mdp.S * mdp.A) ) # needs its own init. alternatively could find init that matches value of other inits?!? vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) plt.figure(figsize=(16, 16)) plt.scatter(vs[:, 0], vs[:, 1], c='b', s=10, alpha=0.75) lr = 0.01 pi_star = utils.solve(policy_iteration(mdp), utils.softmax(rnd.standard_normal( (mdp.S, mdp.A))))[-1] # adversarial pis apis = utils.get_deterministic_policies(mdp.S, mdp.A) apis = np.stack(apis) update_fn = model_iteration(mdp, lr, apis) params = utils.solve(update_fn, init) params = [parse_model_params(mdp.S, mdp.A, p) for p in params] vs = np.vstack([ utils.value_functional(utils.softmax(p_logits), r, pi_star, mdp.discount).T for p_logits, r in params ]) n = vs.shape[0] plt.scatter(vs[0, 0], vs[0, 1], c='g', label='PG') plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n - 2), cmap='spring', s=10) plt.scatter(vs[-1, 0], vs[-1, 1], c='g', marker='x') p_logits, r = params[-1] vs = utils.polytope(utils.softmax(p_logits), r, mdp.discount, pis) plt.scatter(vs[:, 0], vs[:, 1], c='r', s=10, alpha=0.75) plt.title('Model iteration') plt.xlabel('Value of state 1') plt.ylabel('Value of state 2') # plt.show() plt.savefig('figs/model_iteration_1.png') learned_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount, mdp.d0) pi_star_approx = utils.solve( policy_iteration(learned_mdp), utils.softmax(rnd.standard_normal((mdp.S, mdp.A))))[-1] print(pi_star_approx, '\n', pi_star)
def lmdp_field(): """ For each policy. Calculate its dynamics, P_pi. Estimate the value via the LMDP. Plot difference under linearTD operator. """ n_states, n_actions = 2, 2 pis = utils.gen_grid_policies(11) mdp = utils.build_random_mdp(n_states, n_actions, 0.5) p, q = lmdps.mdp_encoder(mdp.P, mdp.r) vs = [] dvs = [] for pi in pis: u = np.einsum('ijk,jk->ij', mdp.P, pi) v = lmdps.linear_value_functional(p, q, u, mdp.discount) z = np.exp(v) Tz = lmdps.linear_bellman_operator(p, q, z, mdp.discount) dv = np.log(Tz) - np.log(z) vs.append(v) dvs.append(dv) dvs = np.vstack(dvs) vs = np.vstack(vs) normed_dvs = utils.normalize(dvs) plt.figure(figsize=(16, 16)) plt.subplot(1, 2, 1) plt.title('Linearised Bellman operator') plt.quiver(vs[:, 0], vs[:, 1], normed_dvs[:, 0], normed_dvs[:, 1], np.linalg.norm(dvs, axis=1)) # plot bellman Vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) diff_op = lambda V: utils.bellman_optimality_operator( mdp.P, mdp.r, np.expand_dims(V, 1), mdp.discount) - np.expand_dims( V, 1) dVs = np.stack([np.max(diff_op(V), axis=1) for V in Vs]) normed_dVs = utils.normalize(dVs) plt.subplot(1, 2, 2) plt.title('Bellman operator') plt.quiver(Vs[:, 0], Vs[:, 1], normed_dVs[:, 0], normed_dVs[:, 1], np.linalg.norm(dVs, axis=1)) # plt.savefig('figs/LBO_BO.png') plt.show()
def compare_mdp_lmdp(): n_states, n_actions = 2, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.9) pis = utils.gen_grid_policies(7) vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) plt.figure(figsize=(16, 16)) plt.scatter(vs[:, 0], vs[:, 1], s=10, alpha=0.75) # solve via LMDPs p, q = lmdps.mdp_encoder(mdp.P, mdp.r) u, v = lmdps.lmdp_solver(p, q, mdp.discount) pi_u_star = lmdps.lmdp_decoder(u, mdp.P) pi_p = lmdps.lmdp_decoder(p, mdp.P) # solve MDP init = np.random.standard_normal((n_states, n_actions)) pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1] # pi_star = onehot(np.argmax(qs, axis=1), n_actions) # evaluate both policies. v_star = utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) v_u_star = utils.value_functional(mdp.P, mdp.r, pi_u_star, mdp.discount) v_p = utils.value_functional(mdp.P, mdp.r, pi_p, mdp.discount) plt.scatter(v_star[0, 0], v_star[1, 0], c='m', alpha=0.5, marker='x', label='mdp') plt.scatter(v_u_star[0, 0], v_u_star[1, 0], c='g', alpha=0.5, marker='x', label='lmdp') plt.scatter(v_p[0, 0], v_p[1, 0], c='k', marker='x', alpha=0.5, label='p') plt.legend() plt.show()
def plot(): n_states = 2 n_actions = 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) value = vmap( lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)) pis = np.stack(utils.gen_grid_policies(101), axis=0) vs = value(pis) plt.scatter(vs[:, 0], vs[:, 1], s=10) pis = np.stack(utils.get_deterministic_policies(2, 2), axis=0) vs = value(pis) plt.scatter(vs[:, 0], vs[:, 1], s=10, c='r') plt.xlabel('The value of state 1') plt.ylabel('The value of state 2') plt.title('The value polytope') plt.show()
def emp_est_snr_map(): n_states, n_actions = 2, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) pis = utils.gen_grid_policies(5) vals = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) vars = [] hs = [] for i, pi in enumerate(pis): print('\r{}'.format(i), end='', flush=True) vars.append(est_var_R(mdp, pi)) hs.append(utils.entropy(pi)) plt.subplot(2, 1, 1) plt.scatter(vals[:, 0], vals[:, 1], c=hs) plt.subplot(2, 1, 2) plt.scatter(vals[:, 0], vals[:, 1], c=vars) # plt.subplot(3,1,1) # plt.scatter(vals[:, 0], vals[:, 0], c=hs) plt.show()
def hyperbolic_polytope(): # https://arxiv.org/abs/1902.06865 n_states, n_actions = 2, 2 N = 21 pis = utils.gen_grid_policies(N) mdp = utils.build_random_mdp(n_states, n_actions, None) n = 10 discounts = np.linspace(0.1, 1-1e-4, n) Vs = [] for discount in discounts: Vs.append((1-discount)*utils.polytope(mdp.P, mdp.r, discount, pis)) h_V = sum(Vs)/n plt.subplot(2, 1, 1) plt.scatter(h_V[:, 0], h_V[:, 1]) plt.subplot(2, 1, 2) V = (1-0.9)*utils.polytope(mdp.P, mdp.r, 0.9, pis) plt.scatter(V[:, 0], V[:, 1]) plt.show()
trajs.append(traj) return trajs class NumpyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.ndarray): return obj.tolist() return json.JSONEncoder.default(self, obj) if __name__ == '__main__': rnd.seed(42) n_states, n_actions = 2, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) pis = utils.gen_grid_policies(4) use_momentum = False fname = 'test1.json' with open(fname, 'w') as f: for lr in np.logspace(-2, -1, 2): traj = value_iteration(mdp, pis, lr) data = { '{}-{}-{}'.format(value_iteration.__name__, lr, use_momentum): [np.array(t).tolist() for t in traj] } s = json.dumps(data, cls=NumpyEncoder) f.write(s + '\n')
plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n-2)) plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10) plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x') for i in range(len(vs)-2): dv = 0.1*(vs[i+1, :] - vs[i, :]) plt.arrow(vs[i, 0], vs[i, 1], dv[0], dv[1], color=c, alpha=0.5, width=0.005) if __name__ == '__main__': # rnd.seed(42) print('start') n_states, n_actions = 2, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) print('\nBuilding polytope') pis = np.stack(utils.gen_grid_policies(41)) vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) plt.figure(figsize=(16,16)) plt.scatter(vs[:, 0], vs[:, 1], s=10, alpha=0.75) colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] for i, c in zip(range(4), colors): print('\nRunning experiment {}'.format(i)) # generate_vi(mdp, c) generate_pg(mdp, c) # generate_pi(mdp, c) plt.legend() plt.colorbar() plt.show()