def graph_PI(): n_states = 10 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) print('n pis: {}'.format(len(det_pis))) mdp = utils.build_random_sparse_mdp(n_states, n_actions, 0.5) A = graph.mdp_topology(det_pis) G = nx.from_numpy_array(A) pos = nx.spring_layout(G, iterations=200) basis = graph.construct_mdp_basis(det_pis, mdp) init_pi = utils.softmax(np.random.standard_normal((n_states, n_actions))) init_v = utils.value_functional(mdp.P, mdp.r, init_pi, mdp.discount).squeeze() a = graph.sparse_coeffs(basis, init_v, lr=0.1) pis = utils.solve(search_spaces.policy_iteration(mdp), init_pi) print("\n{} policies to vis".format(len(pis))) for i, pi in enumerate(pis[:-1]): print('Iteration: {}'.format(i)) v = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() a = graph.sparse_coeffs(basis, v, lr=0.1, a_init=a) plt.figure(figsize=(16,16)) nx.draw(G, pos, node_color=a, node_size=150) # plt.show() plt.savefig('figs/pi_graphs/{}.png'.format(i)) plt.close()
def graph_PG(): # ffmpeg -framerate 10 -start_number 0 -i %d.png -c:v libx264 -r 30 -pix_fmt yuv420p out.mp4 n_states = 6 n_actions = 4 det_pis = utils.get_deterministic_policies(n_states, n_actions) print('n pis: {}'.format(len(det_pis))) mdp = utils.build_random_mdp(n_states, n_actions, 0.9) A = graph.mdp_topology(det_pis) G = nx.from_numpy_array(A) pos = nx.spring_layout(G, iterations=200) basis = graph.construct_mdp_basis(det_pis, mdp) init_logits = np.random.standard_normal((n_states, n_actions)) init_v = utils.value_functional(mdp.P, mdp.r, utils.softmax(init_logits), mdp.discount).squeeze() a = graph.sparse_coeffs(basis, init_v, lr=0.1) print('\nSolving PG') pis = utils.solve(search_spaces.policy_gradient_iteration_logits(mdp, 0.1), init_logits) print("\n{} policies to vis".format(len(pis))) n = len(pis) # pis = pis[::n//100] pis = pis[0:20] for i, pi in enumerate(pis[:-1]): print('Iteration: {}'.format(i)) v = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() a = graph.sparse_coeffs(basis, v, lr=0.1, a_init=a) plt.figure() nx.draw(G, pos, node_color=a) # plt.show() plt.savefig('figs/pg_graphs/{}.png'.format(i)) plt.close()
def generate_model_cs(): """ Compare using all deterministic policies versus fewer mixed policies. Starts to get interesting in higher dims? """ n_states = 32 n_actions = 2 lr = 0.01 k = 64 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) init = rnd.standard_normal((mdp.S * mdp.S * mdp.A + mdp.S * mdp.A)) pi_star = utils.solve(policy_iteration(mdp), utils.softmax(rnd.standard_normal( (mdp.S, mdp.A))))[-1] print('pi_star\n', pi_star) # adversarial pis # apis = utils.get_deterministic_policies(mdp.S, mdp.A) apis = np.stack([utils.random_det_policy(mdp.S, mdp.A) for _ in range(k)]) update_fn = model_iteration(mdp, lr, apis) params = utils.solve(update_fn, init) p_logits, r = parse_model_params(mdp.S, mdp.A, params[-1]) error = np.mean( (utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) - utils.value_functional(utils.softmax(p_logits), r, pi_star, mdp.discount))**2) print('\n', error) new_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount, mdp.d0) pi_star = utils.solve(policy_iteration(new_mdp), utils.softmax(rnd.standard_normal( (mdp.S, mdp.A))))[-1] print(pi_star) apis = np.stack([utils.random_policy(mdp.S, mdp.A) for _ in range(k)]) update_fn = model_iteration(mdp, lr, apis) params = utils.solve(update_fn, init) p_logits, r = parse_model_params(mdp.S, mdp.A, params[-1]) error = np.mean( (utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) - utils.value_functional(utils.softmax(p_logits), r, pi_star, mdp.discount))**2) print('\n', error) new_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount, mdp.d0) pi_star = utils.solve(policy_iteration(new_mdp), utils.softmax(rnd.standard_normal( (mdp.S, mdp.A))))[-1] print(pi_star)
def value_graph(): # vs = [np.sum(utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze()**2) for pi in det_pis] # plt.figure(figsize=(16,16)) # nx.draw(G, pos, node_color=vs, node_size=150) # plt.savefig('figs/pi_graphs/val.png') # plt.close() n_states = 10 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) n = len(det_pis) print('n pis: {}'.format(n)) # how does discount effect these!? mdp = utils.build_random_mdp(n_states, n_actions, 0.5) values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis] vs = np.stack(values).reshape((n, n_states)) W = 1/(np.abs(np.sum(vs[None, :, :] - vs[:, None, :], axis=-1)) + 1e-8) A = graph.mdp_topology(det_pis) adj = A*W G = nx.from_numpy_array(adj) pos = nx.spring_layout(G, iterations=200) plt.figure(figsize=(16,16)) nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150) plt.savefig('figs/value_graphs/value_graph-{}-{}.png'.format(n_states, n_actions)) plt.close()
def generate_iteration_figures(mdp, pis, iteration_fn, name): """ How many steps to converge to the optima from different starting points. """ n = 3 lrs = np.linspace(0.0001, 0.1, n**2) # 0.5 - 0.00195... plt.figure(figsize=(16, 16)) value = vmap( lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)) Vs = value(np.stack(pis))[:, :, 0] # pool = multiprocessing.Pool(n**2) # # couldnt serialise the mdp collection. so just unwrap them here. # lens_n_pi_stars = pool.map(iteration_fn, [(mdp.P, mdp.r, mdp.discount, mdp.d0, pis, lr) for lr in lrs]) # for i, lr, results in zip(range(n**2), lrs, lens_n_pi_stars): # len, pi_star = results for i, lr in enumerate(lrs): print('\n{}: {}\n'.format(i, lr)) lens, pi_stars = iteration_fn( (mdp.P, mdp.r, mdp.discount, mdp.d0, pis, lr)) plt.subplot(n, n, i + 1) plt.title('Learning rate: {}'.format(lr)) fig = plt.scatter(Vs[:, 0], Vs[:, 1], c=lens, s=5) fig.axes.get_xaxis().set_visible(False) fig.axes.get_yaxis().set_visible(False) plt.tight_layout() plt.savefig('figs/iteration-lrs/0-{}.png'.format(name))
def model_iteration(mdp, lr, pis): V_true = vmap(lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)) V_guess = vmap(lambda P, r, pi: utils.value_functional(P, r, pi, mdp.discount), in_axes=(None, None, 0)) def loss_fn(params): p_logits, r = parse_model_params(mdp.S, mdp.A, params) return np.sum((V_true(pis) - V_guess(utils.softmax(p_logits), r, pis))**2) dLdp = grad(loss_fn) @jit def update_fn(params): return params - lr*utils.clip_by_norm(dLdp(params), 100) return update_fn
def Q(init, M, f): # solve V_init = utils.value_functional(M.P, M.r, init, M.discount) Q_init = utils.bellman_operator(M.P, M.r, V_init, M.discount) Q_star = utils.solve(ss.q_learning(M, 0.01), Q_init)[-1] # lift return np.dot(f.T, np.max(Q_star, axis=1, keepdims=True))
def onoffpolicy_abstraction(mdp, pis): tol = 0.01 init = np.random.random((mdp.S, mdp.A)) init = init / np.sum(init, axis=1, keepdims=True) # ### all policy abstraction # # n x |S| x |A| # Qs = np.stack([utils.bellman_operator(mdp.P, mdp.r, utils.value_functional(mdp.P, mdp.r, pi, mdp.discount), mdp.discount) for pi in pis], axis=0) # similar_states = np.sum(np.sum(np.abs(Qs[:, :, None, :] - Qs[:, None, :, :]), axis=3), axis=0) # |S| x |S| # all_idx, all_abstracted_mdp, all_f = abs.build_state_abstraction(similar_states, mdp) ### optimal policy abstraction pi_star = utils.solve(ss.policy_iteration(mdp), np.log(init))[-1] Q_star = utils.bellman_operator( mdp.P, mdp.r, utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount), mdp.discount) # similar_states = np.sum(np.abs(Q_star[:, None, :] - Q_star[None, :, :]), axis=-1) # |S| x |S|. preserves optimal policy's value (for all actions) # similar_states = np.abs(np.max(Q_star[:, None, :],axis=-1) - np.max(Q_star[None, :, :],axis=-1)) # |S| x |S|. preserves optimal action's value # V = utils.value_functional(mdp.P, mdp.r, init, mdp.discount) similar_states = np.abs(V[None, :, :] - V[:, None, :])[:, :, 0] optimal_idx, optimal_abstracted_mdp, optimal_f = abs.build_state_abstraction( similar_states, mdp, tol) mdps = [mdp, optimal_abstracted_mdp] names = ['ground', 'optimal_abstracted_mdp'] solvers = [abs.Q, abs.SARSA, abs.VI] lifts = [np.eye(mdp.S), optimal_f] idxs = [range(mdp.S), optimal_idx] # if all_f.shape[0] == optimal_f.shape[0]: # raise ValueError('Abstractions are the same so we probs wont see any difference...') print('\nAbstraction:', optimal_f.shape) truth = abs.PI(init, mdp, np.eye(mdp.S)) results = [] for n, M, idx, f in zip(names, mdps, idxs, lifts): for solve in solvers: err = np.max(np.abs(truth - solve(init[idx, :], M, f))) results.append((n, solve.__name__, err)) return results
def generate_vi(mdp, c, lr=0.1): init_pi = utils.random_policy(mdp.S,mdp.A) init_v = utils.value_functional(mdp.P, mdp.r, init_pi, mdp.discount) vs = np.stack(utils.solve(ss.value_iteration(mdp, lr), init_v))[:,:,0] n = vs.shape[0] plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n)) plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10) plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')
def VI(init, M, f): # solve V_init = utils.value_functional(M.P, M.r, init, M.discount) V_star = utils.solve(ss.value_iteration(M, 0.01), V_init)[-1] # lift return np.dot(f.T, V_star)
def generate_pg(mdp, c, lr=0.01): init_pi = utils.random_policy(mdp.S,mdp.A) init_logit = np.log(init_pi) logits = utils.solve(ss.policy_gradient_iteration_logits(mdp, lr), init_logit) vs = np.stack([utils.value_functional(mdp.P, mdp.r, utils.softmax(logit), mdp.discount) for logit in logits])[:,:,0] n = vs.shape[0] plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n)) plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10) plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')
def value_iteration(mdp, pis, lr): trajs = [] for pi in pis: init_V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) traj = utils.solve(ss.value_iteration(mdp, lr), init_V) v_star = traj[-1] trajs.append(traj) return trajs
def compare_mdp_lmdp(): n_states, n_actions = 2, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.9) pis = utils.gen_grid_policies(7) vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) plt.figure(figsize=(16, 16)) plt.scatter(vs[:, 0], vs[:, 1], s=10, alpha=0.75) # solve via LMDPs p, q = lmdps.mdp_encoder(mdp.P, mdp.r) u, v = lmdps.lmdp_solver(p, q, mdp.discount) pi_u_star = lmdps.lmdp_decoder(u, mdp.P) pi_p = lmdps.lmdp_decoder(p, mdp.P) # solve MDP init = np.random.standard_normal((n_states, n_actions)) pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1] # pi_star = onehot(np.argmax(qs, axis=1), n_actions) # evaluate both policies. v_star = utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) v_u_star = utils.value_functional(mdp.P, mdp.r, pi_u_star, mdp.discount) v_p = utils.value_functional(mdp.P, mdp.r, pi_p, mdp.discount) plt.scatter(v_star[0, 0], v_star[1, 0], c='m', alpha=0.5, marker='x', label='mdp') plt.scatter(v_u_star[0, 0], v_u_star[1, 0], c='g', alpha=0.5, marker='x', label='lmdp') plt.scatter(v_p[0, 0], v_p[1, 0], c='k', marker='x', alpha=0.5, label='p') plt.legend() plt.show()
def generate_pi(mdp, c): init_pi = utils.random_policy(mdp.S,mdp.A) pis = utils.solve(ss.policy_iteration(mdp), init_pi) vs = np.stack([utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) for pi in pis])[:,:,0] n = vs.shape[0] plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n-2)) plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10) plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x') for i in range(len(vs)-2): dv = 0.1*(vs[i+1, :] - vs[i, :]) plt.arrow(vs[i, 0], vs[i, 1], dv[0], dv[1], color=c, alpha=0.5, width=0.005)
def value_iteration(mdp, pis): lens, pi_stars = [], [] for pi in pis: init_V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) pi_traj = utils.solve(ss.value_iteration(mdp, 0.01), init_V) pi_star = pi_traj[-1] pi_stars.append(pi_star) lens.append(len(pi_traj)) return lens, pi_stars
def generate_model_iteration(): n_states, n_actions = 2, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) pis = utils.gen_grid_policies(7) init = rnd.standard_normal( (mdp.S * mdp.S * mdp.A + mdp.S * mdp.A) ) # needs its own init. alternatively could find init that matches value of other inits?!? vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) plt.figure(figsize=(16, 16)) plt.scatter(vs[:, 0], vs[:, 1], c='b', s=10, alpha=0.75) lr = 0.01 pi_star = utils.solve(policy_iteration(mdp), utils.softmax(rnd.standard_normal( (mdp.S, mdp.A))))[-1] # adversarial pis apis = utils.get_deterministic_policies(mdp.S, mdp.A) apis = np.stack(apis) update_fn = model_iteration(mdp, lr, apis) params = utils.solve(update_fn, init) params = [parse_model_params(mdp.S, mdp.A, p) for p in params] vs = np.vstack([ utils.value_functional(utils.softmax(p_logits), r, pi_star, mdp.discount).T for p_logits, r in params ]) n = vs.shape[0] plt.scatter(vs[0, 0], vs[0, 1], c='g', label='PG') plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n - 2), cmap='spring', s=10) plt.scatter(vs[-1, 0], vs[-1, 1], c='g', marker='x') p_logits, r = params[-1] vs = utils.polytope(utils.softmax(p_logits), r, mdp.discount, pis) plt.scatter(vs[:, 0], vs[:, 1], c='r', s=10, alpha=0.75) plt.title('Model iteration') plt.xlabel('Value of state 1') plt.ylabel('Value of state 2') # plt.show() plt.savefig('figs/model_iteration_1.png') learned_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount, mdp.d0) pi_star_approx = utils.solve( policy_iteration(learned_mdp), utils.softmax(rnd.standard_normal((mdp.S, mdp.A))))[-1] print(pi_star_approx, '\n', pi_star)
def compare_acc(): n_states, n_actions = 2, 2 lmdp = [] lmdp_rnd = [] for _ in range(10): mdp = utils.build_random_mdp(n_states, n_actions, 0.5) # solve via LMDPs p, q = lmdps.mdp_encoder(mdp.P, mdp.r) u, v = lmdps.lmdp_solver(p, q, mdp.discount) pi_u_star = lmdps.lmdp_decoder(u, mdp.P) # solve MDP init = np.random.standard_normal((n_states, n_actions)) pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1] # solve via LMDPs # with p set to the random dynamics p, q = lmdps.mdp_encoder(mdp.P, mdp.r) p = np.einsum('ijk,jk->ij', mdp.P, np.ones((n_states, n_actions)) / n_actions) # q = np.max(mdp.r, axis=1, keepdims=True) u, v = lmdps.lmdp_solver(p, q, mdp.discount) pi_u_star_random = lmdps.lmdp_decoder(u, mdp.P) # evaluate both policies. v_star = utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) v_u_star = utils.value_functional(mdp.P, mdp.r, pi_u_star, mdp.discount) v_u_star_random = utils.value_functional(mdp.P, mdp.r, pi_u_star_random, mdp.discount) lmdp.append(np.isclose(v_star, v_u_star, 1e-3).all()) lmdp_rnd.append(np.isclose(v_star, v_u_star_random, 1e-3).all()) print([np.sum(lmdp), np.sum(lmdp_rnd)]) plt.bar(range(2), [np.sum(lmdp), np.sum(lmdp_rnd)]) plt.show()
def test_sparse_estimation(): n_states = 5 n_actions = 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.9) det_pis = utils.get_deterministic_policies(mdp.S, mdp.A) basis = graph.construct_mdp_basis(det_pis, mdp) v = utils.value_functional(mdp.P, mdp.r, det_pis[2], mdp.discount).squeeze() a = graph.sparse_coeffs(basis, v) print(a)
def thompson(mdp, lr): """ Can we do TD with values from different MDPs? """ V_true = vmap( lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)) V_guess = vmap(lambda params, pi: utils.value_functional( *mdp_sampler(params), pi, mdp.discount), in_axes=(None, None, 0)) dLdp = grad(lambda params: mse(V_true(pis), V_guess(params, pis))) @jit def update_fn(params, Q): m = symmetric_sampler(params) Q_ = utils.bellman_optimality_operator(m.P, m.r, Q, m.discount) params_tp1 -= lr * dLdp( params ) # done based on observations... could use model iteration!? Q_tp1 = Q + lr * (Q_ - Q) return Q_tp1, params_tp1 return update_fn
def mom_value_iteration(mdp, pis): lens, pi_stars = [], [] for pi in pis: init_V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) pi_traj = utils.solve( ss.momentum_bundler(ss.value_iteration(mdp, 0.01), 0.9), (init_V, np.zeros_like(init_V))) pi_star, _ = pi_traj[-1] pi_stars.append(pi_star) lens.append(len(pi_traj)) return lens, pi_stars
def mom_param_value_iteration(mdp, pis): lens, pi_stars = [], [] core_init = ss.random_parameterised_matrix(2, 2, 32, 4) for pi in pis: init_V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) core_init = ss.approximate(init_V, core_init) params = utils.solve( ss.momentum_bundler( ss.parameterised_value_iteration(mdp, 0.01 / len(core_init)), 0.8), (core_init, [np.zeros_like(c) for c in core_init])) pi_star, _ = params[-1] pi_stars.append(pi_star) lens.append(len(params)) return lens, pi_stars
def test_everything(): n_states = 5 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) mdp = utils.build_random_mdp(n_states, n_actions, 0.9) A = graph.mdp_topology(det_pis) basis = graph.construct_mdp_basis(det_pis, mdp) # v = np.random.random((n_states, )) v = utils.value_functional(mdp.P, mdp.r, det_pis[2], mdp.discount).squeeze() a = graph.sparse_coeffs(basis, v) G = nx.from_numpy_array(A) pos = nx.spring_layout(G, iterations=200) nx.draw(G, pos, node_color=a) plt.show()
def param_value_iteration(mdp, pis): # hypothesis. we are going to see some weirdness in the mom partitions. # oscillations will depend on shape of the polytope?!? lens, pi_stars = [], [] core_init = ss.random_parameterised_matrix(2, 2, 32, 4) for pi in pis: init_V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) core_init = ss.approximate(init_V, core_init) params = utils.solve( ss.parameterised_value_iteration(mdp, 0.01 / len(core_init)), core_init) pi_star = params[-1] pi_stars.append(pi_star) lens.append(len(params)) return lens, pi_stars
def policy_gradient_iteration_logits(mdp, lr): # this doesnt seem to behave nicely in larger state spaces!? # d/dlogits V = E_{\pi}[V] = E[V . d/dlogit log \pi] # dlogpi_dlogit = jacrev(lambda logits: np.log(utils.softmax(logits)+1e-8)) dHdlogit = grad(lambda logits: utils.entropy(utils.softmax(logits))) dVdlogit = grad(lambda logits: np.sum(utils.value_functional(mdp.P, mdp.r, utils.softmax(logits), mdp.discount))) @jit def update_fn(logits): # NOTE this is actually soft A2C. # V = utils.value_functional(mdp.P, mdp.r, utils.softmax(logits), mdp.discount) # Q = utils.bellman_operator(mdp.P, mdp.r, V, mdp.discount) # A = Q-V # g = np.einsum('ijkl,ij->kl', dlogpi_dlogit(logits), A) g = dVdlogit(logits) return logits + lr * utils.clip_by_norm(g, 500) + 1e-8*dHdlogit(logits) return update_fn
def value_graph_laplacian(): n_states = 8 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) n = len(det_pis) print('n pis: {}'.format(n)) mdp = utils.build_random_mdp(n_states, n_actions, 0.5) values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis] Vs = np.stack(values).reshape((n, n_states)) A = graph.mdp_topology(det_pis) W = 1/(np.abs(np.sum(Vs[None, :, :] - Vs[:, None, :], axis=-1)) + 1e-8) adj = A*W G = nx.from_numpy_array(adj) pos = nx.spring_layout(G, iterations=200) plt.figure(figsize=(16,16)) nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150) plt.savefig('figs/value_graphs/value_graph-{}-{}.png'.format(n_states, n_actions)) plt.close() # how can you calulate expected eignenvalues!? # observation. the underlying complexity of the value topology is linear!?!? # how hard is it to estimate the main eigen vec from noisy observations!? # that would tell us the complexity!?!? for i, alpha in enumerate(np.linspace(0, 1, 10)): us = [] for _ in range(50): vs = Vs + alpha*np.random.standard_normal(Vs.shape) W = 1/(np.abs(np.sum(vs[None, :, :] - vs[:, None, :], axis=-1)) + 1e-8) adj = A*W u, v = graph_laplacian_spectra(adj) us.append(u) us = np.stack(us, axis=0) mean = np.mean(us, axis=0) var = np.var(us, axis=0) plt.bar(range(len(mean)), mean, yerr=np.sqrt(var)) plt.savefig('figs/value_graphs/{}-lap.png'.format(i)) plt.close()
def plot(): n_states = 2 n_actions = 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) value = vmap( lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)) pis = np.stack(utils.gen_grid_policies(101), axis=0) vs = value(pis) plt.scatter(vs[:, 0], vs[:, 1], s=10) pis = np.stack(utils.get_deterministic_policies(2, 2), axis=0) vs = value(pis) plt.scatter(vs[:, 0], vs[:, 1], s=10, c='r') plt.xlabel('The value of state 1') plt.ylabel('The value of state 2') plt.title('The value polytope') plt.show()
def generate_iteration_figures(mdps, pis, iteration_fn, name): """ How many steps to converge to the optima from different starting points. """ n = np.sqrt(len(mdps)) plt.figure(figsize=(16, 16)) for i, mdp in enumerate(mdps): print(i) Vs = np.hstack([ utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) for pi in pis ]) lens, pi_stars = iteration_fn(mdp, pis) plt.subplot(n, n, i + 1) fig = plt.scatter(Vs[0, :], Vs[1, :], c=lens, s=5) fig.axes.get_xaxis().set_visible(False) fig.axes.get_yaxis().set_visible(False) plt.tight_layout() plt.savefig('figs/iterations/{}.png'.format(name))
def value_graph_laplacians(): n_states = 8 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) N = len(det_pis) print('n pis: {}'.format(N)) for i in range(1): mdp = utils.build_random_mdp(n_states, n_actions, 0.5) values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis] Vs = np.stack(values).reshape((N, n_states)) A = graph.mdp_topology(det_pis) W = np.exp(-np.linalg.norm(Vs[None, :, :] - Vs[:, None, :], ord=np.inf, axis=-1)+1e-8) # mVs = np.mean(Vs, axis=0) # n_states # W = np.dot((Vs - mVs) , (Vs - mVs).T) adj = W * A G = nx.from_numpy_array(adj) pos = nx.spectral_layout(G) #, iterations=500) plt.figure(figsize=(16,16)) nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150) plt.savefig('figs/value_graphs/{}-value_graph-{}-{}.png'.format(i, n_states, n_actions)) plt.close() u, v = graph_laplacian_spectra(adj) plt.figure(figsize=(8,8)) plt.bar(range(len(u)), u) plt.savefig('figs/value_graphs/{}-lap.png'.format(i)) plt.close() plt.figure(figsize=(16,16)) n = 5 for j in range(n*n): plt.subplot(n,n,j+1) nx.draw(G, pos, node_color=u[10*j] * v[10*j], node_size=150) plt.savefig('figs/value_graphs/{}-spectra.png'.format(i, n_states, n_actions)) plt.close()
def find_symmetric_mdp(n_states, n_actions, discount, lr=1e-2): """ Approximately find a mdp with ??? symmetry """ model_init = rnd.standard_normal(n_states * n_states * n_actions + n_states * n_actions) pis = utils.get_deterministic_policies(n_states, n_actions) # pis = [utils.random_policy(n_states, n_actions) for _ in range(100)] pis = np.stack(pis) # print(pis.shape) V = vmap(lambda P, r, pi: utils.value_functional(P, r, pi, discount), in_axes=(None, None, 0)) def loss_fn(model_params): # policy symmetry P, r = ss.parse_model_params(n_states, n_actions, model_params) return np.sum( np.square( V(utils.softmax(P), r, pis) - V(utils.softmax(P), r, np.flip(pis, 1)))) # def loss_fn(model_params): # # value symmetry # P, r = ss.parse_model_params(n_states, n_actions, model_params) # vals = V(utils.softmax(P), r, pis) # n = n_states//2 # return np.sum(np.square(vals[:, :n] - vals[:, n:])) dldp = grad(loss_fn) update_fn = lambda model: model - lr * dldp(model) init = (model_init, np.zeros_like(model_init)) model_params, momentum_var = utils.solve( ss.momentum_bundler(update_fn, 0.9), init)[-1] P, r = ss.parse_model_params(n_states, n_actions, model_params) d0 = rnd.random((n_states, 1)) return utils.MDP(n_states, n_actions, P, r, discount, d0)
def variance(P, r, pi, discount): # var = int_s' int_a p(s'|s, a) pi(a|s) (r(s,a) + \gamma V(s') + V(s)) V = utils.value_functional(P, r, pi, discount)[:, 0] d = (r[None, :, :] + discount * V[:, None, None] - V[None, :, None])**2 expected_d = np.einsum('ijk,ijk->j', P * pi[None, :, :], d) # aka variance return np.sum(expected_d)