def graph_PI(): n_states = 10 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) print('n pis: {}'.format(len(det_pis))) mdp = utils.build_random_sparse_mdp(n_states, n_actions, 0.5) A = graph.mdp_topology(det_pis) G = nx.from_numpy_array(A) pos = nx.spring_layout(G, iterations=200) basis = graph.construct_mdp_basis(det_pis, mdp) init_pi = utils.softmax(np.random.standard_normal((n_states, n_actions))) init_v = utils.value_functional(mdp.P, mdp.r, init_pi, mdp.discount).squeeze() a = graph.sparse_coeffs(basis, init_v, lr=0.1) pis = utils.solve(search_spaces.policy_iteration(mdp), init_pi) print("\n{} policies to vis".format(len(pis))) for i, pi in enumerate(pis[:-1]): print('Iteration: {}'.format(i)) v = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() a = graph.sparse_coeffs(basis, v, lr=0.1, a_init=a) plt.figure(figsize=(16,16)) nx.draw(G, pos, node_color=a, node_size=150) # plt.show() plt.savefig('figs/pi_graphs/{}.png'.format(i)) plt.close()
def value_graph(): # vs = [np.sum(utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze()**2) for pi in det_pis] # plt.figure(figsize=(16,16)) # nx.draw(G, pos, node_color=vs, node_size=150) # plt.savefig('figs/pi_graphs/val.png') # plt.close() n_states = 10 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) n = len(det_pis) print('n pis: {}'.format(n)) # how does discount effect these!? mdp = utils.build_random_mdp(n_states, n_actions, 0.5) values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis] vs = np.stack(values).reshape((n, n_states)) W = 1/(np.abs(np.sum(vs[None, :, :] - vs[:, None, :], axis=-1)) + 1e-8) A = graph.mdp_topology(det_pis) adj = A*W G = nx.from_numpy_array(adj) pos = nx.spring_layout(G, iterations=200) plt.figure(figsize=(16,16)) nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150) plt.savefig('figs/value_graphs/value_graph-{}-{}.png'.format(n_states, n_actions)) plt.close()
def graph_PG(): # ffmpeg -framerate 10 -start_number 0 -i %d.png -c:v libx264 -r 30 -pix_fmt yuv420p out.mp4 n_states = 6 n_actions = 4 det_pis = utils.get_deterministic_policies(n_states, n_actions) print('n pis: {}'.format(len(det_pis))) mdp = utils.build_random_mdp(n_states, n_actions, 0.9) A = graph.mdp_topology(det_pis) G = nx.from_numpy_array(A) pos = nx.spring_layout(G, iterations=200) basis = graph.construct_mdp_basis(det_pis, mdp) init_logits = np.random.standard_normal((n_states, n_actions)) init_v = utils.value_functional(mdp.P, mdp.r, utils.softmax(init_logits), mdp.discount).squeeze() a = graph.sparse_coeffs(basis, init_v, lr=0.1) print('\nSolving PG') pis = utils.solve(search_spaces.policy_gradient_iteration_logits(mdp, 0.1), init_logits) print("\n{} policies to vis".format(len(pis))) n = len(pis) # pis = pis[::n//100] pis = pis[0:20] for i, pi in enumerate(pis[:-1]): print('Iteration: {}'.format(i)) v = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() a = graph.sparse_coeffs(basis, v, lr=0.1, a_init=a) plt.figure() nx.draw(G, pos, node_color=a) # plt.show() plt.savefig('figs/pg_graphs/{}.png'.format(i)) plt.close()
def test_topology(): n_states = 5 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) A = graph.mdp_topology(det_pis) print(A) G = nx.from_numpy_array(A) nx.draw(G) plt.show()
def test_estimation(): n_states = 5 n_actions = 2 det_pis = utils.get_deterministic_policies(mdp.S, mdp.A) mdp = utils.build_random_mdp(n_states, n_actions, 0.9) basis = graph.construct_mdp_basis(det_pis, mdp) v = np.random.random((n_states, )) a = graph.estimate_coeffs(basis.T, v) print(a)
def generate_model_iteration(): n_states, n_actions = 2, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) pis = utils.gen_grid_policies(7) init = rnd.standard_normal( (mdp.S * mdp.S * mdp.A + mdp.S * mdp.A) ) # needs its own init. alternatively could find init that matches value of other inits?!? vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) plt.figure(figsize=(16, 16)) plt.scatter(vs[:, 0], vs[:, 1], c='b', s=10, alpha=0.75) lr = 0.01 pi_star = utils.solve(policy_iteration(mdp), utils.softmax(rnd.standard_normal( (mdp.S, mdp.A))))[-1] # adversarial pis apis = utils.get_deterministic_policies(mdp.S, mdp.A) apis = np.stack(apis) update_fn = model_iteration(mdp, lr, apis) params = utils.solve(update_fn, init) params = [parse_model_params(mdp.S, mdp.A, p) for p in params] vs = np.vstack([ utils.value_functional(utils.softmax(p_logits), r, pi_star, mdp.discount).T for p_logits, r in params ]) n = vs.shape[0] plt.scatter(vs[0, 0], vs[0, 1], c='g', label='PG') plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n - 2), cmap='spring', s=10) plt.scatter(vs[-1, 0], vs[-1, 1], c='g', marker='x') p_logits, r = params[-1] vs = utils.polytope(utils.softmax(p_logits), r, mdp.discount, pis) plt.scatter(vs[:, 0], vs[:, 1], c='r', s=10, alpha=0.75) plt.title('Model iteration') plt.xlabel('Value of state 1') plt.ylabel('Value of state 2') # plt.show() plt.savefig('figs/model_iteration_1.png') learned_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount, mdp.d0) pi_star_approx = utils.solve( policy_iteration(learned_mdp), utils.softmax(rnd.standard_normal((mdp.S, mdp.A))))[-1] print(pi_star_approx, '\n', pi_star)
def test_sparse_estimation(): n_states = 5 n_actions = 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.9) det_pis = utils.get_deterministic_policies(mdp.S, mdp.A) basis = graph.construct_mdp_basis(det_pis, mdp) v = utils.value_functional(mdp.P, mdp.r, det_pis[2], mdp.discount).squeeze() a = graph.sparse_coeffs(basis, v) print(a)
def estimation_err(): """ Compare using all deterministic policies versus fewer mixed policies. Starts to get interesting in higher dims? """ n_states = 4 n_actions = 2 lr = 0.01 discount = 0.5 dpis = utils.get_deterministic_policies(n_states, n_actions) params = rnd.standard_normal( (n_states * n_states * n_actions + n_states * n_actions)) def value(P, r, pis): return np.array([ utils.value_functional(P, r, pi, discount) for pi in pis ]) # jax doesnt seem to like me changing the batch size to a vmap?!? def loss_fn(params, pis): p_logits, r = parse_model_params(n_states, n_actions, params) return np.sum(value(utils.softmax(p_logits), r, pis)**2) dVdp = jit(lambda *x: np.array(grad(loss_fn, 0)(*x))) #,axis=0) det_dVdp = dVdp(params, dpis) k_estim_err = [] for k in range(n_states, n_actions**n_states + 1, n_states // 2): print('\n{} det policies. Testing with {}\n'.format( n_actions**n_states, k)) diffs = [] for _ in range(6): rnd_pis = np.stack([ utils.random_det_policy(n_states, n_actions) for _ in range(k) ]) diffs.append(np.max(np.abs(det_dVdp - dVdp(params, rnd_pis)))) k_estim_err.append(numpy.mean(diffs)) plt.plot(range(n_states, n_actions**n_states + 1, n_states // 2), k_estim_err) plt.xlabel('Number of randomly sampled policies') plt.ylabel('Max error in gradient estimation') plt.show()
def test_everything(): n_states = 5 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) mdp = utils.build_random_mdp(n_states, n_actions, 0.9) A = graph.mdp_topology(det_pis) basis = graph.construct_mdp_basis(det_pis, mdp) # v = np.random.random((n_states, )) v = utils.value_functional(mdp.P, mdp.r, det_pis[2], mdp.discount).squeeze() a = graph.sparse_coeffs(basis, v) G = nx.from_numpy_array(A) pos = nx.spring_layout(G, iterations=200) nx.draw(G, pos, node_color=a) plt.show()
def value_graph_laplacian(): n_states = 8 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) n = len(det_pis) print('n pis: {}'.format(n)) mdp = utils.build_random_mdp(n_states, n_actions, 0.5) values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis] Vs = np.stack(values).reshape((n, n_states)) A = graph.mdp_topology(det_pis) W = 1/(np.abs(np.sum(Vs[None, :, :] - Vs[:, None, :], axis=-1)) + 1e-8) adj = A*W G = nx.from_numpy_array(adj) pos = nx.spring_layout(G, iterations=200) plt.figure(figsize=(16,16)) nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150) plt.savefig('figs/value_graphs/value_graph-{}-{}.png'.format(n_states, n_actions)) plt.close() # how can you calulate expected eignenvalues!? # observation. the underlying complexity of the value topology is linear!?!? # how hard is it to estimate the main eigen vec from noisy observations!? # that would tell us the complexity!?!? for i, alpha in enumerate(np.linspace(0, 1, 10)): us = [] for _ in range(50): vs = Vs + alpha*np.random.standard_normal(Vs.shape) W = 1/(np.abs(np.sum(vs[None, :, :] - vs[:, None, :], axis=-1)) + 1e-8) adj = A*W u, v = graph_laplacian_spectra(adj) us.append(u) us = np.stack(us, axis=0) mean = np.mean(us, axis=0) var = np.var(us, axis=0) plt.bar(range(len(mean)), mean, yerr=np.sqrt(var)) plt.savefig('figs/value_graphs/{}-lap.png'.format(i)) plt.close()
def plot(): n_states = 2 n_actions = 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) value = vmap( lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)) pis = np.stack(utils.gen_grid_policies(101), axis=0) vs = value(pis) plt.scatter(vs[:, 0], vs[:, 1], s=10) pis = np.stack(utils.get_deterministic_policies(2, 2), axis=0) vs = value(pis) plt.scatter(vs[:, 0], vs[:, 1], s=10, c='r') plt.xlabel('The value of state 1') plt.ylabel('The value of state 2') plt.title('The value polytope') plt.show()
def value_graph_laplacians(): n_states = 8 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) N = len(det_pis) print('n pis: {}'.format(N)) for i in range(1): mdp = utils.build_random_mdp(n_states, n_actions, 0.5) values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis] Vs = np.stack(values).reshape((N, n_states)) A = graph.mdp_topology(det_pis) W = np.exp(-np.linalg.norm(Vs[None, :, :] - Vs[:, None, :], ord=np.inf, axis=-1)+1e-8) # mVs = np.mean(Vs, axis=0) # n_states # W = np.dot((Vs - mVs) , (Vs - mVs).T) adj = W * A G = nx.from_numpy_array(adj) pos = nx.spectral_layout(G) #, iterations=500) plt.figure(figsize=(16,16)) nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150) plt.savefig('figs/value_graphs/{}-value_graph-{}-{}.png'.format(i, n_states, n_actions)) plt.close() u, v = graph_laplacian_spectra(adj) plt.figure(figsize=(8,8)) plt.bar(range(len(u)), u) plt.savefig('figs/value_graphs/{}-lap.png'.format(i)) plt.close() plt.figure(figsize=(16,16)) n = 5 for j in range(n*n): plt.subplot(n,n,j+1) nx.draw(G, pos, node_color=u[10*j] * v[10*j], node_size=150) plt.savefig('figs/value_graphs/{}-spectra.png'.format(i, n_states, n_actions)) plt.close()
def find_symmetric_mdp(n_states, n_actions, discount, lr=1e-2): """ Approximately find a mdp with ??? symmetry """ model_init = rnd.standard_normal(n_states * n_states * n_actions + n_states * n_actions) pis = utils.get_deterministic_policies(n_states, n_actions) # pis = [utils.random_policy(n_states, n_actions) for _ in range(100)] pis = np.stack(pis) # print(pis.shape) V = vmap(lambda P, r, pi: utils.value_functional(P, r, pi, discount), in_axes=(None, None, 0)) def loss_fn(model_params): # policy symmetry P, r = ss.parse_model_params(n_states, n_actions, model_params) return np.sum( np.square( V(utils.softmax(P), r, pis) - V(utils.softmax(P), r, np.flip(pis, 1)))) # def loss_fn(model_params): # # value symmetry # P, r = ss.parse_model_params(n_states, n_actions, model_params) # vals = V(utils.softmax(P), r, pis) # n = n_states//2 # return np.sum(np.square(vals[:, :n] - vals[:, n:])) dldp = grad(loss_fn) update_fn = lambda model: model - lr * dldp(model) init = (model_init, np.zeros_like(model_init)) model_params, momentum_var = utils.solve( ss.momentum_bundler(update_fn, 0.9), init)[-1] P, r = ss.parse_model_params(n_states, n_actions, model_params) d0 = rnd.random((n_states, 1)) return utils.MDP(n_states, n_actions, P, r, discount, d0)