def graph_PG(): # ffmpeg -framerate 10 -start_number 0 -i %d.png -c:v libx264 -r 30 -pix_fmt yuv420p out.mp4 n_states = 6 n_actions = 4 det_pis = utils.get_deterministic_policies(n_states, n_actions) print('n pis: {}'.format(len(det_pis))) mdp = utils.build_random_mdp(n_states, n_actions, 0.9) A = graph.mdp_topology(det_pis) G = nx.from_numpy_array(A) pos = nx.spring_layout(G, iterations=200) basis = graph.construct_mdp_basis(det_pis, mdp) init_logits = np.random.standard_normal((n_states, n_actions)) init_v = utils.value_functional(mdp.P, mdp.r, utils.softmax(init_logits), mdp.discount).squeeze() a = graph.sparse_coeffs(basis, init_v, lr=0.1) print('\nSolving PG') pis = utils.solve(search_spaces.policy_gradient_iteration_logits(mdp, 0.1), init_logits) print("\n{} policies to vis".format(len(pis))) n = len(pis) # pis = pis[::n//100] pis = pis[0:20] for i, pi in enumerate(pis[:-1]): print('Iteration: {}'.format(i)) v = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() a = graph.sparse_coeffs(basis, v, lr=0.1, a_init=a) plt.figure() nx.draw(G, pos, node_color=a) # plt.show() plt.savefig('figs/pg_graphs/{}.png'.format(i)) plt.close()
def generate_pg(mdp, c, lr=0.01): init_pi = utils.random_policy(mdp.S,mdp.A) init_logit = np.log(init_pi) logits = utils.solve(ss.policy_gradient_iteration_logits(mdp, lr), init_logit) vs = np.stack([utils.value_functional(mdp.P, mdp.r, utils.softmax(logit), mdp.discount) for logit in logits])[:,:,0] n = vs.shape[0] plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n)) plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10) plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')
def policy_gradient(mdp, pis): lens, pi_stars = [], [] for pi in pis: pi_traj = utils.solve(ss.policy_gradient_iteration_logits(mdp, 0.01), np.log(pi + 1e-8)) pi_star = pi_traj[-1] pi_stars.append(pi_star) lens.append(len(pi_traj)) return lens, pi_stars
def policy_gradient(args): P, r, discount, d0, pis, lr = args mdp = utils.MDP(r.shape[0], r.shape[1], P, r, discount, d0) lens, pi_stars = [], [] for pi in pis: pi_traj = utils.solve(ss.policy_gradient_iteration_logits(mdp, lr), np.log(pi + 1e-8)) pi_star = pi_traj[-1] pi_stars.append(pi_star) lens.append(len(pi_traj)) return lens, pi_stars
def mom_policy_gradient(mdp, pis): lens, pi_stars = [], [] for pi in pis: try: pi_traj = utils.solve( ss.momentum_bundler( ss.policy_gradient_iteration_logits(mdp, 0.01), 0.9), (np.log(pi + 1e-8), np.zeros_like(pi))) pi_star, _ = pi_traj[-1] L = len(pi_traj) except ValueError: pi_star = pis[0] L = 10000 pi_stars.append(pi_star) lens.append(L) return lens, pi_stars