示例#1
0
def graph_PG():
    # ffmpeg -framerate 10 -start_number 0 -i %d.png -c:v libx264 -r 30 -pix_fmt yuv420p out.mp4
    n_states = 6
    n_actions = 4

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    print('n pis: {}'.format(len(det_pis)))
    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)

    A = graph.mdp_topology(det_pis)
    G = nx.from_numpy_array(A)
    pos = nx.spring_layout(G, iterations=200)

    basis = graph.construct_mdp_basis(det_pis, mdp)

    init_logits = np.random.standard_normal((n_states, n_actions))
    init_v = utils.value_functional(mdp.P, mdp.r, utils.softmax(init_logits), mdp.discount).squeeze()
    a = graph.sparse_coeffs(basis, init_v, lr=0.1)

    print('\nSolving PG')
    pis = utils.solve(search_spaces.policy_gradient_iteration_logits(mdp, 0.1), init_logits)
    print("\n{} policies to vis".format(len(pis)))
    n = len(pis)
    # pis = pis[::n//100]
    pis = pis[0:20]

    for i, pi in enumerate(pis[:-1]):
        print('Iteration: {}'.format(i))
        v = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze()
        a = graph.sparse_coeffs(basis, v, lr=0.1, a_init=a)
        plt.figure()
        nx.draw(G, pos, node_color=a)
        # plt.show()
        plt.savefig('figs/pg_graphs/{}.png'.format(i))
        plt.close()
示例#2
0
def generate_pg(mdp, c, lr=0.01):
    init_pi = utils.random_policy(mdp.S,mdp.A)
    init_logit = np.log(init_pi)
    logits = utils.solve(ss.policy_gradient_iteration_logits(mdp, lr), init_logit)
    vs = np.stack([utils.value_functional(mdp.P, mdp.r, utils.softmax(logit), mdp.discount) for logit in logits])[:,:,0]
    n = vs.shape[0]
    plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n))
    plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10)
    plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')
示例#3
0
def policy_gradient(mdp, pis):
    lens, pi_stars = [], []

    for pi in pis:
        pi_traj = utils.solve(ss.policy_gradient_iteration_logits(mdp, 0.01),
                              np.log(pi + 1e-8))
        pi_star = pi_traj[-1]

        pi_stars.append(pi_star)
        lens.append(len(pi_traj))

    return lens, pi_stars
示例#4
0
def policy_gradient(args):
    P, r, discount, d0, pis, lr = args

    mdp = utils.MDP(r.shape[0], r.shape[1], P, r, discount, d0)
    lens, pi_stars = [], []

    for pi in pis:
        pi_traj = utils.solve(ss.policy_gradient_iteration_logits(mdp, lr),
                              np.log(pi + 1e-8))
        pi_star = pi_traj[-1]

        pi_stars.append(pi_star)
        lens.append(len(pi_traj))

    return lens, pi_stars
示例#5
0
def mom_policy_gradient(mdp, pis):
    lens, pi_stars = [], []

    for pi in pis:
        try:
            pi_traj = utils.solve(
                ss.momentum_bundler(
                    ss.policy_gradient_iteration_logits(mdp, 0.01), 0.9),
                (np.log(pi + 1e-8), np.zeros_like(pi)))
            pi_star, _ = pi_traj[-1]
            L = len(pi_traj)
        except ValueError:
            pi_star = pis[0]
            L = 10000

        pi_stars.append(pi_star)
        lens.append(L)

    return lens, pi_stars