예제 #1
0
def generate_polytope_densities():
    n_states, n_actions = 2, 2
    pis = utils.gen_grid_policies(41)

    nx = 4
    ny = 5
    plt.figure(figsize=(16, 16))

    for i in range(nx * ny):
        print(i)
        mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
        Vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)
        # just set all to be the same probability
        p_pi = 0.1
        pVs = [
            density_value_functional(p_pi, mdp.P, mdp.r, pi, mdp.discount)
            for pi in pis
        ]

        plt.subplot(nx, ny, i + 1)

        fig = plt.scatter(Vs[:, 0], Vs[:, 1], c=pVs)
        # plt.colorbar()
        fig.axes.get_xaxis().set_visible(False)
        fig.axes.get_yaxis().set_visible(False)

    plt.tight_layout()
    plt.show()
예제 #2
0
    def random_test():
        """
        Explore how the unconstrained dynamics in a random setting.
        """
        n_states, n_actions = 3, 2
        mdp = utils.build_random_mdp(n_states, n_actions, 0.9)
        P = mdp.P
        r = mdp.r

        # a distribution over future states
        assert np.isclose(np.sum(P, axis=0), np.ones(
            (n_states, n_actions))).all()

        p, q = mdp_encoder(P, r)

        # print('P', P)
        # print('r', r)
        # print('q', q)
        # print('p', p)

        # r(s, a) = q(s) - KL(P(. | s, a) || p(. | s))
        # TODO how to do with matrices!?
        # kl = - (np.einsum('ijk,ij->jk', P, np.log(p)) - np.einsum('ijk,ijk->jk', P, np.log(P)))
        ce = numpy.zeros((n_states, n_actions))
        for j in range(n_states):
            for k in range(n_actions):  # actions
                ce[j, k] = CE(P[:, j, k], p[:, j])

        r_approx = q[:, np.newaxis] + ce

        print('r', np.around(r, 3), r.shape)
        print('r_approx', np.around(r_approx, 3), r_approx.shape)
        print('r ~= q - CE(P || p): {}'.format(
            np.isclose(r, r_approx, atol=1e-3).all()))
예제 #3
0
def graph_PG():
    # ffmpeg -framerate 10 -start_number 0 -i %d.png -c:v libx264 -r 30 -pix_fmt yuv420p out.mp4
    n_states = 6
    n_actions = 4

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    print('n pis: {}'.format(len(det_pis)))
    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)

    A = graph.mdp_topology(det_pis)
    G = nx.from_numpy_array(A)
    pos = nx.spring_layout(G, iterations=200)

    basis = graph.construct_mdp_basis(det_pis, mdp)

    init_logits = np.random.standard_normal((n_states, n_actions))
    init_v = utils.value_functional(mdp.P, mdp.r, utils.softmax(init_logits), mdp.discount).squeeze()
    a = graph.sparse_coeffs(basis, init_v, lr=0.1)

    print('\nSolving PG')
    pis = utils.solve(search_spaces.policy_gradient_iteration_logits(mdp, 0.1), init_logits)
    print("\n{} policies to vis".format(len(pis)))
    n = len(pis)
    # pis = pis[::n//100]
    pis = pis[0:20]

    for i, pi in enumerate(pis[:-1]):
        print('Iteration: {}'.format(i))
        v = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze()
        a = graph.sparse_coeffs(basis, v, lr=0.1, a_init=a)
        plt.figure()
        nx.draw(G, pos, node_color=a)
        # plt.show()
        plt.savefig('figs/pg_graphs/{}.png'.format(i))
        plt.close()
예제 #4
0
def value_graph():

    # vs = [np.sum(utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze()**2) for pi in det_pis]
    # plt.figure(figsize=(16,16))
    # nx.draw(G, pos, node_color=vs, node_size=150)
    # plt.savefig('figs/pi_graphs/val.png')
    # plt.close()

    n_states = 10
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    n = len(det_pis)
    print('n pis: {}'.format(n))
    # how does discount effect these!?
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

    values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis]
    vs = np.stack(values).reshape((n, n_states))
    W = 1/(np.abs(np.sum(vs[None, :, :] - vs[:, None, :], axis=-1)) + 1e-8)
    A = graph.mdp_topology(det_pis)
    adj = A*W
    G = nx.from_numpy_array(adj)
    pos = nx.spring_layout(G, iterations=200)

    plt.figure(figsize=(16,16))
    nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150)
    plt.savefig('figs/value_graphs/value_graph-{}-{}.png'.format(n_states, n_actions))
    plt.close()
예제 #5
0
def generate_model_cs():
    """
    Compare using all deterministic policies versus fewer mixed policies.
    Starts to get interesting in higher dims?



    """
    n_states = 32
    n_actions = 2
    lr = 0.01
    k = 64

    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    init = rnd.standard_normal((mdp.S * mdp.S * mdp.A + mdp.S * mdp.A))

    pi_star = utils.solve(policy_iteration(mdp),
                          utils.softmax(rnd.standard_normal(
                              (mdp.S, mdp.A))))[-1]
    print('pi_star\n', pi_star)

    # adversarial pis
    # apis = utils.get_deterministic_policies(mdp.S, mdp.A)
    apis = np.stack([utils.random_det_policy(mdp.S, mdp.A) for _ in range(k)])

    update_fn = model_iteration(mdp, lr, apis)
    params = utils.solve(update_fn, init)
    p_logits, r = parse_model_params(mdp.S, mdp.A, params[-1])
    error = np.mean(
        (utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) -
         utils.value_functional(utils.softmax(p_logits), r, pi_star,
                                mdp.discount))**2)
    print('\n', error)
    new_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount,
                        mdp.d0)
    pi_star = utils.solve(policy_iteration(new_mdp),
                          utils.softmax(rnd.standard_normal(
                              (mdp.S, mdp.A))))[-1]
    print(pi_star)

    apis = np.stack([utils.random_policy(mdp.S, mdp.A) for _ in range(k)])

    update_fn = model_iteration(mdp, lr, apis)
    params = utils.solve(update_fn, init)
    p_logits, r = parse_model_params(mdp.S, mdp.A, params[-1])
    error = np.mean(
        (utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) -
         utils.value_functional(utils.softmax(p_logits), r, pi_star,
                                mdp.discount))**2)
    print('\n', error)
    new_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount,
                        mdp.d0)
    pi_star = utils.solve(policy_iteration(new_mdp),
                          utils.softmax(rnd.standard_normal(
                              (mdp.S, mdp.A))))[-1]
    print(pi_star)
예제 #6
0
def k_step_option_similarity():
    n_states, n_actions = 6, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

    pi = utils.random_policy(n_states, n_actions)
    P = multi_step_transition_fn(mdp.P, pi, 3)
    # P[:,-1] = P[:,-2]
    # s(o1, o2) = sum_s' P(s' | s1) * log( P(s' | s2)  /  P(s' | s1))
    kl = -np.sum(P[:, :, None] * np.log(P[:, None, :] / P[:, :, None]), axis=0)
    print(kl)
예제 #7
0
파일: graph_tests.py 프로젝트: act65/mdps
def test_estimation():
    n_states = 5
    n_actions = 2

    det_pis = utils.get_deterministic_policies(mdp.S, mdp.A)
    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)
    basis = graph.construct_mdp_basis(det_pis, mdp)

    v = np.random.random((n_states, ))
    a = graph.estimate_coeffs(basis.T, v)
    print(a)
예제 #8
0
def lmdp_field():
    """
    For each policy.
    Calculate its dynamics, P_pi.
    Estimate the value via the LMDP.
    Plot difference under linearTD operator.
    """
    n_states, n_actions = 2, 2
    pis = utils.gen_grid_policies(11)

    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

    p, q = lmdps.mdp_encoder(mdp.P, mdp.r)

    vs = []
    dvs = []
    for pi in pis:
        u = np.einsum('ijk,jk->ij', mdp.P, pi)
        v = lmdps.linear_value_functional(p, q, u, mdp.discount)
        z = np.exp(v)
        Tz = lmdps.linear_bellman_operator(p, q, z, mdp.discount)
        dv = np.log(Tz) - np.log(z)

        vs.append(v)
        dvs.append(dv)

    dvs = np.vstack(dvs)
    vs = np.vstack(vs)

    normed_dvs = utils.normalize(dvs)

    plt.figure(figsize=(16, 16))
    plt.subplot(1, 2, 1)
    plt.title('Linearised Bellman operator')
    plt.quiver(vs[:, 0], vs[:, 1], normed_dvs[:, 0], normed_dvs[:, 1],
               np.linalg.norm(dvs, axis=1))

    # plot bellman
    Vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)
    diff_op = lambda V: utils.bellman_optimality_operator(
        mdp.P, mdp.r, np.expand_dims(V, 1), mdp.discount) - np.expand_dims(
            V, 1)
    dVs = np.stack([np.max(diff_op(V), axis=1) for V in Vs])

    normed_dVs = utils.normalize(dVs)

    plt.subplot(1, 2, 2)
    plt.title('Bellman operator')
    plt.quiver(Vs[:, 0], Vs[:, 1], normed_dVs[:, 0], normed_dVs[:, 1],
               np.linalg.norm(dVs, axis=1))

    # plt.savefig('figs/LBO_BO.png')
    plt.show()
예제 #9
0
def generate_cvi():
    print('\nRunning PVI vs VI')
    n_states, n_actions = 2, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)


    fn = ss.complex_value_iteration(mdp, 0.01)

    Q = rnd.standard_normal((n_states, 1)) + 1j*rnd.standard_normal((n_states, 1))

    results = utils.solve(fn, Q)
    print(results)
예제 #10
0
def generate_model_iteration():
    n_states, n_actions = 2, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    pis = utils.gen_grid_policies(7)

    init = rnd.standard_normal(
        (mdp.S * mdp.S * mdp.A + mdp.S * mdp.A)
    )  # needs its own init. alternatively could find init that matches value of other inits?!?

    vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)

    plt.figure(figsize=(16, 16))
    plt.scatter(vs[:, 0], vs[:, 1], c='b', s=10, alpha=0.75)

    lr = 0.01
    pi_star = utils.solve(policy_iteration(mdp),
                          utils.softmax(rnd.standard_normal(
                              (mdp.S, mdp.A))))[-1]

    # adversarial pis
    apis = utils.get_deterministic_policies(mdp.S, mdp.A)
    apis = np.stack(apis)

    update_fn = model_iteration(mdp, lr, apis)
    params = utils.solve(update_fn, init)
    params = [parse_model_params(mdp.S, mdp.A, p) for p in params]

    vs = np.vstack([
        utils.value_functional(utils.softmax(p_logits), r, pi_star,
                               mdp.discount).T for p_logits, r in params
    ])

    n = vs.shape[0]
    plt.scatter(vs[0, 0], vs[0, 1], c='g', label='PG')
    plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n - 2), cmap='spring', s=10)
    plt.scatter(vs[-1, 0], vs[-1, 1], c='g', marker='x')

    p_logits, r = params[-1]
    vs = utils.polytope(utils.softmax(p_logits), r, mdp.discount, pis)
    plt.scatter(vs[:, 0], vs[:, 1], c='r', s=10, alpha=0.75)
    plt.title('Model iteration')
    plt.xlabel('Value of state 1')
    plt.ylabel('Value of state 2')

    # plt.show()
    plt.savefig('figs/model_iteration_1.png')

    learned_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r,
                            mdp.discount, mdp.d0)
    pi_star_approx = utils.solve(
        policy_iteration(learned_mdp),
        utils.softmax(rnd.standard_normal((mdp.S, mdp.A))))[-1]
    print(pi_star_approx, '\n', pi_star)
예제 #11
0
파일: graph_tests.py 프로젝트: act65/mdps
def test_sparse_estimation():
    n_states = 5
    n_actions = 2

    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)
    det_pis = utils.get_deterministic_policies(mdp.S, mdp.A)
    basis = graph.construct_mdp_basis(det_pis, mdp)

    v = utils.value_functional(mdp.P, mdp.r, det_pis[2],
                               mdp.discount).squeeze()
    a = graph.sparse_coeffs(basis, v)

    print(a)
예제 #12
0
def state_action_vis():
    # want to pick policies that maximise exploration.
    # but. how to solve for this analytically?! not sure this is going to work...
    # unless? is there a way to analytically set pi = 1/visitation?!
    # if we iterate. estimate visitation under pi, set pi = 1/visitaiton.
    # does it converge? where does it converge?
    # it shouldnt converge?!?

    mdp = utils.build_random_mdp(12, 2, 0.5)
    pi = utils.random_policy(mdp.S, mdp.A)
    v_sa_sa = state_action_visitation_distribution(mdp, pi)

    # sum over initial conditions to get discounted state-action visitation probability
    d0_sa = np.reshape(np.einsum('jk,jl->jk', pi, mdp.d0), (mdp.S * mdp.A, ))
    ps = np.einsum('ik,k->i', v_sa_sa, d0_sa)

    plt.imshow(v_sa_sa)
    plt.show()
예제 #13
0
def emp_est_snr_graph():
    n_states, n_actions = 12, 3
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    pis = [utils.random_policy(n_states, n_actions) for _ in range(100)]

    vs = []
    hs = []
    for i, pi in enumerate(pis):
        print('\r{}'.format(i), end='', flush=True)

        # try:
        vs.append(est_var_R(mdp, pi))
        hs.append(utils.entropy(pi))
        # except ValueError as err:
        #     print(err)

    plt.scatter(hs, vs)
    plt.show()
예제 #14
0
파일: graph_tests.py 프로젝트: act65/mdps
def test_everything():
    n_states = 5
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)

    A = graph.mdp_topology(det_pis)
    basis = graph.construct_mdp_basis(det_pis, mdp)

    # v = np.random.random((n_states, ))
    v = utils.value_functional(mdp.P, mdp.r, det_pis[2],
                               mdp.discount).squeeze()
    a = graph.sparse_coeffs(basis, v)

    G = nx.from_numpy_array(A)
    pos = nx.spring_layout(G, iterations=200)
    nx.draw(G, pos, node_color=a)
    plt.show()
예제 #15
0
def value_graph_laplacian():
    n_states = 8
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    n = len(det_pis)
    print('n pis: {}'.format(n))
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

    values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis]
    Vs = np.stack(values).reshape((n, n_states))
    A = graph.mdp_topology(det_pis)

    W = 1/(np.abs(np.sum(Vs[None, :, :] - Vs[:, None, :], axis=-1)) + 1e-8)
    adj = A*W

    G = nx.from_numpy_array(adj)
    pos = nx.spring_layout(G, iterations=200)
    plt.figure(figsize=(16,16))
    nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150)
    plt.savefig('figs/value_graphs/value_graph-{}-{}.png'.format(n_states, n_actions))
    plt.close()

    # how can you calulate expected eignenvalues!?
    # observation. the underlying complexity of the value topology is linear!?!?
    # how hard is it to estimate the main eigen vec from noisy observations!?
    # that would tell us the complexity!?!?
    for i, alpha in enumerate(np.linspace(0, 1, 10)):
        us = []
        for _ in range(50):
            vs = Vs + alpha*np.random.standard_normal(Vs.shape)
            W = 1/(np.abs(np.sum(vs[None, :, :] - vs[:, None, :], axis=-1)) + 1e-8)
            adj = A*W

            u, v = graph_laplacian_spectra(adj)
            us.append(u)
        us = np.stack(us, axis=0)
        mean = np.mean(us, axis=0)
        var = np.var(us, axis=0)
        plt.bar(range(len(mean)), mean, yerr=np.sqrt(var))
        plt.savefig('figs/value_graphs/{}-lap.png'.format(i))
        plt.close()
예제 #16
0
def compare_mdp_lmdp():
    n_states, n_actions = 2, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)
    pis = utils.gen_grid_policies(7)
    vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)

    plt.figure(figsize=(16, 16))
    plt.scatter(vs[:, 0], vs[:, 1], s=10, alpha=0.75)

    # solve via LMDPs
    p, q = lmdps.mdp_encoder(mdp.P, mdp.r)
    u, v = lmdps.lmdp_solver(p, q, mdp.discount)
    pi_u_star = lmdps.lmdp_decoder(u, mdp.P)

    pi_p = lmdps.lmdp_decoder(p, mdp.P)

    # solve MDP
    init = np.random.standard_normal((n_states, n_actions))
    pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1]
    # pi_star = onehot(np.argmax(qs, axis=1), n_actions)

    # evaluate both policies.
    v_star = utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount)
    v_u_star = utils.value_functional(mdp.P, mdp.r, pi_u_star, mdp.discount)
    v_p = utils.value_functional(mdp.P, mdp.r, pi_p, mdp.discount)

    plt.scatter(v_star[0, 0],
                v_star[1, 0],
                c='m',
                alpha=0.5,
                marker='x',
                label='mdp')
    plt.scatter(v_u_star[0, 0],
                v_u_star[1, 0],
                c='g',
                alpha=0.5,
                marker='x',
                label='lmdp')
    plt.scatter(v_p[0, 0], v_p[1, 0], c='k', marker='x', alpha=0.5, label='p')
    plt.legend()
    plt.show()
예제 #17
0
def emp_est_snr_map():
    n_states, n_actions = 2, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    pis = utils.gen_grid_policies(5)
    vals = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)

    vars = []
    hs = []
    for i, pi in enumerate(pis):
        print('\r{}'.format(i), end='', flush=True)

        vars.append(est_var_R(mdp, pi))
        hs.append(utils.entropy(pi))

    plt.subplot(2, 1, 1)
    plt.scatter(vals[:, 0], vals[:, 1], c=hs)
    plt.subplot(2, 1, 2)
    plt.scatter(vals[:, 0], vals[:, 1], c=vars)
    # plt.subplot(3,1,1)
    # plt.scatter(vals[:, 0], vals[:, 0], c=hs)
    plt.show()
예제 #18
0
def hyperbolic_polytope():
    # https://arxiv.org/abs/1902.06865
    n_states, n_actions = 2, 2
    N = 21
    pis = utils.gen_grid_policies(N)
    mdp = utils.build_random_mdp(n_states, n_actions, None)

    n = 10
    discounts = np.linspace(0.1, 1-1e-4, n)
    Vs = []
    for discount in discounts:
        Vs.append((1-discount)*utils.polytope(mdp.P, mdp.r, discount, pis))

    h_V = sum(Vs)/n

    plt.subplot(2, 1, 1)
    plt.scatter(h_V[:, 0], h_V[:, 1])
    plt.subplot(2, 1, 2)
    V = (1-0.9)*utils.polytope(mdp.P, mdp.r, 0.9, pis)
    plt.scatter(V[:, 0], V[:, 1])
    plt.show()
예제 #19
0
파일: polytope_vis.py 프로젝트: act65/mdps
def plot():
    n_states = 2
    n_actions = 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    value = vmap(
        lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount))

    pis = np.stack(utils.gen_grid_policies(101), axis=0)
    vs = value(pis)
    plt.scatter(vs[:, 0], vs[:, 1], s=10)

    pis = np.stack(utils.get_deterministic_policies(2, 2), axis=0)
    vs = value(pis)
    plt.scatter(vs[:, 0], vs[:, 1], s=10, c='r')

    plt.xlabel('The value of state 1')
    plt.ylabel('The value of state 2')

    plt.title('The value polytope')

    plt.show()
예제 #20
0
def value_graph_laplacians():
    n_states = 8
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    N = len(det_pis)
    print('n pis: {}'.format(N))
    for i in range(1):
        mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

        values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis]
        Vs = np.stack(values).reshape((N, n_states))
        A = graph.mdp_topology(det_pis)

        W = np.exp(-np.linalg.norm(Vs[None, :, :] - Vs[:, None, :], ord=np.inf, axis=-1)+1e-8)

        # mVs = np.mean(Vs, axis=0)  # n_states
        # W = np.dot((Vs - mVs) , (Vs - mVs).T)
        adj = W * A

        G = nx.from_numpy_array(adj)
        pos = nx.spectral_layout(G) #, iterations=500)
        plt.figure(figsize=(16,16))
        nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150)
        plt.savefig('figs/value_graphs/{}-value_graph-{}-{}.png'.format(i, n_states, n_actions))
        plt.close()

        u, v = graph_laplacian_spectra(adj)
        plt.figure(figsize=(8,8))
        plt.bar(range(len(u)), u)
        plt.savefig('figs/value_graphs/{}-lap.png'.format(i))
        plt.close()

        plt.figure(figsize=(16,16))
        n = 5
        for j in range(n*n):
            plt.subplot(n,n,j+1)
            nx.draw(G, pos, node_color=u[10*j] * v[10*j], node_size=150)
        plt.savefig('figs/value_graphs/{}-spectra.png'.format(i, n_states, n_actions))
        plt.close()
예제 #21
0
def compare_acc():
    n_states, n_actions = 2, 2

    lmdp = []
    lmdp_rnd = []
    for _ in range(10):
        mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

        # solve via LMDPs
        p, q = lmdps.mdp_encoder(mdp.P, mdp.r)
        u, v = lmdps.lmdp_solver(p, q, mdp.discount)
        pi_u_star = lmdps.lmdp_decoder(u, mdp.P)

        # solve MDP
        init = np.random.standard_normal((n_states, n_actions))
        pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1]

        # solve via LMDPs
        # with p set to the random dynamics
        p, q = lmdps.mdp_encoder(mdp.P, mdp.r)
        p = np.einsum('ijk,jk->ij', mdp.P,
                      np.ones((n_states, n_actions)) / n_actions)
        # q = np.max(mdp.r, axis=1, keepdims=True)
        u, v = lmdps.lmdp_solver(p, q, mdp.discount)
        pi_u_star_random = lmdps.lmdp_decoder(u, mdp.P)

        # evaluate both policies.
        v_star = utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount)
        v_u_star = utils.value_functional(mdp.P, mdp.r, pi_u_star,
                                          mdp.discount)
        v_u_star_random = utils.value_functional(mdp.P, mdp.r,
                                                 pi_u_star_random,
                                                 mdp.discount)

        lmdp.append(np.isclose(v_star, v_u_star, 1e-3).all())
        lmdp_rnd.append(np.isclose(v_star, v_u_star_random, 1e-3).all())

    print([np.sum(lmdp), np.sum(lmdp_rnd)])
    plt.bar(range(2), [np.sum(lmdp), np.sum(lmdp_rnd)])
    plt.show()
예제 #22
0
def mdp_lmdp_optimality():
    n_states, n_actions = 2, 2

    n = 5
    plt.figure(figsize=(8, 16))
    plt.title('Optimal control (LMDP) vs optimal policy (MDP)')
    for i in range(n):
        mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
        # solve via LMDPs
        p, q = lmdps.mdp_encoder(mdp.P, mdp.r)
        u, v = lmdps.lmdp_solver(p, q, mdp.discount)

        init = np.random.standard_normal((n_states, n_actions))
        pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1]

        P_pi_star = np.einsum('ijk,jk->ij', mdp.P, pi_star)
        plt.subplot(n, 2, 2 * i + 1)
        plt.imshow(u)
        plt.subplot(n, 2, 2 * i + 2)
        plt.imshow(P_pi_star)
    plt.savefig('figs/lmdp_mdp_optimal_dynamics.png')
    plt.show()
예제 #23
0
def generate_snr_map():
    n_states, n_actions = 2, 3
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    # pis = utils.gen_grid_policies(11)
    pis = [utils.random_policy(n_states, n_actions) for _ in range(512)]
    Vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)

    mags = [grad_mag(mdp.P, mdp.r, pi, mdp.discount) for pi in pis]
    uncert = [variance(mdp.P, mdp.r, pi, mdp.discount) for pi in pis]

    snr = [s / n for s, n in zip(mags, uncert)]

    plt.subplot(3, 1, 1)
    plt.title('Magnitude')
    plt.scatter(Vs[:, 0], Vs[:, 1], c=mags)

    plt.subplot(3, 1, 2)
    plt.title('Variance')
    plt.scatter(Vs[:, 0], Vs[:, 1], c=uncert)

    plt.subplot(3, 1, 3)
    plt.title('SNR')
    plt.scatter(Vs[:, 0], Vs[:, 1], c=snr)
    plt.show()
예제 #24
0
파일: density_tests.py 프로젝트: act65/mdps
    def test_density():
        mdp = utils.build_random_mdp(2, 2, 0.9)

        pi = utils.softmax(rnd.standard_normal((2,2)), axis=1)
        p_V = density_value_functional(0.1, mdp.P, mdp.r, pi, 0.9)
        print(p_V)
예제 #25
0
파일: iteration_ds.py 프로젝트: act65/mdps
        v_star = traj[-1]
        trajs.append(traj)
    return trajs


class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)


if __name__ == '__main__':
    rnd.seed(42)
    n_states, n_actions = 2, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    pis = utils.gen_grid_policies(4)

    use_momentum = False
    fname = 'test1.json'
    with open(fname, 'w') as f:

        for lr in np.logspace(-2, -1, 2):
            traj = value_iteration(mdp, pis, lr)

            data = {
                '{}-{}-{}'.format(value_iteration.__name__, lr, use_momentum):
                [np.array(t).tolist() for t in traj]
            }
            s = json.dumps(data, cls=NumpyEncoder)
            f.write(s + '\n')
예제 #26
0
    # pool = multiprocessing.Pool(n**2)
    # # couldnt serialise the mdp collection. so just unwrap them here.
    # lens_n_pi_stars = pool.map(iteration_fn, [(mdp.P, mdp.r, mdp.discount, mdp.d0, pis, lr) for lr in lrs])
    # for i, lr, results in zip(range(n**2), lrs, lens_n_pi_stars):
    #     len, pi_star = results

    for i, lr in enumerate(lrs):
        print('\n{}: {}\n'.format(i, lr))
        lens, pi_stars = iteration_fn(
            (mdp.P, mdp.r, mdp.discount, mdp.d0, pis, lr))

        plt.subplot(n, n, i + 1)
        plt.title('Learning rate: {}'.format(lr))
        fig = plt.scatter(Vs[:, 0], Vs[:, 1], c=lens, s=5)
        fig.axes.get_xaxis().set_visible(False)
        fig.axes.get_yaxis().set_visible(False)

    plt.tight_layout()
    plt.savefig('figs/iteration-lrs/0-{}.png'.format(name))


if __name__ == '__main__':
    rnd.seed(41)
    n_states, n_actions = 2, 2
    mdps = [utils.build_random_mdp(n_states, n_actions, 0.5) for _ in range(5)]
    pis = utils.gen_grid_policies(31)

    for i, mdp in enumerate(mdps):
        print('\nMDP {}\n'.format(i))
        generate_iteration_figures(mdp, pis, param_policy_gradient, str(i))
예제 #27
0
import numpy as np
import mdp.utils as utils
from mdp.search_spaces import *


def clip_solver_traj(traj):
    if np.isclose(traj[-1], traj[-2], 1e-8).all():
        return traj[:-1]
    else:
        return traj


mdp = utils.build_random_mdp(2, 2, 0.5)
init = utils.softmax(rnd.standard_normal((mdp.S, mdp.A)), axis=1)
pi_traj = clip_solver_traj(utils.solve(policy_iteration(mdp), init))
print(pi_traj)