pi_g_list = mdp2.piecewise_constant_policies()
pi_a_list = mdp2.abstract_policies()

v_g_list = [vi(mdp1, pi)[0] for pi in pi_g_list]
v_a_list = [vi(mdp2, pi)[0] for pi in pi_a_list]

order_v_g = np.stack(sort_value_fns(v_g_list)).round(4)
order_v_a = np.stack(sort_value_fns(v_a_list)).round(4)

mdp2.p0
agg_state = mdp2.phi.sum(axis=0) > 1
np.stack([mdp2.B(pi, t=1)[agg_state] for pi in pi_g_list])

v_phi_pi_phi_star, _, pi_phi_star = vi(mdp2)
v_pi_phi_star = vi(mdp1, mdp2.get_ground_policy(pi_phi_star))[0]

# Look for examples of v_pi_phi_star < v
for v in v_g_list:
    if compare_value_fns(v_pi_phi_star, v) == "<":
        print('Found example of order mismatch.')
        break
else:
    print('All examples had proper ordering.')
#%%
graph_value_fns(v_g_list)  #, 'graphviz/non_markov_b/ground_17')
graph_value_fns(v_a_list)  #, 'graphviz/non_markov_b/abstract_17')

v_pi_phi_star
np.asarray(v_g_list).round(3)
np.asarray(v_a_list).round(3)
    [4, 0, 0, 0, 0, 0]
])/4

mdp1 = MDP([T, T], [R, R], gamma=0.9)
phi = np.array([
    [1, 0, 0, 0],
    [0, 1, 0, 0],
    [0, 0, 1, 0],
    [0, 0, 0, 1],
    [0, 0, 0, 1],
    [0, 0, 0, 1],
])
mdp2 = AbstractMDP(mdp1, phi)

v_star, q_star, pi_star = vi(mdp1)
v_star, pi_star

pi_g_list = mdp2.piecewise_constant_policies()
pi_a_list = mdp2.abstract_policies()
v_g_list = [vi(mdp1, pi)[0] for pi in pi_g_list]
v_a_list = [vi(mdp2, pi)[0] for pi in pi_a_list]

np.allclose(v_g_list, v_g_list[0])

order_v_g = sorted_order(v_g_list)
order_v_a = sorted_order(v_a_list)
assert np.allclose(order_v_a, order_v_g)

graph_value_fns(v_a_list)
graph_value_fns(v_g_list)
예제 #3
0
    v_star, q_star, pi_star = vi(mdp1)
    v_star, pi_star

    pi_g_list = mdp2.piecewise_constant_policies()
    pi_a_list = mdp2.abstract_policies()
    v_g_list = [vi(mdp1, pi)[0] for pi in pi_g_list]
    v_a_list = [vi(mdp2, pi)[0] for pi in pi_a_list]

    order_v_g = sorted_order(v_g_list)
    order_v_a = sorted_order(v_a_list)
    assert np.allclose(order_v_a, order_v_g)
print('All tests passed.')

#%%
graph_value_fns(v_g_list, 'graphviz/arbitrary_both/ground_10')
graph_value_fns(v_a_list, 'graphviz/arbitrary_both/abstract_10')


#%%
v_phi_star, q_phi_star, pi_phi_star = vi(mdp2)
v_phi_star

n_policies = mdp1.n_actions**mdp1.n_states
def get_policy(mdp, i):
    assert i < n_policies
    pi_string = gmpy.digits(i, mdp.n_actions).zfill(mdp.n_states)
    pi = np.asarray(list(pi_string), dtype=int)
    return pi

# for each ground-state policy
mdp1 = MDP([T, T.transpose()], [R, R.transpose()], gamma=0.9)
phi = np.array([
    [1, 0, 0],
    [0, 1, 0],
    [0, 0, 1],
    [1, 0, 0],
])
mdp2 = UniformAbstractMDP(mdp1, phi)
is_markov(mdp2)

v_g_star, q_g_star, pi_g_star = vi(mdp1)
v_g_star, pi_g_star

v_a_star, q_a_star, pi_a_star = vi(mdp2)
v_a_star, pi_a_star

pi_g_list = mdp2.piecewise_constant_policies()
pi_a_list = mdp2.abstract_policies()
v_g_list = [vi(mdp1, pi)[0] for pi in pi_g_list]
v_a_list = [vi(mdp2, pi)[0] for pi in pi_a_list]

order_v_g = sorted_order(v_g_list)
order_v_a = sorted_order(v_a_list)
print(partial_ordering(v_g_list))
print(partial_ordering(v_a_list))
assert np.allclose(order_v_a, order_v_g)

#%%
graph_value_fns(v_g_list, 'graphviz/non_markov/ground_11')
graph_value_fns(v_a_list, 'graphviz/non_markov/abstract_11')
예제 #5
0
order_v_g = sort_value_fns(v_g_list)
order_v_a = sort_value_fns(v_a_list)

v_phi_pi_phi_star, _, pi_phi_star = vi(mdp2)
v_pi_phi_star = vi(mdp1, mdp2.get_ground_policy(pi_phi_star))[0]

# Look for examples of v_pi_phi_star < v
for v in v_g_list:
    if compare_value_fns(v_pi_phi_star, v) == "<":
        break
else:
    print('No examples found.')

#%%
graph_value_fns(v_g_list)
graph_value_fns(v_a_list)

v_pi_phi_star
np.asarray(v_g_list).round(3)
np.asarray(v_a_list).round(3)

#%%
# This illustrates an example where V^{\pi_\phi^*} < max_{\pi\in \Pi_\phi} V^{\pi}
# Note the fixed weighting scheme.
T_list = np.array([[[1., 0., 0.], [1., 0., 0.], [0., 0., 1.]],
                   [[0., 1., 0.], [0., 0., 1.], [0., 1., 0.]]])
R_list = np.array([[[1., 0., 0.], [0.5, 0., 0.], [0., 0., 0.5]],
                   [[0., 1., 0.], [0., 0., 1.], [0., 0.1, 0.]]])
phi = np.array([[0, 1], [1, 0], [0, 1]])