def test(): T1 = np.array([ [0, 3 / 4, 1 / 4], [1 / 3, 1 / 2, 1 / 6], [1, 0, 0], ]) T2 = np.array([ [0, 3 / 4, 1 / 4], [2 / 3, 1 / 4, 1 / 12], [0, 3 / 4, 1 / 4], ]) R = np.array([ [0, 1, 1], [1, 0, 0], [1, 0, 0], ]) mdp_gnd = MDP([T1, T2], [R, R], gamma=0.9) phi = np.array([ [1, 0], [0, 1], [0, 1], ]) mdp_abs = AbstractMDP(mdp_gnd, phi) mdp_abs = AbstractMDP(mdp_gnd, mdp_abs.phi) assert is_markov(mdp_abs)
def generate_markov_mdp_pair(n_states, n_abs_states, n_actions, sparsity=0, gamma=0.9, equal_block_rewards=True, equal_block_transitions=True): # Sometimes numerical precision causes the abstract mdp to appear non-Markov # so we just keep generating until the problem goes away. Usually it's fine. while True: # generate an MDP and an abstraction function mdp_gnd = MDP.generate(n_states=n_states, n_actions=n_actions, sparsity=sparsity, gamma=gamma) assert n_abs_states < n_states phi = random_phi(n_states, n_abs_states) agg_states = ((phi.sum(axis=0) > 1) @ phi.transpose()).astype(bool) other_states = ((phi.sum(axis=0) == 1) @ phi.transpose()).astype(bool) random_weights = random_transition_matrix( (1, n_states - n_abs_states + 1)) # adjust T and R to achieve desired properties R = np.copy(mdp_gnd.R) T = np.copy(mdp_gnd.T) for a in range(mdp_gnd.n_actions): if equal_block_rewards: R[a][agg_states[:, None] * agg_states] = np.mean( mdp_gnd.R[a][agg_states[:, None] * agg_states]) R[a][other_states[:, None] * agg_states] = np.mean( mdp_gnd.R[a][other_states[:, None] * agg_states]) R[a][agg_states[:, None] * other_states] = np.mean( mdp_gnd.R[a][agg_states[:, None] * other_states]) T[a][:, agg_states] = random_weights * np.sum( mdp_gnd.T[a][:, agg_states], axis=1, keepdims=True) if equal_block_transitions: T[a][agg_states] = np.mean(T[a][agg_states, :], axis=0) T[a][agg_states][:, agg_states] = random_weights * np.sum( T[a][agg_states][:, agg_states], axis=1, keepdims=True) # T[a][:,other_states] = random_transition_matrix((1,mdp_gnd.n_states-2)) * np.sum(mdp_gnd.T[a][:,other_states],axis=1, keepdims=True) assert (is_stochastic(T[a])) mdp_gnd.R = R mdp_gnd.T = T p0 = random_transition_matrix((1, n_states)).squeeze() mdp_abs = AbstractMDP(mdp_gnd, phi, p0=p0) # Ensure that the abstraction is markov by checking inverse models and ratios if is_markov(mdp_abs): break return mdp_gnd, mdp_abs
def test(): mdp = MDP.generate(n_states=4, n_actions=2) pi_list = mdp.all_policies() v_list = [vi(mdp, pi)[0] for pi in pi_list] v_ranks = sorted_order(v_list) sorted_v = [v for _, v in sorted(zip(v_ranks, v_list))] for v1, v2 in zip(sorted_v[:-1], sorted_v[1:]): assert compare_value_fns(v1, v2) != '<' # for pi1, v1 in zip(pi_list, v_list): # for pi2, v2 in zip(pi_list, v_list): # print(v1.round(4)) # print(compare_value_fns(v1, v2), v2.round(4)) # print() v_star, _, pi_star = vi(mdp) assert compare_value_fns(v_star, sorted_v[0]) == '='
def main(): mdp = BlockMDP(MDP.generate(n_states=5, n_actions=6), n_obs_per_block=3) v, q, pi = vi(mdp) v_alt = np.zeros_like(v) for s in range(mdp.n_states): v_alt[s] = q[pi[s]][s] v_alt = v_alt.squeeze() assert np.allclose(v_alt, v) v_pi = vi(mdp, pi)[0] assert np.allclose(v_pi, v) m_phi = mdp.base_mdp v_phi, q_phi, pi_phi = vi(m_phi) pi_phi_grounded = np.kron(pi_phi, np.ones((1, mdp.n_states // m_phi.n_states))) assert np.allclose(pi_phi_grounded, pi) print('All tests passed.')
def generate_non_markov_mdp_pair(n_states, n_abs_states, n_actions, sparsity=0, gamma=0.9, fixed_w=False): while True: mdp_gnd = MDP.generate(n_states=n_states, n_actions=n_actions, sparsity=sparsity, gamma=gamma) assert n_abs_states < n_states phi = random_phi(n_states, n_abs_states) if fixed_w: mdp_abs = UniformAbstractMDP(mdp_gnd, phi) else: mdp_abs = AbstractMDP(mdp_gnd, phi) # Ensure non-markov by checking inverse models and ratios if not is_markov(mdp_abs): break return mdp_gnd, mdp_abs
def test_non_markov_B(): T = np.array([ [0, .5, .5, 0, 0, 0], [0, 0, 0, .5, .5, 0], [0, 0, 0, 0, .5, .5], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], ]) R = (T > 0).astype(float) mdp_gnd = MDP([T, T], [R, R], gamma=0.9) phi = np.array([ [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], ]) mdp_abs = AbstractMDP(mdp_gnd, phi) # Even though this abstract MDP is Markov, is_markov() will return False, # since its conditions (while sufficient) are stricter than necessary assert not is_markov(mdp_abs)
def test_non_I_possibly_markov(): T1 = np.array([ #0 1 2 3 4 [0, .5, 0, .5, 0], # 0 [0, 0, 1, 0, 0], # 1 (action 1) [1, 0, 0, 0, 0], # 2 [0, .5, 0, .5, 0], # 3 (action 1) [1, 0, 0, 0, 0], # 4 ]) T2 = np.array([ #0 1 2 3 4 [0, .5, 0, .5, 0], # 0 [0, .5, 0, .5, 0], # 1 (action 2) [1, 0, 0, 0, 0], # 2 [0, 0, 0, 0, 1], # 3 (action 2) [1, 0, 0, 0, 0], # 4 ]) T = (.2 * T1 + .8 * T2) R = ((T1 + T2) > 0).astype(float) # mdp_gnd = MDP([T1, T2], [R, R], gamma=0.9) mdp_gnd = MDP([T, T], [R, R], gamma=0.9) phi = np.array([ [1, 0, 0], # 0 [0, 1, 0], # 1 [0, 0, 1], # 2 [0, 1, 0], # 3 [0, 0, 1], # 4 ]) p0 = np.array([1 / 3, 1 / 6, 1 / 6, 1 / 6, 1 / 6]) mdp_abs = AbstractMDP(mdp_gnd, phi, p0=p0) matching_I(mdp_abs) pi = mdp_gnd.get_policy(0) mdp_gnd.stationary_distribution(p0=p0, max_steps=200).round(4) p0_abs = np.array([1 / 3, 1 / 3, 1 / 3]) mdp_abs.stationary_distribution(p0=p0_abs, max_steps=100).round(3) mdp_gnd.get_N(pi=pi)
# # Note that because B(x|z) depends on the action selected at s0, B is not Markov. # Similarly, R(z',a,z) depends on the same additional history, so the abstraction # is not Markov either. T_list = np.array([ [[0, 1, 0, 0.0], [0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1]], [[0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1]], ]) R = np.array([[0, 0.5, 0, 0], [0, 0, 0, 1], [0, 0, 0, 2], [0, 0, 0, 0]]) phi = np.array([ [1, 0, 0], [0, 1, 0], [0, 1, 0], [0, 0, 1], ]) mdp1 = MDP(T_list, [R, R], gamma=0.9) mdp2 = AbstractMDP(mdp1, phi, p0=np.array([1, 0, 0, 0]), t=1) mdp2 = AbstractMDP(mdp1, phi) is_markov(mdp2) pi_g_list = mdp2.piecewise_constant_policies() pi_a_list = mdp2.abstract_policies() v_g_list = [vi(mdp1, pi)[0] for pi in pi_g_list] v_a_list = [vi(mdp2, pi)[0] for pi in pi_a_list] order_v_g = np.stack(sort_value_fns(v_g_list)).round(4) order_v_a = np.stack(sort_value_fns(v_a_list)).round(4) mdp2.p0 agg_state = mdp2.phi.sum(axis=0) > 1
[0, 0, 0, .5, .5, 0], [0, 0, 0, 0, .5, .5], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], ]) R = np.array([ [0, 1, 1, 0, 0, 0], [0, 0, 0, 2, 2, 0], [0, 0, 0, 0, 2, 2], [2, 0, 0, 0, 0, 0], [3, 0, 0, 0, 0, 0], [4, 0, 0, 0, 0, 0] ])/4 mdp1 = MDP([T, T], [R, R], gamma=0.9) phi = np.array([ [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], ]) mdp2 = AbstractMDP(mdp1, phi) v_star, q_star, pi_star = vi(mdp1) v_star, pi_star pi_g_list = mdp2.piecewise_constant_policies() pi_a_list = mdp2.abstract_policies()
T2 = np.array([ [0, 3/4, 1/4], [2/3, 1/4, 1/12], [0, 3/4, 1/4], ]) # T_alt = np.array([ # [1/2, 3/8, 1/8], # [1, 0, 0], # [1, 0, 0], # ]) R = np.array([ [0, 1, 1], [1, 0, 0], [1, 0, 0], ]) mdp1 = MDP([T1, T2], [R, R], gamma=0.9) v_star, q_star, pi_star = vi(mdp1) v_star, pi_star phi = np.array([ [1, 0], [0, 1], [0, 1], ]) mdp2 = AbstractMDP(mdp1, phi) v_phi_star, q_phi_star, pi_phi_star = vi(mdp2) v_phi_star # for each ground-state policy n_policies = mdp1.n_actions**mdp1.n_states
[0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 1] ]) else: # However, if we enforce a Markov abstraction, the policy # ranking remains good even for arbitrary rewards phi = np.array([ [1, 0, 0], [0, 1, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1], ]) mdp1 = MDP(T_list, R_list, gamma=0.9) mdp2 = AbstractMDP(mdp1, phi, p0=np.array([0,0,0,1,0]), t=200) mdp2 = AbstractMDP(mdp1, phi) is_markov(mdp2) is_hutter_markov(mdp2) has_block_dynamics(mdp2) pi_g_list = mdp2.piecewise_constant_policies() pi_a_list = mdp2.abstract_policies() v_g_list = [vi(mdp1, pi)[0] for pi in pi_g_list] v_a_list = [vi(mdp2, pi)[0] for pi in pi_a_list] order_v_g = np.stack(sort_value_fns(v_g_list)).round(4) order_v_a = np.stack(sort_value_fns(v_a_list)).round(4)
#%% # Generate (MDP, abstract MDP) pair T = np.array([ [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [1, 0, 0, 0], ]) R = np.array([ [3, 3, 3, 3], [4, 4, 4, 4], [2, 2, 2, 2], [1, 1, 1, 1], ]) / 4 mdp1 = MDP([T, T.transpose()], [R, R.transpose()], gamma=0.9) phi = np.array([ [1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0], ]) mdp2 = UniformAbstractMDP(mdp1, phi) is_markov(mdp2) v_g_star, q_g_star, pi_g_star = vi(mdp1) v_g_star, pi_g_star v_a_star, q_a_star, pi_a_star = vi(mdp2) v_a_star, pi_a_star
T2 = np.array([ [0, 3/4, 1/4], [2/3, 1/4, 1/12], [0, 3/4, 1/4], ]) # T_alt = np.array([ # [1/2, 3/8, 1/8], # [1, 0, 0], # [1, 0, 0], # ]) R = np.array([ [0, 1, 1], [1, 0, 0], [1, 0, 0], ]) mdp1 = MDP([T1, T2], [R, R], gamma=0.9) mdp2 = AbstractMDP(MDP([T0, T1], [R, R], gamma=0.9), np.array([[1,0],[0,1],[0,1]])) is_hutter_markov(mdp2) is_markov(mdp2) v_star, q_star, pi_star = vi(mdp1) v_star, pi_star phi = np.array([ [1, 0], [0, 1], [0, 1], ]) mdp2 = AbstractMDP(mdp1, phi) assert is_markov(mdp2) assert has_block_dynamics(mdp2)