def test_missing_terminal_state_fail(): with pytest.raises(ValueError): with dsl.new() as new_mdp: dsl.state() dsl.action() new_mdp.validate()
def test_multi_actions_fail(): with pytest.raises(dsl.SyntaxError): with dsl.new(): actionA = dsl.action() actionB = dsl.action() actionA & actionB
def tiny_counterexample_env(): with dsl.new() as mdp: s0 = dsl.state() s1 = dsl.state() s2 = dsl.state() s3 = dsl.terminal_state() a0 = dsl.action() a1 = dsl.action() # s0 & a0 > s1 s0 & a1 > s2 s1 & a0 > s3 s1 & a1 > s3 s2 & a0 > s3 s2 & a1 > s3 # s0 & a0 > dsl.reward(0) s0 & a1 > dsl.reward(0) s1 & a0 > dsl.reward(2) s1 & a1 > dsl.reward(0) s2 & a0 > dsl.reward(0) s2 & a1 > dsl.reward(1) return mdp.validate()
def test_terminal_state(): with dsl.new() as new_mdp: dsl.terminal_state() dsl.action() solver = lp.LinearProgramming(new_mdp) assert np.isclose(solver.compute_q_table(), [0])
def test_alternatives(): with dsl.new(): stateA = dsl.state() stateB = dsl.state() actionA = dsl.action() actionB = dsl.action() (stateA | stateB) & (actionA | actionB) > (stateA | stateB) dsl.to_env()
def test_alternatives3(): with dsl.new(): stateA = dsl.state() stateB = dsl.state() actionA = dsl.action() actionB = dsl.action() (stateA | stateB) & ((actionA > stateA) | (actionB > stateB)) dsl.to_env()
def _multi_round_nmdp(): with dsl.new() as mdp: start = dsl.state() end = dsl.terminal_state() start & dsl.action() > dsl.reward(5) | start | end * 2 start & dsl.action() > dsl.reward(3) | start * 2 | end dsl.discount(0.5) return mdp.validate()
def test_coverage_nmrp(): with dsl.new(): stateA = dsl.state() stateB = dsl.state() actionA = dsl.action() actionB = dsl.action() stateA & actionA > stateA stateA & actionB > stateB stateB & (actionA | actionB) > stateB dsl.to_env()
def _one_round_dmdp(): with dsl.new() as mdp: start = dsl.state() end = dsl.terminal_state() action_0 = dsl.action() action_1 = dsl.action() start & (action_0 | action_1) > end start & action_1 > dsl.reward(1.) return mdp.validate()
def test_coverage(): with dsl.new() as new_mdp: stateA = dsl.state() stateB = dsl.state() actionA = dsl.action() actionB = dsl.action() stateA & actionA > stateA stateA & actionB > stateB stateB & (actionA | actionB) > stateB new_mdp.to_env() new_mdp.to_graph() return new_mdp.validate()
def long_counterexample_env(middle_steps=1): with dsl.new() as mdp: s0 = dsl.state() a0 = dsl.action() a1 = dsl.action() middle_l = [s0, dsl.state()] middle_r = [s0, dsl.state()] s0 & a0 > middle_l[-1] s0 & a1 > middle_r[-1] s0 & a0 > dsl.reward(0) s0 & a1 > dsl.reward(0) for step in range(middle_steps - 1): middle_l.append(dsl.state()) middle_l[-2] & a0 > middle_l[-1] middle_l[-2] & a1 > middle_l[-3] middle_l[-2] & a0 > dsl.reward(0) middle_l[-2] & a1 > dsl.reward(0) middle_r.append(dsl.state()) middle_r[-2] & a0 > middle_r[-3] middle_r[-2] & a1 > middle_r[-1] middle_r[-2] & a0 > dsl.reward(0) middle_r[-2] & a1 > dsl.reward(0) s3 = dsl.terminal_state() # middle_l[-1] & a0 > s3 middle_l[-1] & a1 > s3 middle_r[-1] & a0 > s3 middle_r[-1] & a1 > s3 # middle_l[-1] & a0 > dsl.reward(2) middle_l[-1] & a1 > dsl.reward(0) middle_r[-1] & a0 > dsl.reward(0) middle_r[-1] & a1 > dsl.reward(1) return mdp.validate()
def counterexample_env(): with dsl.new() as mdp: s1 = dsl.state() s2 = dsl.state() a1 = dsl.action() a2 = dsl.action() (s1 | s2) & a1 > s2 (s1 | s2) & a2 > s1 (s1 | s2) & a1 > dsl.reward(+1) (s1 | s2) & a2 > dsl.reward(0) return mdp.validate()
def test_coverage(): with dsl.new() as new_mdp: # type: mdp.MDPSpec start = dsl.state() finish = dsl.terminal_state() action = dsl.action() start & action > finish | dsl.reward(1) assert new_mdp.num_states == 2 assert new_mdp.num_actions == 1 new_mdp.to_graph() env: mdp.MDPEnv = new_mdp.to_env() state = env.reset() assert state == 0 state, reward, is_done, info = env.step(0) assert state == 1 assert reward == 1 assert is_done env.render(mode='rgb_array') env.render(mode='png')
def test_mapping_alternative_mismatch_fail(): with pytest.raises(dsl.SyntaxError): with dsl.new(): stateA = dsl.state() actionB = dsl.action() stateA > stateA | actionB
def test_weighted_rewards(): with dsl.new() as new_mdp: state = dsl.state() action = dsl.action() state & action > dsl.reward(1) * 1 state & action > dsl.reward(1) * 1 | dsl.reward(2) * 3
def test_weighted_next_states(): with dsl.new() as new_mdp: state = dsl.state() action = dsl.action() state & action > state * 0.5 state & action > state * 2 | state * 5 new_mdp.validate()
def test_divergence_raises(): with dsl.new() as new_mdp: start = dsl.state() action = dsl.action() start & action > start | dsl.reward(1) solver = lp.LinearProgramming(new_mdp) with pytest.raises(ValueError): solver.compute_v_vector(max_iterations=10)
def _two_round_dmdp(): with dsl.new() as mdp: start = dsl.state() better = dsl.state() worse = dsl.state() end = dsl.terminal_state() action_0 = dsl.action() action_1 = dsl.action() start & action_0 > better better & action_1 > dsl.reward(3) start & action_1 > worse worse & action_0 > dsl.reward(1) worse & action_1 > dsl.reward(2) (better | worse) & (action_0 | action_1) > end return mdp.validate()
def _two_round_nmdp(): with dsl.new() as mdp: start = dsl.state() a = dsl.state() b = dsl.state() end = dsl.terminal_state() action_0 = dsl.action() action_1 = dsl.action() start & action_0 > a a & action_0 > dsl.reward(-1) | dsl.reward(1) a & action_1 > dsl.reward(0) * 2 | dsl.reward(9) start & action_1 > b b & action_0 > dsl.reward(0) | dsl.reward(2) b & action_1 > dsl.reward(2) | dsl.reward(3) (a | b) & (action_0 | action_1) > end return mdp.validate()
def new_counterexample_env(): with dsl.new() as mdp: s1 = dsl.state() s2 = dsl.state() s3 = dsl.state() s4 = dsl.state() s5 = dsl.terminal_state() l = dsl.action() r = dsl.action() # s1 & l > s2 s1 & r > s4 s2 & l > s3 s2 & r > s1 s3 & l > s5 s3 & r > s2 s4 & l > s1 s4 & r > s5 # s1 & l > dsl.reward(-1) s1 & r > dsl.reward(-1) s2 & l > dsl.reward(-1) s2 & r > dsl.reward(-1) s3 & l > dsl.reward(-1) s3 & r > dsl.reward(-1) s4 & l > dsl.reward(-1) s4 & r > dsl.reward(-1) return mdp.validate()
def test_geometric_series(): with dsl.new() as new_mdp: start = dsl.state() action = dsl.action() start & action > dsl.reward(1) | start dsl.discount(0.5) solver = lp.LinearProgramming(new_mdp) assert np.allclose(solver.compute_v_vector(), [2.0]) assert np.allclose(solver.compute_q_table(), [[2.0]])
def test_multiple_actions(): with dsl.new() as new_mdp: start = dsl.state() state_a = dsl.state() state_b = dsl.state() action_a = dsl.action() action_b = dsl.action() either_action = action_a | action_b start & action_a > state_a start & action_b > state_b state_a & either_action > state_a | dsl.reward(1) state_b & either_action > state_b | dsl.reward(2) dsl.discount(1 / 3) solver = lp.LinearProgramming(new_mdp) assert np.allclose(solver.compute_v_vector(), [1, 1.5, 3]) assert np.allclose(solver.compute_q_table(), [[0.5, 1], [1.5, 1.5], [3, 3]])