예제 #1
0
def test_V_pi():
    mdp = build_SB_example35()

    print(mdp.reward)
    # random policy:
    policy = np.ones((mdp.P.shape[0], mdp.P.shape[1]))/mdp.P.shape[1]

    V_pi = calculate_V_pi(mdp.P, mdp.reward, policy, mdp.discount)

    assert np.allclose(np.round(V_pi, 1), np.array([3.3, 8.8, 4.4, 5.3, 1.5,
                                       1.5, 3.0, 2.3, 1.9, 0.5,
                                       0.1, 0.7, 0.7, 0.4, -0.4,
                                       -1.0, -0.4, -0.4, -0.6, -1.2,
                                       -1.9, -1.3, -1.2, -1.4, -2.0]))
예제 #2
0
파일: test_examples.py 프로젝트: d3sm0/emdp
def test_cakeworld_mdp():
    """Numerical test for cake world mdp.

    This numerical test ensures that calculations from implemented MDP represent
    those that are obtained from calculations in the paper.
    """
    epsilon = 0.1
    discount = 0.99
    built_mdp = build_cake_world_mdp(epsilon=0.1, discount=0.99)
    eval_policy = np.array([[0.5, 0.5], [0.5, 0.5]])

    calc_v_pi = analytic.calculate_V_pi(
        built_mdp.P, built_mdp.reward, eval_policy, built_mdp.discount)

    # Value of "Bad State" is independent of policy.
    expected_value = -2.0 * (1 + epsilon) / discount
    calculated_value = calc_v_pi[1]
    assert np.isclose(calculated_value, expected_value)