def test_evaluate_mrp(self):
        v = evaluate_mrp_result(self.finite_flip_flop, gamma=0.99)

        self.assertEqual(len(v), 2)

        for s in v:
            self.assertLess(abs(v[s] - 170), 0.1)
    def test_compare_to_backward_induction(self):
        finite_horizon = finite_horizon_MRP(self.finite_flip_flop, 10)

        v = evaluate_mrp_result(finite_horizon, gamma=1)
        self.assertEqual(len(v), 20)

        finite_v =\
            list(evaluate(unwrap_finite_horizon_MRP(finite_horizon), gamma=1))

        for time in range(0, 10):
            self.assertAlmostEqual(v[WithTime(state=True, time=time)],
                                   finite_v[time][True])
            self.assertAlmostEqual(v[WithTime(state=False, time=time)],
                                   finite_v[time][False])
示例#3
0
    print("---------------")
    implied_mrp.display_reward_function()
    print()

    print("Implied MRP Value Function")
    print("--------------")
    implied_mrp.display_value_function(gamma=user_gamma)
    print()

    from rl.dynamic_programming import evaluate_mrp_result
    from rl.dynamic_programming import policy_iteration_result
    from rl.dynamic_programming import value_iteration_result

    print("Implied MRP Policy Evaluation Value Function")
    print("--------------")
    pprint(evaluate_mrp_result(implied_mrp, gamma=user_gamma))
    print()

    print("MDP Policy Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_pi, opt_policy_pi = policy_iteration_result(fe_mdp,
                                                       gamma=user_gamma)
    pprint(opt_vf_pi)
    print(opt_policy_pi)
    print()

    print("MDP Value Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_vi, opt_policy_vi = value_iteration_result(fe_mdp, gamma=user_gamma)
    pprint(opt_vf_vi)
    print(opt_policy_vi)