def test_iterate_value_q_pi_with_pdf(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None) mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=3000, num_episodes_per_improvement=1, update_upon_every_visit=False, planning_environment=None, make_final_policy_greedy=False, q_S_A=q_S_A, num_improvements_per_plot=1500, pdf_save_path=tempfile.NamedTemporaryFile(delete=False).name) with pytest.raises(ValueError, match='Epsilon must be >= 0'): q_S_A.epsilon = -1.0 q_S_A.improve_policy(mdp_agent, states=None, event=PolicyImprovementEvent.MAKING_POLICY_GREEDY) q_S_A.epsilon = 0.0 assert q_S_A.improve_policy( mdp_agent, None, PolicyImprovementEvent.MAKING_POLICY_GREEDY) == 14
def test_invalid_iterate_value_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.0, None) # target agent mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) # episode generation (behavior) policy off_policy_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) with pytest.raises( ValueError, match= 'Planning environments are not currently supported for Monte Carlo iteration.' ): iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=100, num_episodes_per_improvement=1, update_upon_every_visit=True, planning_environment=TrajectorySamplingMdpPlanningEnvironment( 'foo', random_state, StochasticEnvironmentModel(), 100, None), make_final_policy_greedy=False, q_S_A=q_S_A, off_policy_agent=off_policy_agent) # test warning...no off-policy agent with epsilon=0.0 q_S_A.epsilon = 0.0 iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=100, num_episodes_per_improvement=1, update_upon_every_visit=True, planning_environment=None, make_final_policy_greedy=False, q_S_A=q_S_A, off_policy_agent=None)