def test_value_iteration(): # run policy iteration on v_pi random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent_v_pi_policy_iteration = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) iterate_policy_v_pi(mdp_agent_v_pi_policy_iteration, mdp_environment, 0.001, True) # run value iteration on v_pi random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent_v_pi_value_iteration = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) iterate_value_v_pi(mdp_agent_v_pi_value_iteration, mdp_environment, 0.001, 1, True) assert mdp_agent_v_pi_policy_iteration.pi == mdp_agent_v_pi_value_iteration.pi # run value iteration on q_pi random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent_q_pi_value_iteration = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) iterate_value_q_pi(mdp_agent_q_pi_value_iteration, mdp_environment, 0.001, 1, True) assert mdp_agent_q_pi_value_iteration.pi == mdp_agent_v_pi_policy_iteration.pi
def test_run(): random_state = RandomState(12345) mdp_environment: GamblersProblem = GamblersProblem( 'gamblers problem', random_state=random_state, T=None, p_h=0.4) agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) monitor = Monitor() state = mdp_environment.reset_for_new_run(agent) agent.reset_for_new_run(state) mdp_environment.run(agent, monitor) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_run.pickle', 'wb') as file: # pickle.dump(monitor, file) with open(f'{os.path.dirname(__file__)}/fixtures/test_run.pickle', 'rb') as file: fixture = pickle.load(file) assert monitor.t_average_reward == fixture.t_average_reward
def test_human_player(): random_state = RandomState(12345) human = Human() def mock_input(*_) -> str: s = human.most_recent_state selected_a = sample_list_item(s.AA, probs=None, random_state=random_state) return selected_a.name human.get_input = mock_input mancala: Mancala = Mancala(random_state=random_state, T=None, initial_count=4, player_2=human) epsilon = 0.05 p1 = ActionValueMdpAgent( 'player 1', random_state, 1, TabularStateActionValueEstimator(mancala, epsilon, None)) state = mancala.reset_for_new_run(p1) p1.reset_for_new_run(state) a = p1.act(0) state, reward = mancala.advance(state, 0, a, p1) assert mancala.board[7].count == 0 and state.i == 1 and reward.i == 2
def test_off_policy_monte_carlo_with_function_approximation(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = ApproximateStateActionValueEstimator( mdp_environment, 0.05, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), None, False, None, None ) # target agent mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, q_S_A ) # episode generation (behavior) policy off_policy_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None) ) iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=100, num_episodes_per_improvement=1, update_upon_every_visit=True, planning_environment=None, make_final_policy_greedy=False, off_policy_agent=off_policy_agent ) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_off_policy_monte_carlo_with_function_approximationo.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open(f'{os.path.dirname(__file__)}/fixtures/test_off_policy_monte_carlo_with_function_approximationo.pickle', 'rb') as file: pi_fixture, q_S_A_fixture = pickle.load(file) assert mdp_agent.pi == pi_fixture and q_S_A == q_S_A_fixture assert str(mdp_agent.pi.estimator[mdp_environment.SS[5]][mdp_environment.SS[5].AA[1]]).startswith('-2.4305') # make greedy q_S_A.epsilon = 0.0 assert q_S_A.improve_policy(mdp_agent, None, PolicyImprovementEvent.MAKING_POLICY_GREEDY) == -1 assert mdp_agent.pi.estimator.epsilon == 0.0
def test_learn(): random_state = RandomState(12345) mancala: Mancala = Mancala(random_state=random_state, T=None, initial_count=4, player_2=None) p1 = ActionValueMdpAgent( 'player 1', random_state, 1, TabularStateActionValueEstimator(mancala, 0.05, None)) checkpoint_path = iterate_value_q_pi( agent=p1, environment=mancala, num_improvements=3, num_episodes_per_improvement=100, update_upon_every_visit=False, planning_environment=None, make_final_policy_greedy=False, num_improvements_per_checkpoint=3, checkpoint_path=tempfile.NamedTemporaryFile(delete=False).name) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_mancala.pickle', 'wb') as file: # pickle.dump(p1.pi, file) with open(f'{os.path.dirname(__file__)}/fixtures/test_mancala.pickle', 'rb') as file: fixture = pickle.load(file) assert tabular_pi_legacy_eq(p1.pi, fixture) resumed_p1 = resume_from_checkpoint(checkpoint_path=checkpoint_path, resume_function=iterate_value_q_pi, num_improvements=2) # run same number of improvements without checkpoint...result should be the same. random_state = RandomState(12345) mancala: Mancala = Mancala(random_state=random_state, T=None, initial_count=4, player_2=None) no_checkpoint_p1 = ActionValueMdpAgent( 'player 1', random_state, 1, TabularStateActionValueEstimator(mancala, 0.05, None)) iterate_value_q_pi(agent=no_checkpoint_p1, environment=mancala, num_improvements=5, num_episodes_per_improvement=100, update_upon_every_visit=False, planning_environment=None, make_final_policy_greedy=False) assert no_checkpoint_p1.pi == resumed_p1.pi
def test_learn(): random_state = RandomState(12345) gym = Gym(random_state=random_state, T=None, gym_id='CartPole-v1') q_S_A = TabularStateActionValueEstimator(gym, 0.05, 0.001) mdp_agent = ActionValueMdpAgent('agent', random_state, 1, q_S_A) iterate_value_q_pi(agent=mdp_agent, environment=gym, num_improvements=10, num_episodes_per_improvement=100, num_updates_per_improvement=None, alpha=0.1, mode=Mode.SARSA, n_steps=1, planning_environment=None, make_final_policy_greedy=False) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_gym.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open(f'{os.path.dirname(__file__)}/fixtures/test_gym.pickle', 'rb') as file: fixture_pi, fixture_q_S_A = pickle.load(file) assert tabular_pi_legacy_eq(mdp_agent.pi, fixture_pi) and tabular_estimator_legacy_eq( q_S_A, fixture_q_S_A)
def test_sarsa_iterate_value_q_pi_with_trajectory_planning(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None) mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A) planning_environment = TrajectorySamplingMdpPlanningEnvironment( 'test planning', random_state, StochasticEnvironmentModel(), 10, None) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=100, num_episodes_per_improvement=1, num_updates_per_improvement=None, alpha=0.1, mode=Mode.SARSA, n_steps=1, planning_environment=planning_environment, make_final_policy_greedy=True) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_td_iteration_of_value_q_pi_planning.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open( f'{os.path.dirname(__file__)}/fixtures/test_td_iteration_of_value_q_pi_planning.pickle', 'rb') as file: pi_fixture, q_S_A_fixture = pickle.load(file) assert tabular_pi_legacy_eq(mdp_agent.pi, pi_fixture) and tabular_estimator_legacy_eq( q_S_A, q_S_A_fixture)
def test_evaluate_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, None, None) mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A) evaluated_states, _ = evaluate_q_pi(agent=mdp_agent, environment=mdp_environment, num_episodes=1000, exploring_starts=True, update_upon_every_visit=False) assert len( q_S_A) == len(evaluated_states) + 2 # terminal states aren't evaluated assert all(s in q_S_A for s in evaluated_states) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_action_value.pickle', 'wb') as file: # pickle.dump(q_S_A, file) with open( f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_action_value.pickle', 'rb') as file: fixture = pickle.load(file) assert tabular_estimator_legacy_eq(q_S_A, fixture)
def test_iterate_value_q_pi_with_pdf(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, q_S_A ) iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=3000, num_episodes_per_improvement=1, update_upon_every_visit=False, planning_environment=None, make_final_policy_greedy=False, num_improvements_per_plot=1500, pdf_save_path=tempfile.NamedTemporaryFile(delete=False).name ) with pytest.raises(ValueError, match='Epsilon must be >= 0'): q_S_A.epsilon = -1.0 q_S_A.improve_policy(mdp_agent, states=None, event=PolicyImprovementEvent.MAKING_POLICY_GREEDY) q_S_A.epsilon = 0.0 assert q_S_A.improve_policy(mdp_agent, None, PolicyImprovementEvent.MAKING_POLICY_GREEDY) == 14
def test_n_step_q_learning_iterate_value_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None) mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=10, num_episodes_per_improvement=100, num_updates_per_improvement=None, alpha=0.1, mode=Mode.Q_LEARNING, n_steps=3, planning_environment=None, make_final_policy_greedy=False) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_td_n_step_q_learning_iteration_of_value_q_pi.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open( f'{os.path.dirname(__file__)}/fixtures/test_td_n_step_q_learning_iteration_of_value_q_pi.pickle', 'rb') as file: fixture_pi, fixture_q_S_A = pickle.load(file) assert tabular_pi_legacy_eq(mdp_agent.pi, fixture_pi) and tabular_estimator_legacy_eq( q_S_A, fixture_q_S_A)
def test_iterate_value_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, q_S_A ) iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=3000, num_episodes_per_improvement=1, update_upon_every_visit=False, planning_environment=None, make_final_policy_greedy=False ) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_iteration_of_value_q_pi.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_iteration_of_value_q_pi.pickle', 'rb') as file: pi_fixture, q_S_A_fixture = pickle.load(file) assert tabular_pi_legacy_eq(mdp_agent.pi, pi_fixture) and tabular_estimator_legacy_eq(q_S_A, q_S_A_fixture)
def test_q_learning_iterate_value_q_pi_function_approximation_with_formula(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) q_S_A = ApproximateStateActionValueEstimator( mdp_environment, 0.05, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), StateActionIdentityFeatureExtractor(mdp_environment), f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})', False, None, None) mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=5, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=False) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open( f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation.pickle', 'rb') as file: pi_fixture, q_S_A_fixture = pickle.load(file) assert np.allclose(mdp_agent.pi.estimator.model.model.coef_, pi_fixture.estimator.model.model.coef_)
def test_invalid_iterate_value_q_pi(): # target agent random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, 0.0, None) ) # episode generation (behavior) policy off_policy_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, 0.0, None) ) with pytest.raises(ValueError, match='Planning environments are not currently supported for Monte Carlo iteration.'): iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=100, num_episodes_per_improvement=1, update_upon_every_visit=True, planning_environment=TrajectorySamplingMdpPlanningEnvironment('foo', random_state, StochasticEnvironmentModel(), 100, None), make_final_policy_greedy=False, off_policy_agent=off_policy_agent ) # test warning...no off-policy agent with epsilon=0.0 mdp_agent.q_S_A.epsilon = 0.0 iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=100, num_episodes_per_improvement=1, update_upon_every_visit=True, planning_environment=None, make_final_policy_greedy=False, off_policy_agent=None )
def test_q_learning_iterate_value_q_pi_function_approximation_policy_ne(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) epsilon = 0.05 q_S_A_1 = ApproximateStateActionValueEstimator( mdp_environment, epsilon, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), None, False, None, None) mdp_agent_1 = ActionValueMdpAgent('test', random_state, 1, q_S_A_1) iterate_value_q_pi(agent=mdp_agent_1, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=10, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True) q_S_A_2 = ApproximateStateActionValueEstimator( mdp_environment, epsilon, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), None, False, None, None) mdp_agent_2 = ActionValueMdpAgent('test', random_state, 1, q_S_A_2) iterate_value_q_pi(agent=mdp_agent_2, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=5, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True) assert mdp_agent_1.pi.estimator != mdp_agent_2.pi.estimator assert mdp_agent_1.pi.estimator.model != mdp_agent_2.pi.estimator.model
def test_invalid_improve_policy_with_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) epsilon = 0.0 mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, epsilon, None)) with pytest.raises(ValueError, match='Epsilon must be >= 0'): improve_policy_with_q_pi(mdp_agent, {}, -1)
def dump_agent() -> str: # create dummy mdp agent for runner # noinspection PyTypeChecker stochastic_mdp_agent = ActionValueMdpAgent('foo', RandomState(12345), 1.0, DummyQSA()) agent_path = tempfile.NamedTemporaryFile(delete=False).name with open(agent_path, 'wb') as f: pickle.dump(stochastic_mdp_agent, f) return agent_path
def test_q_learning_iterate_value_q_pi_tabular_policy_ne(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) epsilon = 0.05 q_S_A_1 = TabularStateActionValueEstimator(mdp_environment, epsilon, None) mdp_agent_1 = ActionValueMdpAgent('test', random_state, 1, q_S_A_1) iterate_value_q_pi(agent=mdp_agent_1, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=10, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True) q_S_A_2 = TabularStateActionValueEstimator(mdp_environment, epsilon, None) mdp_agent_2 = ActionValueMdpAgent('test', random_state, 1, q_S_A_2) iterate_value_q_pi(agent=mdp_agent_2, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=5, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True) test_state = mdp_environment.SS[5] test_action = test_state.AA[0] assert q_S_A_1 != q_S_A_2 assert q_S_A_1[test_state] != q_S_A_2[test_state] assert q_S_A_1[test_state][test_action] != q_S_A_2[test_state][test_action]
def test_policy_iteration(): # state-value policy iteration random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent_v_pi = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) iterate_policy_v_pi(mdp_agent_v_pi, mdp_environment, 0.001, True) # action-value policy iteration random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent_q_pi = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) iterate_policy_q_pi(mdp_agent_q_pi, mdp_environment, 0.001, True) # should get the same policy assert mdp_agent_v_pi.pi == mdp_agent_q_pi.pi
def test_evaluate_v_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) v_pi, _ = evaluate_v_pi(agent=mdp_agent, environment=mdp_environment, theta=0.001, num_iterations=None, update_in_place=True) random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) v_pi_not_in_place, _ = evaluate_v_pi(agent=mdp_agent, environment=mdp_environment, theta=0.001, num_iterations=None, update_in_place=False) assert list(v_pi.keys()) == list(v_pi_not_in_place.keys()) np.testing.assert_allclose(list(v_pi.values()), list(v_pi_not_in_place.values()), atol=0.01) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_iterative_policy_evaluation_of_state_value.pickle', 'wb') as file: # pickle.dump(v_pi, file) with open( f'{os.path.dirname(__file__)}/fixtures/test_iterative_policy_evaluation_of_state_value.pickle', 'rb') as file: fixture = pickle.load(file) assert v_pi == fixture
def test_agent_invalid_action(): random = RandomState() agent = ActionValueMdpAgent( 'foo', random, 1.0, TabularStateActionValueEstimator(Gridworld.example_4_1(random, None), None, None)) # test None action agent.__act__ = lambda t: None with pytest.raises(ValueError, match='Agent returned action of None'): agent.act(0) # test infeasible action action = Action(1, 'foo') agent.__act__ = lambda t: action state = MdpState(1, [], False) agent.sense(state, 0) with pytest.raises( ValueError, match=f'Action {action} is not feasible in state {state}'): agent.act(0)
def test_invalid_epsilon_iterate_value_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, 0.0, None)) with pytest.raises(ValueError, match='epsilon must be strictly > 0 for TD-learning'): iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=10, num_episodes_per_improvement=100, num_updates_per_improvement=None, alpha=0.1, mode=Mode.Q_LEARNING, n_steps=3, planning_environment=None, make_final_policy_greedy=False)
def train_thread_target(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, 0.1, None)) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=1000000, num_episodes_per_improvement=10, num_updates_per_improvement=None, alpha=0.1, mode=Mode.SARSA, n_steps=None, planning_environment=None, make_final_policy_greedy=False, thread_manager=thread_manager, num_improvements_per_plot=10)
def test_iterate_value_q_pi_with_pdf(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, 0.05, None)) iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=10, num_episodes_per_improvement=100, num_updates_per_improvement=None, alpha=0.1, mode=Mode.Q_LEARNING, n_steps=1, planning_environment=None, make_final_policy_greedy=False, num_improvements_per_plot=5, pdf_save_path=tempfile.NamedTemporaryFile(delete=False).name)
def test_gamblers_problem(): random_state = RandomState(12345) mdp_environment: GamblersProblem = GamblersProblem( 'gamblers problem', random_state=random_state, T=None, p_h=0.4) mdp_agent_v_pi_value_iteration = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) v_pi = iterate_value_v_pi(mdp_agent_v_pi_value_iteration, mdp_environment, 0.001, 1, True) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_gamblers_problem.pickle', 'wb') as file: # pickle.dump(v_pi, file) with open( f'{os.path.dirname(__file__)}/fixtures/test_gamblers_problem.pickle', 'rb') as file: fixture = pickle.load(file) assert v_pi == fixture
def test_evaluate_q_pi_invalid_n_steps(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) epsilon = 0.05 mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, epsilon, None) ) with pytest.raises(ValueError): evaluate_q_pi( agent=mdp_agent, environment=mdp_environment, num_episodes=5, num_updates_per_improvement=None, alpha=0.1, mode=Mode.Q_LEARNING, n_steps=-1, planning_environment=None )
def test_q_learning_iterate_value_q_pi_function_approximation_invalid_formula( ): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) q_S_A = ApproximateStateActionValueEstimator( mdp_environment, 0.05, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})', False, None, None) mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A) with pytest.raises(ValueError, match='Invalid combination of formula'): iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=5, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=False)
def test_evaluate_v_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) v_pi = evaluate_v_pi(agent=mdp_agent, environment=mdp_environment, num_episodes=1000) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_value.pickle', 'wb') as file: # pickle.dump(v_pi, file) with open( f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_value.pickle', 'rb') as file: fixture = pickle.load(file) assert v_pi == fixture
def evaluate_v_pi( agent: ActionValueMdpAgent, environment: MdpEnvironment, num_episodes: int ) -> Dict[MdpState, float]: """ Perform Monte Carlo evaluation of an agent's policy within an environment, returning state values. Uses a random action on the first time step to maintain exploration (exploring starts). This evaluation approach is only marginally useful in practice, as the state-value estimates require a model of the environmental dynamics (i.e., the transition-reward probability distribution) in order to be applied. See `evaluate_q_pi` in this module for a more feature-rich and useful evaluation approach (i.e., state-action value estimation). This evaluation function operates over rewards obtained at the end of episodes, so it is only appropriate for episodic tasks. :param agent: Agent. :param environment: Environment. :param num_episodes: Number of episodes to execute. :return: Dictionary of MDP states and their estimated values under the agent's policy. """ logging.info(f'Running Monte Carlo evaluation of v_pi for {num_episodes} episode(s).') v_pi: Dict[MdpState, IncrementalSampleAverager] = { terminal_state: IncrementalSampleAverager() for terminal_state in environment.terminal_states } episodes_per_print = int(num_episodes * 0.05) for episode_i in range(num_episodes): # start the environment in a random state state = environment.reset_for_new_run(agent) agent.reset_for_new_run(state) # simulate until episode termination, keeping a trace of states and their immediate rewards, as well as the # times of their first visits. t = 0 state_first_t = {} t_state_reward = [] while not state.terminal and (environment.T is None or t < environment.T): if state not in state_first_t: state_first_t[state] = t if t == 0: a = sample_list_item(state.AA, None, environment.random_state) else: a = agent.act(t) next_state, reward = environment.advance(state, t, a, agent) t_state_reward.append((t, state, reward)) state = next_state t += 1 agent.sense(state, t) # work backwards through the trace to calculate discounted returns. need to work backward in order for the value # of g at each time step t to be properly discounted. g = 0 for t, state, reward in reversed(t_state_reward): g = agent.gamma * g + reward.r # if the current time step was the first visit to the state, then g is the discounted sample value. add it # to our average. if state_first_t[state] == t: if state not in v_pi: v_pi[state] = IncrementalSampleAverager() v_pi[state].update(g) episodes_finished = episode_i + 1 if episodes_finished % episodes_per_print == 0: logging.info(f'Finished {episodes_finished} of {num_episodes} episode(s).') return { s: v_pi[s].get_value() for s in v_pi }
def evaluate_q_pi( agent: ActionValueMdpAgent, environment: MdpEnvironment, num_episodes: int, num_updates_per_improvement: Optional[int], alpha: Optional[float], mode: Mode, n_steps: Optional[int], planning_environment: Optional[MdpPlanningEnvironment] ) -> Tuple[Set[MdpState], float]: """ Perform temporal-difference (TD) evaluation of an agent's policy within an environment, returning state-action values. This evaluation function implements both on-policy TD learning (SARSA) as well as off-policy TD learning (Q-learning and expected SARSA), and n-step updates are implemented for all learning modes. :param agent: Agent containing target policy to be optimized. :param environment: Environment. :param num_episodes: Number of episodes to execute. :param num_updates_per_improvement: Number of state-action value updates to execute for each iteration of policy improvement, or None for policy improvement per specified number of episodes. :param alpha: Constant step size to use when updating Q-values, or None for 1/n step size. :param mode: Evaluation mode (see `rlai.gpi.temporal_difference.evaluation.Mode`). :param n_steps: Number of steps to accumulate rewards before updating estimated state-action values. Must be in the range [1, inf], or None for infinite step size (Monte Carlo evaluation). :param planning_environment: Planning environment to learn through experience gained during evaluation, or None to not learn an environment model. :return: 2-tuple of (1) set of only those states that were evaluated, and (2) the average reward obtained per episode. """ if n_steps is not None and n_steps < 1: raise ValueError( 'The value of n_steps must be in range [1, inf], or None.') logging.info( f'Running temporal-difference evaluation of q_pi for {num_episodes} episode(s).' ) evaluated_states = set() planning = isinstance(environment, MdpPlanningEnvironment) # prioritized sampling requires access to the bootstrapped state-action value function, and it also requires # access to the state-action value estimators. if isinstance(environment, PrioritizedSweepingMdpPlanningEnvironment): environment.bootstrap_function = partial( get_bootstrapped_state_action_value, mode=mode, agent=agent, q_S_A=agent.q_S_A, environment=environment) environment.q_S_A = agent.q_S_A # run episodes episode_reward_averager = IncrementalSampleAverager() episodes_per_print = max(1, int(num_episodes * 0.05)) for episode_i in range(num_episodes): # reset the environment for the new run, and reset the agent accordingly. curr_state = environment.reset_for_new_run(agent) agent.reset_for_new_run(curr_state) # simulate until episode termination. begin by taking an action in the first state. curr_t = 0 curr_a = agent.act(curr_t) total_reward = 0.0 t_state_a_g: Dict[int, Tuple[MdpState, Action, float]] = { } # dictionary from time steps to tuples of state, action, and truncated return. next_state_q_s_a = 0.0 while not curr_state.terminal and (environment.T is None or curr_t < environment.T): advance_result, next_reward = environment.advance(state=curr_state, t=curr_t, a=curr_a, agent=agent) logging.debug(f'Obtained reward: {next_reward}\n') # in the case of a planning-based advancement, the planning environment returns a 3-tuple of the current # state, current action, and next state. this is because the planning environment may revise any one of # these variables to conduct the planning process (e.g., by prioritized sweeping). if planning: curr_state, curr_a, next_state = advance_result else: next_state = advance_result next_t = curr_t + 1 agent.sense(next_state, next_t) # if we're building an environment model, then update it with the transition we just observed. if planning_environment is not None: planning_environment.model.update(curr_state, curr_a, next_state, next_reward) # initialize the n-step, truncated return accumulator at the current time for the current state and action. t_state_a_g[curr_t] = (curr_state, curr_a, 0.0) # ask the agent to shape the reward, returning the time steps whose returns should be updated and the shaped # reward associated with each. if n_steps is None, then shape the reward all the way back to the start # (equivalent to infinite n_steps, or monte carlo returns). if n_steps is not None, then shape the reward # for n-step updates. if n_steps is None: first_t = 0 else: # in 1-step td, the earliest time step is the final time step; in 2-step, the earliest time step is the # prior time step, etc. first_t = max(0, curr_t - n_steps + 1) t_shaped_reward = agent.shape_reward(reward=next_reward, first_t=first_t, final_t=curr_t) # update return accumulators with shaped rewards t_state_a_g.update({ return_t: (t_state_a_g[return_t][0], t_state_a_g[return_t][1], t_state_a_g[return_t][2] + shaped_reward) for return_t, shaped_reward in t_shaped_reward.items() # reward shapers might return invalid time steps. ignore these. if return_t in t_state_a_g }) # get the next state's bootstrapped value and next action, based on the bootstrapping mode. note that the # bootstrapped next state-action value is only used if we're performing n-step updates below. next_state_q_s_a, next_a = get_bootstrapped_state_action_value( state=next_state, t=next_t, mode=mode, agent=agent, q_S_A=agent.q_S_A, environment=environment) # only update if n_steps is finite (not monte carlo) if n_steps is not None: update_q_S_A( q_S_A=agent.q_S_A, n_steps=n_steps, curr_t=curr_t, t_state_a_g=t_state_a_g, agent=agent, next_state_q_s_a=next_state_q_s_a, alpha=alpha, evaluated_states=evaluated_states, planning_environment=planning_environment, num_updates_per_improvement=num_updates_per_improvement) # advance the episode curr_t = next_t curr_state = next_state curr_a = next_a total_reward += next_reward.r # flush out the remaining n-step updates. if we terminated because we reached a terminal state, then all next- # state values for the updates are zero. if instead we terminated because we reached the maximum number of time # steps, then use the bootstrapped next-state value instead. this is an important distinction because these # value can be dramatically different, and when discounting is not used (or is very small), the resulting value # estimates can be correspondingly different. if curr_state.terminal: next_state_q_s_a = 0.0 flush_n_steps = len(t_state_a_g) + 1 while len(t_state_a_g) > 0: update_q_S_A( q_S_A=agent.q_S_A, n_steps=flush_n_steps, curr_t=curr_t, t_state_a_g=t_state_a_g, agent=agent, next_state_q_s_a=next_state_q_s_a, alpha=alpha, evaluated_states=evaluated_states, planning_environment=planning_environment, num_updates_per_improvement=num_updates_per_improvement) curr_t += 1 episode_reward_averager.update(total_reward) episodes_finished = episode_i + 1 if episodes_finished % episodes_per_print == 0: logging.info( f'Finished {episodes_finished} of {num_episodes} episode(s).') return evaluated_states, episode_reward_averager.get_value()
def test_policy_overrides(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) epsilon = 0.05 q_S_A = ApproximateStateActionValueEstimator( mdp_environment, epsilon, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), None, False, None, None) mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A) iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=10, num_episodes_per_improvement=20, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True, ) random_state = RandomState(12345) mdp_environment_2: Gridworld = Gridworld.example_4_1(random_state, 20) q_S_A_2 = ApproximateStateActionValueEstimator( mdp_environment_2, epsilon, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment_2), None, False, None, None) mdp_agent_2 = ActionValueMdpAgent('test', random_state, 1, q_S_A_2) iterate_value_q_pi(agent=mdp_agent_2, environment=mdp_environment_2, num_improvements=10, num_episodes_per_improvement=20, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True) assert isinstance( mdp_agent_2.most_recent_state, MdpState) and mdp_agent_2.most_recent_state in mdp_agent_2.pi with pytest.raises(ValueError, match='Attempted to check for None in policy.'): # noinspection PyTypeChecker if None in mdp_agent_2.pi: # pragma no cover pass assert mdp_agent.pi == mdp_agent_2.pi assert not (mdp_agent.pi != mdp_agent_2.pi)