Exemplo n.º 1
0
def test_evaluate_q_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, None, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    evaluated_states, _ = evaluate_q_pi(agent=mdp_agent,
                                        environment=mdp_environment,
                                        num_episodes=1000,
                                        exploring_starts=True,
                                        update_upon_every_visit=False,
                                        q_S_A=q_S_A)

    assert len(
        q_S_A) == len(evaluated_states) + 2  # terminal states aren't evaluated
    assert all(s in q_S_A for s in evaluated_states)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_action_value.pickle', 'wb') as file:
    #     pickle.dump(q_S_A, file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_action_value.pickle',
            'rb') as file:
        fixture = pickle.load(file)

    assert tabular_estimator_legacy_eq(q_S_A, fixture)
Exemplo n.º 2
0
def test_rescale():

    random_state = RandomState(12345)

    gym = Gym(random_state=random_state,
              T=None,
              gym_id='LunarLanderContinuous-v2')

    fex = ContinuousFeatureExtractor()

    policy = ContinuousActionBetaDistributionPolicy(gym, fex, False)

    agent = StochasticMdpAgent('test', random_state, policy, 0.9)
    state = gym.reset_for_new_run(agent)
    policy.set_action(state)

    assert np.allclose(policy.rescale(np.array([0.0, 0.5])),
                       np.array([-1.0, 0.0]))
    assert np.allclose(policy.rescale(np.array([0.5, 1.0])),
                       np.array([0.0, 1.0]))

    assert np.allclose(
        policy.invert_rescale(policy.rescale(np.array([0.0, 0.5]))),
        np.array([0.0, 0.5]))
    assert np.allclose(
        policy.invert_rescale(policy.rescale(np.array([0.5, 1.0]))),
        np.array([0.5, 1.0]))
Exemplo n.º 3
0
def test_gamblers_problem():

    random_state = RandomState(12345)

    mdp_environment: GamblersProblem = GamblersProblem(
        'gamblers problem',
        random_state=random_state,
        T=None,
        p_h=0.4
    )

    mdp_agent_v_pi_value_iteration = StochasticMdpAgent(
        'test',
        random_state,
        TabularPolicy(None, mdp_environment.SS),
        1
    )

    v_pi = iterate_value_v_pi(
        mdp_agent_v_pi_value_iteration,
        mdp_environment,
        0.001,
        1,
        True
    )

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_gamblers_problem.pickle', 'wb') as file:
    #     pickle.dump(v_pi, file)

    with open(f'{os.path.dirname(__file__)}/fixtures/test_gamblers_problem.pickle', 'rb') as file:
        fixture = pickle.load(file)

    assert v_pi == fixture
Exemplo n.º 4
0
def test_iterate_value_q_pi_with_pdf():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=10,
        num_episodes_per_improvement=100,
        num_updates_per_improvement=None,
        alpha=0.1,
        mode=Mode.Q_LEARNING,
        n_steps=1,
        planning_environment=None,
        make_final_policy_greedy=False,
        q_S_A=q_S_A,
        num_improvements_per_plot=5,
        pdf_save_path=tempfile.NamedTemporaryFile(delete=False).name)
Exemplo n.º 5
0
def test_n_step_q_learning_iterate_value_q_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=10,
                       num_episodes_per_improvement=100,
                       num_updates_per_improvement=None,
                       alpha=0.1,
                       mode=Mode.Q_LEARNING,
                       n_steps=3,
                       planning_environment=None,
                       make_final_policy_greedy=False,
                       q_S_A=q_S_A)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_td_n_step_q_learning_iteration_of_value_q_pi.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_td_n_step_q_learning_iteration_of_value_q_pi.pickle',
            'rb') as file:
        fixture_pi, fixture_q_S_A = pickle.load(file)

    assert tabular_pi_legacy_eq(mdp_agent.pi,
                                fixture_pi) and tabular_estimator_legacy_eq(
                                    q_S_A, fixture_q_S_A)
Exemplo n.º 6
0
def test_q_learning_iterate_value_q_pi_function_approximation_invalid_formula(
):

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, 0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment),
        f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})',
        False, None, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    with pytest.raises(ValueError, match='Invalid combination of formula'):
        iterate_value_q_pi(agent=mdp_agent,
                           environment=mdp_environment,
                           num_improvements=5,
                           num_episodes_per_improvement=5,
                           num_updates_per_improvement=None,
                           alpha=None,
                           mode=Mode.Q_LEARNING,
                           n_steps=None,
                           planning_environment=None,
                           make_final_policy_greedy=False,
                           q_S_A=q_S_A)
Exemplo n.º 7
0
def test_iterate_value_q_pi_with_pdf():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=3000,
        num_episodes_per_improvement=1,
        update_upon_every_visit=False,
        planning_environment=None,
        make_final_policy_greedy=False,
        q_S_A=q_S_A,
        num_improvements_per_plot=1500,
        pdf_save_path=tempfile.NamedTemporaryFile(delete=False).name)

    with pytest.raises(ValueError, match='Epsilon must be >= 0'):
        q_S_A.epsilon = -1.0
        q_S_A.improve_policy(mdp_agent,
                             states=None,
                             event=PolicyImprovementEvent.MAKING_POLICY_GREEDY)

    q_S_A.epsilon = 0.0
    assert q_S_A.improve_policy(
        mdp_agent, None, PolicyImprovementEvent.MAKING_POLICY_GREEDY) == 14
Exemplo n.º 8
0
def test_learn():

    random_state = RandomState(12345)

    gym = Gym(random_state=random_state, T=None, gym_id='CartPole-v1')

    q_S_A = TabularStateActionValueEstimator(gym, 0.05, 0.001)

    mdp_agent = StochasticMdpAgent('agent', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=gym,
                       num_improvements=10,
                       num_episodes_per_improvement=100,
                       num_updates_per_improvement=None,
                       alpha=0.1,
                       mode=Mode.SARSA,
                       n_steps=1,
                       planning_environment=None,
                       make_final_policy_greedy=False,
                       q_S_A=q_S_A)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_gym.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(f'{os.path.dirname(__file__)}/fixtures/test_gym.pickle',
              'rb') as file:
        fixture_pi, fixture_q_S_A = pickle.load(file)

    assert tabular_pi_legacy_eq(mdp_agent.pi,
                                fixture_pi) and tabular_estimator_legacy_eq(
                                    q_S_A, fixture_q_S_A)
Exemplo n.º 9
0
def test_iterate_value_q_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=3000,
                       num_episodes_per_improvement=1,
                       update_upon_every_visit=False,
                       planning_environment=None,
                       make_final_policy_greedy=False,
                       q_S_A=q_S_A)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_iteration_of_value_q_pi.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_iteration_of_value_q_pi.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert tabular_pi_legacy_eq(mdp_agent.pi,
                                pi_fixture) and tabular_estimator_legacy_eq(
                                    q_S_A, q_S_A_fixture)
Exemplo n.º 10
0
def test_off_policy_monte_carlo_with_function_approximation():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, 0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    # target agent
    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    # episode generation (behavior) policy
    off_policy_agent = StochasticMdpAgent('test', random_state,
                                          TabularPolicy(None, None), 1)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=100,
                       num_episodes_per_improvement=1,
                       update_upon_every_visit=True,
                       planning_environment=None,
                       make_final_policy_greedy=False,
                       q_S_A=q_S_A,
                       off_policy_agent=off_policy_agent)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_off_policy_monte_carlo_with_function_approximationo.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_off_policy_monte_carlo_with_function_approximationo.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert mdp_agent.pi == pi_fixture and q_S_A == q_S_A_fixture
    assert str(mdp_agent.pi.estimator[mdp_environment.SS[5]][
        mdp_environment.SS[5].AA[1]]).startswith('-1.4524')

    # make greedy
    q_S_A.epsilon = 0.0
    assert q_S_A.improve_policy(
        mdp_agent, None, PolicyImprovementEvent.MAKING_POLICY_GREEDY) == -1
    assert mdp_agent.pi.estimator.epsilon == 0.0
Exemplo n.º 11
0
def test_agent_invalid_action():

    random = RandomState()
    agent = StochasticMdpAgent('foo', random, TabularPolicy(None, None), 1.0)

    # test None action
    agent.__act__ = lambda t: None

    with pytest.raises(ValueError, match='Agent returned action of None'):
        agent.act(0)

    # test infeasible action
    action = Action(1, 'foo')
    agent.__act__ = lambda t: action
    state = MdpState(1, [], False)
    agent.sense(state, 0)
    with pytest.raises(ValueError, match=f'Action {action} is not feasible in state {state}'):
        agent.act(0)
Exemplo n.º 12
0
def test_human_player_mutator():

    random = RandomState()
    mancala = Mancala(
        random, None, 5,
        StochasticMdpAgent('foo', random, TabularPolicy(None, []), 1.0))
    Mancala.human_player_mutator(mancala)

    assert isinstance(mancala.player_2, Human)
Exemplo n.º 13
0
def test_invalid_iterate_value_q_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.0, None)

    # target agent
    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    # episode generation (behavior) policy
    off_policy_agent = StochasticMdpAgent('test', random_state,
                                          q_S_A.get_initial_policy(), 1)

    with pytest.raises(
            ValueError,
            match=
            'Planning environments are not currently supported for Monte Carlo iteration.'
    ):
        iterate_value_q_pi(
            agent=mdp_agent,
            environment=mdp_environment,
            num_improvements=100,
            num_episodes_per_improvement=1,
            update_upon_every_visit=True,
            planning_environment=TrajectorySamplingMdpPlanningEnvironment(
                'foo', random_state, StochasticEnvironmentModel(), 100, None),
            make_final_policy_greedy=False,
            q_S_A=q_S_A,
            off_policy_agent=off_policy_agent)

    # test warning...no off-policy agent with epsilon=0.0
    q_S_A.epsilon = 0.0
    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=100,
                       num_episodes_per_improvement=1,
                       update_upon_every_visit=True,
                       planning_environment=None,
                       make_final_policy_greedy=False,
                       q_S_A=q_S_A,
                       off_policy_agent=None)
def main():

    random_state = RandomState(12345)

    environment = Gym(
        random_state=random_state,
        T=None,
        gym_id='CartPole-v1',
        continuous_action_discretization_resolution=None,
        render_every_nth_episode=100
    )

    model = SKLearnSGD(
        loss='squared_loss',
        alpha=0.0,
        learning_rate='constant',
        eta0=0.0001,
        scale_eta0_for_y=False
    )

    feature_extractor = CartpoleFeatureExtractor(
        environment=environment
    )

    q_S_A = ApproximateStateActionValueEstimator(
        environment=environment,
        epsilon=0.02,
        model=model,
        feature_extractor=feature_extractor,
        formula=None,
        plot_model=False,
        plot_model_per_improvements=None,
        plot_model_bins=None
    )

    agent = StochasticMdpAgent(
        name='Cartpole Agent',
        random_state=random_state,
        pi=q_S_A.get_initial_policy(),
        gamma=0.95
    )

    iterate_value_q_pi(
        agent=agent,
        environment=environment,
        num_improvements=15000,
        num_episodes_per_improvement=1,
        num_updates_per_improvement=1,
        alpha=None,
        mode=Mode.SARSA,
        n_steps=None,
        planning_environment=None,
        make_final_policy_greedy=True,
        q_S_A=q_S_A,
        num_improvements_per_plot=100
    )
def main():

    random = RandomState(12345)
    gridworld = Gridworld.example_4_1(random, None)

    # the bottom-right corner (3,3) is a goal state. get the states surrounding this goal. these will become the sticky
    # states.
    sticky_states = [
        gridworld.grid[2, 2], gridworld.grid[2, 3], gridworld.grid[3, 2]
    ]

    # amplify all negative rewards in the sticky states by a factor of 10, keeping the probabilities the same.
    for sticky_state in sticky_states:
        for a in gridworld.p_S_prime_R_given_S_A[sticky_state]:
            for s_prime in gridworld.p_S_prime_R_given_S_A[sticky_state][a]:
                gridworld.p_S_prime_R_given_S_A[sticky_state][a][s_prime] = {
                    Reward(r.i, (r.r * 10.0 if r.r < 0.0 else r.r)):
                    gridworld.p_S_prime_R_given_S_A[sticky_state][a][s_prime]
                    [r]
                    for r in gridworld.p_S_prime_R_given_S_A[sticky_state][a]
                    [s_prime]
                }

    epsilon = 0.1

    q_S_A = TabularStateActionValueEstimator(
        environment=gridworld,
        epsilon=epsilon,
        continuous_state_discretization_resolution=None)

    pi = q_S_A.get_initial_policy()

    mdp_agent = StochasticMdpAgent(name='agent',
                                   random_state=random,
                                   pi=pi,
                                   gamma=1.0)

    # iterate the agents policy using q-learning temporal differencing
    iterate_value_q_pi(agent=mdp_agent,
                       environment=gridworld,
                       num_improvements=20,
                       num_episodes_per_improvement=100,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A,
                       num_improvements_per_plot=20)

    for s in pi:
        print(f'State {s.i}:')
        for a in pi[s]:
            if pi[s][a] > 0.0:
                print(f'\tPr({a.name}):  {pi[s][a]}')
Exemplo n.º 16
0
def dump_agent() -> str:

    # create dummy mdp agent for runner
    stochastic_mdp_agent = StochasticMdpAgent('foo', RandomState(12345),
                                              TabularPolicy(None, None), 1.0)
    agent_path = tempfile.NamedTemporaryFile(delete=False).name
    with open(agent_path, 'wb') as f:
        pickle.dump(stochastic_mdp_agent, f)

    return agent_path
Exemplo n.º 17
0
def test_policy_iteration():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    # state-value policy iteration
    mdp_agent_v_pi = StochasticMdpAgent(
        'test',
        random_state,
        TabularPolicy(None, mdp_environment.SS),
        1
    )

    iterate_policy_v_pi(
        mdp_agent_v_pi,
        mdp_environment,
        0.001,
        True
    )

    # action-value policy iteration
    mdp_agent_q_pi = StochasticMdpAgent(
        'test',
        random_state,
        TabularPolicy(None, mdp_environment.SS),
        1
    )

    iterate_policy_q_pi(
        mdp_agent_q_pi,
        mdp_environment,
        0.001,
        True
    )

    # should get the same policy
    assert mdp_agent_v_pi.pi == mdp_agent_q_pi.pi
Exemplo n.º 18
0
def test_invalid_improve_policy_with_q_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    epsilon = 0.0

    q_S_A = TabularStateActionValueEstimator(mdp_environment, epsilon, None)

    # target agent
    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    with pytest.raises(ValueError, match='Epsilon must be >= 0'):
        improve_policy_with_q_pi(mdp_agent, {}, -1)
Exemplo n.º 19
0
def test_q_learning_iterate_value_q_pi_function_approximation_no_formula():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, 0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=10,
                       num_episodes_per_improvement=20,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation_no_formula.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation_no_formula.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert np.allclose(mdp_agent.pi.estimator.model.model.coef_,
                       pi_fixture.estimator.model.model.coef_)
    assert mdp_agent.pi.format_state_action_probs(
        mdp_environment.SS) == pi_fixture.format_state_action_probs(
            mdp_environment.SS)
    assert mdp_agent.pi.format_state_action_values(
        mdp_environment.SS) == pi_fixture.format_state_action_values(
            mdp_environment.SS)
Exemplo n.º 20
0
    def train_thread_target():
        random_state = RandomState(12345)

        mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

        q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None)

        mdp_agent = StochasticMdpAgent('test', random_state,
                                       q_S_A.get_initial_policy(), 1)

        iterate_value_q_pi(agent=mdp_agent,
                           environment=mdp_environment,
                           num_improvements=1000000,
                           num_episodes_per_improvement=10,
                           update_upon_every_visit=False,
                           planning_environment=None,
                           make_final_policy_greedy=False,
                           q_S_A=q_S_A,
                           thread_manager=thread_manager,
                           num_improvements_per_plot=10)
Exemplo n.º 21
0
    def init_from_arguments(
            cls, args: List[str],
            random_state: RandomState) -> Tuple[Environment, List[str]]:
        """
        Initialize an environment from arguments.

        :param args: Arguments.
        :param random_state: Random state.
        :return: 2-tuple of an environment and a list of unparsed arguments.
        """

        parsed_args, unparsed_args = parse_arguments(cls, args)

        mancala = cls(random_state=random_state,
                      player_2=StochasticMdpAgent('environmental agent',
                                                  random_state,
                                                  TabularPolicy(None,
                                                                None), 1),
                      **vars(parsed_args))

        return mancala, unparsed_args
Exemplo n.º 22
0
def test_sarsa_iterate_value_q_pi_with_trajectory_planning():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    planning_environment = TrajectorySamplingMdpPlanningEnvironment(
        'test planning', random_state, StochasticEnvironmentModel(), 10, None)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=100,
                       num_episodes_per_improvement=1,
                       num_updates_per_improvement=None,
                       alpha=0.1,
                       mode=Mode.SARSA,
                       n_steps=1,
                       planning_environment=planning_environment,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_td_iteration_of_value_q_pi_planning.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_td_iteration_of_value_q_pi_planning.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert tabular_pi_legacy_eq(mdp_agent.pi,
                                pi_fixture) and tabular_estimator_legacy_eq(
                                    q_S_A, q_S_A_fixture)
Exemplo n.º 23
0
def test_evaluate_v_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   TabularPolicy(None, mdp_environment.SS), 1)

    v_pi = evaluate_v_pi(agent=mdp_agent,
                         environment=mdp_environment,
                         num_episodes=1000)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_value.pickle', 'wb') as file:
    #     pickle.dump(v_pi, file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_value.pickle',
            'rb') as file:
        fixture = pickle.load(file)

    assert v_pi == fixture
Exemplo n.º 24
0
def test_evaluate_q_pi_invalid_n_steps():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    epsilon = 0.05

    q_S_A = TabularStateActionValueEstimator(mdp_environment, epsilon, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    with pytest.raises(ValueError):
        evaluate_q_pi(agent=mdp_agent,
                      environment=mdp_environment,
                      num_episodes=5,
                      num_updates_per_improvement=None,
                      alpha=0.1,
                      mode=Mode.Q_LEARNING,
                      n_steps=-1,
                      planning_environment=None,
                      q_S_A=q_S_A)
Exemplo n.º 25
0
def test_evaluate_q_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   TabularPolicy(None, mdp_environment.SS), 1)

    q_pi, _ = evaluate_q_pi(agent=mdp_agent,
                            environment=mdp_environment,
                            theta=0.001,
                            num_iterations=100,
                            update_in_place=True)

    q_pi_not_in_place, _ = evaluate_q_pi(agent=mdp_agent,
                                         environment=mdp_environment,
                                         theta=0.001,
                                         num_iterations=200,
                                         update_in_place=False)

    assert list(q_pi.keys()) == list(q_pi_not_in_place.keys())

    for s in q_pi:
        for a in q_pi[s]:
            assert np.allclose(q_pi[s][a], q_pi_not_in_place[s][a], atol=0.01)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_iterative_policy_evaluation_of_action_value.pickle', 'wb') as file:
    #     pickle.dump(q_pi, file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_iterative_policy_evaluation_of_action_value.pickle',
            'rb') as file:
        fixture = pickle.load(file)

    assert q_pi == fixture
Exemplo n.º 26
0
def test_invalid_epsilon_iterate_value_q_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.0, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    with pytest.raises(ValueError,
                       match='epsilon must be strictly > 0 for TD-learning'):
        iterate_value_q_pi(agent=mdp_agent,
                           environment=mdp_environment,
                           num_improvements=10,
                           num_episodes_per_improvement=100,
                           num_updates_per_improvement=None,
                           alpha=0.1,
                           mode=Mode.Q_LEARNING,
                           n_steps=3,
                           planning_environment=None,
                           make_final_policy_greedy=False,
                           q_S_A=q_S_A)
Exemplo n.º 27
0
def test_policy_overrides():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    epsilon = 0.05

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=10,
                       num_episodes_per_improvement=20,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A)

    random_state = RandomState(12345)

    mdp_environment_2: Gridworld = Gridworld.example_4_1(random_state, 20)

    q_S_A_2 = ApproximateStateActionValueEstimator(
        mdp_environment_2, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment_2), None, False, None, None)

    mdp_agent_2 = StochasticMdpAgent('test', random_state,
                                     q_S_A_2.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent_2,
                       environment=mdp_environment_2,
                       num_improvements=10,
                       num_episodes_per_improvement=20,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A_2)

    assert isinstance(
        mdp_agent_2.most_recent_state,
        MdpState) and mdp_agent_2.most_recent_state in mdp_agent_2.pi

    with pytest.raises(ValueError,
                       match='Attempted to check for None in policy.'):
        # noinspection PyTypeChecker
        if None in mdp_agent_2.pi:  # pragma no cover
            pass

    assert mdp_agent.pi == mdp_agent_2.pi
    assert not (mdp_agent.pi != mdp_agent_2.pi)
Exemplo n.º 28
0
def improve(agent: StochasticMdpAgent,
            policy: ParameterizedPolicy,
            environment: MdpEnvironment,
            num_episodes: int,
            update_upon_every_visit: bool,
            alpha: float,
            v_S: Optional[StateValueEstimator],
            thread_manager: RunThreadManager,
            plot_state_value: bool,
            num_episodes_per_checkpoint: Optional[int] = None,
            checkpoint_path: Optional[str] = None) -> Optional[str]:
    """
    Perform Monte Carlo improvement of an agent's policy within an environment via the REINFORCE policy gradient method.
    This improvement function operates over rewards obtained at the end of episodes, so it is only appropriate for
    episodic tasks.

    :param agent: Agent containing target policy to be optimized.
    :param policy: Parameterized policy to be optimized.
    :param environment: Environment.
    :param num_episodes: Number of episodes to execute.
    :param update_upon_every_visit: True to update each state-action pair upon each visit within an episode, or False to
    update each state-action pair upon the first visit within an episode.
    :param alpha: Policy gradient step size.
    :param thread_manager: Thread manager. The current function (and the thread running it) will wait on this manager
    before starting each iteration. This provides a mechanism for pausing, resuming, and aborting training. Omit for no
    waiting.
    :param plot_state_value: Whether or not to plot the state-value.
    :param v_S: Baseline state-value estimator, or None for no baseline.
    :param num_episodes_per_checkpoint: Number of episodes per checkpoint save.
    :param checkpoint_path: Checkpoint path. Must be provided if `num_episodes_per_checkpoint` is provided.
    :return: Final checkpoint path, or None if checkpoints were not saved.
    """

    if thread_manager is not None:
        warnings.warn(
            'This optimization method will ignore the thread_manager.')

    if checkpoint_path is not None:
        checkpoint_path = os.path.expanduser(checkpoint_path)

    state_value_plot = None
    if plot_state_value and v_S is not None:
        state_value_plot = ScatterPlot('REINFORCE:  State Value', ['Estimate'],
                                       None)

    logging.info(
        f'Running Monte Carlo-based REINFORCE improvement for {num_episodes} episode(s).'
    )

    episode_reward_averager = IncrementalSampleAverager()
    episodes_per_print = max(1, int(num_episodes * 0.05))
    final_checkpoint_path = None
    for episode_i in range(num_episodes):

        # reset the environment for the new run (always use the agent we're learning about, as state identifiers come
        # from it), and reset the agent accordingly.
        state = environment.reset_for_new_run(agent)
        agent.reset_for_new_run(state)

        # simulate until episode termination, keeping a trace of state-action pairs and their immediate rewards, as well
        # as the times of their first visits (only if we're doing first-visit evaluation).
        t = 0
        state_action_first_t = None if update_upon_every_visit else {}
        t_state_action_reward = []
        total_reward = 0.0
        while not state.terminal and (environment.T is None
                                      or t < environment.T):

            a = agent.act(t)
            state_a = (state, a)

            if state_value_plot is not None:
                state_value_plot.update(np.array([v_S[state].get_value()]))

            # mark time step of first visit, if we're doing first-visit evaluation.
            if state_action_first_t is not None and state_a not in state_action_first_t:
                state_action_first_t[state_a] = t

            next_state, next_reward = environment.advance(state, t, a, agent)
            t_state_action_reward.append((t, state_a, next_reward))
            total_reward += next_reward.r
            state = next_state
            t += 1

            agent.sense(state, t)

        # work backwards through the trace to calculate discounted returns. need to work backward in order for the value
        # of g at each time step t to be properly discounted.
        g = 0
        for i, (t, state_a,
                reward) in enumerate(reversed(t_state_action_reward)):

            g = agent.gamma * g + reward.r

            # if we're doing every-visit, or if the current time step was the first visit to the state-action, then g
            # is the discounted sample value. use it to update the policy.
            if state_action_first_t is None or state_action_first_t[
                    state_a] == t:

                state, a = state_a

                # if we don't have a baseline, then the target is the return.
                if v_S is None:
                    target = g

                # otherwise, update the baseline state-value estimator and set the target to be the difference between
                # observed return and the baseline. actions that produce an above-baseline return will be reinforced.
                else:
                    v_S[state].update(g)
                    v_S.improve()
                    estimate = v_S[state].get_value()
                    target = g - estimate

                policy.append_update(a, state, alpha, target)

        policy.commit_updates()
        episode_reward_averager.update(total_reward)

        if num_episodes_per_checkpoint is not None and episode_i % num_episodes_per_checkpoint == 0:

            resume_args = {
                'agent': agent,
                'policy': policy,
                'environment': environment,
                'num_episodes': num_episodes,
                'update_upon_every_visit': update_upon_every_visit,
                'alpha': alpha,
                'plot_state_value': plot_state_value,
                'v_S': v_S,
                'num_episodes_per_checkpoint': num_episodes_per_checkpoint,
                'checkpoint_path': checkpoint_path
            }

            checkpoint_path_with_index = insert_index_into_path(
                checkpoint_path, episode_i)
            final_checkpoint_path = checkpoint_path_with_index
            with open(checkpoint_path_with_index, 'wb') as checkpoint_file:
                pickle.dump(resume_args, checkpoint_file)

        episodes_finished = episode_i + 1
        if episodes_finished % episodes_per_print == 0:
            logging.info(
                f'Finished {episodes_finished} of {num_episodes} episode(s).')

    logging.info(
        f'Completed optimization. Average reward per episode:  {episode_reward_averager.get_value()}'
    )

    return final_checkpoint_path
Exemplo n.º 29
0
def test_learn():

    random_state = RandomState(12345)

    mancala: Mancala = Mancala(random_state=random_state,
                               T=None,
                               initial_count=4,
                               player_2=StochasticMdpAgent(
                                   'player 2', random_state,
                                   TabularPolicy(None, None), 1))

    q_S_A = TabularStateActionValueEstimator(mancala, 0.05, None)

    p1 = StochasticMdpAgent('player 1', random_state,
                            q_S_A.get_initial_policy(), 1)

    checkpoint_path = iterate_value_q_pi(
        agent=p1,
        environment=mancala,
        num_improvements=3,
        num_episodes_per_improvement=100,
        update_upon_every_visit=False,
        planning_environment=None,
        make_final_policy_greedy=False,
        q_S_A=q_S_A,
        num_improvements_per_checkpoint=3,
        checkpoint_path=tempfile.NamedTemporaryFile(delete=False).name)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_mancala.pickle', 'wb') as file:
    #     pickle.dump(p1.pi, file)

    with open(f'{os.path.dirname(__file__)}/fixtures/test_mancala.pickle',
              'rb') as file:
        fixture = pickle.load(file)

    assert tabular_pi_legacy_eq(p1.pi, fixture)

    resumed_p1 = resume_from_checkpoint(checkpoint_path=checkpoint_path,
                                        resume_function=iterate_value_q_pi,
                                        num_improvements=2)

    # run same number of improvements without checkpoint...result should be the same.
    random_state = RandomState(12345)

    mancala: Mancala = Mancala(random_state=random_state,
                               T=None,
                               initial_count=4,
                               player_2=StochasticMdpAgent(
                                   'player 2', random_state,
                                   TabularPolicy(None, None), 1))

    q_S_A = TabularStateActionValueEstimator(mancala, 0.05, None)

    no_checkpoint_p1 = StochasticMdpAgent('player 1', random_state,
                                          q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(agent=no_checkpoint_p1,
                       environment=mancala,
                       num_improvements=5,
                       num_episodes_per_improvement=100,
                       update_upon_every_visit=False,
                       planning_environment=None,
                       make_final_policy_greedy=False,
                       q_S_A=q_S_A)

    assert no_checkpoint_p1.pi == resumed_p1.pi