Exemplo n.º 1
0
def test_q_learning_iterate_value_q_pi_function_approximation_policy_ne():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    epsilon = 0.05

    q_S_A_1 = ApproximateStateActionValueEstimator(
        mdp_environment, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent_1 = StochasticMdpAgent('test', random_state,
                                     q_S_A_1.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent_1,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=10,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A_1)

    q_S_A_2 = ApproximateStateActionValueEstimator(
        mdp_environment, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent_2 = StochasticMdpAgent('test', random_state,
                                     q_S_A_2.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent_2,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=5,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A_2)

    assert mdp_agent_1.pi.estimator != mdp_agent_2.pi.estimator
    assert mdp_agent_1.pi.estimator.model != mdp_agent_2.pi.estimator.model
Exemplo n.º 2
0
def test_q_learning_iterate_value_q_pi_function_approximation_invalid_formula(
):

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, 0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment),
        f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})',
        False, None, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    with pytest.raises(ValueError, match='Invalid combination of formula'):
        iterate_value_q_pi(agent=mdp_agent,
                           environment=mdp_environment,
                           num_improvements=5,
                           num_episodes_per_improvement=5,
                           num_updates_per_improvement=None,
                           alpha=None,
                           mode=Mode.Q_LEARNING,
                           n_steps=None,
                           planning_environment=None,
                           make_final_policy_greedy=False,
                           q_S_A=q_S_A)
Exemplo n.º 3
0
def test_q_learning_iterate_value_q_pi_function_approximation_with_formula():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)
    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, 0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        StateActionIdentityFeatureExtractor(mdp_environment),
        f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})',
        False, None, None)
    mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=5,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=False)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert np.allclose(mdp_agent.pi.estimator.model.model.coef_,
                       pi_fixture.estimator.model.model.coef_)
Exemplo n.º 4
0
def test_off_policy_monte_carlo_with_function_approximation():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment,
        0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment),
        None,
        False,
        None,
        None
    )

    # target agent
    mdp_agent = ActionValueMdpAgent(
        'test',
        random_state,
        1,
        q_S_A
    )

    # episode generation (behavior) policy
    off_policy_agent = ActionValueMdpAgent(
        'test',
        random_state,
        1,
        TabularStateActionValueEstimator(mdp_environment, None, None)
    )

    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=100,
        num_episodes_per_improvement=1,
        update_upon_every_visit=True,
        planning_environment=None,
        make_final_policy_greedy=False,
        off_policy_agent=off_policy_agent
    )

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_off_policy_monte_carlo_with_function_approximationo.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(f'{os.path.dirname(__file__)}/fixtures/test_off_policy_monte_carlo_with_function_approximationo.pickle', 'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert mdp_agent.pi == pi_fixture and q_S_A == q_S_A_fixture
    assert str(mdp_agent.pi.estimator[mdp_environment.SS[5]][mdp_environment.SS[5].AA[1]]).startswith('-2.4305')

    # make greedy
    q_S_A.epsilon = 0.0
    assert q_S_A.improve_policy(mdp_agent, None, PolicyImprovementEvent.MAKING_POLICY_GREEDY) == -1
    assert mdp_agent.pi.estimator.epsilon == 0.0
def main():

    random_state = RandomState(12345)

    environment = Gym(
        random_state=random_state,
        T=None,
        gym_id='CartPole-v1',
        continuous_action_discretization_resolution=None,
        render_every_nth_episode=100
    )

    model = SKLearnSGD(
        loss='squared_loss',
        alpha=0.0,
        learning_rate='constant',
        eta0=0.0001,
        scale_eta0_for_y=False
    )

    feature_extractor = CartpoleFeatureExtractor(
        environment=environment
    )

    q_S_A = ApproximateStateActionValueEstimator(
        environment=environment,
        epsilon=0.02,
        model=model,
        feature_extractor=feature_extractor,
        formula=None,
        plot_model=False,
        plot_model_per_improvements=None,
        plot_model_bins=None
    )

    agent = StochasticMdpAgent(
        name='Cartpole Agent',
        random_state=random_state,
        pi=q_S_A.get_initial_policy(),
        gamma=0.95
    )

    iterate_value_q_pi(
        agent=agent,
        environment=environment,
        num_improvements=15000,
        num_episodes_per_improvement=1,
        num_updates_per_improvement=1,
        alpha=None,
        mode=Mode.SARSA,
        n_steps=None,
        planning_environment=None,
        make_final_policy_greedy=True,
        q_S_A=q_S_A,
        num_improvements_per_plot=100
    )
Exemplo n.º 6
0
def test_q_learning_iterate_value_q_pi_function_approximation_no_formula():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, 0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=10,
                       num_episodes_per_improvement=20,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation_no_formula.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation_no_formula.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert np.allclose(mdp_agent.pi.estimator.model.model.coef_,
                       pi_fixture.estimator.model.model.coef_)
    assert mdp_agent.pi.format_state_action_probs(
        mdp_environment.SS) == pi_fixture.format_state_action_probs(
            mdp_environment.SS)
    assert mdp_agent.pi.format_state_action_values(
        mdp_environment.SS) == pi_fixture.format_state_action_values(
            mdp_environment.SS)
def test_policy_overrides():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    epsilon = 0.05

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A)

    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=10,
        num_episodes_per_improvement=20,
        num_updates_per_improvement=None,
        alpha=None,
        mode=Mode.Q_LEARNING,
        n_steps=None,
        planning_environment=None,
        make_final_policy_greedy=True,
    )

    random_state = RandomState(12345)

    mdp_environment_2: Gridworld = Gridworld.example_4_1(random_state, 20)

    q_S_A_2 = ApproximateStateActionValueEstimator(
        mdp_environment_2, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment_2), None, False, None, None)

    mdp_agent_2 = ActionValueMdpAgent('test', random_state, 1, q_S_A_2)

    iterate_value_q_pi(agent=mdp_agent_2,
                       environment=mdp_environment_2,
                       num_improvements=10,
                       num_episodes_per_improvement=20,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True)

    assert isinstance(
        mdp_agent_2.most_recent_state,
        MdpState) and mdp_agent_2.most_recent_state in mdp_agent_2.pi

    with pytest.raises(ValueError,
                       match='Attempted to check for None in policy.'):
        # noinspection PyTypeChecker
        if None in mdp_agent_2.pi:  # pragma no cover
            pass

    assert mdp_agent.pi == mdp_agent_2.pi
    assert not (mdp_agent.pi != mdp_agent_2.pi)