Exemplo n.º 1
0
def test_q_learning_deterministic():
    """
    Tests that the QLearning implementation successfully navigates a
    deterministic environment with provided state-action-values.
    """
    from code import QLearning

    np.random.seed(0)

    env = gym.make('FrozonLakeNoSlippery-v0')
    env.seed(0)

    agent = QLearning(epsilon=0.5, discount=0.95)
    state_action_values = np.array([[0.0, 0.7, 0.3, 0.0], [0.0, 1.0, 0.0, 0.0],
                                    [0.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 0.0],
                                    [0.0, 0.51, 0.49, 0.0],
                                    [0.0, 0.0, 0.0, 0.0], [0.5, 0.0, 0.5, 0.0],
                                    [0.0, 0.0, 0.0, 0.0], [0.0, 0.2, 0.8, 0.0],
                                    [0.0, 0.2, 0.8, 0.0], [0.0, 0.6, 0.4, 0.0],
                                    [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
                                    [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0],
                                    [0.0, 0.0, 0.0, 0.0]])

    states, actions, rewards = agent.predict(env, state_action_values)
    assert np.all(states == np.array([4, 8, 9, 10, 14, 15]))
    assert np.all(actions == np.array([1, 1, 2, 2, 1, 2]))
    assert np.all(rewards == np.array([0, 0, 0, 0, 0, 1]))
Exemplo n.º 2
0
def test_q_learning_slots():
    """
    Tests that the Qlearning implementation successfully finds the slot
    machine with the largest expected reward.
    """
    from code import QLearning

    np.random.seed(0)

    env = gym.make('SlotMachines-v0',
                   n_machines=10,
                   mean_range=(-10, 10),
                   std_range=(5, 10))
    env.seed(0)
    means = np.array([m.mean for m in env.machines])

    agent = QLearning(epsilon=0.2, discount=0)
    state_action_values, rewards = agent.fit(env, steps=10000)

    assert state_action_values.shape == (1, 10)
    assert len(rewards) == 100
    assert np.argmax(means) == np.argmax(state_action_values)

    states, actions, rewards = agent.predict(env, state_action_values)
    assert len(actions) == 1 and actions[0] == np.argmax(means)
    assert len(states) == 1 and states[0] == 0
    assert len(rewards) == 1