示例#1
0
def test_q_learning_slots():
    """
    Tests that the Qlearning implementation successfully finds the slot
    machine with the largest expected reward.
    """
    from src import QLearning

    np.random.seed(0)

    env = gym.make('SlotMachines-v0',
                   n_machines=10,
                   mean_range=(-10, 10),
                   std_range=(5, 10))
    env.seed(0)
    means = np.array([m.mean for m in env.machines])

    agent = QLearning(epsilon=0.2, discount=0)
    state_action_values, rewards = agent.fit(env, steps=10000)

    assert state_action_values.shape == (1, 10)
    assert len(rewards) == 100
    assert np.argmax(means) == np.argmax(state_action_values)

    states, actions, rewards = agent.predict(env, state_action_values)
    assert len(actions) == 1 and actions[0] == np.argmax(means)
    assert len(states) == 1 and states[0] == 0
    assert len(rewards) == 1
示例#2
0
def test_q_learning_deterministic():
    """
    Tests that the QLearning implementation successfully navigates a
    deterministic environment with provided state-action-values.
    """
    from src import QLearning

    np.random.seed(0)

    env = gym.make('FrozonLakeNoSlippery-v0')
    env.seed(0)

    agent = QLearning(epsilon=0.5, discount=0.95)
    state_action_values = np.array([[0.0, 0.7, 0.3, 0.0], [0.0, 1.0, 0.0, 0.0],
                                    [0.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 0.0],
                                    [0.0, 0.51, 0.49, 0.0],
                                    [0.0, 0.0, 0.0, 0.0], [0.5, 0.0, 0.5, 0.0],
                                    [0.0, 0.0, 0.0, 0.0], [0.0, 0.2, 0.8, 0.0],
                                    [0.0, 0.2, 0.8, 0.0], [0.0, 0.6, 0.4, 0.0],
                                    [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
                                    [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0],
                                    [0.0, 0.0, 0.0, 0.0]])

    states, actions, rewards = agent.predict(env, state_action_values)
    assert np.all(states == np.array([4, 8, 9, 10, 14, 15]))
    assert np.all(actions == np.array([1, 1, 2, 2, 1, 2]))
    assert np.all(rewards == np.array([0, 0, 0, 0, 0, 1]))
示例#3
0
def FRQ_2_e():

    env = gym.make('FrozenLake-v0')
    multiarmed_rewards_ls = []
    for i in range(10):
        agent = MultiArmedBandit()
        action_values, multiarmed_rewards = agent.fit(env, steps=100000)
        multiarmed_rewards_ls.append(multiarmed_rewards)

    sum_10_trials = np.zeros(100)
    for i in range(10):
        sum_10_trials = sum_10_trials + multiarmed_rewards_ls[i]
    average_10_trials = 1.0 / 10.0 * sum_10_trials
    plt.plot(np.linspace(0, 99, 100), average_10_trials)

    Qlearning_rewards_ls = []
    for i in range(10):
        agent = QLearning()
        action_values, Qlearning_rewards = agent.fit(env, steps=1000)
        Qlearning_rewards_ls.append(Qlearning_rewards)
    sum_10_trials = np.zeros(100)
    for i in range(10):
        sum_10_trials = sum_10_trials + Qlearning_rewards_ls[i]
    average_10_trials = 1.0 / 10.0 * sum_10_trials
    plt.plot(np.linspace(0, 99, 100), average_10_trials)

    plt.legend(('MultiArmedbandit', 'Qlearning'), loc='lower right')
    plt.title("FRQ_2e_Qlearning_Multiarmedbandit_rewards_Comparison")
    plt.xlabel("Steps")
    plt.ylabel("Reward Values")
    plt.show()
示例#4
0
def test_agent(agent, env, n_runs):
    t = PrettyTable()
    t.field_names = ['# objects', 'avg accuracy']

    env.generate_random_nobj = False

    for n_objects in range(1, env.max_CL_objects + 1):
        env.max_episode_objects = n_objects

        correct_label_bools = []

        for n_run in range(n_runs):
            env.reset()
            done = False

            while not done:
                state = torch.Tensor(env.state)
                q_values = QLearning.get_qvalues(state, agent)
                __, __, done, correct_label = env.step(q_values,
                                                       env.max_train_iters,
                                                       state_visit_history)

            if correct_label:
                correct_label_bools.append(1)
            else:
                correct_label_bools.append(0)

        t.add_row([n_objects, np.mean(correct_label_bools)])

    print(t)
示例#5
0
def FRQ_3_a():

    env = gym.make('FrozenLake-v0')

    rewards_001_ls = []
    for i in range(10):
        agent = QLearning(epsilon=0.01)
        action_values, Qlearning_rewards = agent.fit(env, steps=100000)
        rewards_001_ls.append(Qlearning_rewards)
    sum_10_trials = np.zeros(100)
    for i in range(10):
        sum_10_trials = sum_10_trials + rewards_001_ls[i]
    average_10_trials = 1.0 / 10.0 * sum_10_trials
    plt.plot(np.linspace(0, 99, 100), average_10_trials)

    rewards_05_ls = []
    for i in range(10):
        agent = QLearning(epsilon=0.5)
        action_values, Qlearning_rewards = agent.fit(env, steps=100000)
        rewards_05_ls.append(Qlearning_rewards)
    sum_10_trials = np.zeros(100)
    for i in range(10):
        sum_10_trials = sum_10_trials + rewards_05_ls[i]
    average_10_trials = 1.0 / 10.0 * sum_10_trials
    plt.plot(np.linspace(0, 99, 100), average_10_trials)

    plt.legend(('epsilon=0.01', 'epsilon=0.5'), loc='lower right')
    plt.title("FRQ_3a_Different_Epsilon_Rewards_Comparison")
    plt.xlabel("Steps")
    plt.ylabel("Reward Values")
    plt.show()
示例#6
0
def test_q_learning_frozen_lake():
    """
    Tests that the QLearning implementation successfully learns the
    FrozenLake-v0 environment.
    """
    from src import QLearning

    np.random.seed(0)

    env = gym.make('FrozenLake-v0')
    env.seed(0)

    agent = QLearning(epsilon=0.2, discount=0.95)
    state_action_values, rewards = agent.fit(env, steps=10000)

    state_values = np.max(state_action_values, axis=1)

    assert state_action_values.shape == (16, 4)
    assert len(rewards) == 100

    assert np.allclose(state_values[np.array([5, 7, 11, 12, 15])], np.zeros(5))
    assert np.all(
        state_values[np.array([0, 1, 2, 3, 4, 6, 8, 9, 10, 13, 14])] > 0)
示例#7
0
import gym
import gym_game
import numpy as np
from src import QLearning
import time

start_time = time.time()

env = gym.make("RoobetCrash-v0")

agent = QLearning(epsilon=.2, discount=0.6, adaptive=True)

state_action_values, observation, N = agent.fit(env)

env.crash.train_or_test = "test"
agent.predict(env=env,
              state_action_values=state_action_values,
              observation=observation,
              N=N)

elapsed_time = time.time() - start_time
print("Time elapsed: ", elapsed_time)
示例#8
0
import matplotlib.pyplot as plt

print('Starting example experiment')

env = gym.make('FrozenLake-v0')
epsilon1 = 0.01
epsilon2 = 0.5
steps = 100000
trials = 10
discount = 0.95

reward_matrix_q1 = np.zeros((trials, 100))
reward_matrix_q2 = np.zeros((trials, 100))

for trial in range(trials):
    agent_q1 = QLearning(epsilon=epsilon1, discount=discount)
    agent_q2 = QLearning(epsilon=epsilon2, discount=discount)
    action_values_q1, rewards_q1 = agent_q1.fit(env, steps=steps)
    action_values_q2, rewards_q2 = agent_q2.fit(env, steps=steps)

    reward_matrix_q1[trial, :] = rewards_q1
    reward_matrix_q2[trial, :] = rewards_q2

average_rewards_q1 = np.mean(reward_matrix_q1, axis=0)
average_rewards_q2 = np.mean(reward_matrix_q2, axis=0)

plt.figure()
plt.plot(range(len(average_rewards_q1)),
         average_rewards_q1,
         color='blue',
         label='Average rewards of QLearning for epsilon = 0.01')
示例#9
0
from src import MultiArmedBandit, QLearning
import matplotlib.pyplot as plt

print('Starting example experiment')

env = gym.make('SlotMachines-v0')
steps = 100000
trials = 10
action_matrix_bandits = np.zeros((trials, 10))
reward_matrix_bandits = np.zeros((trials, 100))
action_matrix_q = np.zeros((trials, 10))
reward_matrix_q = np.zeros((trials, 100))

for trial in range(trials):
    agent_bandits = MultiArmedBandit()
    agent_q = QLearning()
    action_values_bandits, rewards_bandits = agent_bandits.fit(env,
                                                               steps=steps)
    action_values_q, rewards_q = agent_q.fit(env, steps=steps)

    action_matrix_bandits[trial, :] = action_values_bandits
    reward_matrix_bandits[trial, :] = rewards_bandits
    action_matrix_q[trial, :] = action_values_q
    reward_matrix_q[trial, :] = rewards_q

average_5_rewards_bandits = np.mean(reward_matrix_bandits[:5], axis=0)
average_10_rewards_bandits = np.mean(reward_matrix_bandits, axis=0)
average_10_rewards_q = np.mean(reward_matrix_q, axis=0)

plt.figure()
plt.plot(range(len(reward_matrix_bandits[0])),