Exemplo n.º 1
0
def test_collect_Q():
    np.random.seed(88)
    mdp = GridWorld(3, 3, (2, 2))

    eps = Parameter(0.1)
    pi = EpsGreedy(eps)
    alpha = Parameter(0.1)
    agent = SARSA(pi, mdp.info, alpha)

    callback_q = CollectQ(agent.Q)
    callback_max_q = CollectMaxQ(agent.Q, np.array([2]))

    core = Core(agent, mdp, callbacks=[callback_q, callback_max_q])

    core.learn(n_steps=1000, n_steps_per_fit=1, quiet=True)

    V_test = np.array([2.4477574 , 0.02246188, 1.6210059 , 6.01867052])
    V = callback_q.get()[-1]

    assert np.allclose(V[0, :], V_test)

    V_max = np.array([np.max(x[2, :], axis=-1) for x in callback_q.get()])
    max_q = np.array(callback_max_q.get())

    assert np.allclose(V_max, max_q)
Exemplo n.º 2
0
def experiment():
    np.random.seed()

    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon, )

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1)
Exemplo n.º 3
0
def experiment(boosted):
    np.random.seed(20)

    # MDP
    mdp = CarOnHill()

    # Policy
    epsilon = Parameter(value=1)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    if not boosted:
        approximator_params = dict(
            input_shape=mdp.info.observation_space.shape,
            n_actions=mdp.info.action_space.n,
            n_estimators=50,
            min_samples_split=5,
            min_samples_leaf=2)
    else:
        approximator_params = dict(
            input_shape=mdp.info.observation_space.shape,
            n_actions=mdp.info.action_space.n,
            n_models=3,
            prediction='sum',
            n_estimators=50,
            min_samples_split=5,
            min_samples_leaf=2)

    approximator = ExtraTreesRegressor

    # Agent
    algorithm_params = dict(n_iterations=3, boosted=boosted, quiet=True)
    fit_params = dict()
    agent_params = {
        'approximator_params': approximator_params,
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = FQI(approximator, pi, mdp.info, agent_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=50, n_episodes_per_fit=50, quiet=True)

    # Test
    test_epsilon = Parameter(0)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.zeros((9, 2))
    cont = 0
    for i in range(-8, 9, 8):
        for j in range(-8, 9, 8):
            initial_states[cont, :] = [0.125 * i, 0.375 * j]
            cont += 1

    dataset = core.evaluate(initial_states=initial_states, quiet=True)

    return np.mean(compute_J(dataset, mdp.info.gamma))
Exemplo n.º 4
0
def experiment(policy, value):
    np.random.seed(45)

    # MDP
    mdp = generate_taxi('tests/taxi/grid.txt', rew=(0, 1, 5))

    # Policy
    pi = policy(Parameter(value=value))

    # Agent
    learning_rate = Parameter(value=.15)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = SARSA(pi, mdp.info, agent_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    n_steps = 2000
    core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True)

    return np.sum(np.array(collect_dataset.get())[:, 2]) / float(n_steps)
Exemplo n.º 5
0
def learn(alg, alg_params):
    mdp = CarOnHill()
    np.random.seed(1)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                               n_actions=mdp.info.action_space.n,
                               n_estimators=50,
                               min_samples_split=5,
                               min_samples_leaf=2)
    approximator = ExtraTreesRegressor

    # Agent
    agent = alg(approximator, pi, mdp.info,
                approximator_params=approximator_params, **alg_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=5, n_episodes_per_fit=5)

    test_epsilon = Parameter(0.75)
    agent.policy.set_epsilon(test_epsilon)
    dataset = core.evaluate(n_episodes=2)

    return np.mean(compute_J(dataset, mdp.info.gamma))
Exemplo n.º 6
0
def experiment():
    np.random.seed()

    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = QLearning(pi, mdp.info, **algorithm_params)

    # Core
    core = Core(agent, mdp)

    # Initial policy Evaluation
    dataset = core.evaluate(n_steps=1000)
    J = np.mean(compute_J(dataset, mdp.info.gamma))
    print('J start:', J)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1)

    # Final Policy Evaluation
    dataset = core.evaluate(n_steps=1000)
    J = np.mean(compute_J(dataset, mdp.info.gamma))
    print('J final:', J)
Exemplo n.º 7
0
def experiment2():
    np.random.seed(3)
    print('mushroom     :')

    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon, )

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)
    dataset = collect_dataset.get()
    return agent.Q.table
Exemplo n.º 8
0
def test_dataset_utils():
    np.random.seed(88)

    mdp = GridWorld(3, 3, (2, 2))
    epsilon = Parameter(value=0.)
    alpha = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    agent = SARSA(pi, mdp.info, alpha)
    core = Core(agent, mdp)

    dataset = core.evaluate(n_episodes=10)

    J = compute_J(dataset, mdp.info.gamma)
    J_test = np.array([
        1.16106307e-03, 2.78128389e-01, 1.66771817e+00, 3.09031544e-01,
        1.19725152e-01, 9.84770902e-01, 1.06111661e-02, 2.05891132e+00,
        2.28767925e+00, 4.23911583e-01
    ])
    assert np.allclose(J, J_test)

    L = episodes_length(dataset)
    L_test = np.array([87, 35, 18, 34, 43, 23, 66, 16, 15, 31])
    assert np.array_equal(L, L_test)

    dataset_ep = select_first_episodes(dataset, 3)
    J = compute_J(dataset_ep, mdp.info.gamma)
    assert np.allclose(J, J_test[:3])

    L = episodes_length(dataset_ep)
    assert np.allclose(L, L_test[:3])

    samples = select_random_samples(dataset, 2)
    s, a, r, ss, ab, last = parse_dataset(samples)
    s_test = np.array([[6.], [1.]])
    a_test = np.array([[0.], [1.]])
    r_test = np.zeros(2)
    ss_test = np.array([[3], [4]])
    ab_test = np.zeros(2)
    last_test = np.zeros(2)
    assert np.array_equal(s, s_test)
    assert np.array_equal(a, a_test)
    assert np.array_equal(r, r_test)
    assert np.array_equal(ss, ss_test)
    assert np.array_equal(ab, ab_test)
    assert np.array_equal(last, last_test)

    index = np.sum(L_test[:2]) + L_test[2] // 2
    min_J, max_J, mean_J, n_episodes = compute_metrics(dataset[:index],
                                                       mdp.info.gamma)
    assert min_J == 0.0
    assert max_J == 0.0011610630703530948
    assert mean_J == 0.0005805315351765474
    assert n_episodes == 2
Exemplo n.º 9
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 10
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(policy, mu, mdp.info,
                     alpha_theta, alpha_omega, alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    visualization_callback = Display(agent._V, mu,
                                     mdp.info.observation_space.low,
                                     mdp.info.observation_space.high,
                                     phi, phi)
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.0)
        dataset_callback.clean()
        visualization_callback()
        print('Mean Reward at iteration ' + str(i) + ': ' +
              str(np.sum(J) / n_steps / n_episodes))

    print('Press a button to visualize the pendulum...')
    input()
    sigma = 1e-8 * np.eye(1)
    policy.set_sigma(sigma)
    core.evaluate(n_steps=n_steps, render=True)
Exemplo n.º 10
0
def test_rq_learning():
    pi, mdp, _ = initialize()

    agent = RQLearning(pi, mdp.info, Parameter(.1), beta=Parameter(.5))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[0.32411217, 2.9698436, 0.46474438, 1.10269504],
                       [2.99505139, 5.217031, 0.40933461, 0.37687883],
                       [0.41942675, 0.32363486, 0., 4.68559],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)

    agent = RQLearning(pi, mdp.info, Parameter(.1), delta=Parameter(.5))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[1.04081115e-2, 5.14662188e-1, 1.73951634e-2, 1.24081875e-01],
                       [0., 2.71, 1.73137500e-4, 4.10062500e-6],
                       [0., 4.50000000e-2, 0., 4.68559],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)

    agent = RQLearning(pi, mdp.info, Parameter(.1), off_policy=True,
                       beta=Parameter(.5))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[3.55204022, 4.54235939, 3.42601165, 2.95170908],
                       [2.73877031, 3.439, 2.42031528, 2.86634531],
                       [3.43274708, 3.8592342, 3.72637395, 5.217031],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)

    agent = RQLearning(pi, mdp.info, Parameter(.1), off_policy=True,
                       delta=Parameter(.5))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[0.18947806, 1.57782254, 0.21911489, 1.05197011],
                       [0.82309759, 5.217031, 0.04167492, 0.61472604],
                       [0.23620541, 0.59828262, 1.25299991, 5.217031],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)
Exemplo n.º 11
0
def experiment():
    np.random.seed()

    # MDP
    mdp = CarOnHill()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                               n_actions=mdp.info.action_space.n,
                               n_estimators=50,
                               min_samples_split=5,
                               min_samples_leaf=2)
    approximator = ExtraTreesRegressor

    # Agent
    algorithm_params = dict(n_iterations=20)
    agent = FQI(approximator, pi, mdp.info,
                approximator_params=approximator_params, **algorithm_params)

    # Algorithm
    core = Core(agent, mdp)

    # Render
    core.evaluate(n_episodes=1, render=True)

    # Train
    core.learn(n_episodes=1000, n_episodes_per_fit=1000)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.zeros((289, 2))
    cont = 0
    for i in range(-8, 9):
        for j in range(-8, 9):
            initial_states[cont, :] = [0.125 * i, 0.375 * j]
            cont += 1

    dataset = core.evaluate(initial_states=initial_states)

    # Render
    core.evaluate(n_episodes=3, render=True)

    return np.mean(compute_J(dataset, mdp.info.gamma))
Exemplo n.º 12
0
def test_r_learning():
    pi, mdp, _ = initialize()
    agent = RLearning(pi, mdp.info, Parameter(.1), Parameter(.5))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[-6.19137991, -3.9368055, -5.11544257, -3.43673781],
                       [-2.52319391, 1.92201829, -2.77602918, -2.45972955],
                       [-5.38824415, -2.43019918, -1.09965936, 2.04202511],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)
Exemplo n.º 13
0
def test_r_learning():
    alg = RLearning(pi, mdp.info, Parameter(.1), Parameter(.5))

    alg.Q.table = np.arange(np.prod(mdp.info.size)).reshape(
        mdp.info.size).astype(np.float)

    alg._update(0, 1, 100, 1, 0)
    alg._update(1, 0, 10, 3, 1)
    alg._update(3, 1, 50, 3, 0)
    alg._update(2, 2, -100, 3, 1)

    test_q = np.array([[0, 11.6, 2, 3], [-.17, 5, 6, 7], [8, 9, -5.77, 11],
                       [12, 13.43, 14, 15]])

    assert np.allclose(alg.Q.table, test_q)
Exemplo n.º 14
0
def test_lspi():
    mdp = CartPole()
    np.random.seed(1)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]
    features = Features(basis_list=basis)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(pi,
                 mdp.info,
                 fit_params=dict(),
                 approximator_params=approximator_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=100, n_episodes_per_fit=100)

    w = agent.approximator.get_weights()
    w_test = np.array([-2.23880597, -2.27427603, -2.25])

    assert np.allclose(w, w_test)
Exemplo n.º 15
0
def test_sarsa_lambda_continuous_nn():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)

    features = Features(
        n_outputs=mdp_continuous.info.observation_space.shape[0]
    )

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        network=Network,
        n_actions=mdp_continuous.info.action_space.n
    )
    agent = SARSALambdaContinuous(TorchApproximator, pi, mdp_continuous.info,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([-0.18968964, 0.4296857, 0.52967095, 0.5674884,
                       -0.12784956, -0.10572472, -0.14546978, -0.67001086,
                       -0.93925357])

    assert np.allclose(agent.Q.get_weights(), test_w)
Exemplo n.º 16
0
def test_sarsa_lambda_continuous_linear():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        n_actions=mdp_continuous.info.action_space.n
    )
    agent = SARSALambdaContinuous(LinearApproximator, pi, mdp_continuous.info,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([-16.38428419, 0., -14.31250136, 0., -15.68571525, 0.,
                       -10.15663821, 0., -15.0545445, 0., -8.3683605, 0.])

    assert np.allclose(agent.Q.get_weights(), test_w)
Exemplo n.º 17
0
def test_boltzmann():
    np.random.seed(88)
    beta = Parameter(0.1)
    pi = Boltzmann(beta)

    Q = Table((10, 3))
    Q.table = np.random.randn(10, 3)

    pi.set_q(Q)

    s = np.array([2])
    a = np.array([1])

    p_s = pi(s)
    p_s_test = np.array([0.30676679, 0.36223227, 0.33100094])
    assert np.allclose(p_s, p_s_test)

    p_sa = pi(s, a)
    p_sa_test = np.array([0.36223227])
    assert np.allclose(p_sa, p_sa_test)

    a = pi.draw_action(s)
    a_test = 2
    assert a.item() == a_test

    beta_2 = LinearParameter(0.2, 0.1, 2)
    pi.set_beta(beta_2)
    p_sa_2 = pi(s, a)
    assert p_sa_2 < p_sa

    pi.update(s, a)
    p_sa_3 = pi(s, a)
    p_sa_3_test = np.array([0.33100094])
    assert np.allclose(p_sa_3, p_sa_3_test)
Exemplo n.º 18
0
def test_sarsa_lambda_continuous():
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size, ),
        output_shape=(mdp_continuous.info.action_space.n, ),
        n_actions=mdp_continuous.info.action_space.n)
    alg = SARSALambdaContinuous(LinearApproximator,
                                pi,
                                mdp_continuous.info,
                                Parameter(.1),
                                .9,
                                features=features,
                                approximator_params=approximator_params)

    s_1 = np.linspace(mdp_continuous.info.observation_space.low[0],
                      mdp_continuous.info.observation_space.high[0], 10)
    s_2 = np.linspace(mdp_continuous.info.observation_space.low[1],
                      mdp_continuous.info.observation_space.high[1], 10)
    for i in s_1:
        for j in s_2:
            alg._update(np.array([i, j]), np.array([1]), 100, np.array([0, 0]),
                        0)

    test_w = np.array([[0, 0, 0, 0], [320.43, 399.8616, 340.397, 417.218],
                       [0, 0, 0, 0]])

    assert np.allclose(alg.Q.get_weights(), test_w.ravel())
Exemplo n.º 19
0
def test_true_online_sarsa_lambda():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        n_actions=mdp_continuous.info.action_space.n
    )
    agent = TrueOnlineSARSALambda(pi, mdp_continuous.info,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([-17.27410736, 0., -15.04386343, 0., -16.6551805, 0.,
                       -11.31383707, 0., -16.11782002, 0., -9.6927357, 0.])

    assert np.allclose(agent.Q.get_weights(), test_w)
Exemplo n.º 20
0
def experiment(algorithm_class, decay_exp):
    np.random.seed()

    # MDP
    p = np.load('chain_structure/p.npy')
    rew = np.load('chain_structure/rew.npy')
    mdp = FiniteMDP(p, rew, gamma=.9)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=decay_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(pi, mdp.info, **algorithm_params)

    # Algorithm
    collect_Q = CollectQ(agent.approximator)
    callbacks = [collect_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True)

    Qs = collect_Q.get_values()

    return Qs
Exemplo n.º 21
0
def test_true_online_sarsa_lambda():
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size, ),
        output_shape=(mdp_continuous.info.action_space.n, ),
        n_actions=mdp_continuous.info.action_space.n)
    alg = TrueOnlineSARSALambda(pi,
                                mdp_continuous.info,
                                Parameter(.1),
                                .9,
                                features=features,
                                approximator_params=approximator_params)

    s_1 = np.linspace(mdp_continuous.info.observation_space.low[0],
                      mdp_continuous.info.observation_space.high[0], 10)
    s_2 = np.linspace(mdp_continuous.info.observation_space.low[1],
                      mdp_continuous.info.observation_space.high[1], 10)
    for i in s_1:
        for j in s_2:
            alg._update(np.array([i, j]), np.array([1]), 100, np.array([0, 0]),
                        0)

    test_w = np.array([[0, 0, 0, 0], [927.0283, 798.57594, 876.8018, 705.227],
                       [0, 0, 0, 0]])

    assert np.allclose(alg.Q.get_weights(), test_w.ravel())
Exemplo n.º 22
0
def test_eps_greedy():
    np.random.seed(88)
    eps = Parameter(0.1)
    pi = EpsGreedy(eps)

    Q = Table((10, 3))
    Q.table = np.random.randn(10, 3)

    pi.set_q(Q)

    s = np.array([2])
    a = np.array([1])

    p_s = pi(s)
    p_s_test = np.array([0.03333333, 0.93333333, 0.03333333])
    assert np.allclose(p_s, p_s_test)

    p_sa = pi(s, a)
    p_sa_test = np.array([0.93333333])
    assert np.allclose(p_sa, p_sa_test)

    a = pi.draw_action(s)
    a_test = 1
    assert a.item() == a_test

    eps_2 = LinearParameter(0.2, 0.1, 2)
    pi.set_epsilon(eps_2)
    p_sa_2 = pi(s, a)
    assert p_sa_2 < p_sa

    pi.update(s, a)
    pi.update(s, a)
    p_sa_3 = pi(s, a)
    print(eps_2.get_value())
    assert p_sa_3 == p_sa
Exemplo n.º 23
0
def experiment():
    np.random.seed()

    # MDP
    mdp = CartPole()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]

    s1 = np.array([-np.pi, 0, np.pi]) * .25
    s2 = np.array([-1, 0, 1])
    for i in s1:
        for j in s2:
            basis.append(GaussianRBF(np.array([i, j]), np.array([1.])))
    features = Features(basis_list=basis)

    fit_params = dict()
    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(pi,
                 mdp.info,
                 fit_params=fit_params,
                 approximator_params=approximator_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)
    core.evaluate(n_episodes=3, render=True)

    # Train
    core.learn(n_episodes=100, n_episodes_per_fit=100)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    dataset = core.evaluate(n_episodes=1, quiet=True)

    core.evaluate(n_steps=100, render=True)

    return np.mean(episodes_length(dataset))
Exemplo n.º 24
0
    def __init__(self, epsilon=None):
        if epsilon is None:
            epsilon = Parameter(0.)

        super(WeightedGaussianPolicy, self).__init__()

        self._epsilon = epsilon
        self._evaluation = False
def experiment():
    np.random.seed(3)
    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    action_space = mdp._mdp_info.action_space
    observation_space = mdp._mdp_info.observation_space
    gamma = mdp._mdp_info.gamma

    # Model Block
    model_block = MBlock(env=mdp, render=False)

    #Policy
    epsilon = Parameter(value=1)
    pi = EpsGreedy(epsilon=epsilon)
    table = Table(mdp.info.size)
    pi.set_q(table)

    #Agents
    mdp_info_agent1 = MDPInfo(observation_space=observation_space,
                              action_space=spaces.Discrete(5),
                              gamma=1,
                              horizon=20)
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Discrete(5),
                              action_space=action_space,
                              gamma=gamma,
                              horizon=10)
    agent1 = SimpleAgent(name='HIGH', mdp_info=mdp_info_agent1, policy=pi)
    agent2 = SimpleAgent(name='LOW', mdp_info=mdp_info_agent2, policy=pi)

    # Control Blocks
    control_block1 = ControlBlock(wake_time=10,
                                  agent=agent1,
                                  n_eps_per_fit=None,
                                  n_steps_per_fit=1)
    control_block2 = ControlBlock(wake_time=1,
                                  agent=agent2,
                                  n_eps_per_fit=None,
                                  n_steps_per_fit=1)

    # Algorithm
    blocks = [model_block, control_block1, control_block2]
    order = [0, 1, 2]
    model_block.add_input(control_block2)
    control_block1.add_input(model_block)
    control_block1.add_reward(model_block)
    control_block2.add_input(control_block1)
    control_block2.add_reward(model_block)
    computational_graph = ComputationalGraph(blocks=blocks, order=order)
    core = HierarchicalCore(computational_graph)

    # Train
    core.learn(n_steps=40, quiet=True)
    return
Exemplo n.º 26
0
def experiment():
    np.random.seed(3)
    print('hierarchical     :')
    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Model Block
    model_block = MBlock(env=mdp, render=False)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Control Block
    control_block = ControlBlock(wake_time=1,
                                 agent=agent,
                                 n_eps_per_fit=None,
                                 n_steps_per_fit=1)

    # Algorithm
    blocks = [model_block, control_block]
    order = [0, 1]
    model_block.add_input(control_block)
    control_block.add_input(model_block)
    control_block.add_reward(model_block)
    computational_graph = ComputationalGraph(blocks=blocks, order=order)
    core = HierarchicalCore(computational_graph)

    # Train
    core.learn(n_steps=100, quiet=True)
    return agent.Q.table
Exemplo n.º 27
0
    def __init__(self, n_approximators, epsilon=None):
        if epsilon is None:
            epsilon = Parameter(0.)

        super(WeightedPolicy, self).__init__()

        self._n_approximators = n_approximators
        self._epsilon = epsilon
        self._evaluation = False
Exemplo n.º 28
0
def experiment(alpha):
    gym.logger.setLevel(0)
    np.random.seed(386)

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=10000, gamma=1.)
    mdp.seed(201)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(alpha)
    tilings = Tiles.generate(10, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(input_shape=(features.size,),
                               output_shape=(mdp.info.action_space.n,),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate,
                        'lambda': .9}
    fit_params = dict()
    agent_params = {'approximator_params': approximator_params,
                    'algorithm_params': algorithm_params,
                    'fit_params': fit_params}
    agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.array([[0., 0.], [.1, .1]])
    dataset = core.evaluate(initial_states=initial_states, quiet=True)

    return np.mean(compute_J(dataset, 1.))
 def __call__(self, **kwargs):
     dataset = kwargs.get('dataset')
     for step in dataset:
         last = step[-1]
         if last:
             self.counter += 1
     if self.counter % 50 == 0 and not self.last_counter == self.counter:
         new_epsilon = Parameter(self._policy._epsilon.get_value() / 1.01)
         self._policy.set_epsilon(new_epsilon)
         self.last_counter = self.counter
Exemplo n.º 30
0
    def __init__(self, n_approximators, epsilon=None):
        if epsilon is None:
            epsilon = Parameter(0.)

        super(BootPolicy, self).__init__()

        self._n_approximators = n_approximators
        self._epsilon = epsilon
        self._evaluation = False
        self._idx = None
        self.plotter = None