示例#1
0
def experiment():
    np.random.seed()

    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = QLearning(pi, mdp.info, **algorithm_params)

    # Core
    core = Core(agent, mdp)

    # Initial policy Evaluation
    dataset = core.evaluate(n_steps=1000)
    J = np.mean(compute_J(dataset, mdp.info.gamma))
    print('J start:', J)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1)

    # Final Policy Evaluation
    dataset = core.evaluate(n_steps=1000)
    J = np.mean(compute_J(dataset, mdp.info.gamma))
    print('J final:', J)
示例#2
0
def experiment(algorithm_class, decay_exp):
    np.random.seed()

    # MDP
    p = np.load('chain_structure/p.npy')
    rew = np.load('chain_structure/rew.npy')
    mdp = FiniteMDP(p, rew, gamma=.9)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=decay_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(pi, mdp.info, **algorithm_params)

    # Algorithm
    collect_Q = CollectQ(agent.approximator)
    callbacks = [collect_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True)

    Qs = collect_Q.get_values()

    return Qs
示例#3
0
def test_collect_Q():
    np.random.seed(88)
    mdp = GridWorld(3, 3, (2, 2))

    eps = Parameter(0.1)
    pi = EpsGreedy(eps)
    alpha = Parameter(0.1)
    agent = SARSA(pi, mdp.info, alpha)

    callback_q = CollectQ(agent.Q)
    callback_max_q = CollectMaxQ(agent.Q, np.array([2]))

    core = Core(agent, mdp, callbacks=[callback_q, callback_max_q])

    core.learn(n_steps=1000, n_steps_per_fit=1, quiet=True)

    V_test = np.array([2.4477574 , 0.02246188, 1.6210059 , 6.01867052])
    V = callback_q.get()[-1]

    assert np.allclose(V[0, :], V_test)

    V_max = np.array([np.max(x[2, :], axis=-1) for x in callback_q.get()])
    max_q = np.array(callback_max_q.get())

    assert np.allclose(V_max, max_q)
示例#4
0
def experiment_others(alg, decay_exp):
    np.random.seed()

    # MDP

    grid_map = "simple_gridmap.txt"
    mdp = GridWorldGenerator(grid_map=grid_map)

    # Policy
    epsilon = ExponentialDecayParameter(value=1, decay_exp=.5,
                             size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size)

    algorithm_params = dict(learning_rate=alpha)
    fit_params = dict()
    agent_params = {'algorithm_params': algorithm_params,
                    'fit_params': fit_params}
    agent = alg(pi, mdp.info, agent_params)

    # Algorithm
    collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width))
    collect_dataset = CollectDataset()
    callbacks = [collect_max_Q, collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
示例#5
0
def experiment2():
    np.random.seed(3)
    print('mushroom     :')

    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon, )

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)
    dataset = collect_dataset.get()
    return agent.Q.table
示例#6
0
def experiment():
    np.random.seed()

    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon, )

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1)
示例#7
0
def learn(alg, alg_params):
    mdp = CarOnHill()
    np.random.seed(1)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                               n_actions=mdp.info.action_space.n,
                               n_estimators=50,
                               min_samples_split=5,
                               min_samples_leaf=2)
    approximator = ExtraTreesRegressor

    # Agent
    agent = alg(approximator, pi, mdp.info,
                approximator_params=approximator_params, **alg_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=5, n_episodes_per_fit=5)

    test_epsilon = Parameter(0.75)
    agent.policy.set_epsilon(test_epsilon)
    dataset = core.evaluate(n_episodes=2)

    return np.mean(compute_J(dataset, mdp.info.gamma))
示例#8
0
def test_lspi():
    mdp = CartPole()
    np.random.seed(1)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]
    features = Features(basis_list=basis)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(pi,
                 mdp.info,
                 fit_params=dict(),
                 approximator_params=approximator_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=100, n_episodes_per_fit=100)

    w = agent.approximator.get_weights()
    w_test = np.array([-2.23880597, -2.27427603, -2.25])

    assert np.allclose(w, w_test)
示例#9
0
def experiment(boosted):
    np.random.seed(20)

    # MDP
    mdp = CarOnHill()

    # Policy
    epsilon = Parameter(value=1)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    if not boosted:
        approximator_params = dict(
            input_shape=mdp.info.observation_space.shape,
            n_actions=mdp.info.action_space.n,
            n_estimators=50,
            min_samples_split=5,
            min_samples_leaf=2)
    else:
        approximator_params = dict(
            input_shape=mdp.info.observation_space.shape,
            n_actions=mdp.info.action_space.n,
            n_models=3,
            prediction='sum',
            n_estimators=50,
            min_samples_split=5,
            min_samples_leaf=2)

    approximator = ExtraTreesRegressor

    # Agent
    algorithm_params = dict(n_iterations=3, boosted=boosted, quiet=True)
    fit_params = dict()
    agent_params = {
        'approximator_params': approximator_params,
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = FQI(approximator, pi, mdp.info, agent_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=50, n_episodes_per_fit=50, quiet=True)

    # Test
    test_epsilon = Parameter(0)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.zeros((9, 2))
    cont = 0
    for i in range(-8, 9, 8):
        for j in range(-8, 9, 8):
            initial_states[cont, :] = [0.125 * i, 0.375 * j]
            cont += 1

    dataset = core.evaluate(initial_states=initial_states, quiet=True)

    return np.mean(compute_J(dataset, mdp.info.gamma))
示例#10
0
def experiment(algorithm_class, decay_exp):
    np.random.seed()

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1, decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1, decay_exp=decay_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(pi, mdp.info, **algorithm_params)

    # Algorithm
    start = mdp.convert_to_int(mdp._start, mdp._width)
    collect_max_Q = CollectMaxQ(agent.approximator, start)
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset, collect_max_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
示例#11
0
def build_high_level_agent(alg, params, mdp, epsilon):
    pi = EpsGreedy(epsilon=epsilon, )
    mdp_info_high = MDPInfo(observation_space=spaces.Discrete(16),
                            action_space=spaces.Discrete(4),
                            gamma=mdp.info.gamma,
                            horizon=100)

    agent = alg(pi, mdp_info_high, **params)

    return agent
示例#12
0
def test_dataset_utils():
    np.random.seed(88)

    mdp = GridWorld(3, 3, (2, 2))
    epsilon = Parameter(value=0.)
    alpha = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    agent = SARSA(pi, mdp.info, alpha)
    core = Core(agent, mdp)

    dataset = core.evaluate(n_episodes=10)

    J = compute_J(dataset, mdp.info.gamma)
    J_test = np.array([
        1.16106307e-03, 2.78128389e-01, 1.66771817e+00, 3.09031544e-01,
        1.19725152e-01, 9.84770902e-01, 1.06111661e-02, 2.05891132e+00,
        2.28767925e+00, 4.23911583e-01
    ])
    assert np.allclose(J, J_test)

    L = episodes_length(dataset)
    L_test = np.array([87, 35, 18, 34, 43, 23, 66, 16, 15, 31])
    assert np.array_equal(L, L_test)

    dataset_ep = select_first_episodes(dataset, 3)
    J = compute_J(dataset_ep, mdp.info.gamma)
    assert np.allclose(J, J_test[:3])

    L = episodes_length(dataset_ep)
    assert np.allclose(L, L_test[:3])

    samples = select_random_samples(dataset, 2)
    s, a, r, ss, ab, last = parse_dataset(samples)
    s_test = np.array([[6.], [1.]])
    a_test = np.array([[0.], [1.]])
    r_test = np.zeros(2)
    ss_test = np.array([[3], [4]])
    ab_test = np.zeros(2)
    last_test = np.zeros(2)
    assert np.array_equal(s, s_test)
    assert np.array_equal(a, a_test)
    assert np.array_equal(r, r_test)
    assert np.array_equal(ss, ss_test)
    assert np.array_equal(ab, ab_test)
    assert np.array_equal(last, last_test)

    index = np.sum(L_test[:2]) + L_test[2] // 2
    min_J, max_J, mean_J, n_episodes = compute_metrics(dataset[:index],
                                                       mdp.info.gamma)
    assert min_J == 0.0
    assert max_J == 0.0011610630703530948
    assert mean_J == 0.0005805315351765474
    assert n_episodes == 2
def build_high_level_agent(alg, params, mdp):
    epsilon = Parameter(value=0.1)
    pi = EpsGreedy(epsilon=epsilon)
    gamma = 1.0
    mdp_info_agentH = MDPInfo(observation_space=spaces.Discrete(400),
                              action_space=spaces.Discrete(8),
                              gamma=gamma,
                              horizon=10000)

    agent = alg(policy=pi, mdp_info=mdp_info_agentH, **params)

    return agent
示例#14
0
def learn(alg, alg_params):
    # MDP
    mdp = CartPole()
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Policy
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Approximator
    input_shape = mdp.info.observation_space.shape
    approximator_params = dict(
        network=Network if alg is not CategoricalDQN else FeatureNetwork,
        optimizer={
            'class': optim.Adam,
            'params': {
                'lr': .001
            }
        },
        loss=F.smooth_l1_loss,
        input_shape=input_shape,
        output_shape=mdp.info.action_space.size,
        n_actions=mdp.info.action_space.n,
        n_features=2,
        use_cuda=False)

    # Agent
    if alg is not CategoricalDQN:
        agent = alg(TorchApproximator,
                    pi,
                    mdp.info,
                    approximator_params=approximator_params,
                    **alg_params)
    else:
        agent = alg(pi,
                    mdp.info,
                    n_atoms=2,
                    v_min=-1,
                    v_max=1,
                    approximator_params=approximator_params,
                    **alg_params)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=500, n_steps_per_fit=5)

    return agent.approximator
示例#15
0
def experiment1(decay_exp, beta_type):
    np.random.seed()

    # MDP
    p = np.load('p.npy')
    rew = np.load('rew.npy')
    mdp = FiniteMDP(p, rew, gamma=.9)

    # Policy
    epsilon = Parameter(value=1)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    alpha = ExponentialDecayParameter(value=1,
                                      decay_exp=decay_exp,
                                      size=mdp.info.size)

    if beta_type == 'Win':
        beta = WindowedVarianceIncreasingParameter(value=1,
                                                   size=mdp.info.size,
                                                   tol=10.,
                                                   window=50)
    else:
        beta = VarianceIncreasingParameter(value=1,
                                           size=mdp.info.size,
                                           tol=10.)

    algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = RQLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_q = CollectQ(agent.Q)
    collect_lr_1 = CollectParameters(beta, np.array([0]))
    collect_lr_5 = CollectParameters(beta, np.array([4]))
    callbacks = [collect_q, collect_lr_1, collect_lr_5]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True)

    Qs = collect_q.get_values()
    lr_1 = collect_lr_1.get_values()
    lr_5 = collect_lr_5.get_values()

    return Qs, lr_1, lr_5
示例#16
0
def experiment():
    np.random.seed()

    # MDP
    mdp = CarOnHill()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                               n_actions=mdp.info.action_space.n,
                               n_estimators=50,
                               min_samples_split=5,
                               min_samples_leaf=2)
    approximator = ExtraTreesRegressor

    # Agent
    algorithm_params = dict(n_iterations=20)
    agent = FQI(approximator, pi, mdp.info,
                approximator_params=approximator_params, **algorithm_params)

    # Algorithm
    core = Core(agent, mdp)

    # Render
    core.evaluate(n_episodes=1, render=True)

    # Train
    core.learn(n_episodes=1000, n_episodes_per_fit=1000)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.zeros((289, 2))
    cont = 0
    for i in range(-8, 9):
        for j in range(-8, 9):
            initial_states[cont, :] = [0.125 * i, 0.375 * j]
            cont += 1

    dataset = core.evaluate(initial_states=initial_states)

    # Render
    core.evaluate(n_episodes=3, render=True)

    return np.mean(compute_J(dataset, mdp.info.gamma))
示例#17
0
def experiment(decay_exp, windowed, tol):
    np.random.seed()

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1,
                                        decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    alpha = ExponentialDecayParameter(value=1,
                                      decay_exp=decay_exp,
                                      size=mdp.info.size)
    if windowed:
        beta = WindowedVarianceIncreasingParameter(value=1,
                                                   size=mdp.info.size,
                                                   tol=tol,
                                                   window=50)
    else:
        beta = VarianceIncreasingParameter(value=1,
                                           size=mdp.info.size,
                                           tol=tol)
    algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = RQLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_max_Q = CollectMaxQ(agent.Q,
                                mdp.convert_to_int(mdp._start, mdp._width))
    collect_dataset = CollectDataset()
    callbacks = [collect_max_Q, collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
示例#18
0
def experiment():
    np.random.seed()

    # MDP
    mdp = CartPole()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]

    s1 = np.array([-np.pi, 0, np.pi]) * .25
    s2 = np.array([-1, 0, 1])
    for i in s1:
        for j in s2:
            basis.append(GaussianRBF(np.array([i, j]), np.array([1.])))
    features = Features(basis_list=basis)

    fit_params = dict()
    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(pi,
                 mdp.info,
                 fit_params=fit_params,
                 approximator_params=approximator_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)
    core.evaluate(n_episodes=3, render=True)

    # Train
    core.learn(n_episodes=100, n_episodes_per_fit=100)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    dataset = core.evaluate(n_episodes=1, quiet=True)

    core.evaluate(n_steps=100, render=True)

    return np.mean(episodes_length(dataset))
示例#19
0
def experiment():
    np.random.seed(3)
    print('hierarchical     :')
    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Model Block
    model_block = MBlock(env=mdp, render=False)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Control Block
    control_block = ControlBlock(wake_time=1,
                                 agent=agent,
                                 n_eps_per_fit=None,
                                 n_steps_per_fit=1)

    # Algorithm
    blocks = [model_block, control_block]
    order = [0, 1]
    model_block.add_input(control_block)
    control_block.add_input(model_block)
    control_block.add_reward(model_block)
    computational_graph = ComputationalGraph(blocks=blocks, order=order)
    core = HierarchicalCore(computational_graph)

    # Train
    core.learn(n_steps=100, quiet=True)
    return agent.Q.table
示例#20
0
def experiment():
    np.random.seed(3)
    print('hierarchical     :')

    mdp = GridWorldVanHasselt()

    # Model Block
    model_block = MBlock(env=mdp, render=False)

    # Policy
    epsilon = ExponentialDecayParameter(value=1,
                                        decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=1.,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Control Block
    control_block = ControlBlock(name='controller',
                                 agent=agent,
                                 n_steps_per_fit=1)

    # Algorithm
    blocks = [model_block, control_block]
    order = [0, 1]
    model_block.add_input(control_block)
    control_block.add_input(model_block)
    control_block.add_reward(model_block)
    computational_graph = ComputationalGraph(blocks=blocks, order=order)
    core = HierarchicalCore(computational_graph)

    # Train
    core.learn(n_steps=2000, quiet=True)
    return agent.Q.table
示例#21
0
def experiment(alpha):
    gym.logger.setLevel(0)
    np.random.seed(386)

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=10000, gamma=1.)
    mdp.seed(201)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(alpha)
    tilings = Tiles.generate(10, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(input_shape=(features.size,),
                               output_shape=(mdp.info.action_space.n,),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate,
                        'lambda': .9}
    fit_params = dict()
    agent_params = {'approximator_params': approximator_params,
                    'algorithm_params': algorithm_params,
                    'fit_params': fit_params}
    agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.array([[0., 0.], [.1, .1]])
    dataset = core.evaluate(initial_states=initial_states, quiet=True)

    return np.mean(compute_J(dataset, 1.))
示例#22
0
def test_collect_parameter():
    np.random.seed(88)
    mdp = GridWorld(3, 3, (2, 2))

    eps = ExponentialParameter(value=1, exp=.5,
                                   size=mdp.info.observation_space.size)
    pi = EpsGreedy(eps)
    alpha = Parameter(0.1)
    agent = SARSA(pi, mdp.info, alpha)

    callback_eps = CollectParameters(eps, 1)

    core = Core(agent, mdp, callbacks=[callback_eps])

    core.learn(n_steps=10, n_steps_per_fit=1, quiet=True)

    eps_test = np.array([1., 0.70710678, 0.70710678, 0.57735027, 0.57735027,
                         0.57735027, 0.57735027, 0.57735027, 0.57735027, 0.57735027])
    eps = callback_eps.get()

    assert np.allclose(eps, eps_test)
示例#23
0
def experiment():
    np.random.seed()

    # MDP
    mdp = InvertedPendulum()

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    rbfs = GaussianRBF.generate(10, [10, 10], mdp.info.observation_space.low,
                                mdp.info.observation_space.high)
    features = Features(basis_list=rbfs)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = dict(n_iterations)
    fit_params = dict()
    agent_params = {
        'approximator_params': approximator_params,
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = LSPI(pi, mdp.info, agent_params, features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=1000, n_episodes_per_fit=20)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    dataset = core.evaluate(n_episodes=20)

    return np.mean(compute_J(dataset, 1.))
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(alpha)
    tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate, 'lambda': .9}
    fit_params = dict()
    agent_params = {
        'approximator_params': approximator_params,
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks=callbacks)

    # Train
    core.learn(n_episodes=20, n_steps_per_fit=1, render=0)

    dataset = collect_dataset.get()
    return np.mean(compute_J(dataset, 1.))
示例#25
0
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    n_tilings = 10
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    learning_rate = Parameter(alpha / n_tilings)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9}

    agent = TrueOnlineSARSALambda(pi,
                                  mdp.info,
                                  approximator_params=approximator_params,
                                  features=features,
                                  **algorithm_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=40, n_steps_per_fit=1, render=False)
    dataset = core.evaluate(n_episodes=1, render=True)

    return np.mean(compute_J(dataset, 1.))
示例#26
0
def test_collect_dataset():
    np.random.seed(88)
    callback = CollectDataset()

    mdp = GridWorld(4, 4, (2, 2))

    eps = Parameter(0.1)
    pi = EpsGreedy(eps)
    alpha = Parameter(0.2)
    agent = SARSA(pi, mdp.info, alpha)

    core = Core(agent, mdp, callbacks=[callback])

    core.learn(n_steps=10, n_steps_per_fit=1, quiet=True)

    dataset = callback.get()
    assert len(dataset) == 10
    core.learn(n_steps=5, n_steps_per_fit=1, quiet=True)
    assert len(dataset) == 15

    callback.clean()
    dataset = callback.get()
    assert len(dataset) == 0
示例#27
0
def experiment2():
    np.random.seed(3)
    print('mushroom     :')

    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1,
                                        decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=1.,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True)

    # Train
    dataset = collect_dataset.get()
    VisualizeControlBlock(dataset)
    return agent.Q.table
示例#28
0
def experiment():
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_game = parser.add_argument_group('Game')
    arg_game.add_argument("--name",
                          type=str,
                          default='BreakoutDeterministic-v4',
                          help='Gym ID of the Atari game.')
    arg_game.add_argument("--screen-width", type=int, default=84,
                          help='Width of the game screen.')
    arg_game.add_argument("--screen-height", type=int, default=84,
                          help='Height of the game screen.')

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size", type=int, default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size", type=int, default=500000,
                         help='Max size of the replay memory.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument("--optimizer",
                         choices=['adadelta',
                                  'adam',
                                  'rmsprop',
                                  'rmspropcentered'],
                         default='adam',
                         help='Name of the optimizer to use to learn.')
    arg_net.add_argument("--learning-rate", type=float, default=.00025,
                         help='Learning rate value of the optimizer. Only used'
                              'in rmspropcentered')
    arg_net.add_argument("--decay", type=float, default=.95,
                         help='Discount factor for the history coming from the'
                              'gradient momentum in rmspropcentered')
    arg_net.add_argument("--epsilon", type=float, default=.01,
                         help='Epsilon term used in rmspropcentered')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn', 'adqn'],
                         default='dqn',
                         help='Name of the algorithm. dqn is for standard'
                              'DQN, ddqn is for Double DQN and adqn is for'
                              'Averaged DQN.')
    arg_alg.add_argument("--n-approximators", type=int, default=1,
                         help="Number of approximators used in the ensemble for"
                              "Averaged DQN.")
    arg_alg.add_argument("--batch-size", type=int, default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length", type=int, default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency", type=int, default=10000,
                         help='Number of learning step before each update of'
                              'the target network.')
    arg_alg.add_argument("--evaluation-frequency", type=int, default=250000,
                         help='Number of learning step before each evaluation.'
                              'This number represents an epoch.')
    arg_alg.add_argument("--train-frequency", type=int, default=4,
                         help='Number of learning steps before each fit of the'
                              'neural network.')
    arg_alg.add_argument("--max-steps", type=int, default=50000000,
                         help='Total number of learning steps.')
    arg_alg.add_argument("--final-exploration-frame", type=int, default=1000000,
                         help='Number of steps until the exploration rate stops'
                              'decreasing.')
    arg_alg.add_argument("--initial-exploration-rate", type=float, default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate", type=float, default=.1,
                         help='Final value of the exploration rate. When it'
                              'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate", type=float, default=.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-samples", type=int, default=125000,
                         help='Number of steps for each evaluation.')
    arg_alg.add_argument("--max-no-op-actions", type=int, default=8,
                         help='Maximum number of no-op action performed at the'
                              'beginning of the episodes. The minimum number is'
                              'history_length. This number is 30 in the DQN'
                              'Deepmind paper, but they consider the first 30'
                              'frame without frame skipping.')
    arg_alg.add_argument("--no-op-action-value", type=int, default=0,
                         help='Value of the no-op action.')

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--load-path', type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--save', action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--render', action='store_true',
                           help='Flag specifying whether to render the game.')
    arg_utils.add_argument('--quiet', action='store_true',
                           help='Flag specifying whether to hide the progress'
                                'bar.')
    arg_utils.add_argument('--debug', action='store_true',
                           help='Flag specifying whether the script has to be'
                                'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    # Evaluation of the model provided by the user.
    if args.load_path:
        # MDP
        mdp = Atari(args.name, args.screen_width, args.screen_height,
                    ends_at_life=False)

        # Policy
        epsilon_test = Parameter(value=args.test_exploration_rate)
        pi = EpsGreedy(epsilon=epsilon_test)

        # Approximator
        input_shape = (args.screen_height, args.screen_width,
                       args.history_length)
        approximator_params = dict(
            input_shape=input_shape,
            output_shape=(mdp.info.action_space.n,),
            n_actions=mdp.info.action_space.n,
            name='test',
            load_path=args.load_path,
            optimizer={'name': args.optimizer,
                       'lr': args.learning_rate,
                       'decay': args.decay,
                       'epsilon': args.epsilon}
        )

        approximator = ConvNet

        # Agent
        algorithm_params = dict(
            batch_size=1,
            train_frequency=1,
            target_update_frequency=1,
            initial_replay_size=0,
            max_replay_size=0,
            history_length=args.history_length,
            max_no_op_actions=args.max_no_op_actions,
            no_op_action_value=args.no_op_action_value,
            dtype=np.uint8
        )
        agent = DQN(approximator, pi, mdp.info,
                    approximator_params=approximator_params, **algorithm_params)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        pi.set_epsilon(epsilon_test)
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset)
    else:
        # DQN learning run

        # Summary folder
        folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\
            '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

        # Settings
        if args.debug:
            initial_replay_size = 50
            max_replay_size = 500
            train_frequency = 5
            target_update_frequency = 10
            test_samples = 20
            evaluation_frequency = 50
            max_steps = 1000
        else:
            initial_replay_size = args.initial_replay_size
            max_replay_size = args.max_replay_size
            train_frequency = args.train_frequency
            target_update_frequency = args.target_update_frequency
            test_samples = args.test_samples
            evaluation_frequency = args.evaluation_frequency
            max_steps = args.max_steps

        # MDP
        mdp = Atari(args.name, args.screen_width, args.screen_height,
                    ends_at_life=True)

        # Policy
        epsilon = LinearDecayParameter(value=args.initial_exploration_rate,
                                       min_value=args.final_exploration_rate,
                                       n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1)
        pi = EpsGreedy(epsilon=epsilon_random)

        # Approximator
        input_shape = (args.screen_height, args.screen_width,
                       args.history_length)
        approximator_params = dict(
            input_shape=input_shape,
            output_shape=(mdp.info.action_space.n,),
            n_actions=mdp.info.action_space.n,
            folder_name=folder_name,
            optimizer={'name': args.optimizer,
                       'lr': args.learning_rate,
                       'decay': args.decay,
                       'epsilon': args.epsilon}
        )

        approximator = ConvNet

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            n_approximators=args.n_approximators,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size,
            history_length=args.history_length,
            train_frequency=train_frequency,
            target_update_frequency=target_update_frequency,
            max_no_op_actions=args.max_no_op_actions,
            no_op_action_value=args.no_op_action_value,
            dtype=np.uint8
        )

        if args.algorithm == 'dqn':
            agent = DQN(approximator, pi, mdp.info,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'ddqn':
            agent = DoubleDQN(approximator, pi, mdp.info,
                              approximator_params=approximator_params,
                              **algorithm_params)
        elif args.algorithm == 'adqn':
            agent = AveragedDQN(approximator, pi, mdp.info,
                                approximator_params=approximator_params,
                                **algorithm_params)

        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size, quiet=args.quiet)

        if args.save:
            agent.approximator.model.save()

        # Evaluate initial policy
        pi.set_epsilon(epsilon_test)
        if args.algorithm == 'ddqn':
            agent.policy.set_q(agent.target_approximator)
        mdp.set_episode_end(False)
        dataset = core.evaluate(n_steps=test_samples, render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset))
        if args.algorithm == 'ddqn':
            agent.policy.set_q(agent.approximator)

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch)
            print('- Learning:')
            # learning step
            pi.set_epsilon(epsilon)
            mdp.set_episode_end(True)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency, quiet=args.quiet)

            if args.save:
                agent.approximator.model.save()

            print('- Evaluation:')
            # evaluation step
            pi.set_epsilon(epsilon_test)
            if args.algorithm == 'ddqn':
                agent.policy.set_q(agent.target_approximator)
            mdp.set_episode_end(False)
            dataset = core.evaluate(n_steps=test_samples, render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset))
            if args.algorithm == 'ddqn':
                agent.policy.set_q(agent.approximator)

            np.save(folder_name + '/scores.npy', scores)

    return scores
示例#29
0
def experiment_ghavamzade(alg_high, alg_low, params, subdir, i):

    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=False, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last action Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # FeaturesH
    low_hi = 0
    lim_hi = 1000 + 1e-8
    n_tiles_high = [20, 20]
    n_tilings = 1

    # Discretization Block
    discretization_block = DiscretizationBlock(low=low_hi,
                                               high=lim_hi,
                                               n_tiles=n_tiles_high)

    # PolicyH
    epsilon = Parameter(value=0.1)
    piH = EpsGreedy(epsilon=epsilon)

    # AgentH
    learning_rate = params.get('learning_rate_high')

    mdp_info_agentH = MDPInfo(observation_space=spaces.Discrete(
        n_tiles_high[0] * n_tiles_high[1]),
                              action_space=spaces.Discrete(8),
                              gamma=1,
                              horizon=10000)

    agentH = alg_high(policy=piH,
                      mdp_info=mdp_info_agentH,
                      learning_rate=learning_rate,
                      lambda_coeff=0.9)

    epsilon_update = EpsilonUpdate(piH)

    # Control Block H
    control_blockH = ControlBlock(name='control block H',
                                  agent=agentH,
                                  n_steps_per_fit=1)

    #FeaturesL
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 10]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 3

    tilingsL = Tiles.generate(n_tilings=n_tilings,
                              n_tiles=n_tiles,
                              low=low,
                              high=high)

    featuresL = Features(tilings=tilingsL)

    mdp_info_agentL = MDPInfo(observation_space=spaces.Box(
        low=np.array([0, 0]), high=np.array([150, 150]), shape=(2, )),
                              action_space=mdp.info.action_space,
                              gamma=0.99,
                              horizon=10000)

    # Approximators
    input_shape = (featuresL.size, )

    approximator_params = dict(input_dim=input_shape[0])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=input_shape,
                              output_shape=mdp.info.action_space.shape,
                              **approximator_params)
    approximator2 = Regressor(LinearApproximator,
                              input_shape=input_shape,
                              output_shape=mdp.info.action_space.shape,
                              **approximator_params)

    # Policy1
    std1 = np.array([3e-2])
    pi1 = DiagonalGaussianPolicy(mu=approximator1, std=std1)

    # Policy2
    std2 = np.array([3e-2])
    pi2 = DiagonalGaussianPolicy(mu=approximator2, std=std2)

    # Agent1
    learning_rate1 = params.get('learning_rate_low')
    agent1 = alg_low(pi1, mdp_info_agentL, learning_rate1, featuresL)

    # Agent2
    learning_rate2 = params.get('learning_rate_low')
    agent2 = alg_low(pi2, mdp_info_agentL, learning_rate2, featuresL)

    #Termination Conds
    termination_condition1 = TerminationCondition(active_dir='+')
    termination_condition2 = TerminationCondition(active_dir='x')

    low_ep_per_fit = params.get('low_ep_per_fit')

    # Control Block +
    control_block_plus = ControlBlock(
        name='control block 1',
        agent=agent1,
        n_eps_per_fit=low_ep_per_fit,
        termination_condition=termination_condition1)

    # Control Block x
    control_block_cross = ControlBlock(
        name='control block 2',
        agent=agent2,
        n_eps_per_fit=low_ep_per_fit,
        termination_condition=termination_condition2)

    # Function Block 1: picks state for hi lev ctrl
    function_block1 = fBlock(phi=pick_state, name='f1 pickstate')

    # Function Block 2: maps the env to low lev ctrl state
    function_block2 = fBlock(phi=rototranslate, name='f2 rotot')

    # Function Block 3: holds curr state as ref
    function_block3 = hold_state(name='f3 holdstate')

    # Function Block 4: adds hi lev rew
    function_block4 = addBlock(name='f4 add')

    # Function Block 5: adds low lev rew
    function_block5 = addBlock(name='f5 add')

    # Function Block 6:ext rew of hi lev ctrl
    function_block6 = fBlock(phi=G_high, name='f6 G_hi')

    # Function Block 7: ext rew of low lev ctrl
    function_block7 = fBlock(phi=G_low, name='f7 G_lo')

    #Reward Accumulator H:
    reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma,
                                            name='reward_acc_H')

    # Selector Block
    function_block8 = fBlock(phi=selector_function, name='f7 G_lo')

    #Mux_Block
    mux_block = MuxBlock(name='mux')
    mux_block.add_block_list([control_block_plus])
    mux_block.add_block_list([control_block_cross])

    #Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_blockH, mux_block,
        function_block1, function_block2, function_block3, function_block4,
        function_block5, function_block6, function_block7, function_block8,
        reward_acc_H, discretization_block
    ]

    reward_acc_H.add_input(reward_ph)
    reward_acc_H.add_alarm_connection(control_block_plus)
    reward_acc_H.add_alarm_connection(control_block_cross)

    control_blockH.add_input(discretization_block)
    control_blockH.add_reward(function_block4)
    control_blockH.add_alarm_connection(control_block_plus)
    control_blockH.add_alarm_connection(control_block_cross)

    mux_block.add_input(function_block8)
    mux_block.add_input(function_block2)

    control_block_plus.add_reward(function_block5)
    control_block_cross.add_reward(function_block5)

    function_block1.add_input(state_ph)

    function_block2.add_input(control_blockH)
    function_block2.add_input(state_ph)
    function_block2.add_input(function_block3)

    function_block3.add_input(state_ph)
    function_block3.add_alarm_connection(control_block_plus)
    function_block3.add_alarm_connection(control_block_cross)

    function_block4.add_input(function_block6)
    function_block4.add_input(reward_acc_H)

    function_block5.add_input(function_block7)

    function_block6.add_input(reward_ph)

    function_block7.add_input(control_blockH)
    function_block7.add_input(function_block2)

    function_block8.add_input(control_blockH)

    discretization_block.add_input(function_block1)

    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    low_level_dataset_eval1 = list()
    low_level_dataset_eval2 = list()
    dataset_eval = list()

    dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
    # print('distribution parameters: ', distribution.get_parameters())
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    dataset_eval += dataset_eval_run
    print('J at start : ' + str(np.mean(J)))
    for n in range(n_runs):
        print('ITERATION', n)

        core.learn(n_episodes=n_iterations * ep_per_run, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        dataset_eval += dataset_eval_run

        dataset_plus = control_block_plus.dataset.get()
        J_plus = compute_J(dataset_plus, mdp.info.gamma)
        dataset_cross = control_block_cross.dataset.get()
        J_cross = compute_J(dataset_cross, mdp.info.gamma)

        low_level_dataset_eval1.append(dataset_plus)
        low_level_dataset_eval2.append(dataset_cross)

        print('J ll PLUS at iteration  ' + str(n) + ': ' +
              str(np.mean(J_plus)))
        print('J ll CROSS at iteration ' + str(n) + ': ' +
              str(np.mean(J_cross)))
        if n == 4:
            control_blockH.callbacks = [epsilon_update]

    # Tile data
    hi_lev_params = agentH.Q.table
    max_q_val = np.zeros(n_tiles_high[0]**2)
    act_max_q_val = np.zeros(n_tiles_high[0]**2)
    for n in range(n_tiles_high[0]**2):
        max_q_val[n] = np.amax(hi_lev_params[n])
        act_max_q_val[n] = np.argmax(hi_lev_params[n])

    mk_dir_recursive('./' + subdir + str(i))

    np.save(subdir + str(i) + '/low_level_dataset1_file',
            low_level_dataset_eval1)
    np.save(subdir + str(i) + '/low_level_dataset2_file',
            low_level_dataset_eval2)
    np.save(subdir + str(i) + '/max_q_val_tiled_file', max_q_val)
    np.save(subdir + str(i) + '/act_max_q_val_tiled_file', act_max_q_val)
    np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)

    return
示例#30
0
def experiment(n_epochs, n_steps, n_steps_test):
    np.random.seed()

    # MDP
    horizon = 1000
    gamma = 0.99
    gamma_eval = 1.
    mdp = Gym('Acrobot-v1', horizon, gamma)

    # Policy
    epsilon = LinearDecayParameter(value=1., min_value=.01, n=5000)
    epsilon_test = Parameter(value=0.)
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Settings
    initial_replay_size = 500
    max_replay_size = 5000
    target_update_frequency = 100
    batch_size = 200
    n_features = 80
    train_frequency = 1

    # Approximator
    input_shape = mdp.info.observation_space.shape
    approximator_params = dict(network=Network,
                               optimizer={
                                   'class': optim.Adam,
                                   'params': {
                                       'lr': .001
                                   }
                               },
                               loss=F.smooth_l1_loss,
                               n_features=n_features,
                               input_shape=input_shape,
                               output_shape=mdp.info.action_space.size,
                               n_actions=mdp.info.action_space.n)

    # Agent
    agent = DQN(PyTorchApproximator,
                pi,
                mdp.info,
                approximator_params=approximator_params,
                batch_size=batch_size,
                n_approximators=1,
                initial_replay_size=initial_replay_size,
                max_replay_size=max_replay_size,
                target_update_frequency=target_update_frequency)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    # RUN
    pi.set_epsilon(epsilon_test)
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    J = compute_J(dataset, gamma_eval)
    print('J: ', np.mean(J))

    for n in range(n_epochs):
        print('Epoch: ', n)
        pi.set_epsilon(epsilon)
        core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency)
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        J = compute_J(dataset, gamma_eval)
        print('J: ', np.mean(J))

    print('Press a button to visualize acrobot')
    input()
    core.evaluate(n_episodes=5, render=True)
示例#31
0
def experiment():
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_game = parser.add_argument_group('Game')
    arg_game.add_argument("--name",
                          type=str,
                          default='BreakoutDeterministic-v4',
                          help='Gym ID of the Atari game.')
    arg_game.add_argument("--screen-width", type=int, default=84,
                          help='Width of the game screen.')
    arg_game.add_argument("--screen-height", type=int, default=84,
                          help='Height of the game screen.')

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size", type=int, default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size", type=int, default=500000,
                         help='Max size of the replay memory.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument("--optimizer",
                         choices=['adadelta',
                                  'adam',
                                  'rmsprop',
                                  'rmspropcentered'],
                         default='adam',
                         help='Name of the optimizer to use.')
    arg_net.add_argument("--learning-rate", type=float, default=.00025,
                         help='Learning rate value of the optimizer.')
    arg_net.add_argument("--decay", type=float, default=.95,
                         help='Discount factor for the history coming from the'
                              'gradient momentum in rmspropcentered and'
                              'rmsprop')
    arg_net.add_argument("--epsilon", type=float, default=.01,
                         help='Epsilon term used in rmspropcentered and'
                              'rmsprop')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn', 'adqn'],
                         default='dqn',
                         help='Name of the algorithm. dqn is for standard'
                              'DQN, ddqn is for Double DQN and adqn is for'
                              'Averaged DQN.')
    arg_alg.add_argument("--n-approximators", type=int, default=1,
                         help="Number of approximators used in the ensemble for"
                              "Averaged DQN.")
    arg_alg.add_argument("--batch-size", type=int, default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length", type=int, default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency", type=int, default=10000,
                         help='Number of collected samples before each update'
                              'of the target network.')
    arg_alg.add_argument("--evaluation-frequency", type=int, default=250000,
                         help='Number of collected samples before each'
                              'evaluation. An epoch ends after this number of'
                              'steps')
    arg_alg.add_argument("--train-frequency", type=int, default=4,
                         help='Number of collected samples before each fit of'
                              'the neural network.')
    arg_alg.add_argument("--max-steps", type=int, default=50000000,
                         help='Total number of collected samples.')
    arg_alg.add_argument("--final-exploration-frame", type=int, default=1000000,
                         help='Number of collected samples until the exploration'
                              'rate stops decreasing.')
    arg_alg.add_argument("--initial-exploration-rate", type=float, default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate", type=float, default=.1,
                         help='Final value of the exploration rate. When it'
                              'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate", type=float, default=.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-samples", type=int, default=125000,
                         help='Number of collected samples for each'
                              'evaluation.')
    arg_alg.add_argument("--max-no-op-actions", type=int, default=8,
                         help='Maximum number of no-op actions performed at the'
                              'beginning of the episodes. The minimum number is'
                              'history_length. This number is reported to be 30'
                              'in the DQN Deepmind paper but, since they'
                              'consider the first 30 frames without frame'
                              'skipping and that the number of skipped frames'
                              'is generally 4, we set it to 8.')
    arg_alg.add_argument("--no-op-action-value", type=int, default=0,
                         help='Value of the no-op action.')

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--device', type=int, default=None,
                           help='ID of the GPU device to use. If None, CPU is'
                                'used.')
    arg_utils.add_argument('--load-path', type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--save', action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--render', action='store_true',
                           help='Flag specifying whether to render the game.')
    arg_utils.add_argument('--quiet', action='store_true',
                           help='Flag specifying whether to hide the progress'
                                'bar.')
    arg_utils.add_argument('--debug', action='store_true',
                           help='Flag specifying whether the script has to be'
                                'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if args.optimizer == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=args.learning_rate)
    elif args.optimizer == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=args.learning_rate)
    elif args.optimizer == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon,
                                   centered=True)
    else:
        raise ValueError

    # Evaluation of the model provided by the user.
    if args.load_path:
        # MDP
        mdp = Atari(args.name, args.screen_width, args.screen_height,
                    ends_at_life=False)

        # Policy
        epsilon_test = Parameter(value=args.test_exploration_rate)
        pi = EpsGreedy(epsilon=epsilon_test)

        # Approximator
        input_shape = (args.screen_height, args.screen_width,
                       args.history_length)
        approximator_params = dict(
            network=Network,
            input_shape=input_shape,
            output_shape=(mdp.info.action_space.n,),
            n_actions=mdp.info.action_space.n,
            load_path=args.load_path,
            optimizer=optimizer,
            loss=F.smooth_l1_loss,
            device=args.device
        )

        approximator = PyTorchApproximator

        # Agent
        algorithm_params = dict(
            batch_size=1,
            train_frequency=1,
            target_update_frequency=1,
            initial_replay_size=0,
            max_replay_size=0,
            history_length=args.history_length,
            max_no_op_actions=args.max_no_op_actions,
            no_op_action_value=args.no_op_action_value,
            dtype=np.uint8
        )
        agent = DQN(approximator, pi, mdp.info,
                    approximator_params=approximator_params, **algorithm_params)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        pi.set_epsilon(epsilon_test)
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset)
    else:
        # DQN learning run

        # Summary folder
        folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\
            '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        pathlib.Path(folder_name).mkdir(parents=True)

        # Settings
        if args.debug:
            initial_replay_size = 50
            max_replay_size = 500
            train_frequency = 5
            target_update_frequency = 10
            test_samples = 20
            evaluation_frequency = 50
            max_steps = 1000
        else:
            initial_replay_size = args.initial_replay_size
            max_replay_size = args.max_replay_size
            train_frequency = args.train_frequency
            target_update_frequency = args.target_update_frequency
            test_samples = args.test_samples
            evaluation_frequency = args.evaluation_frequency
            max_steps = args.max_steps

        # MDP
        mdp = Atari(args.name, args.screen_width, args.screen_height,
                    ends_at_life=True)

        # Policy
        epsilon = LinearDecayParameter(value=args.initial_exploration_rate,
                                       min_value=args.final_exploration_rate,
                                       n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1)
        pi = EpsGreedy(epsilon=epsilon_random)

        # Approximator
        input_shape = (args.screen_height, args.screen_width,
                       args.history_length)
        approximator_params = dict(
            network=Network,
            input_shape=input_shape,
            output_shape=(mdp.info.action_space.n,),
            n_actions=mdp.info.action_space.n,
            folder_name=folder_name,
            optimizer=optimizer,
            loss=F.smooth_l1_loss,
            device=args.device
        )

        approximator = PyTorchApproximator

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            n_approximators=args.n_approximators,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size,
            history_length=args.history_length,
            target_update_frequency=target_update_frequency//train_frequency,
            max_no_op_actions=args.max_no_op_actions,
            no_op_action_value=args.no_op_action_value,
            dtype=np.uint8
        )

        if args.algorithm == 'dqn':
            agent = DQN(approximator, pi, mdp.info,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'ddqn':
            agent = DoubleDQN(approximator, pi, mdp.info,
                              approximator_params=approximator_params,
                              **algorithm_params)
        elif args.algorithm == 'adqn':
            agent = AveragedDQN(approximator, pi, mdp.info,
                                approximator_params=approximator_params,
                                **algorithm_params)

        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size, quiet=args.quiet)

        if args.save:
            agent.approximator.model.save()

        # Evaluate initial policy
        pi.set_epsilon(epsilon_test)
        if args.algorithm == 'ddqn':
            agent.policy.set_q(agent.target_approximator)
        mdp.set_episode_end(False)
        dataset = core.evaluate(n_steps=test_samples, render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset))
        if args.algorithm == 'ddqn':
            agent.policy.set_q(agent.approximator)

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch)
            print('- Learning:')
            # learning step
            pi.set_epsilon(epsilon)
            mdp.set_episode_end(True)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency, quiet=args.quiet)

            if args.save:
                agent.approximator.model.save()

            print('- Evaluation:')
            # evaluation step
            pi.set_epsilon(epsilon_test)
            if args.algorithm == 'ddqn':
                agent.policy.set_q(agent.target_approximator)
            mdp.set_episode_end(False)
            dataset = core.evaluate(n_steps=test_samples, render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset))
            if args.algorithm == 'ddqn':
                agent.policy.set_q(agent.approximator)

            np.save(folder_name + '/scores.npy', scores)

    return scores
示例#32
0
def experiment(n_epochs, n_steps, n_steps_test):
    np.random.seed()

    # MDP
    horizon = 1000
    gamma = 0.99
    gamma_eval = 1.
    mdp = Gym('Acrobot-v1', horizon, gamma)

    # Policy
    epsilon = LinearDecayParameter(value=1., min_value=.01, n=5000)
    epsilon_test = Parameter(value=0.)
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Settings
    initial_replay_size = 500
    max_replay_size = 5000
    target_update_frequency = 100
    batch_size = 200
    n_features = 80
    train_frequency = 1

    # Approximator
    input_shape = (1,) + mdp.info.observation_space.shape
    approximator_params = dict(network=Network,
                               optimizer={'class': optim.Adam,
                                          'params': {'lr': .001}},
                               loss=F.smooth_l1_loss, n_features=n_features,
                               input_shape=input_shape,
                               output_shape=mdp.info.action_space.size,
                               n_actions=mdp.info.action_space.n)

    # Agent
    agent = DQN(PyTorchApproximator, pi, mdp.info,
                approximator_params=approximator_params, batch_size=batch_size,
                n_approximators=1, initial_replay_size=initial_replay_size,
                max_replay_size=max_replay_size, history_length=1,
                target_update_frequency=target_update_frequency,
                max_no_op_actions=0, no_op_action_value=0, dtype=np.float32)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size)

    # RUN
    pi.set_epsilon(epsilon_test)
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    J = compute_J(dataset, gamma_eval)
    print('J: ', np.mean(J))

    for n in range(n_epochs):
        print('Epoch: ', n)
        pi.set_epsilon(epsilon)
        core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency)
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        J = compute_J(dataset, gamma_eval)
        print('J: ', np.mean(J))

    print('Press a button to visualize acrobot')
    input()
    core.evaluate(n_episodes=5, render=True)
示例#33
0
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor

from mushroom.algorithms.value import FQI
from mushroom.core import Core
from mushroom.environments import CarOnHill
from mushroom.policy import EpsGreedy
from mushroom.utils.dataset import compute_J
from mushroom.utils.parameters import Parameter

mdp = CarOnHill()

# Policy
epsilon = Parameter(value=1.)
pi = EpsGreedy(epsilon=epsilon)

# Approximator
approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                           n_actions=mdp.info.action_space.n,
                           n_estimators=50,
                           min_samples_split=5,
                           min_samples_leaf=2)
approximator = ExtraTreesRegressor

# Agent
agent = FQI(approximator, pi, mdp.info, n_iterations=20,
            approximator_params=approximator_params)

core = Core(agent, mdp)

core.learn(n_episodes=1000, n_episodes_per_fit=1000)