예제 #1
0
def flat_experiment(mdp, agent, n_epochs, n_iterations,
                    ep_per_iteration, ep_per_eval):
    np.random.seed()

    J_list = list()
    L_list = list()
    core = Core(agent, mdp)

    dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True)
    J = compute_J(dataset, gamma=mdp.info.gamma)
    J_list.append(np.mean(J))
    L = episodes_length(dataset)
    L_list.append(np.mean(L))

    for n in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_iteration,
                   n_episodes_per_fit=ep_per_iteration, quiet=True)
        dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True)
        J = compute_J(dataset, gamma=mdp.info.gamma)
        J_list.append(np.mean(J))
        L = episodes_length(dataset)
        L_list.append(np.mean(L))
        #print('J', n, ':', J_list[-1])

    return J_list, L_list
예제 #2
0
def hierarchical_experiment(mdp, agent_low, agent_high, n_epochs, n_episodes,
                            ep_per_fit_low, ep_per_fit_high, ep_per_eval):
    np.random.seed()

    computational_graph, control_block_h = build_computational_graph(
        mdp, agent_low, agent_high, ep_per_fit_low, ep_per_fit_high)

    core = HierarchicalCore(computational_graph)
    J_list = list()
    L_list = list()

    dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True)
    J = compute_J(dataset, gamma=mdp.info.gamma)
    J_list.append(np.mean(J))
    L = episodes_length(dataset)
    L_list.append(np.mean(L))

    for n in range(n_epochs):
        if n == 2:
            control_block_h.unset_mask()
        core.learn(n_episodes=n_episodes, skip=True, quiet=True)
        dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True)
        J = compute_J(dataset, gamma=mdp.info.gamma)
        J_list.append(np.mean(J))
        L = episodes_length(dataset)
        L_list.append(np.mean(L))

    return J_list, L_list
def ghavamzadeh_experiment(mdp, agent_plus, agent_cross, agent_high, n_epochs,
                           n_episodes, ep_per_eval, ep_per_iteration_low):
    np.random.seed()

    computational_graph, control_blockH = build_ghavamzadeh_graph(
        mdp, agent_plus, agent_cross, agent_high, ep_per_iteration_low)

    core = HierarchicalCore(computational_graph)
    J_list = list()
    L_list = list()

    epsilon_update = EpsilonUpdate(agent_high.policy)

    dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True)
    J = compute_J(dataset, gamma=mdp.info.gamma)
    J_list.append(np.mean(J))
    L = episodes_length(dataset)
    L_list.append(np.mean(L))

    for n in range(n_epochs):
        core.learn(n_episodes=n_episodes, skip=True, quiet=True)
        dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True)
        J = compute_J(dataset, gamma=mdp.info.gamma)
        J_list.append(np.mean(J))
        L = episodes_length(dataset)
        L_list.append(np.mean(L))

        if n == 4:
            control_blockH.callbacks = [epsilon_update]

    return J_list, L_list
예제 #4
0
def discretized_experiment(mdp, agent, n_actions, n_epochs, n_episodes,
                           ep_per_eval, display, print_j, quiet):
    np.random.seed()

    computational_graph = build_computational_graph_discretized(
        mdp, agent, n_actions)

    core = HierarchicalCore(computational_graph)
    J_list = list()
    L_list = list()

    dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet)
    J = compute_J(dataset, gamma=mdp.info.gamma)
    J_list.append(np.mean(J))
    L = episodes_length(dataset)
    L_list.append(np.mean(L))
    if print_j:
        print('Reward at start :', J_list[-1])

    for n in range(n_epochs):
        core.learn(n_episodes=n_episodes, skip=True, quiet=quiet)
        dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet)
        J = compute_J(dataset, gamma=mdp.info.gamma)
        J_list.append(np.mean(J))
        L = episodes_length(dataset)
        L_list.append(np.mean(L))

        if print_j:
            print('Reward at epoch ', n, ':', J_list[-1])

        if display:
            core.evaluate(n_episodes=1, render=True)

    return J_list, L_list
def experiment(mdp, agent_high, agent_low, n_epochs, n_episodes, ep_per_eval,
               ep_per_fit_low, display, print_j, quiet):
    np.random.seed()

    dataset_callback = CollectDataset()

    computational_graph = build_computational_graph(mdp, agent_low, agent_high,
                                                    ep_per_fit_low,
                                                    [dataset_callback])

    core = HierarchicalCore(computational_graph)
    J_list = list()
    L_list = list()

    dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet)
    J = compute_J(dataset, gamma=mdp.info.gamma)
    J_list.append(np.mean(J))
    J_low_list = list()
    L = episodes_length(dataset)
    L_list.append(np.mean(L))
    if print_j:
        print('Reward at start :', J_list[-1])

    for n in range(n_epochs):
        core.learn(n_episodes=n_episodes, skip=True, quiet=quiet)

        ll_dataset = dataset_callback.get()
        dataset_callback.clean()
        J_low = compute_J(ll_dataset, mdp.info.gamma)
        J_low_list.append(np.mean(J_low))
        if print_j:
            print('Low level reward at epoch', n, ':', np.mean(J_low))

        dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet)
        J = compute_J(dataset, gamma=mdp.info.gamma)
        J_list.append(np.mean(J))
        L = episodes_length(dataset)
        L_list.append(np.mean(L))

        if print_j:
            print('Reward at epoch ', n, ':', J_list[-1])

        if display:
            core.evaluate(n_episodes=1, render=True)

    return J_list, L_list, J_low_list
예제 #6
0
def experiment():
    np.random.seed()

    # MDP
    mdp = CartPole()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]

    s1 = np.array([-np.pi, 0, np.pi]) * .25
    s2 = np.array([-1, 0, 1])
    for i in s1:
        for j in s2:
            basis.append(GaussianRBF(np.array([i, j]), np.array([1.])))
    features = Features(basis_list=basis)

    fit_params = dict()
    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(pi,
                 mdp.info,
                 fit_params=fit_params,
                 approximator_params=approximator_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)
    core.evaluate(n_episodes=3, render=True)

    # Train
    core.learn(n_episodes=100, n_episodes_per_fit=100)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    dataset = core.evaluate(n_episodes=1, quiet=True)

    core.evaluate(n_steps=100, render=True)

    return np.mean(episodes_length(dataset))