예제 #1
0
def experiment(algorithm_class, decay_exp):
    np.random.seed()

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1, decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1, decay_exp=decay_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(pi, mdp.info, **algorithm_params)

    # Algorithm
    start = mdp.convert_to_int(mdp._start, mdp._width)
    collect_max_Q = CollectMaxQ(agent.approximator, start)
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset, collect_max_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
예제 #2
0
def experiment_others(alg, decay_exp):
    np.random.seed()

    # MDP

    grid_map = "simple_gridmap.txt"
    mdp = GridWorldGenerator(grid_map=grid_map)

    # Policy
    epsilon = ExponentialDecayParameter(value=1, decay_exp=.5,
                             size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size)

    algorithm_params = dict(learning_rate=alpha)
    fit_params = dict()
    agent_params = {'algorithm_params': algorithm_params,
                    'fit_params': fit_params}
    agent = alg(pi, mdp.info, agent_params)

    # Algorithm
    collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width))
    collect_dataset = CollectDataset()
    callbacks = [collect_max_Q, collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
예제 #3
0
def experiment(algorithm_class, decay_exp):
    np.random.seed()

    # MDP
    p = np.load('chain_structure/p.npy')
    rew = np.load('chain_structure/rew.npy')
    mdp = FiniteMDP(p, rew, gamma=.9)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=decay_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(pi, mdp.info, **algorithm_params)

    # Algorithm
    collect_Q = CollectQ(agent.approximator)
    callbacks = [collect_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True)

    Qs = collect_Q.get_values()

    return Qs
예제 #4
0
def experiment(decay_exp, windowed, tol):
    np.random.seed()

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1,
                                        decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    alpha = ExponentialDecayParameter(value=1,
                                      decay_exp=decay_exp,
                                      size=mdp.info.size)
    if windowed:
        beta = WindowedVarianceIncreasingParameter(value=1,
                                                   size=mdp.info.size,
                                                   tol=tol,
                                                   window=50)
    else:
        beta = VarianceIncreasingParameter(value=1,
                                           size=mdp.info.size,
                                           tol=tol)
    algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = RQLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_max_Q = CollectMaxQ(agent.Q,
                                mdp.convert_to_int(mdp._start, mdp._width))
    collect_dataset = CollectDataset()
    callbacks = [collect_max_Q, collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
예제 #5
0
def experiment(n_epochs, n_steps, n_eval_episodes):
    np.random.seed()

    # MDP
    mdp = InvertedPendulum()

    # Agent
    n_tilings = 10
    alpha_theta = ExponentialDecayParameter(1, decay_exp=1.0)
    alpha_omega = ExponentialDecayParameter(1.5 / n_tilings, decay_exp=2 / 3)
    alpha_v = ExponentialDecayParameter(1 / n_tilings, decay_exp=2 / 3)
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)

    phi = Features(tilings=tilings)
    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-3 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(policy,
                     mu,
                     mdp.info,
                     alpha_theta,
                     alpha_omega,
                     alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=n_eval_episodes)
    J = compute_J(dataset_eval, gamma=1.0)
    print('Total Reward per episode at start : ' + str(np.mean(J)))

    for i in range(n_epochs):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset_eval = core.evaluate(n_episodes=n_eval_episodes, render=False)
        J = compute_J(dataset_eval, gamma=1.0)
        print('Total Reward per episode at iteration ' + str(i) + ': ' +
              str(np.mean(J)))
예제 #6
0
def experiment():
    np.random.seed(3)
    print('hierarchical     :')

    mdp = GridWorldVanHasselt()

    # Model Block
    model_block = MBlock(env=mdp, render=False)

    # Policy
    epsilon = ExponentialDecayParameter(value=1,
                                        decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=1.,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Control Block
    control_block = ControlBlock(name='controller',
                                 agent=agent,
                                 n_steps_per_fit=1)

    # Algorithm
    blocks = [model_block, control_block]
    order = [0, 1]
    model_block.add_input(control_block)
    control_block.add_input(model_block)
    control_block.add_reward(model_block)
    computational_graph = ComputationalGraph(blocks=blocks, order=order)
    core = HierarchicalCore(computational_graph)

    # Train
    core.learn(n_steps=2000, quiet=True)
    return agent.Q.table
예제 #7
0
def experiment1(decay_exp, beta_type):
    np.random.seed()

    # MDP
    p = np.load('p.npy')
    rew = np.load('rew.npy')
    mdp = FiniteMDP(p, rew, gamma=.9)

    # Policy
    epsilon = Parameter(value=1)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    alpha = ExponentialDecayParameter(value=1,
                                      decay_exp=decay_exp,
                                      size=mdp.info.size)

    if beta_type == 'Win':
        beta = WindowedVarianceIncreasingParameter(value=1,
                                                   size=mdp.info.size,
                                                   tol=10.,
                                                   window=50)
    else:
        beta = VarianceIncreasingParameter(value=1,
                                           size=mdp.info.size,
                                           tol=10.)

    algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = RQLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_q = CollectQ(agent.Q)
    collect_lr_1 = CollectParameters(beta, np.array([0]))
    collect_lr_5 = CollectParameters(beta, np.array([4]))
    callbacks = [collect_q, collect_lr_1, collect_lr_5]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True)

    Qs = collect_q.get_values()
    lr_1 = collect_lr_1.get_values()
    lr_5 = collect_lr_5.get_values()

    return Qs, lr_1, lr_5
예제 #8
0
def experiment2():
    np.random.seed(3)
    print('mushroom     :')

    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1,
                                        decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=1.,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True)

    # Train
    dataset = collect_dataset.get()
    VisualizeControlBlock(dataset)
    return agent.Q.table
예제 #9
0
def experiment(policy, name, alg_version):
    np.random.seed()

    # MDP

    if name == "Taxi":
        mdp = generate_taxi('../grid.txt')
        max_steps = 100000
        evaluation_frequency = 2000
        test_samples = 10000
    elif name == "NChain-v0":
        mdp = generate_chain(horizon=1000)
        max_steps = 5000
        evaluation_frequency = 100
        test_samples = 10000
    elif name == "Loop":
        mdp = generate_loop(horizon=1000)
        max_steps = 5000
        evaluation_frequency = 100
        test_samples = 10000
    elif name == "SixArms":
        mdp = generate_arms(horizon=1000)
        max_steps = 25000
        evaluation_frequency = 500
        test_samples = 10000
    elif name == "RiverSwim":
        mdp = generate_river(horizon=1000)
        max_steps = 5000
        evaluation_frequency = 100
        test_samples = 10000
    else:
        raise NotImplementedError
    # Policy
    # epsilon = ExponentialDecayParameter(value=1., decay_exp=.5,
    #                                     size=mdp.info.observation_space.size)
    epsilon_train = ExponentialDecayParameter(
        value=1., decay_exp=.5, size=mdp.info.observation_space.size)
    epsilon_test = Parameter(0)
    pi = policy(epsilon=epsilon_train)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=.2,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = QLearning(pi, mdp.info, **algorithm_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)
    scores = list()
    scores_train = list()
    # Train
    for n_epoch in range(1, max_steps // evaluation_frequency + 1):
        print('- Learning:')
        # learning step
        pi.set_epsilon(epsilon_train)
        core.learn(n_steps=evaluation_frequency,
                   n_steps_per_fit=1,
                   quiet=False)
        dataset = collect_dataset.get()
        if name == "Taxi":
            scores_train.append(get_stats(dataset))
        elif name in ["SixArms"]:
            scores_train.append(compute_scores_Loop(dataset, horizon=500))
        else:
            scores_train.append(compute_scores_Loop(dataset))
        collect_dataset.clean()
        mdp.reset()
        print('- Evaluation:')
        # evaluation step
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_steps=test_samples, quiet=False)
        mdp.reset()
        scores.append(get_stats(dataset))
        #np.save(env + '/'+alg_version+'_scores.npy', scores)

    return scores_train, scores
예제 #10
0
def experiment(algorithm, name, update_mode, update_type, policy,
               n_approximators, q_max, q_min, lr_exp, double, file_name,
               out_dir, collect_qs, seed):
    set_global_seeds(seed)
    print('Using seed %s' % seed)

    # MDP
    if name == 'Taxi':
        mdp = generate_taxi('../../grid.txt', horizon=5000)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 5000
    elif name == 'Chain':
        mdp = generate_chain(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'Loop':
        mdp = generate_loop(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'RiverSwim':
        mdp = generate_river(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'SixArms':
        mdp = generate_arms(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'KnightQuest':
        mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    else:
        raise NotImplementedError

    epsilon_test = Parameter(0)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=lr_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)

    if algorithm == 'ql':
        if policy not in ['boltzmann', 'eps-greedy']:
            warnings.warn(
                'QL available with only boltzmann and eps-greedy policies!')
            policy = 'eps-greedy'

        if policy == 'eps-greedy':
            epsilon_train = ExponentialDecayParameter(
                value=1., decay_exp=.5, size=mdp.info.observation_space.size)
            pi = policy_dict[policy](epsilon=epsilon_train)
        else:
            beta_train = ExponentialDecayParameter(
                value=1.5 * q_max,
                decay_exp=.5,
                size=mdp.info.observation_space.size)
            pi = policy_dict[policy](beta=beta_train)
        if double:
            agent = DoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = QLearning(pi, mdp.info, **algorithm_params)
    elif algorithm == 'boot-ql':
        if policy not in ['boot', 'weighted']:
            warnings.warn(
                'Bootstrapped QL available with only boot and weighted policies!'
            )
            policy = 'boot'
        pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                mu=(q_max + q_min) / 2,
                                sigma=q_max - q_min,
                                **algorithm_params)
        if double:
            agent = BootstrappedDoubleQLearning(pi, mdp.info,
                                                **algorithm_params)
        else:
            agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params)
        epsilon_train = Parameter(0)
    elif algorithm == 'particle-ql':
        if policy not in ['weighted', 'vpi']:
            warnings.warn(
                'Particle QL available with only vpi and weighted policies!')
            policy = 'weighted'
        pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                update_mode=update_mode,
                                update_type=update_type,
                                q_max=q_max,
                                q_min=q_min,
                                **algorithm_params)
        if double:
            agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = ParticleQLearning(pi, mdp.info, **algorithm_params)
        epsilon_train = Parameter(0)
    else:
        raise ValueError()

    # Algorithm
    collect_dataset = CollectDataset()
    collect_qs_callback = CollectQs(agent.approximator)
    callbacks = [collect_dataset]
    if collect_qs:
        callbacks += [collect_qs_callback]
    core = Core(agent, mdp, callbacks)

    train_scores = []
    test_scores = []

    for n_epoch in range(1, max_steps // evaluation_frequency + 1):

        # Train
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_train)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(False)
        core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True)
        dataset = collect_dataset.get()
        scores = compute_scores(dataset, mdp.info.gamma)

        # print('Train: ', scores)
        train_scores.append(scores)

        collect_dataset.clean()
        mdp.reset()

        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_test)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        dataset = core.evaluate(n_steps=test_samples, quiet=True)
        mdp.reset()
        scores = compute_scores(dataset, mdp.info.gamma)
        # print('Evaluation: ', scores)
        test_scores.append(scores)
    if collect_qs:
        qs = collect_qs_callback.get_values()
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        np.save(out_dir + '/' + file_name, qs)
    return train_scores, test_scores
예제 #11
0
def experiment(algorithm,
               name,
               update_mode,
               update_type,
               policy,
               n_approximators,
               q_max,
               q_min,
               lr_exp,
               file_name,
               out_dir,
               particles,
               R=1,
               m=1,
               collect_qs=False,
               seed=0):
    set_global_seeds(seed)
    print('Using seed %s' % seed)

    # MDP
    if name == 'Taxi':
        mdp = generate_taxi('../../grid.txt', horizon=5000)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 5000
    elif name == 'Chain':
        mdp = generate_chain(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'Loop':
        mdp = generate_loop(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'RiverSwim':
        mdp = generate_river(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'SixArms':
        mdp = generate_arms(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'KnightQuest':
        mdp = None
        try:
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        except:
            register(
                id='KnightQuest-v0',
                entry_point='envs.knight_quest:KnightQuest',
            )
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    else:
        raise NotImplementedError

    epsilon_test = Parameter(0)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=lr_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)

    if algorithm == 'particle-ql':
        delta = 0.1
        if policy not in ['weighted', 'vpi', 'ucb']:
            warnings.warn(
                'Particle QL available with only vpi and weighted policies!')
            policy = 'weighted'
        if policy == 'ucb':
            pi = UCBPolicy(delta=delta, q_max=R / (1 - mdp.info.gamma))
        else:
            pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                update_mode=update_mode,
                                update_type=update_type,
                                q_max=q_max,
                                q_min=q_min,
                                delta=delta,
                                init_values=particles,
                                **algorithm_params)

        agent = ParticleQLearning(pi, mdp.info, **algorithm_params)
        if policy == 'ucb':
            q = agent.approximator
            quantiles = [
                i * 1. / (n_approximators - 1) for i in range(n_approximators)
            ]
            for p in range(n_approximators):
                if quantiles[p] >= 1 - delta:
                    particle_bound = p
                    break

            def quantile_func(state, quantile):
                q_list = list()
                for i in range(n_approximators):
                    q_list.append(q.predict(state, idx=i))
                qs = np.array(q_list)
                out = np.zeros(qs.shape[1])
                out[:] = qs[particle_bound, :]
                return out

            def mu(state):
                q_list = list()
                for i in range(n_approximators):
                    q_list.append(q.predict(state, idx=i))
                qs = np.array(q_list)
                return np.mean(qs, axis=0)

            pi.set_quantile_func(quantile_func)
            pi.set_mu(mu)
        epsilon_train = Parameter(0)
    elif algorithm == 'delayed-ql':
        algorithm_params = dict(R=R, m=m, **algorithm_params)

        agent = DelayedQLearning(mdp.info, **algorithm_params)
        pi = agent

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    if collect_qs:
        collect_qs_callback = CollectQs(agent.approximator)
        callbacks += [collect_qs_callback]
    core = Core(agent, mdp, callbacks)

    train_scores = []
    test_scores = []

    for n_epoch in range(1, max_steps // evaluation_frequency + 1):

        # Train
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_train)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(False)
        core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True)
        dataset = collect_dataset.get()
        scores = compute_scores(dataset, mdp.info.gamma)

        # print('Train: ', scores)
        train_scores.append(scores)

        collect_dataset.clean()
        mdp.reset()

        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_test)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        dataset = core.evaluate(n_steps=test_samples, quiet=True)
        mdp.reset()
        scores = compute_scores(dataset, mdp.info.gamma)
        # print('Evaluation: ', scores)
        test_scores.append(scores)
    if collect_qs:
        qs = collect_qs_callback.get_values()
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        np.save(out_dir + '/' + file_name, qs)
    return train_scores, test_scores
예제 #12
0
파일: run.py 프로젝트: czgdp1807/wql
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min,
               lr_exp, R, log_lr, r_max_m, delayed_m, delayed_epsilon, delta, debug, double,
               regret_test, a, b, mbie_C, value_iterations, tolerance, file_name, out_dir,
               collect_qs,  seed):
    set_global_seeds(seed)
    print('Using seed %s' % seed)
    # MDP
    if name == 'Taxi':
        mdp = generate_taxi('../grid.txt', horizon=5000, gamma=0.99)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 5000
    elif name == 'Chain':
        mdp = generate_chain(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'Gridworld':
        mdp = generate_gridworld(horizon=100, gamma=0.99)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 1000
    elif name == 'Loop':
        mdp = generate_loop(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'RiverSwim':
        mdp = generate_river(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
        mbie_C = 0.4
    elif name == 'SixArms':
        mdp = generate_arms(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
        mbie_C = 0.8
    elif name == 'ThreeArms':
        horizon = 100
        mdp = generate_three_arms(horizon=horizon, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'KnightQuest':
        mdp = None
        try:
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        except:
            register(
                id='KnightQuest-v0',
                entry_point='envs.knight_quest:KnightQuest',
            )
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    else:
        raise NotImplementedError

    epsilon_test = Parameter(0)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    if regret_test:

        max_steps = int(args.max_steps_regret * 1e6)
        evaluation_frequency = max_steps // 100
        test_samples = 1000
        if debug:
            max_steps = 100000
            evaluation_frequency =  max_steps // 100
            test_samples = 1000
        
    if algorithm == 'ql':
        if policy not in ['boltzmann', 'eps-greedy']:
            warnings.warn('QL available with only boltzmann and eps-greedy policies!')
            policy = 'eps-greedy'

        if policy == 'eps-greedy':
            epsilon_train = ExponentialDecayParameter(value=1., decay_exp=lr_exp,
                                                      size=mdp.info.observation_space.size)
            pi = policy_dict[policy](epsilon=epsilon_train)
        else:
            beta_train = ExponentialDecayParameter(value=1.5 * q_max, decay_exp=.5,
                                                      size=mdp.info.observation_space.size)
            pi = policy_dict[policy](beta=beta_train)
        if double:
            agent = DoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = QLearning(pi, mdp.info, **algorithm_params)
    elif algorithm == 'boot-ql':
        if policy not in ['boot', 'weighted']:
            warnings.warn('Bootstrapped QL available with only boot and weighted policies!')
            policy = 'boot'
        pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                mu=(q_max + q_min) / 2,
                                sigma=(q_max - q_min)/2,
                                **algorithm_params)
        if double:
            agent = BootstrappedDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params)
        epsilon_train = Parameter(0)
    elif algorithm == 'particle-ql':
        if policy not in ['weighted', 'ucb']:
            warnings.warn('Particle QL available with only ucb and weighted policies!')
            policy = 'weighted'
        if policy == 'ucb':
            pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma))
        else:
            pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                update_mode=update_mode,
                                update_type=update_type,
                                q_max=q_max,
                                q_min=q_min,
                                delta=delta,
                                **algorithm_params)
        if double:
            agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = ParticleQLearning(pi, mdp.info, **algorithm_params)

        epsilon_train = Parameter(0)
    elif algorithm == 'r-max':
        thr_1 = int(np.ceil((4 * mdp.info.size[0] * 1.0/(1-mdp.info.gamma) * R )**3))

        algorithm_params = dict(
            rmax=R,
            s_a_threshold=r_max_m
        )
        agent = RMaxAgent(mdp.info, **algorithm_params)
        pi = agent
        epsilon_train = Parameter(0)
    elif algorithm == 'mbie':


        algorithm_params = dict(
            rmax=R,
            C=mbie_C,
            value_iterations=value_iterations,
            tolerance=tolerance
        )
        agent = MBIE_EB(mdp.info, **algorithm_params)

        pi = agent
        epsilon_train = Parameter(0)
    elif algorithm == 'delayed-ql':
        theoretic_m = delayed_m
        if regret_test:
            gamma = mdp.info.gamma
            Vmax = R / (1 - gamma)
            epsilon = args.delayed_ratio * Vmax
            delayed_epsilon = epsilon*(1-gamma)
            delta = 0.1
            S, A = mdp.info.size

            theoretic_m = (1 + gamma*Vmax)**2 / (2*delayed_epsilon**2) * np.log(3*S*A/delta * (1 + S*A/(delayed_epsilon*(1-gamma))))
            if debug:
                print("Delta:{}".format(delta))
                print("R:{}".format(R))
                print("Vmax:{}".format(Vmax))
                print("Gamma:{}".format(mdp.info.gamma))
                print("Epsilon:{}".format(epsilon))
                #print("k:{}".format(k))
                print("m:{}".format(theoretic_m))
                print("S:{}".format(S))
                print("A:{}".format(A))
                input()
            def evaluate_policy(P, R, policy):

                P_pi = np.zeros((S, S))
                R_pi = np.zeros(S)

                for s in range(S):
                    for s1 in range(S):
                        P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1])
                    R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :], axis=-1))
                I = np.diag(np.ones(S))
                V = np.linalg.solve(I - gamma * P_pi, R_pi)

                return V
        algorithm_params = dict(
            R=R,
            m=theoretic_m,
            delta=delta,
            epsilon=delayed_epsilon,
            **algorithm_params)

        agent = DelayedQLearning(mdp.info, **algorithm_params)
        if regret_test:
            collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, args.freq_collection)
            if debug:
                print("Q:")
                print(agent.get_approximator()[:, :])
                print("Policy:")
                print(agent.get_policy())
                print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy())))
                input()

        pi = agent
        epsilon_train = Parameter(0)
    elif algorithm == 'gaussian-ql':
        if policy not in ['weighted-gaussian', 'ucb']:
            warnings.warn('Particle QL available with only ucb and weighted policies!')
            policy = 'weighted-gaussian'
        if policy == 'ucb':
            pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma))
        else:
            pi = policy_dict[policy]()
        q_0 = (q_max - q_min) / 2
        sigma_0 = (q_max - q_min) / np.sqrt(12)
        C = 2 * R / (np.sqrt(2 * np.pi) * (1 - mdp.info.gamma) * sigma_0)
        sigma_lr = None
        if log_lr:
            sigma_lr = LogarithmicDecayParameter(value=1., C=C,
                                             size=mdp.info.size)
        init_values = (q_0, sigma_0)
        if regret_test:
            sigma_lr = None
            gamma = mdp.info.gamma
            T = max_steps
            S, A = mdp.info.size
            a = (2 + gamma) / (2 *(1 - gamma))
            b = a - 1
            c = 1
            d = b
            q_max = R / (1 - gamma)
            standard_bound = norm.ppf(1 - delta, loc=0, scale=1)
            #first_fac = np.sqrt(b + T)
            #second_fac = np.sqrt(a * np.log(S*A*T / delta))
            #sigma2_factor = min(np.sqrt(b + T), np.sqrt(a * np.log(S*A*T / delta)))

            q_0 = q_max
            sigma1_0 = 0
            #sigma2_0 = (R + gamma * q_max) / (standard_bound * np.sqrt(c-1)) * sigma2_factor

            sigma2_0 = (gamma * q_max) / (c * standard_bound) * np.sqrt(a * np.log(S * A * T / delta))
            init_values = (q_0, sigma1_0, sigma2_0)
            learning_rate = TheoreticalParameter(a=a, b=b, decay_exp=1,
                                                 size=mdp.info.size)
            learning_rate_sigma1 = TheoreticalParameter(a=a, b=b, decay_exp=1,
                                                 size=mdp.info.size)
            algorithm_params = dict(learning_rate=learning_rate,
                                    sigma_1_learning_rate=learning_rate_sigma1)

            sigma_lr = BetaParameter(c=c, d=d, size=mdp.info.size)
            def evaluate_policy(P, R, policy):

                P_pi = np.zeros((S, S))
                R_pi = np.zeros(S)

                for s in range(S):
                    for s1 in range(S):
                        P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1])

                    R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :],axis=-1))
                I = np.diag(np.ones(S))

                V = np.linalg.solve(I - gamma * P_pi, R_pi)
                return V
            if debug:
                print("Delta:{}".format(delta))
                print("R:{}".format(R))
                print("Gamma:{}".format(mdp.info.gamma))
                print("mu0:{}".format(q_0))
                print("Sigma1_0:{}".format(sigma1_0))
                print("Sigma2_0:{}".format(sigma2_0))
                print("a:{}".format(a))
                print("b:{}".format(b))
                print("c:{}".format(c))
                print("d:{}".format(d))
                print("T:{}".format(T))
                print("S:{}".format(S))
                print("A:{}".format(A))
                input()




        algorithm_params = dict(
            update_mode=update_mode,
            update_type=update_type,
            sigma_learning_rate=sigma_lr,
            init_values=init_values,
            delta=delta,
            q_max=q_max,
            **algorithm_params)
        if double and not regret_test:
            agent = GaussianDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = GaussianQLearning(pi, mdp.info, **algorithm_params)
        if regret_test:
            if debug:
                freq = 10
            else:
                freq = args.freq_collection
            collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, freq)
        if debug:
            print("Policy:")
            print(agent.get_policy())
            print("Q")
            for state in range(S):
                means = np.array(agent.approximator.predict(np.array([state]), idx=0))
                sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1))
                sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2))
                print("Means:{}".format(means))
                print("Sigmas1:{}".format(sigmas1))
                print("Sigmas2:{}".format(sigmas2))
            print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy())))
            input()
        if policy == 'ucb':
            q = agent.approximator
            standard_bound = norm.ppf(1 - delta, loc=0, scale=1)
            def quantile_func(state):
                means = np.array(q.predict(state, idx=0))
                if regret_test:
                    sigmas1 = np.array(q.predict(state, idx=1))
                    sigmas2 = np.array(q.predict(state, idx=2))
                    sigmas = sigmas2
                    #print(sigmas1, sigmas2)
                else:
                    sigmas = np.array(q.predict(state, idx=1))
                out = sigmas * standard_bound + means
                return out

            def mu(state):
                q_list = q.predict(state, idx=0)
                means = np.array(q_list)

                return means
            pi.set_quantile_func(quantile_func)
            pi.set_mu(mu)
        epsilon_train = Parameter(0)
    else:
        raise ValueError()

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    if collect_qs:
        if algorithm not in ['r-max']:
            collect_qs_callback = CollectQs(agent.approximator)
            callbacks += [collect_qs_callback]

    if regret_test:
        callbacks += [collect_vs_callback]
    core = Core(agent, mdp, callbacks)

    train_scores = []
    test_scores = []

    for n_epoch in range(1, max_steps // evaluation_frequency + 1):

        # Train
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_train)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(False)
        if regret_test:
            collect_vs_callback.on()
        core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True)
        dataset = collect_dataset.get()
        scores = compute_scores(dataset, mdp.info.gamma)

        #print('Train: ', scores)
        train_scores.append(scores)

        collect_dataset.clean()
        mdp.reset()
        if regret_test:
            vs = collect_vs_callback.get_values()
            if not os.path.exists(out_dir):
                os.makedirs(out_dir)
            print("Finished {} steps.".format(n_epoch * evaluation_frequency))
            np.save(out_dir + "/vs_" + algorithm+"_"+str(seed), vs)
            np.save(out_dir+"/scores_online" + str(seed), train_scores)
            collect_vs_callback.off()
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_test)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        dataset = core.evaluate(n_steps=test_samples, quiet=True)
        s = mdp.reset()
        scores = compute_scores(dataset, mdp.info.gamma)
        print('Evaluation #%d:%s ' %(n_epoch, scores))
        if debug:
            print("Policy:")
            print(agent.get_policy())
            print("Q")
            for state in range(S):
                means = np.array(agent.approximator.predict(np.array([state]), idx=0))
                sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1))
                sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2))
                print("Means:{}".format(means))
                print("Sigmas1:{}".format(sigmas1))
                print("Sigmas2:{}".format(sigmas2))
            print("V:{}".format(evaluate_policy(mdp.p, mdp.r, agent.get_policy())))
            input()
        test_scores.append(scores)
        if regret_test:
            np.save(out_dir + "/scores_offline" + str(seed), test_scores)
    if collect_qs:
        qs= collect_qs_callback.get_values()
        if not os.path.exists(out_dir):
                            os.makedirs(out_dir)
        np.save(out_dir + '/' + file_name, qs)

    return train_scores, test_scores