예제 #1
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 10
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(policy, mu, mdp.info,
                     alpha_theta, alpha_omega, alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    visualization_callback = Display(agent._V, mu,
                                     mdp.info.observation_space.low,
                                     mdp.info.observation_space.high,
                                     phi, phi)
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.0)
        dataset_callback.clean()
        visualization_callback()
        print('Mean Reward at iteration ' + str(i) + ': ' +
              str(np.sum(J) / n_steps / n_episodes))

    print('Press a button to visualize the pendulum...')
    input()
    sigma = 1e-8 * np.eye(1)
    policy.set_sigma(sigma)
    core.evaluate(n_steps=n_steps, render=True)
def experiment(mdp, agent_high, agent_low, n_epochs, n_episodes, ep_per_eval,
               ep_per_fit_low, display, print_j, quiet):
    np.random.seed()

    dataset_callback = CollectDataset()

    computational_graph = build_computational_graph(mdp, agent_low, agent_high,
                                                    ep_per_fit_low,
                                                    [dataset_callback])

    core = HierarchicalCore(computational_graph)
    J_list = list()
    L_list = list()

    dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet)
    J = compute_J(dataset, gamma=mdp.info.gamma)
    J_list.append(np.mean(J))
    J_low_list = list()
    L = episodes_length(dataset)
    L_list.append(np.mean(L))
    if print_j:
        print('Reward at start :', J_list[-1])

    for n in range(n_epochs):
        core.learn(n_episodes=n_episodes, skip=True, quiet=quiet)

        ll_dataset = dataset_callback.get()
        dataset_callback.clean()
        J_low = compute_J(ll_dataset, mdp.info.gamma)
        J_low_list.append(np.mean(J_low))
        if print_j:
            print('Low level reward at epoch', n, ':', np.mean(J_low))

        dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet)
        J = compute_J(dataset, gamma=mdp.info.gamma)
        J_list.append(np.mean(J))
        L = episodes_length(dataset)
        L_list.append(np.mean(L))

        if print_j:
            print('Reward at epoch ', n, ':', J_list[-1])

        if display:
            core.evaluate(n_episodes=1, render=True)

    return J_list, L_list, J_low_list
예제 #3
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()

    # MDP
    mdp = Segway()

    # Policy
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    n_weights = approximator.weights_size
    mu = np.zeros(n_weights)
    sigma = 2e-0 * np.ones(n_weights)
    policy = DeterministicPolicy(approximator)
    dist = GaussianDiagonalDistribution(mu, sigma)

    agent = alg(dist, policy, mdp.info, **params)

    # Train
    print(alg.__name__)
    dataset_callback = CollectDataset()
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_episodes_per_fit=n_ep_per_fit,
                   render=False)
        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        dataset_callback.clean()

        p = dist.get_parameters()

        print('mu:    ', p[:n_weights])
        print('sigma: ', p[n_weights:])
        print('Reward at iteration ' + str(i) + ': ' + str(np.mean(J)))

    print('Press a button to visualize the segway...')
    input()
    core.evaluate(n_episodes=3, render=True)
예제 #4
0
def experiment(policy, name, alg_version):
    np.random.seed()

    # MDP

    if name == "Taxi":
        mdp = generate_taxi('../grid.txt')
        max_steps = 100000
        evaluation_frequency = 2000
        test_samples = 10000
    elif name == "NChain-v0":
        mdp = generate_chain(horizon=1000)
        max_steps = 5000
        evaluation_frequency = 100
        test_samples = 10000
    elif name == "Loop":
        mdp = generate_loop(horizon=1000)
        max_steps = 5000
        evaluation_frequency = 100
        test_samples = 10000
    elif name == "SixArms":
        mdp = generate_arms(horizon=1000)
        max_steps = 25000
        evaluation_frequency = 500
        test_samples = 10000
    elif name == "RiverSwim":
        mdp = generate_river(horizon=1000)
        max_steps = 5000
        evaluation_frequency = 100
        test_samples = 10000
    else:
        raise NotImplementedError
    # Policy
    # epsilon = ExponentialDecayParameter(value=1., decay_exp=.5,
    #                                     size=mdp.info.observation_space.size)
    epsilon_train = ExponentialDecayParameter(
        value=1., decay_exp=.5, size=mdp.info.observation_space.size)
    epsilon_test = Parameter(0)
    pi = policy(epsilon=epsilon_train)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=.2,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = QLearning(pi, mdp.info, **algorithm_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)
    scores = list()
    scores_train = list()
    # Train
    for n_epoch in range(1, max_steps // evaluation_frequency + 1):
        print('- Learning:')
        # learning step
        pi.set_epsilon(epsilon_train)
        core.learn(n_steps=evaluation_frequency,
                   n_steps_per_fit=1,
                   quiet=False)
        dataset = collect_dataset.get()
        if name == "Taxi":
            scores_train.append(get_stats(dataset))
        elif name in ["SixArms"]:
            scores_train.append(compute_scores_Loop(dataset, horizon=500))
        else:
            scores_train.append(compute_scores_Loop(dataset))
        collect_dataset.clean()
        mdp.reset()
        print('- Evaluation:')
        # evaluation step
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_steps=test_samples, quiet=False)
        mdp.reset()
        scores.append(get_stats(dataset))
        #np.save(env + '/'+alg_version+'_scores.npy', scores)

    return scores_train, scores
예제 #5
0
def experiment(algorithm, name, update_mode, update_type, policy,
               n_approximators, q_max, q_min, lr_exp, double, file_name,
               out_dir, collect_qs, seed):
    set_global_seeds(seed)
    print('Using seed %s' % seed)

    # MDP
    if name == 'Taxi':
        mdp = generate_taxi('../../grid.txt', horizon=5000)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 5000
    elif name == 'Chain':
        mdp = generate_chain(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'Loop':
        mdp = generate_loop(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'RiverSwim':
        mdp = generate_river(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'SixArms':
        mdp = generate_arms(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'KnightQuest':
        mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    else:
        raise NotImplementedError

    epsilon_test = Parameter(0)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=lr_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)

    if algorithm == 'ql':
        if policy not in ['boltzmann', 'eps-greedy']:
            warnings.warn(
                'QL available with only boltzmann and eps-greedy policies!')
            policy = 'eps-greedy'

        if policy == 'eps-greedy':
            epsilon_train = ExponentialDecayParameter(
                value=1., decay_exp=.5, size=mdp.info.observation_space.size)
            pi = policy_dict[policy](epsilon=epsilon_train)
        else:
            beta_train = ExponentialDecayParameter(
                value=1.5 * q_max,
                decay_exp=.5,
                size=mdp.info.observation_space.size)
            pi = policy_dict[policy](beta=beta_train)
        if double:
            agent = DoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = QLearning(pi, mdp.info, **algorithm_params)
    elif algorithm == 'boot-ql':
        if policy not in ['boot', 'weighted']:
            warnings.warn(
                'Bootstrapped QL available with only boot and weighted policies!'
            )
            policy = 'boot'
        pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                mu=(q_max + q_min) / 2,
                                sigma=q_max - q_min,
                                **algorithm_params)
        if double:
            agent = BootstrappedDoubleQLearning(pi, mdp.info,
                                                **algorithm_params)
        else:
            agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params)
        epsilon_train = Parameter(0)
    elif algorithm == 'particle-ql':
        if policy not in ['weighted', 'vpi']:
            warnings.warn(
                'Particle QL available with only vpi and weighted policies!')
            policy = 'weighted'
        pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                update_mode=update_mode,
                                update_type=update_type,
                                q_max=q_max,
                                q_min=q_min,
                                **algorithm_params)
        if double:
            agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = ParticleQLearning(pi, mdp.info, **algorithm_params)
        epsilon_train = Parameter(0)
    else:
        raise ValueError()

    # Algorithm
    collect_dataset = CollectDataset()
    collect_qs_callback = CollectQs(agent.approximator)
    callbacks = [collect_dataset]
    if collect_qs:
        callbacks += [collect_qs_callback]
    core = Core(agent, mdp, callbacks)

    train_scores = []
    test_scores = []

    for n_epoch in range(1, max_steps // evaluation_frequency + 1):

        # Train
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_train)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(False)
        core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True)
        dataset = collect_dataset.get()
        scores = compute_scores(dataset, mdp.info.gamma)

        # print('Train: ', scores)
        train_scores.append(scores)

        collect_dataset.clean()
        mdp.reset()

        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_test)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        dataset = core.evaluate(n_steps=test_samples, quiet=True)
        mdp.reset()
        scores = compute_scores(dataset, mdp.info.gamma)
        # print('Evaluation: ', scores)
        test_scores.append(scores)
    if collect_qs:
        qs = collect_qs_callback.get_values()
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        np.save(out_dir + '/' + file_name, qs)
    return train_scores, test_scores
예제 #6
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 11
    alpha_r = Parameter(.0001)
    alpha_theta = Parameter(.001 / n_tilings)
    alpha_v = Parameter(.1 / n_tilings)
    tilings = Tiles.generate(n_tilings-1, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    tilings_v = tilings + Tiles.generate(1, [1, 1],
                                         mdp.info.observation_space.low,
                                         mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    std = Regressor(LinearApproximator, input_shape=input_shape,
                    output_shape=mdp.info.action_space.shape)

    std_0 = np.sqrt(1.)
    std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size))

    policy = StateLogStdGaussianPolicy(mu, std)

    agent = SAC_AVG(policy, mdp.info,
                    alpha_theta, alpha_v, alpha_r,
                    lambda_par=.5,
                    value_function_features=psi,
                    policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    display_callback = Display(agent._V, mu, std,
                               mdp.info.observation_space.low,
                               mdp.info.observation_space.high,
                               phi, psi)
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.)
        dataset_callback.clean()
        display_callback()
        print('Mean Reward at iteration ' + str(i) + ': ' +
              str(np.sum(J) / n_steps/n_episodes))

    print('Press a button to visualize the pendulum...')
    input()
    core.evaluate(n_steps=n_steps, render=True)
예제 #7
0
def experiment(algorithm,
               name,
               update_mode,
               update_type,
               policy,
               n_approximators,
               q_max,
               q_min,
               lr_exp,
               file_name,
               out_dir,
               particles,
               R=1,
               m=1,
               collect_qs=False,
               seed=0):
    set_global_seeds(seed)
    print('Using seed %s' % seed)

    # MDP
    if name == 'Taxi':
        mdp = generate_taxi('../../grid.txt', horizon=5000)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 5000
    elif name == 'Chain':
        mdp = generate_chain(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'Loop':
        mdp = generate_loop(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'RiverSwim':
        mdp = generate_river(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'SixArms':
        mdp = generate_arms(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'KnightQuest':
        mdp = None
        try:
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        except:
            register(
                id='KnightQuest-v0',
                entry_point='envs.knight_quest:KnightQuest',
            )
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    else:
        raise NotImplementedError

    epsilon_test = Parameter(0)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=lr_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)

    if algorithm == 'particle-ql':
        delta = 0.1
        if policy not in ['weighted', 'vpi', 'ucb']:
            warnings.warn(
                'Particle QL available with only vpi and weighted policies!')
            policy = 'weighted'
        if policy == 'ucb':
            pi = UCBPolicy(delta=delta, q_max=R / (1 - mdp.info.gamma))
        else:
            pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                update_mode=update_mode,
                                update_type=update_type,
                                q_max=q_max,
                                q_min=q_min,
                                delta=delta,
                                init_values=particles,
                                **algorithm_params)

        agent = ParticleQLearning(pi, mdp.info, **algorithm_params)
        if policy == 'ucb':
            q = agent.approximator
            quantiles = [
                i * 1. / (n_approximators - 1) for i in range(n_approximators)
            ]
            for p in range(n_approximators):
                if quantiles[p] >= 1 - delta:
                    particle_bound = p
                    break

            def quantile_func(state, quantile):
                q_list = list()
                for i in range(n_approximators):
                    q_list.append(q.predict(state, idx=i))
                qs = np.array(q_list)
                out = np.zeros(qs.shape[1])
                out[:] = qs[particle_bound, :]
                return out

            def mu(state):
                q_list = list()
                for i in range(n_approximators):
                    q_list.append(q.predict(state, idx=i))
                qs = np.array(q_list)
                return np.mean(qs, axis=0)

            pi.set_quantile_func(quantile_func)
            pi.set_mu(mu)
        epsilon_train = Parameter(0)
    elif algorithm == 'delayed-ql':
        algorithm_params = dict(R=R, m=m, **algorithm_params)

        agent = DelayedQLearning(mdp.info, **algorithm_params)
        pi = agent

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    if collect_qs:
        collect_qs_callback = CollectQs(agent.approximator)
        callbacks += [collect_qs_callback]
    core = Core(agent, mdp, callbacks)

    train_scores = []
    test_scores = []

    for n_epoch in range(1, max_steps // evaluation_frequency + 1):

        # Train
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_train)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(False)
        core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True)
        dataset = collect_dataset.get()
        scores = compute_scores(dataset, mdp.info.gamma)

        # print('Train: ', scores)
        train_scores.append(scores)

        collect_dataset.clean()
        mdp.reset()

        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_test)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        dataset = core.evaluate(n_steps=test_samples, quiet=True)
        mdp.reset()
        scores = compute_scores(dataset, mdp.info.gamma)
        # print('Evaluation: ', scores)
        test_scores.append(scores)
    if collect_qs:
        qs = collect_qs_callback.get_values()
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        np.save(out_dir + '/' + file_name, qs)
    return train_scores, test_scores
예제 #8
0
파일: run.py 프로젝트: czgdp1807/wql
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min,
               lr_exp, R, log_lr, r_max_m, delayed_m, delayed_epsilon, delta, debug, double,
               regret_test, a, b, mbie_C, value_iterations, tolerance, file_name, out_dir,
               collect_qs,  seed):
    set_global_seeds(seed)
    print('Using seed %s' % seed)
    # MDP
    if name == 'Taxi':
        mdp = generate_taxi('../grid.txt', horizon=5000, gamma=0.99)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 5000
    elif name == 'Chain':
        mdp = generate_chain(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'Gridworld':
        mdp = generate_gridworld(horizon=100, gamma=0.99)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 1000
    elif name == 'Loop':
        mdp = generate_loop(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'RiverSwim':
        mdp = generate_river(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
        mbie_C = 0.4
    elif name == 'SixArms':
        mdp = generate_arms(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
        mbie_C = 0.8
    elif name == 'ThreeArms':
        horizon = 100
        mdp = generate_three_arms(horizon=horizon, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'KnightQuest':
        mdp = None
        try:
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        except:
            register(
                id='KnightQuest-v0',
                entry_point='envs.knight_quest:KnightQuest',
            )
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    else:
        raise NotImplementedError

    epsilon_test = Parameter(0)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    if regret_test:

        max_steps = int(args.max_steps_regret * 1e6)
        evaluation_frequency = max_steps // 100
        test_samples = 1000
        if debug:
            max_steps = 100000
            evaluation_frequency =  max_steps // 100
            test_samples = 1000
        
    if algorithm == 'ql':
        if policy not in ['boltzmann', 'eps-greedy']:
            warnings.warn('QL available with only boltzmann and eps-greedy policies!')
            policy = 'eps-greedy'

        if policy == 'eps-greedy':
            epsilon_train = ExponentialDecayParameter(value=1., decay_exp=lr_exp,
                                                      size=mdp.info.observation_space.size)
            pi = policy_dict[policy](epsilon=epsilon_train)
        else:
            beta_train = ExponentialDecayParameter(value=1.5 * q_max, decay_exp=.5,
                                                      size=mdp.info.observation_space.size)
            pi = policy_dict[policy](beta=beta_train)
        if double:
            agent = DoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = QLearning(pi, mdp.info, **algorithm_params)
    elif algorithm == 'boot-ql':
        if policy not in ['boot', 'weighted']:
            warnings.warn('Bootstrapped QL available with only boot and weighted policies!')
            policy = 'boot'
        pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                mu=(q_max + q_min) / 2,
                                sigma=(q_max - q_min)/2,
                                **algorithm_params)
        if double:
            agent = BootstrappedDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params)
        epsilon_train = Parameter(0)
    elif algorithm == 'particle-ql':
        if policy not in ['weighted', 'ucb']:
            warnings.warn('Particle QL available with only ucb and weighted policies!')
            policy = 'weighted'
        if policy == 'ucb':
            pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma))
        else:
            pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                update_mode=update_mode,
                                update_type=update_type,
                                q_max=q_max,
                                q_min=q_min,
                                delta=delta,
                                **algorithm_params)
        if double:
            agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = ParticleQLearning(pi, mdp.info, **algorithm_params)

        epsilon_train = Parameter(0)
    elif algorithm == 'r-max':
        thr_1 = int(np.ceil((4 * mdp.info.size[0] * 1.0/(1-mdp.info.gamma) * R )**3))

        algorithm_params = dict(
            rmax=R,
            s_a_threshold=r_max_m
        )
        agent = RMaxAgent(mdp.info, **algorithm_params)
        pi = agent
        epsilon_train = Parameter(0)
    elif algorithm == 'mbie':


        algorithm_params = dict(
            rmax=R,
            C=mbie_C,
            value_iterations=value_iterations,
            tolerance=tolerance
        )
        agent = MBIE_EB(mdp.info, **algorithm_params)

        pi = agent
        epsilon_train = Parameter(0)
    elif algorithm == 'delayed-ql':
        theoretic_m = delayed_m
        if regret_test:
            gamma = mdp.info.gamma
            Vmax = R / (1 - gamma)
            epsilon = args.delayed_ratio * Vmax
            delayed_epsilon = epsilon*(1-gamma)
            delta = 0.1
            S, A = mdp.info.size

            theoretic_m = (1 + gamma*Vmax)**2 / (2*delayed_epsilon**2) * np.log(3*S*A/delta * (1 + S*A/(delayed_epsilon*(1-gamma))))
            if debug:
                print("Delta:{}".format(delta))
                print("R:{}".format(R))
                print("Vmax:{}".format(Vmax))
                print("Gamma:{}".format(mdp.info.gamma))
                print("Epsilon:{}".format(epsilon))
                #print("k:{}".format(k))
                print("m:{}".format(theoretic_m))
                print("S:{}".format(S))
                print("A:{}".format(A))
                input()
            def evaluate_policy(P, R, policy):

                P_pi = np.zeros((S, S))
                R_pi = np.zeros(S)

                for s in range(S):
                    for s1 in range(S):
                        P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1])
                    R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :], axis=-1))
                I = np.diag(np.ones(S))
                V = np.linalg.solve(I - gamma * P_pi, R_pi)

                return V
        algorithm_params = dict(
            R=R,
            m=theoretic_m,
            delta=delta,
            epsilon=delayed_epsilon,
            **algorithm_params)

        agent = DelayedQLearning(mdp.info, **algorithm_params)
        if regret_test:
            collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, args.freq_collection)
            if debug:
                print("Q:")
                print(agent.get_approximator()[:, :])
                print("Policy:")
                print(agent.get_policy())
                print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy())))
                input()

        pi = agent
        epsilon_train = Parameter(0)
    elif algorithm == 'gaussian-ql':
        if policy not in ['weighted-gaussian', 'ucb']:
            warnings.warn('Particle QL available with only ucb and weighted policies!')
            policy = 'weighted-gaussian'
        if policy == 'ucb':
            pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma))
        else:
            pi = policy_dict[policy]()
        q_0 = (q_max - q_min) / 2
        sigma_0 = (q_max - q_min) / np.sqrt(12)
        C = 2 * R / (np.sqrt(2 * np.pi) * (1 - mdp.info.gamma) * sigma_0)
        sigma_lr = None
        if log_lr:
            sigma_lr = LogarithmicDecayParameter(value=1., C=C,
                                             size=mdp.info.size)
        init_values = (q_0, sigma_0)
        if regret_test:
            sigma_lr = None
            gamma = mdp.info.gamma
            T = max_steps
            S, A = mdp.info.size
            a = (2 + gamma) / (2 *(1 - gamma))
            b = a - 1
            c = 1
            d = b
            q_max = R / (1 - gamma)
            standard_bound = norm.ppf(1 - delta, loc=0, scale=1)
            #first_fac = np.sqrt(b + T)
            #second_fac = np.sqrt(a * np.log(S*A*T / delta))
            #sigma2_factor = min(np.sqrt(b + T), np.sqrt(a * np.log(S*A*T / delta)))

            q_0 = q_max
            sigma1_0 = 0
            #sigma2_0 = (R + gamma * q_max) / (standard_bound * np.sqrt(c-1)) * sigma2_factor

            sigma2_0 = (gamma * q_max) / (c * standard_bound) * np.sqrt(a * np.log(S * A * T / delta))
            init_values = (q_0, sigma1_0, sigma2_0)
            learning_rate = TheoreticalParameter(a=a, b=b, decay_exp=1,
                                                 size=mdp.info.size)
            learning_rate_sigma1 = TheoreticalParameter(a=a, b=b, decay_exp=1,
                                                 size=mdp.info.size)
            algorithm_params = dict(learning_rate=learning_rate,
                                    sigma_1_learning_rate=learning_rate_sigma1)

            sigma_lr = BetaParameter(c=c, d=d, size=mdp.info.size)
            def evaluate_policy(P, R, policy):

                P_pi = np.zeros((S, S))
                R_pi = np.zeros(S)

                for s in range(S):
                    for s1 in range(S):
                        P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1])

                    R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :],axis=-1))
                I = np.diag(np.ones(S))

                V = np.linalg.solve(I - gamma * P_pi, R_pi)
                return V
            if debug:
                print("Delta:{}".format(delta))
                print("R:{}".format(R))
                print("Gamma:{}".format(mdp.info.gamma))
                print("mu0:{}".format(q_0))
                print("Sigma1_0:{}".format(sigma1_0))
                print("Sigma2_0:{}".format(sigma2_0))
                print("a:{}".format(a))
                print("b:{}".format(b))
                print("c:{}".format(c))
                print("d:{}".format(d))
                print("T:{}".format(T))
                print("S:{}".format(S))
                print("A:{}".format(A))
                input()




        algorithm_params = dict(
            update_mode=update_mode,
            update_type=update_type,
            sigma_learning_rate=sigma_lr,
            init_values=init_values,
            delta=delta,
            q_max=q_max,
            **algorithm_params)
        if double and not regret_test:
            agent = GaussianDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = GaussianQLearning(pi, mdp.info, **algorithm_params)
        if regret_test:
            if debug:
                freq = 10
            else:
                freq = args.freq_collection
            collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, freq)
        if debug:
            print("Policy:")
            print(agent.get_policy())
            print("Q")
            for state in range(S):
                means = np.array(agent.approximator.predict(np.array([state]), idx=0))
                sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1))
                sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2))
                print("Means:{}".format(means))
                print("Sigmas1:{}".format(sigmas1))
                print("Sigmas2:{}".format(sigmas2))
            print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy())))
            input()
        if policy == 'ucb':
            q = agent.approximator
            standard_bound = norm.ppf(1 - delta, loc=0, scale=1)
            def quantile_func(state):
                means = np.array(q.predict(state, idx=0))
                if regret_test:
                    sigmas1 = np.array(q.predict(state, idx=1))
                    sigmas2 = np.array(q.predict(state, idx=2))
                    sigmas = sigmas2
                    #print(sigmas1, sigmas2)
                else:
                    sigmas = np.array(q.predict(state, idx=1))
                out = sigmas * standard_bound + means
                return out

            def mu(state):
                q_list = q.predict(state, idx=0)
                means = np.array(q_list)

                return means
            pi.set_quantile_func(quantile_func)
            pi.set_mu(mu)
        epsilon_train = Parameter(0)
    else:
        raise ValueError()

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    if collect_qs:
        if algorithm not in ['r-max']:
            collect_qs_callback = CollectQs(agent.approximator)
            callbacks += [collect_qs_callback]

    if regret_test:
        callbacks += [collect_vs_callback]
    core = Core(agent, mdp, callbacks)

    train_scores = []
    test_scores = []

    for n_epoch in range(1, max_steps // evaluation_frequency + 1):

        # Train
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_train)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(False)
        if regret_test:
            collect_vs_callback.on()
        core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True)
        dataset = collect_dataset.get()
        scores = compute_scores(dataset, mdp.info.gamma)

        #print('Train: ', scores)
        train_scores.append(scores)

        collect_dataset.clean()
        mdp.reset()
        if regret_test:
            vs = collect_vs_callback.get_values()
            if not os.path.exists(out_dir):
                os.makedirs(out_dir)
            print("Finished {} steps.".format(n_epoch * evaluation_frequency))
            np.save(out_dir + "/vs_" + algorithm+"_"+str(seed), vs)
            np.save(out_dir+"/scores_online" + str(seed), train_scores)
            collect_vs_callback.off()
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_test)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        dataset = core.evaluate(n_steps=test_samples, quiet=True)
        s = mdp.reset()
        scores = compute_scores(dataset, mdp.info.gamma)
        print('Evaluation #%d:%s ' %(n_epoch, scores))
        if debug:
            print("Policy:")
            print(agent.get_policy())
            print("Q")
            for state in range(S):
                means = np.array(agent.approximator.predict(np.array([state]), idx=0))
                sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1))
                sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2))
                print("Means:{}".format(means))
                print("Sigmas1:{}".format(sigmas1))
                print("Sigmas2:{}".format(sigmas2))
            print("V:{}".format(evaluate_policy(mdp.p, mdp.r, agent.get_policy())))
            input()
        test_scores.append(scores)
        if regret_test:
            np.save(out_dir + "/scores_offline" + str(seed), test_scores)
    if collect_qs:
        qs= collect_qs_callback.get_values()
        if not os.path.exists(out_dir):
                            os.makedirs(out_dir)
        np.save(out_dir + '/' + file_name, qs)

    return train_scores, test_scores