示例#1
0
def experiment():
    np.random.seed()

    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon, )

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1)
示例#2
0
def experiment(algorithm_class):
    np.random.seed(20)

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1, decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1., decay_exp=1.,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {'algorithm_params': algorithm_params,
                    'fit_params': fit_params}
    agent = algorithm_class(pi, mdp.info, agent_params)

    # Algorithm
    collect_dataset = CollectDataset()
    start = mdp.convert_to_int(mdp._start, mdp._width)
    collect_max_Q = CollectMaxQ(agent.approximator, start)
    callbacks = [collect_dataset, collect_max_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
示例#3
0
def experiment(algorithm_class, decay_exp):
    np.random.seed(3)

    # MDP
    p = np.load('tests/double_chain/chain_structure/p.npy')
    rew = np.load('tests/double_chain/chain_structure/rew.npy')
    mdp = FiniteMDP(p, rew, gamma=.9)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1., decay_exp=decay_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {'algorithm_params': algorithm_params,
                    'fit_params': fit_params}
    agent = algorithm_class(pi, mdp.info, agent_params)

    # Algorithm
    collect_Q = CollectQ(agent.approximator)
    callbacks = [collect_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True)

    Qs = collect_Q.get_values()

    return Qs
示例#4
0
def experiment2():
    np.random.seed(3)
    print('mushroom     :')

    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon, )

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)
    dataset = collect_dataset.get()
    return agent.Q.table
示例#5
0
def experiment_others(alg, decay_exp):
    np.random.seed()

    # MDP

    grid_map = "simple_gridmap.txt"
    mdp = GridWorldGenerator(grid_map=grid_map)

    # Policy
    epsilon = ExponentialDecayParameter(value=1, decay_exp=.5,
                             size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size)

    algorithm_params = dict(learning_rate=alpha)
    fit_params = dict()
    agent_params = {'algorithm_params': algorithm_params,
                    'fit_params': fit_params}
    agent = alg(pi, mdp.info, agent_params)

    # Algorithm
    collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width))
    collect_dataset = CollectDataset()
    callbacks = [collect_max_Q, collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
示例#6
0
def experiment(boosted):
    np.random.seed(20)

    # MDP
    mdp = CarOnHill()

    # Policy
    epsilon = Parameter(value=1)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    if not boosted:
        approximator_params = dict(
            input_shape=mdp.info.observation_space.shape,
            n_actions=mdp.info.action_space.n,
            n_estimators=50,
            min_samples_split=5,
            min_samples_leaf=2)
    else:
        approximator_params = dict(
            input_shape=mdp.info.observation_space.shape,
            n_actions=mdp.info.action_space.n,
            n_models=3,
            prediction='sum',
            n_estimators=50,
            min_samples_split=5,
            min_samples_leaf=2)

    approximator = ExtraTreesRegressor

    # Agent
    algorithm_params = dict(n_iterations=3, boosted=boosted, quiet=True)
    fit_params = dict()
    agent_params = {
        'approximator_params': approximator_params,
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = FQI(approximator, pi, mdp.info, agent_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=50, n_episodes_per_fit=50, quiet=True)

    # Test
    test_epsilon = Parameter(0)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.zeros((9, 2))
    cont = 0
    for i in range(-8, 9, 8):
        for j in range(-8, 9, 8):
            initial_states[cont, :] = [0.125 * i, 0.375 * j]
            cont += 1

    dataset = core.evaluate(initial_states=initial_states, quiet=True)

    return np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(n_iterations, n_runs, ep_per_run, use_tensorflow):
    np.random.seed()

    # MDP
    mdp = ShipSteering()

    # Policy
    if use_tensorflow:
        tensor_list = gaussian_tensor.generate(
            [3, 3, 6, 2], [[0., 150.], [0., 150.], [-np.pi, np.pi],
                           [-np.pi / 12, np.pi / 12]])

        phi = Features(tensor_list=tensor_list,
                       name='phi',
                       input_dim=mdp.info.observation_space.shape[0])
    else:
        basis = GaussianRBF.generate([3, 3, 6, 2],
                                     [[0., 150.], [0., 150.], [-np.pi, np.pi],
                                      [-np.pi / 12, np.pi / 12]])

        phi = Features(basis_list=basis)

    input_shape = (phi.size, )

    approximator_params = dict(input_dim=phi.size)
    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape,
                             params=approximator_params)
    #sigma = Parameter(value=.05)
    #policy = GaussianPolicy(mu=approximator, sigma=sigma)

    sigma = np.array([[.05]])
    policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma)

    # Agent
    learning_rate = AdaptiveParameter(value=.01)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = REINFORCE(policy, mdp.info, agent_params, phi)

    # Train
    core = Core(agent, mdp)
    for i in xrange(n_runs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_steps_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))

    np.save('ship_steering.npy', dataset_eval)
示例#8
0
def experiment1(decay_exp, beta_type):
    np.random.seed()

    # MDP
    p = np.load('p.npy')
    rew = np.load('rew.npy')
    mdp = FiniteMDP(p, rew, gamma=.9)

    # Policy
    epsilon = Parameter(value=1)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    alpha = ExponentialDecayParameter(value=1,
                                      decay_exp=decay_exp,
                                      size=mdp.info.size)

    if beta_type == 'Win':
        beta = WindowedVarianceIncreasingParameter(value=1,
                                                   size=mdp.info.size,
                                                   tol=10.,
                                                   window=50)
    else:
        beta = VarianceIncreasingParameter(value=1,
                                           size=mdp.info.size,
                                           tol=10.)

    algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = RQLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_q = CollectQ(agent.Q)
    collect_lr_1 = CollectParameters(beta, np.array([0]))
    collect_lr_5 = CollectParameters(beta, np.array([4]))
    callbacks = [collect_q, collect_lr_1, collect_lr_5]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True)

    Qs = collect_q.get_values()
    lr_1 = collect_lr_1.get_values()
    lr_5 = collect_lr_5.get_values()

    return Qs, lr_1, lr_5
示例#9
0
def experiment(decay_exp, windowed, tol):
    np.random.seed()

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1,
                                        decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    alpha = ExponentialDecayParameter(value=1,
                                      decay_exp=decay_exp,
                                      size=mdp.info.size)
    if windowed:
        beta = WindowedVarianceIncreasingParameter(value=1,
                                                   size=mdp.info.size,
                                                   tol=tol,
                                                   window=50)
    else:
        beta = VarianceIncreasingParameter(value=1,
                                           size=mdp.info.size,
                                           tol=tol)
    algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = RQLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_max_Q = CollectMaxQ(agent.Q,
                                mdp.convert_to_int(mdp._start, mdp._width))
    collect_dataset = CollectDataset()
    callbacks = [collect_max_Q, collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
示例#10
0
def experiment(alpha):
    gym.logger.setLevel(0)
    np.random.seed(386)

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=10000, gamma=1.)
    mdp.seed(201)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(alpha)
    tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate, 'lambda': .9}
    fit_params = dict()
    agent_params = {
        'approximator_params': approximator_params,
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.array([[0., 0.], [.1, .1]])
    dataset = core.evaluate(initial_states=initial_states, quiet=True)

    return np.mean(compute_J(dataset, 1.))
示例#11
0
def experiment():
    np.random.seed()

    # MDP
    mdp = InvertedPendulum()

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    rbfs = GaussianRBF.generate(10, [10, 10], mdp.info.observation_space.low,
                                mdp.info.observation_space.high)
    features = Features(basis_list=rbfs)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = dict(n_iterations)
    fit_params = dict()
    agent_params = {
        'approximator_params': approximator_params,
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = LSPI(pi, mdp.info, agent_params, features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=1000, n_episodes_per_fit=20)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    dataset = core.evaluate(n_episodes=20)

    return np.mean(compute_J(dataset, 1.))
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(alpha)
    tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate, 'lambda': .9}
    fit_params = dict()
    agent_params = {
        'approximator_params': approximator_params,
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks=callbacks)

    # Train
    core.learn(n_episodes=20, n_steps_per_fit=1, render=0)

    dataset = collect_dataset.get()
    return np.mean(compute_J(dataset, 1.))
示例#13
0
def experiment2():
    np.random.seed(3)
    print('mushroom     :')

    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1,
                                        decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=1.,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True)

    # Train
    dataset = collect_dataset.get()
    VisualizeControlBlock(dataset)
    return agent.Q.table
示例#14
0
def experiment(alg):
    gym.logger.setLevel(0)
    np.random.seed(88)
    tf.set_random_seed(88)

    # DQN settings
    initial_replay_size = 500
    max_replay_size = 1000
    train_frequency = 50
    target_update_frequency = 100
    evaluation_frequency = 200
    max_steps = 2000

    # MDP train
    mdp = Atari('BreakoutDeterministic-v4', 84, 84, ends_at_life=True)

    # Policy
    epsilon = LinearDecayParameter(value=1, min_value=.1, n=10)
    epsilon_test = Parameter(value=.05)
    epsilon_random = Parameter(value=1)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Approximator
    input_shape = (84, 84, 4)
    approximator_params = dict(
        input_shape=input_shape,
        output_shape=(mdp.info.action_space.n, ),
        n_actions=mdp.info.action_space.n,
        input_preprocessor=[Scaler(mdp.info.observation_space.high[0, 0])],
        optimizer={
            'name': 'rmsprop',
            'lr': .00025,
            'decay': .95,
            'epsilon': 1e-10
        })

    approximator = ConvNet

    # Agent
    algorithm_params = dict(batch_size=32,
                            initial_replay_size=initial_replay_size,
                            n_approximators=2 if alg == 'adqn' else 1,
                            max_replay_size=max_replay_size,
                            history_length=4,
                            train_frequency=train_frequency,
                            target_update_frequency=target_update_frequency,
                            max_no_op_actions=10,
                            no_op_action_value=0)
    fit_params = dict()
    agent_params = {
        'approximator_params': approximator_params,
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }

    if alg == 'dqn':
        agent = DQN(approximator, pi, mdp.info, agent_params)
    elif alg == 'ddqn':
        agent = DoubleDQN(approximator, pi, mdp.info, agent_params)
    elif alg == 'adqn':
        agent = AveragedDQN(approximator, pi, mdp.info, agent_params)

    # Algorithm
    core = Core(agent, mdp)

    # DQN

    # fill replay memory with random dataset
    core.learn(n_steps=initial_replay_size,
               n_steps_per_fit=initial_replay_size,
               quiet=True)

    # evaluate initial policy
    pi.set_epsilon(epsilon_test)
    mdp.set_episode_end(ends_at_life=False)
    for n_epoch in xrange(1, max_steps / evaluation_frequency + 1):
        # learning step
        pi.set_epsilon(epsilon)
        mdp.set_episode_end(ends_at_life=True)
        core.learn(n_steps=evaluation_frequency,
                   n_steps_per_fit=train_frequency,
                   quiet=True)

        # evaluation step
        pi.set_epsilon(epsilon_test)
        mdp.set_episode_end(ends_at_life=False)
    w = agent.approximator.model.get_weights(only_trainable=True)

    return w
示例#15
0
def experiment():
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_game = parser.add_argument_group('Game')
    arg_game.add_argument("--name",
                          type=str,
                          default='BreakoutDeterministic-v4',
                          help='Gym ID of the Atari game.')
    arg_game.add_argument("--screen-width",
                          type=int,
                          default=84,
                          help='Width of the game screen.')
    arg_game.add_argument("--screen-height",
                          type=int,
                          default=84,
                          help='Height of the game screen.')

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size",
                         type=int,
                         default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size",
                         type=int,
                         default=500000,
                         help='Max size of the replay memory.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument(
        "--optimizer",
        choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'],
        default='adam',
        help='Name of the optimizer to use to learn.')
    arg_net.add_argument("--learning-rate",
                         type=float,
                         default=.00025,
                         help='Learning rate value of the optimizer. Only used'
                         'in rmspropcentered')
    arg_net.add_argument("--decay",
                         type=float,
                         default=.95,
                         help='Discount factor for the history coming from the'
                         'gradient momentum in rmspropcentered')
    arg_net.add_argument("--epsilon",
                         type=float,
                         default=.01,
                         help='Epsilon term used in rmspropcentered')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--algorithm",
                         choices=['dqn', 'ddqn', 'wdqn', 'adqn'],
                         default='dqn',
                         help='Name of the algorithm. dqn stands for standard'
                         'DQN, ddqn stands for Double DQN, wdqn'
                         'stands for Weighted DQN and adqn stands for'
                         'Averaged DQN.')
    arg_alg.add_argument("--n-approximators",
                         type=int,
                         default=1,
                         help="Number of approximators used in the ensemble"
                         "for Weighted DQN and Averaged DQN.")
    arg_alg.add_argument("--batch-size",
                         type=int,
                         default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length",
                         type=int,
                         default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency",
                         type=int,
                         default=10000,
                         help='Number of learning step before each update of'
                         'the target network.')
    arg_alg.add_argument("--evaluation-frequency",
                         type=int,
                         default=250000,
                         help='Number of learning step before each evaluation.'
                         'This number represents an epoch.')
    arg_alg.add_argument("--train-frequency",
                         type=int,
                         default=4,
                         help='Number of learning steps before each fit of the'
                         'neural network.')
    arg_alg.add_argument("--max-steps",
                         type=int,
                         default=50000000,
                         help='Total number of learning steps.')
    arg_alg.add_argument(
        "--final-exploration-frame",
        type=int,
        default=1000000,
        help='Number of steps until the exploration rate stops'
        'decreasing.')
    arg_alg.add_argument("--initial-exploration-rate",
                         type=float,
                         default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate",
                         type=float,
                         default=.1,
                         help='Final value of the exploration rate. When it'
                         'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate",
                         type=float,
                         default=.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-samples",
                         type=int,
                         default=125000,
                         help='Number of steps for each evaluation.')
    arg_alg.add_argument("--max-no-op-actions",
                         type=int,
                         default=30,
                         help='Maximum number of no-op action performed at the'
                         'beginning of the episodes. The minimum number is'
                         'history_length.')
    arg_alg.add_argument("--no-op-action-value",
                         type=int,
                         default=0,
                         help='Value of the no-op action.')

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--load-path',
                           type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--save',
                           action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--render',
                           action='store_true',
                           help='Flag specifying whether to render the game.')
    arg_utils.add_argument('--quiet',
                           action='store_true',
                           help='Flag specifying whether to hide the progress'
                           'bar.')
    arg_utils.add_argument('--debug',
                           action='store_true',
                           help='Flag specifying whether the script has to be'
                           'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    # Evaluation of the model provided by the user.
    if args.load_path:
        # MDP
        mdp = Atari(args.name,
                    args.screen_width,
                    args.screen_height,
                    ends_at_life=True)

        # Policy
        epsilon_test = Parameter(value=args.test_exploration_rate)
        pi = EpsGreedy(epsilon=epsilon_test)

        # Approximator
        input_shape = (args.screen_height, args.screen_width,
                       args.history_length)
        approximator_params = dict(
            input_shape=input_shape,
            output_shape=(mdp.info.action_space.n, ),
            n_actions=mdp.info.action_space.n,
            input_preprocessor=[Scaler(mdp.info.observation_space.high[0, 0])],
            name='test',
            load_path=args.load_path,
            optimizer={
                'name': args.optimizer,
                'lr': args.learning_rate,
                'decay': args.decay,
                'epsilon': args.epsilon
            })

        approximator = ConvNet

        # Agent
        algorithm_params = dict(max_replay_size=0,
                                history_length=args.history_length,
                                max_no_op_actions=args.max_no_op_actions,
                                no_op_action_value=args.no_op_action_value)
        fit_params = dict()
        agent_params = {
            'approximator_params': approximator_params,
            'algorithm_params': algorithm_params,
            'fit_params': fit_params
        }
        agent = DQN(approximator, pi, mdp.info, agent_params)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        pi.set_epsilon(epsilon_test)
        mdp.set_episode_end(ends_at_life=False)
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset)
    else:
        # DQN learning run

        # Summary folder
        folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\
            '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

        # Settings
        if args.debug:
            initial_replay_size = 50
            max_replay_size = 500
            train_frequency = 5
            target_update_frequency = 10
            test_samples = 20
            evaluation_frequency = 50
            max_steps = 1000
        else:
            initial_replay_size = args.initial_replay_size
            max_replay_size = args.max_replay_size
            train_frequency = args.train_frequency
            target_update_frequency = args.target_update_frequency
            test_samples = args.test_samples
            evaluation_frequency = args.evaluation_frequency
            max_steps = args.max_steps

        # MDP
        mdp = Atari(args.name,
                    args.screen_width,
                    args.screen_height,
                    ends_at_life=True)

        # Policy
        epsilon = LinearDecayParameter(value=args.initial_exploration_rate,
                                       min_value=args.final_exploration_rate,
                                       n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1)
        pi = EpsGreedy(epsilon=epsilon_random)

        # Approximator
        input_shape = (args.screen_height, args.screen_width,
                       args.history_length)
        approximator_params = dict(
            input_shape=input_shape,
            output_shape=(mdp.info.action_space.n, ),
            n_actions=mdp.info.action_space.n,
            input_preprocessor=[Scaler(mdp.info.observation_space.high[0, 0])],
            folder_name=folder_name,
            optimizer={
                'name': args.optimizer,
                'lr': args.learning_rate,
                'decay': args.decay,
                'epsilon': args.epsilon
            })

        approximator = ConvNet

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            n_approximators=args.n_approximators,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size,
            history_length=args.history_length,
            train_frequency=train_frequency,
            target_update_frequency=target_update_frequency,
            max_no_op_actions=args.max_no_op_actions,
            no_op_action_value=args.no_op_action_value)
        fit_params = dict()
        agent_params = {
            'approximator_params': approximator_params,
            'algorithm_params': algorithm_params,
            'fit_params': fit_params
        }

        if args.algorithm == 'dqn':
            agent = DQN(approximator, pi, mdp.info, agent_params)
        elif args.algorithm == 'ddqn':
            agent = DoubleDQN(approximator, pi, mdp.info, agent_params)
        elif args.algorithm == 'wdqn':
            agent = WeightedDQN(approximator, pi, mdp.info, agent_params)
        elif args.algorithm == 'adqn':
            agent = AveragedDQN(approximator, pi, mdp.info, agent_params)

        # Algorithm
        core = Core(agent, mdp)
        core_test = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size,
                   quiet=args.quiet)

        if args.save:
            agent.approximator.model.save()

        # Evaluate initial policy
        pi.set_epsilon(epsilon_test)
        mdp.set_episode_end(ends_at_life=False)
        if args.algorithm == 'ddqn':
            agent.policy.set_q(agent.target_approximator)
        dataset = core_test.evaluate(n_steps=test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        scores.append(get_stats(dataset))
        if args.algorithm == 'ddqn':
            agent.policy.set_q(agent.approximator)

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in xrange(1, max_steps / evaluation_frequency + 1):
            print_epoch(n_epoch)
            print '- Learning:'
            # learning step
            pi.set_epsilon(epsilon)
            mdp.set_episode_end(ends_at_life=True)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.approximator.model.save()

            print '- Evaluation:'
            # evaluation step
            pi.set_epsilon(epsilon_test)
            mdp.set_episode_end(ends_at_life=False)
            core_test.reset()
            if args.algorithm == 'ddqn':
                agent.policy.set_q(agent.target_approximator)
            dataset = core_test.evaluate(n_steps=test_samples,
                                         render=args.render,
                                         quiet=args.quiet)
            scores.append(get_stats(dataset))
            if args.algorithm == 'ddqn':
                agent.policy.set_q(agent.approximator)

            np.save(folder_name + '/scores.npy', scores)

    return scores
示例#16
0
def experiment():
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_game = parser.add_argument_group('Game')
    arg_game.add_argument("--name",
                          type=str,
                          default='BreakoutNoFrameskip-v4',
                          help='Gym ID of the Atari game.')
    arg_game.add_argument("--screen-width",
                          type=int,
                          default=84,
                          help='Width of the game screen.')
    arg_game.add_argument("--screen-height",
                          type=int,
                          default=84,
                          help='Height of the game screen.')

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size",
                         type=int,
                         default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size",
                         type=int,
                         default=1000000,
                         help='Max size of the replay memory.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument(
        "--optimizer",
        choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'],
        default='rmsprop',
        help='Name of the optimizer to use to learn.')
    arg_net.add_argument("--learning-rate",
                         type=float,
                         default=.00025,
                         help='Learning rate value of the optimizer. Only used'
                         'in rmspropcentered')
    arg_net.add_argument("--decay", type=float, default=.95)
    arg_net.add_argument("--epsilon", type=float, default=1e-10)
    arg_net.add_argument("--bootInit",
                         action='store_true',
                         help='Initialize weights as in Bootstrapped DQN')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--weighted", action='store_true')
    arg_alg.add_argument("--double", action='store_true')
    arg_alg.add_argument("--weighted-update", action='store_true')
    arg_alg.add_argument(
        "--n-approximators",
        type=int,
        default=10,
        help="Number of approximators used in the ensemble for"
        "Averaged DQN.")
    arg_alg.add_argument("--loss",
                         choices=[
                             'squared_loss',
                             'huber_loss',
                         ],
                         default='squared_loss',
                         help="Loss functions used in the approximator")
    arg_alg.add_argument(
        "--q-max",
        type=float,
        default=10,
        help='Upper bound for initializing the heads of the network')
    arg_alg.add_argument(
        "--q-min",
        type=float,
        default=-10,
        help='Lower bound for initializing the heads of the network')
    arg_alg.add_argument("--batch-size",
                         type=int,
                         default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length",
                         type=int,
                         default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency",
                         type=int,
                         default=10000,
                         help='Number of learning step before each update of'
                         'the target network.')
    arg_alg.add_argument("--evaluation-frequency",
                         type=int,
                         default=250000,
                         help='Number of learning step before each evaluation.'
                         'This number represents an epoch.')
    arg_alg.add_argument("--train-frequency",
                         type=int,
                         default=4,
                         help='Number of learning steps before each fit of the'
                         'neural network.')
    arg_alg.add_argument("--max-steps",
                         type=int,
                         default=50000000,
                         help='Total number of learning steps.')
    arg_alg.add_argument(
        "--final-exploration-frame",
        type=int,
        default=1,
        help='Number of steps until the exploration rate stops'
        'decreasing.')
    arg_alg.add_argument("--initial-exploration-rate",
                         type=float,
                         default=0.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate",
                         type=float,
                         default=0.,
                         help='Final value of the exploration rate. When it'
                         'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate",
                         type=float,
                         default=.005,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-samples",
                         type=int,
                         default=125000,
                         help='Number of steps for each evaluation.')
    arg_alg.add_argument("--max-no-op-actions",
                         type=int,
                         default=8,
                         help='Maximum number of no-op action performed at the'
                         'beginning of the episodes. The minimum number is'
                         'history_length.')
    arg_alg.add_argument("--no-op-action-value",
                         type=int,
                         default=0,
                         help='Value of the no-op action.')
    arg_alg.add_argument("--p-mask", type=float, default=1.)

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--experiment-number',
                           type=int,
                           default=1,
                           help='To differentiate experiment results')
    arg_utils.add_argument('--load-path',
                           type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--save',
                           action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--render',
                           action='store_true',
                           help='Flag specifying whether to render the game.')
    arg_utils.add_argument('--quiet',
                           action='store_true',
                           help='Flag specifying whether to hide the progress'
                           'bar.')
    arg_utils.add_argument('--debug',
                           action='store_true',
                           help='Flag specifying whether the script has to be'
                           'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    # Evaluation of the model provided by the user.
    if args.load_path:
        mdp = Atari(args.name,
                    args.screen_width,
                    args.screen_height,
                    ends_at_life=False)
        print("Evaluation Run")

        # Policy
        epsilon_test = Parameter(value=args.test_exploration_rate)
        pi = VPIPolicy(args.n_approximators, epsilon=epsilon_test)

        # Approximator
        input_shape = (args.screen_height, args.screen_width,
                       args.history_length)
        approximator_params = dict(input_shape=input_shape,
                                   output_shape=(mdp.info.action_space.n, ),
                                   n_actions=mdp.info.action_space.n,
                                   n_approximators=args.n_approximators,
                                   name='test',
                                   load_path=args.load_path,
                                   q_min=args.q_min,
                                   q_max=args.q_max,
                                   optimizer={
                                       'name': args.optimizer,
                                       'lr': args.learning_rate,
                                       'decay': args.decay,
                                       'epsilon': args.epsilon
                                   },
                                   loss=args.loss)

        approximator = ConvNet

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            initial_replay_size=1,
            max_replay_size=1,
            history_length=args.history_length,
            clip_reward=True,
            train_frequency=args.train_frequency,
            n_approximators=args.n_approximators,
            target_update_frequency=args.target_update_frequency,
            max_no_op_actions=4,
            no_op_action_value=args.no_op_action_value,
            p_mask=args.p_mask,
            dtype=np.uint8,
            weighted_update=args.weighted_update)
        if args.double:
            agent = DoubleDQN(approximator,
                              pi,
                              mdp.info,
                              approximator_params=approximator_params,
                              **algorithm_params)
        else:
            agent = DQN(approximator,
                        pi,
                        mdp.info,
                        approximator_params=approximator_params,
                        **algorithm_params)
        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        pi.set_eval(True)
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset)
    else:
        # DQN learning run
        print("Learning Run")
        policy_name = 'weighted' if args.weighted else 'vpi'
        update_rule = 'weighted_update' if args.weighted_update else 'max_mean_update'
        # Summary folder
        folder_name = './logs/' + str(
            args.experiment_number
        ) + '/' + policy_name + '/' + update_rule + '/' + args.name + "/" + args.loss + "/" + str(
            args.n_approximators) + "_particles"

        # Settings
        if args.debug:
            initial_replay_size = 50
            max_replay_size = 500
            train_frequency = 5
            target_update_frequency = 10
            test_samples = 20
            evaluation_frequency = 50
            max_steps = 1000
        else:
            initial_replay_size = args.initial_replay_size
            max_replay_size = args.max_replay_size
            train_frequency = args.train_frequency
            target_update_frequency = args.target_update_frequency
            test_samples = args.test_samples
            evaluation_frequency = args.evaluation_frequency
            max_steps = args.max_steps

        # MDP
        mdp = Atari(args.name,
                    args.screen_width,
                    args.screen_height,
                    ends_at_life=True)

        # Policy
        epsilon = LinearDecayParameter(value=args.initial_exploration_rate,
                                       min_value=args.final_exploration_rate,
                                       n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1.)

        if not args.weighted:
            pi = VPIPolicy(args.n_approximators, epsilon=epsilon_random)
        else:
            pi = WeightedPolicy(args.n_approximators, epsilon=epsilon_random)

        # Approximator
        input_shape = (args.screen_height, args.screen_width,
                       args.history_length)
        approximator_params = dict(input_shape=input_shape,
                                   output_shape=(mdp.info.action_space.n, ),
                                   n_actions=mdp.info.action_space.n,
                                   n_approximators=args.n_approximators,
                                   folder_name=folder_name,
                                   q_min=args.q_min,
                                   q_max=args.q_max,
                                   loss=args.loss,
                                   optimizer={
                                       'name': args.optimizer,
                                       'lr': args.learning_rate,
                                       'decay': args.decay,
                                       'epsilon': args.epsilon
                                   })

        approximator = ConvNet

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size,
            history_length=args.history_length,
            clip_reward=True,
            train_frequency=args.train_frequency,
            n_approximators=args.n_approximators,
            target_update_frequency=target_update_frequency,
            max_no_op_actions=args.max_no_op_actions,
            no_op_action_value=args.no_op_action_value,
            p_mask=args.p_mask,
            dtype=np.uint8,
            weighted_update=args.weighted_update)

        if args.double:
            agent = DoubleDQN(approximator,
                              pi,
                              mdp.info,
                              approximator_params=approximator_params,
                              **algorithm_params)
        else:
            agent = DQN(approximator,
                        pi,
                        mdp.info,
                        approximator_params=approximator_params,
                        **algorithm_params)
        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size,
                   quiet=args.quiet)

        if args.save:
            agent.approximator.model.save()

        # Evaluate initial policy
        pi.set_eval(True)
        pi.set_epsilon(epsilon_test)
        mdp.set_episode_end(False)
        dataset = core.evaluate(n_steps=test_samples,
                                render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset))

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch)
            print('- Learning:')
            # learning step
            pi.set_eval(False)
            pi.set_epsilon(epsilon)
            mdp.set_episode_end(True)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.approximator.model.save()

            print('- Evaluation:')
            # evaluation step
            pi.set_eval(True)
            pi.set_epsilon(epsilon_test)
            mdp.set_episode_end(False)
            dataset = core.evaluate(n_steps=test_samples,
                                    render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset))

            np.save(folder_name + '/scores.npy', scores)

    return scores
示例#17
0
def experiment(policy, name, folder_name):
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_mdp = parser.add_argument_group('Environment')
    arg_mdp.add_argument("--horizon", type=int)
    arg_mdp.add_argument("--gamma", type=float)

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size",
                         type=int,
                         default=100,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size",
                         type=int,
                         default=5000,
                         help='Max size of the replay memory.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument("--n-features", type=int, default=80)
    arg_net.add_argument(
        "--optimizer",
        choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'],
        default='adam',
        help='Name of the optimizer to use to learn.')
    arg_net.add_argument("--learning-rate",
                         type=float,
                         default=.0001,
                         help='Learning rate value of the optimizer. Only used'
                         'in rmspropcentered')
    arg_net.add_argument("--decay",
                         type=float,
                         default=.95,
                         help='Discount factor for the history coming from the'
                         'gradient momentum in rmspropcentered')
    arg_net.add_argument("--epsilon",
                         type=float,
                         default=.01,
                         help='Epsilon term used in rmspropcentered')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--weighted-update", action='store_true')
    arg_alg.add_argument(
        "--n-approximators",
        type=int,
        default=10,
        help="Number of approximators used in the ensemble for"
        "Averaged DQN.")
    arg_alg.add_argument("--batch-size",
                         type=int,
                         default=100,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length",
                         type=int,
                         default=1,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency",
                         type=int,
                         default=100,
                         help='Number of collected samples before each update'
                         'of the target network.')
    arg_alg.add_argument("--evaluation-frequency",
                         type=int,
                         default=1000,
                         help='Number of learning step before each evaluation.'
                         'This number represents an epoch.')
    arg_alg.add_argument("--train-frequency",
                         type=int,
                         default=1,
                         help='Number of learning steps before each fit of the'
                         'neural network.')
    arg_alg.add_argument("--max-steps",
                         type=int,
                         default=50000,
                         help='Total number of learning steps.')
    arg_alg.add_argument(
        "--final-exploration-frame",
        type=int,
        default=1,
        help='Number of steps until the exploration rate stops'
        'decreasing.')
    arg_alg.add_argument("--initial-exploration-rate",
                         type=float,
                         default=0.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate",
                         type=float,
                         default=0.,
                         help='Final value of the exploration rate. When it'
                         'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate",
                         type=float,
                         default=0.,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-samples",
                         type=int,
                         default=1000,
                         help='Number of steps for each evaluation.')
    arg_alg.add_argument("--max-no-op-actions",
                         type=int,
                         default=0,
                         help='Maximum number of no-op action performed at the'
                         'beginning of the episodes. The minimum number is'
                         'history_length.')
    arg_alg.add_argument("--no-op-action-value",
                         type=int,
                         default=0,
                         help='Value of the no-op action.')
    arg_alg.add_argument("--p-mask", type=float, default=1.)

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--load-path',
                           type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--save',
                           action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--render',
                           action='store_true',
                           help='Flag specifying whether to render the game.')
    arg_utils.add_argument('--quiet',
                           action='store_true',
                           help='Flag specifying whether to hide the progress'
                           'bar.')
    arg_utils.add_argument('--debug',
                           action='store_true',
                           help='Flag specifying whether the script has to be'
                           'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    # Evaluation of the model provided by the user.
    if args.load_path:
        # MDP
        if name != 'Taxi':
            mdp = Gym(name, args.horizon, args.gamma)
            n_states = None
            gamma_eval = 1.
        else:
            mdp = generate_taxi('../../grid.txt')
            n_states = mdp.info.observation_space.size[0]
            gamma_eval = mdp.info.gamma

        # Policy
        epsilon_test = Parameter(value=args.test_exploration_rate)
        pi = BootPolicy(args.n_approximators, epsilon=epsilon_test)

        # Approximator
        input_shape = mdp.info.observation_space.shape + (
            args.history_length, )
        input_preprocessor = list()
        approximator_params = dict(input_shape=input_shape,
                                   output_shape=(mdp.info.action_space.n, ),
                                   n_states=n_states,
                                   n_actions=mdp.info.action_space.n,
                                   n_features=args.n_features,
                                   n_approximators=args.n_approximators,
                                   input_preprocessor=input_preprocessor,
                                   name='test',
                                   load_path=args.load_path,
                                   optimizer={
                                       'name': args.optimizer,
                                       'lr': args.learning_rate,
                                       'decay': args.decay,
                                       'epsilon': args.epsilon
                                   })

        approximator = SimpleNet

        # Agent
        algorithm_params = dict(batch_size=0,
                                initial_replay_size=0,
                                max_replay_size=0,
                                history_length=1,
                                clip_reward=False,
                                n_approximators=args.n_approximators,
                                train_frequency=1,
                                target_update_frequency=1,
                                max_no_op_actions=args.max_no_op_actions,
                                no_op_action_value=args.no_op_action_value,
                                p_mask=args.p_mask,
                                weighted_update=args.weighted_update)
        agent = DoubleDQN(approximator,
                          pi,
                          mdp.info,
                          approximator_params=approximator_params,
                          **algorithm_params)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        pi.set_eval(True)
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset, gamma_eval)
    else:
        # DQN learning run

        # Settings
        if args.debug:
            initial_replay_size = 50
            max_replay_size = 500
            train_frequency = 5
            target_update_frequency = 10
            test_samples = 20
            evaluation_frequency = 50
            max_steps = 1000
        else:
            initial_replay_size = args.initial_replay_size
            max_replay_size = args.max_replay_size
            train_frequency = args.train_frequency
            target_update_frequency = args.target_update_frequency
            test_samples = args.test_samples
            evaluation_frequency = args.evaluation_frequency
            max_steps = args.max_steps

        # MDP
        if name != 'Taxi':
            mdp = Gym(name, args.horizon, args.gamma)
            n_states = None
            gamma_eval = 1.
        else:
            mdp = generate_taxi('../../grid.txt')
            n_states = mdp.info.observation_space.size[0]
            gamma_eval = mdp.info.gamma

        # Policy
        epsilon = LinearDecayParameter(value=args.initial_exploration_rate,
                                       min_value=args.final_exploration_rate,
                                       n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1.)

        if policy == 'boot':
            pi = BootPolicy(args.n_approximators, epsilon=epsilon_random)
        elif policy == 'weighted':
            pi = WeightedPolicy(args.n_approximators, epsilon=epsilon_random)
        else:
            raise ValueError

        # Approximator
        input_shape = mdp.info.observation_space.shape + (
            args.history_length, )
        input_preprocessor = list()
        approximator_params = dict(input_shape=input_shape,
                                   output_shape=(mdp.info.action_space.n, ),
                                   n_states=n_states,
                                   n_actions=mdp.info.action_space.n,
                                   n_features=args.n_features,
                                   n_approximators=args.n_approximators,
                                   input_preprocessor=input_preprocessor,
                                   folder_name=folder_name,
                                   optimizer={
                                       'name': args.optimizer,
                                       'lr': args.learning_rate,
                                       'decay': args.decay,
                                       'epsilon': args.epsilon
                                   })

        approximator = SimpleNet

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size,
            history_length=args.history_length,
            clip_reward=False,
            n_approximators=args.n_approximators,
            train_frequency=train_frequency,
            target_update_frequency=target_update_frequency,
            max_no_op_actions=args.max_no_op_actions,
            no_op_action_value=args.no_op_action_value,
            p_mask=args.p_mask,
            weighted_update=args.weighted_update)

        agent = DoubleDQN(approximator,
                          pi,
                          mdp.info,
                          approximator_params=approximator_params,
                          **algorithm_params)

        # Algorithm
        core = Core(agent, mdp)
        core_test = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size,
                   quiet=args.quiet)

        if args.save:
            agent.approximator.model.save()

        # Evaluate initial policy
        pi.set_eval(True)
        pi.set_epsilon(epsilon_test)
        dataset = core_test.evaluate(n_steps=test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        scores.append(get_stats(dataset, gamma_eval))

        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch)
            print('- Learning:')
            pi.set_eval(False)
            pi.set_epsilon(epsilon)
            # learning step
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.approximator.model.save()

            print('- Evaluation:')
            # evaluation step
            core_test.reset()
            pi.set_eval(True)
            pi.set_epsilon(epsilon_test)
            dataset = core_test.evaluate(n_steps=test_samples,
                                         render=args.render,
                                         quiet=args.quiet)
            scores.append(get_stats(dataset, gamma_eval))

    return scores
示例#18
0
def experiment(args, agent_algorithm):
    np.random.seed()

    scores = list()
    #add timestamp to results
    ts = str(time.time())
    # Evaluation of the model provided by the user.
    if args.load_path and args.evaluation:
        # MDP
        if args.name not in ['Taxi', 'Gridworld']:
            mdp = Gym(args.name, args.horizon, args.gamma)
            n_states = None
            gamma_eval = 1.
        elif args.name == 'Taxi':
            mdp = generate_taxi('../../grid.txt')
            n_states = mdp.info.observation_space.size[0]
            gamma_eval = mdp.info.gamma
        else:
            rew_weights = [args.fast_zone, args.slow_zone, args.goal]
            grid_size = args.grid_size
            env = GridWorld(gamma=args.gamma,
                            rew_weights=rew_weights,
                            shape=(grid_size, grid_size),
                            randomized_initial=args.rand_initial,
                            horizon=args.horizon)
            gamma_eval = args.gamma
            mdp = env.generate_mdp()
            n_states = mdp.info.observation_space.size[0]
        # Policy
        epsilon_test = Parameter(value=args.test_exploration_rate)
        pi = BootPolicy(args.n_approximators, epsilon=epsilon_test)

        # Approximator
        input_shape = mdp.info.observation_space.shape + (1, )
        input_preprocessor = list()
        approximator_params = dict(input_shape=input_shape,
                                   output_shape=(mdp.info.action_space.n, ),
                                   n_states=n_states,
                                   n_actions=mdp.info.action_space.n,
                                   n_features=args.n_features,
                                   n_approximators=args.n_approximators,
                                   input_preprocessor=input_preprocessor,
                                   name='test',
                                   load_path=args.load_path,
                                   net_type=args.net_type,
                                   optimizer={
                                       'name': args.optimizer,
                                       'lr': args.learning_rate,
                                       'lr_sigma': args.learning_rate,
                                       'decay': args.decay,
                                       'epsilon': args.epsilon
                                   })

        approximator = SimpleNet

        # Agent
        algorithm_params = dict(batch_size=0,
                                initial_replay_size=0,
                                max_replay_size=0,
                                clip_reward=False,
                                target_update_frequency=1)
        if args.alg == 'boot':
            algorithm_params['p_mask'] = args.p_mask
            pi = BootPolicy(args.n_approximators, epsilon=epsilon_test)
        elif args.alg == 'gaussian':
            if args.ucb:
                pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma))
            else:
                pi = WeightedGaussianPolicy(epsilon=epsilon_test)
        elif args.alg == 'dqn':
            pi = EpsGreedy(epsilon=epsilon_test)
        elif args.alg == 'particle':
            if args.ucb:
                pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma))
            else:
                pi = WeightedPolicy(args.n_approximators, epsilon=epsilon_test)

        else:
            raise ValueError("Algorithm uknown")

        if args.alg in ['gaussian', 'particle']:
            algorithm_params['update_type'] = args.update_type
            algorithm_params['delta'] = args.delta
            algorithm_params['store_prob'] = args.store_prob
            if args.clip_target:
                algorithm_params['max_spread'] = args.q_max - args.q_min
            approximator_params['q_min'] = args.q_min
            approximator_params['q_max'] = args.q_max
            approximator_params['loss'] = args.loss
            approximator_params['init_type'] = args.init_type
            approximator_params['sigma_weight'] = args.sigma_weight
        if args.alg in ['particle', 'boot']:
            approximator_params['n_approximators'] = args.n_approximators
            algorithm_params['n_approximators'] = args.n_approximators
        agent = agent_algorithm(approximator,
                                pi,
                                mdp.info,
                                approximator_params=approximator_params,
                                **algorithm_params)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        pi.set_eval(True)
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset)
    else:
        # DQN learning run
        print("Learning Run")

        # Settings
        if args.debug:
            initial_replay_size = 50
            max_replay_size = 500
            train_frequency = 5
            target_update_frequency = 10
            test_samples = 20
            evaluation_frequency = 50
            max_steps = 1000
        else:
            initial_replay_size = args.initial_replay_size
            max_replay_size = args.max_replay_size
            train_frequency = args.train_frequency
            target_update_frequency = args.target_update_frequency
            test_samples = args.test_samples
            evaluation_frequency = args.evaluation_frequency
            max_steps = args.max_steps

        # MDP
        if args.name not in ['Taxi', 'Gridworld']:
            mdp = Gym(args.name, args.horizon, args.gamma)
            n_states = None
            gamma_eval = 1.
        elif args.name == 'Taxi':
            mdp = generate_taxi('../../grid.txt')
            n_states = mdp.info.observation_space.size[0]
            gamma_eval = mdp.info.gamma
        else:
            rew_weights = [args.fast_zone, args.slow_zone, args.goal]
            grid_size = args.grid_size
            env = GridWorld(gamma=args.gamma,
                            rew_weights=rew_weights,
                            shape=(grid_size, grid_size),
                            randomized_initial=args.rand_initial,
                            horizon=args.horizon)
            mdp = env.generate_mdp()
            n_states = mdp.info.observation_space.size[0]
            print(mdp.info.gamma)
            gamma_eval = args.gamma
        # Policy
        epsilon = LinearDecayParameter(value=args.initial_exploration_rate,
                                       min_value=args.final_exploration_rate,
                                       n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1.)

        policy_name = 'weighted'
        update_rule = args.update_type + "_update"
        if args.alg == 'boot':
            pi = BootPolicy(args.n_approximators, epsilon=epsilon)
            policy_name = 'boot'
            update_rule = 'boot'
        elif args.alg == 'dqn':
            pi = EpsGreedy(epsilon=epsilon)
            policy_name = 'eps_greedy'
            update_rule = 'td'
        elif args.alg == 'particle':
            if args.ucb:
                policy_name = 'ucb'
                pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma))
            else:
                pi = WeightedPolicy(args.n_approximators)
        elif args.alg == 'gaussian':
            if args.ucb:
                policy_name = 'ucb'
                pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma))
            else:
                pi = WeightedGaussianPolicy()
        else:
            raise ValueError("Algorithm unknown")
        # Summary folder
        folder_name = './logs/' + args.alg + "/" + policy_name + '/' + update_rule + '/' + args.name + "/" + args.loss + "/" + str(
            args.n_approximators
        ) + "_particles" + "/" + args.init_type + "_init" + "/" + str(
            args.learning_rate) + "/" + ts

        # Approximator
        input_shape = mdp.info.observation_space.shape
        input_preprocessor = list()
        approximator_params = dict(input_shape=input_shape,
                                   output_shape=(mdp.info.action_space.n, ),
                                   n_states=n_states,
                                   n_actions=mdp.info.action_space.n,
                                   n_features=args.n_features,
                                   n_approximators=args.n_approximators,
                                   input_preprocessor=input_preprocessor,
                                   folder_name=folder_name,
                                   net_type=args.net_type,
                                   sigma_weight=args.sigma_weight,
                                   optimizer={
                                       'name': args.optimizer,
                                       'lr': args.learning_rate,
                                       'lr_sigma': args.learning_rate,
                                       'decay': args.decay,
                                       'epsilon': args.epsilon
                                   })
        if args.load_path:
            ts = os.path.basename(os.path.normpath(args.load_path))
            approximator_params['load_path'] = args.load_path
            approximator_params['folder_name'] = args.load_path
            folder_name = args.load_path
            p = "scores_" + str(ts) + ".npy"
            scores = np.load(p).tolist()
            max_steps = max_steps - evaluation_frequency * len(scores)
        approximator = SimpleNet

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size,
            clip_reward=False,
            target_update_frequency=target_update_frequency // train_frequency,
        )
        if args.alg == 'boot':
            algorithm_params['p_mask'] = args.p_mask
        elif args.alg in ['particle', 'gaussian']:
            algorithm_params['update_type'] = args.update_type
            algorithm_params['delta'] = args.delta
            algorithm_params['store_prob'] = args.store_prob
            if args.clip_target:
                algorithm_params['max_spread'] = args.q_max - args.q_min
            approximator_params['q_min'] = args.q_min
            approximator_params['q_max'] = args.q_max
            approximator_params['loss'] = args.loss
            approximator_params['init_type'] = args.init_type

        if args.alg in ['boot', 'particle']:
            approximator_params['n_approximators'] = args.n_approximators
            algorithm_params['n_approximators'] = args.n_approximators

        agent = agent_algorithm(approximator,
                                pi,
                                mdp.info,
                                approximator_params=approximator_params,
                                **algorithm_params)

        if args.ucb:
            q = agent.approximator
            if args.alg == 'particle':

                def mu(state):
                    q_list = q.predict(state).squeeze()
                    qs = np.array(q_list)
                    return qs.mean(axis=0)

                quantiles = [
                    i * 1. / (args.n_approximators - 1)
                    for i in range(args.n_approximators)
                ]
                for p in range(args.n_approximators):
                    if quantiles[p] >= 1 - args.delta:
                        delta_index = p
                        break

                def quantile_func(state):
                    q_list = q.predict(state).squeeze()

                    qs = np.sort(np.array(q_list), axis=0)
                    return qs[delta_index, :]

                print("Setting up ucb policy")
                pi.set_mu(mu)
                pi.set_quantile_func(quantile_func)

            if args.alg == 'gaussian':
                standard_bound = norm.ppf(1 - args.delta, loc=0, scale=1)

                def mu(state):
                    q_and_sigma = q.predict(state).squeeze()
                    means = q_and_sigma[0]
                    return means

                def quantile_func(state):
                    q_and_sigma = q.predict(state).squeeze()
                    means = q_and_sigma[0]
                    sigmas = q_and_sigma[1]
                    return sigmas * standard_bound + means

                print("Setting up ucb policy")
                pi.set_mu(mu)
                pi.set_quantile_func(quantile_func)
        args.count = 100
        if args.plot_qs:
            import matplotlib.pyplot as plt
            colors = ['red', 'blue', 'green']
            labels = ['left', 'nop', 'right']

            def plot_probs(qs):
                args.count += 1
                if args.count < 1:
                    return
                ax.clear()
                for i in range(qs.shape[-1]):
                    mu = np.mean(qs[..., i], axis=0)
                    sigma = np.std(qs[..., i], axis=0)
                    x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 20)
                    ax.plot(x,
                            stats.norm.pdf(x, mu, sigma),
                            label=labels[i],
                            color=colors[i])

                ax.set_xlabel('Q-value')
                ax.set_ylabel('Probability')
                ax.set_title('Q-distributions')
                #ax.set_ylim(bottom=0, top=1)

                plt.draw()
                plt.pause(0.02)
                #print("Plotted")
                args.count = 0
                #return probs

            plt.ion()
            fig, ax = plt.subplots()

            plot_probs(
                np.array(agent.approximator.predict(np.array(mdp.reset()))))

            input()
            args.count = 100
            qs = np.array([
                np.linspace(-1000, 0, 10),
                np.linspace(-2000, -1000, 10),
                np.linspace(-750, -250, 10)
            ])
            plot_probs(qs.T)
        # Algorithm
        core = Core(agent, mdp)
        core_test = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset

        print_epoch(0)
        core.learn(
            n_steps=initial_replay_size,
            n_steps_per_fit=initial_replay_size,
            quiet=args.quiet,
        )

        if args.save:
            agent.approximator.model.save()

        # Evaluate initial policy
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        pi.set_epsilon(epsilon_test)
        dataset = core_test.evaluate(n_steps=test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        scores.append(get_stats(dataset))
        if args.plot_qs:
            pi.set_plotter(plot_probs)
        np.save(folder_name + '/scores_' + str(ts) + '.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch)
            print('- Learning:')
            # learning step
            if hasattr(pi, 'set_eval'):
                pi.set_eval(False)

            pi.set_epsilon(epsilon)
            # learning step
            if args.plot_qs:
                pi.set_plotter(None)
            core.learn(
                n_steps=evaluation_frequency,
                n_steps_per_fit=train_frequency,
                quiet=args.quiet,
            )

            if args.save:
                agent.approximator.model.save()

            print('- Evaluation:')
            # evaluation step
            if hasattr(pi, 'set_eval'):
                pi.set_eval(True)
            pi.set_epsilon(epsilon_test)
            if args.plot_qs:
                pi.set_plotter(plot_probs)
            dataset = core_test.evaluate(n_steps=test_samples,
                                         render=args.render,
                                         quiet=args.quiet)
            scores.append(get_stats(dataset))
            np.save(folder_name + '/scores_' + str(ts) + '.npy', scores)

    return scores
示例#19
0
文件: run.py 项目: czgdp1807/wql
def experiment():
    np.random.seed()
    #tf.set_random_seed()
    # Argument parser
    parser = argparse.ArgumentParser()

    arg_game = parser.add_argument_group('Game')
    arg_game.add_argument("--name",
                          type=str,
                          default='BreakoutNoFrameskip-v4',
                          help='Gym ID of the Atari game.')
    arg_game.add_argument("--screen-width",
                          type=int,
                          default=84,
                          help='Width of the game screen.')
    arg_game.add_argument("--screen-height",
                          type=int,
                          default=84,
                          help='Height of the game screen.')
    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size",
                         type=int,
                         default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size",
                         type=int,
                         default=1000000,
                         help='Max size of the replay memory.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument(
        "--optimizer",
        choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'],
        default='rmsprop',
        help='Name of the optimizer to use to learn.')
    arg_net.add_argument("--learning-rate",
                         type=float,
                         default=.00025,
                         help='Learning rate value of the optimizer. Only used'
                         'in rmspropcentered')
    arg_net.add_argument(
        "--lr-sigma",
        type=float,
        default=.1e-6,
        help='Learning rate value of the optimizer for sigma. Only used'
        'in GaussianDQN')
    arg_net.add_argument("--decay", type=float, default=.95)
    arg_net.add_argument("--epsilon", type=float, default=1e-8)

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--alg",
                         choices=['boot', 'particle', 'gaussian', 'dqn'],
                         default='particle',
                         help='Algorithm to use')
    arg_alg.add_argument("--weighted", action='store_true')
    arg_alg.add_argument("--ucb", action='store_true')
    arg_alg.add_argument("--boot",
                         action='store_true',
                         help="Flag to use BootstrappedDQN.")
    arg_alg.add_argument("--gaussian",
                         action='store_true',
                         help="Flag to use GaussianDQN.")
    arg_alg.add_argument(
        "--double",
        action='store_true',
        help="Flag to use the DoubleDQN version of the algorithm.")
    arg_alg.add_argument(
        "--update-type",
        choices=['mean', 'weighted', 'optimistic'],
        default='mean',
        help='Kind of update to perform (only WQL algorithms).')
    arg_alg.add_argument("--multiple-nets",
                         action='store_true',
                         help="if to use separate nets for every environment")
    arg_alg.add_argument(
        "--n-approximators",
        type=int,
        default=10,
        help="Number of approximators used in the ensemble for"
        "Averaged DQN.")
    arg_alg.add_argument("--loss",
                         choices=['squared_loss', 'huber_loss', 'triple_loss'],
                         default='huber_loss',
                         help="Loss functions used in the approximator")
    arg_alg.add_argument("--delta",
                         type=float,
                         default=0.1,
                         help='Parameter of ucb policy')
    arg_alg.add_argument(
        "--q-max",
        type=float,
        default=100,
        help='Upper bound for initializing the heads of the network')
    arg_alg.add_argument(
        "--q-min",
        type=float,
        default=0,
        help='Lower bound for initializing the heads of the network')
    arg_alg.add_argument("--sigma-weight",
                         type=float,
                         default=1.0,
                         help='Used in gaussian learning to explore more')
    arg_alg.add_argument("--init-type",
                         choices=['boot', 'linspace'],
                         default='linspace',
                         help='Type of initialization for the network')
    arg_alg.add_argument("--batch-size",
                         type=int,
                         default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length",
                         type=int,
                         default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency",
                         type=int,
                         default=10000,
                         help='Number of learning step before each update of'
                         'the target network.')
    arg_alg.add_argument("--evaluation-frequency",
                         type=int,
                         default=250000,
                         help='Number of learning step before each evaluation.'
                         'This number represents an epoch.')
    arg_alg.add_argument("--train-frequency",
                         type=int,
                         default=4,
                         help='Number of learning steps before each fit of the'
                         'neural network.')
    arg_alg.add_argument("--max-steps",
                         type=int,
                         default=50000000,
                         help='Total number of learning steps.')
    arg_alg.add_argument(
        "--final-exploration-frame",
        type=int,
        default=1000000,
        help='Number of steps until the exploration rate stops'
        'decreasing.')
    arg_alg.add_argument("--initial-exploration-rate",
                         type=float,
                         default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate",
                         type=float,
                         default=0.05,
                         help='Final value of the exploration rate. When it'
                         'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate",
                         type=float,
                         default=0.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-samples",
                         type=int,
                         default=125000,
                         help='Number of steps for each evaluation.')
    arg_alg.add_argument("--max-no-op-actions",
                         type=int,
                         default=30,
                         help='Maximum number of no-op action performed at the'
                         'beginning of the episodes. The minimum number is'
                         'history_length.')
    arg_alg.add_argument("--p-mask", type=float, default=1.)

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--load-path',
                           type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--save',
                           action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument(
        '--evaluation',
        action='store_true',
        help='Flag specifying whether the model loaded will be evaluated.')
    arg_utils.add_argument('--render',
                           action='store_true',
                           help='Flag specifying whether to render the game.')
    arg_utils.add_argument('--quiet',
                           action='store_true',
                           help='Flag specifying whether to hide the progress'
                           'bar.')
    arg_utils.add_argument('--debug',
                           action='store_true',
                           help='Flag specifying whether the script has to be'
                           'run in debug mode.')
    arg_utils.add_argument("--device",
                           type=int,
                           default=0,
                           help='Index of the GPU.')

    args = parser.parse_args()

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device)

    from particle_dqn import ParticleDQN, ParticleDoubleDQN
    from bootstrapped_dqn import BootstrappedDoubleDQN, BootstrappedDQN
    from gaussian_dqn import GaussianDQN
    from dqn import DoubleDQN, DQN
    from mushroom.core.core import Core
    from mushroom.environments import Atari

    from mushroom.utils.dataset import compute_scores
    from mushroom.utils.parameters import LinearDecayParameter, Parameter

    from policy import BootPolicy, WeightedPolicy, WeightedGaussianPolicy, EpsGreedy, UCBPolicy
    if args.alg == 'boot':
        from boot_net import ConvNet
        if args.double:
            agent_algorithm = BootstrappedDoubleDQN
        else:
            agent_algorithm = BootstrappedDQN
    elif args.alg == 'gaussian':
        from gaussian_net import GaussianNet as ConvNet
        agent_algorithm = GaussianDQN
    elif args.alg == 'dqn':
        from dqn_net import ConvNet
        if args.double:
            agent_algorithm = DoubleDQN
        else:
            agent_algorithm = DQN
    else:
        if args.multiple_nets:
            from net_multiple import ConvNet
            print("Using Multiple Nets")
        else:
            from net import ConvNet
        if args.double:
            agent_algorithm = ParticleDoubleDQN
        else:
            agent_algorithm = ParticleDQN

    def get_stats(dataset):
        score = compute_scores(dataset)
        print('min_reward: %f, max_reward: %f, mean_reward: %f,'
              ' games_completed: %d' % score)

        return score

    scores = list()
    #add timestamp to results
    ts = str(time.time())
    # Evaluation of the model provided by the user.

    if args.load_path and args.evaluation:
        mdp = Atari(args.name,
                    args.screen_width,
                    args.screen_height,
                    ends_at_life=False,
                    history_length=args.history_length,
                    max_no_op_actions=args.max_no_op_actions)
        print("Evaluation Run")

        # Policy
        epsilon_test = Parameter(value=args.test_exploration_rate)

        # Approximator
        input_shape = (args.screen_height, args.screen_width,
                       args.history_length)
        approximator_params = dict(
            input_shape=input_shape,
            output_shape=(mdp.info.action_space.n, ),
            n_actions=mdp.info.action_space.n,
            name='test',
            load_path=args.load_path,
            optimizer={
                'name': args.optimizer,
                'lr': args.learning_rate,
                'lr_sigma': args.lr_sigma,
                'decay': args.decay,
                'epsilon': args.epsilon
            },
        )

        approximator = ConvNet

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            initial_replay_size=1,
            max_replay_size=1,
            clip_reward=True,
            target_update_frequency=args.target_update_frequency //
            args.train_frequency,
        )
        if args.alg == 'boot':
            algorithm_params['p_mask'] = args.p_mask
            pi = BootPolicy(args.n_approximators, epsilon=epsilon_test)
        elif args.alg == 'gaussian':
            if args.ucb:
                pi = UCBPolicy(delta=args.delta)
            else:
                pi = WeightedGaussianPolicy(epsilon=epsilon_test)
        elif args.alg == 'dqn':
            pi = EpsGreedy(epsilon=epsilon_test)
        elif args.alg == 'particle':
            if args.ucb:
                pi = UCBPolicy(args.n_approximators, epsilon=epsilon_test)
            else:
                pi = WeightedPolicy(delta=args.delta)
        else:
            raise ValueError("Algorithm uknown")

        if args.alg in ['gaussian', 'particle']:
            algorithm_params['update_type'] = args.update_type
            algorithm_params['delta'] = args.delta
            approximator_params['q_min'] = args.q_min
            approximator_params['q_max'] = args.q_max
            approximator_params['loss'] = args.loss
            approximator_params['init_type'] = args.init_type
            approximator_params['sigma_weight'] = args.sigma_weight
        if args.alg in ['particle', 'boot']:
            approximator_params['n_approximators'] = args.n_approximators
            algorithm_params['n_approximators'] = args.n_approximators
        agent = agent_algorithm(approximator,
                                pi,
                                mdp.info,
                                approximator_params=approximator_params,
                                **algorithm_params)
        #print(agent)
        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        pi.set_eval(True)
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset)
    else:
        # DQN learning run
        print("Learning Run")

        # Settings
        if args.debug:
            initial_replay_size = 50
            max_replay_size = 500
            train_frequency = 5
            target_update_frequency = 10
            test_samples = 20
            evaluation_frequency = 50
            max_steps = 1000
        else:
            initial_replay_size = args.initial_replay_size
            max_replay_size = args.max_replay_size
            train_frequency = args.train_frequency
            target_update_frequency = args.target_update_frequency
            test_samples = args.test_samples
            evaluation_frequency = args.evaluation_frequency
            max_steps = args.max_steps

        # MDP
        mdp = Atari(args.name,
                    args.screen_width,
                    args.screen_height,
                    ends_at_life=True)

        # Policy
        epsilon = LinearDecayParameter(value=args.initial_exploration_rate,
                                       min_value=args.final_exploration_rate,
                                       n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1.)

        policy_name = 'weighted'
        update_rule = args.update_type + "_update"
        if args.alg == 'boot':
            pi = BootPolicy(args.n_approximators, epsilon=epsilon)
            policy_name = 'boot'
            update_rule = 'boot'
        elif args.alg == 'dqn':
            pi = EpsGreedy(epsilon=epsilon)
            policy_name = 'eps_greedy'
            update_rule = 'td'
        elif args.alg == 'particle':
            if args.ucb:
                pi = UCBPolicy(delta=args.delta)
            else:
                pi = WeightedPolicy(args.n_approximators)
        elif args.alg == 'gaussian':
            if args.ucb:
                pi = UCBPolicy(delta=args.delta)
            else:
                pi = WeightedGaussianPolicy()
        else:
            raise ValueError("Algorithm unknown")
        # Summary folder
        folder_name = './logs/' + args.alg + "/" + policy_name + '/' + update_rule + '/' + args.name + "/" + args.loss + "/" + str(
            args.n_approximators
        ) + "_particles" + "/" + args.init_type + "_init" + "/" + str(
            args.learning_rate) + "/" + ts

        # Approximator
        input_shape = (args.screen_height, args.screen_width,
                       args.history_length)
        approximator_params = dict(input_shape=input_shape,
                                   output_shape=(mdp.info.action_space.n, ),
                                   n_actions=mdp.info.action_space.n,
                                   folder_name=folder_name,
                                   sigma_weight=args.sigma_weight,
                                   optimizer={
                                       'name': args.optimizer,
                                       'lr': args.learning_rate,
                                       'decay': args.decay,
                                       'epsilon': args.epsilon,
                                       'lr_sigma': args.lr_sigma
                                   })
        if args.load_path:
            ts = os.path.basename(os.path.normpath(args.load_path))
            approximator_params['load_path'] = args.load_path
            approximator_params['folder_name'] = args.load_path
            folder_name = args.load_path
            p = "scores.npy"
            scores = np.load(p).tolist()
            max_steps = max_steps - evaluation_frequency * len(scores)
        approximator = ConvNet

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size,
            clip_reward=True,
            target_update_frequency=target_update_frequency //
            args.train_frequency)
        if args.alg == 'boot':
            algorithm_params['p_mask'] = args.p_mask
        elif args.alg in ['particle', 'gaussian']:
            algorithm_params['update_type'] = args.update_type
            algorithm_params['delta'] = args.delta
            approximator_params['q_min'] = args.q_min
            approximator_params['q_max'] = args.q_max
            approximator_params['loss'] = args.loss
            approximator_params['init_type'] = args.init_type

        if args.alg in ['boot', 'particle']:
            approximator_params['n_approximators'] = args.n_approximators
            algorithm_params['n_approximators'] = args.n_approximators

        agent = agent_algorithm(approximator,
                                pi,
                                mdp.info,
                                approximator_params=approximator_params,
                                **algorithm_params)
        if args.ucb:
            q = agent.approximator
            if args.alg == 'particle':

                def mu(state):
                    q_list = q.predict(state).squeeze()
                    qs = np.array(q_list)
                    return qs.mean(axis=0)

                quantiles = [
                    i * 1. / (args.n_approximators - 1)
                    for i in range(args.n_approximators)
                ]
                for p in range(args.n_approximators):
                    if quantiles[p] >= 1 - args.delta:
                        delta_index = p
                        break

                def quantile_func(state):
                    q_list = q.predict(state).squeeze()
                    qs = np.array(q_list)
                    return qs[delta_index, :]

                pi.set_mu(mu)
                pi.set_quantile_func(quantile_func)

            if args.alg == 'gaussian':
                raise ValueError("Not implemented")

        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset

        print_epoch(0)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size,
                   quiet=args.quiet)

        if args.save:
            agent.approximator.model.save()

        # Evaluate initial policy
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        pi.set_epsilon(epsilon_test)
        mdp.set_episode_end(False)
        dataset = core.evaluate(n_steps=test_samples,
                                render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset))

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch)
            print('- Learning:')
            # learning step
            if hasattr(pi, 'set_eval'):
                pi.set_eval(False)

            pi.set_epsilon(epsilon)
            mdp.set_episode_end(True)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.approximator.model.save()

            print('- Evaluation:')
            # evaluation step
            if hasattr(pi, 'set_eval'):
                pi.set_eval(True)
            pi.set_epsilon(epsilon_test)
            mdp.set_episode_end(False)
            dataset = core.evaluate(n_steps=test_samples,
                                    render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset))

            np.save(folder_name + '/scores.npy', scores)

    return scores