示例#1
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')

    args = parser.parse_args()
    #args.input_shape = tuple(args.input_shape)

    #args.output = get_output_folder(args.output, args.env)

    #set up environment model
    env = gym.make(str(args.env))
    NUM_ACTIONS = env.action_space.n  #env.get_action_space().num_actions()

    #make dqn agent
    FRAMES_PER_STATE = 4
    INPUT_SHAPE = (84, 84)
    GAMMA = .99
    NUM_ITERATIONS = 5000000
    TARGET_UPDATE_FREQ = 10000
    NUM_BURN_IN = 32
    TRAIN_FREQ = 0
    BATCH_SIZE = 32
    REPLAY_MEM_SIZE = 1000000
    REPLAY_START_SIZE = 50000
    MAX_EPISODE_LEN = 10000
    HELD_OUT_STATES_SIZE = 1000
    IS_DOUBLE_Q = True

    model = create_model(FRAMES_PER_STATE,
                         INPUT_SHAPE,
                         NUM_ACTIONS,
                         model_name='linear q_network')

    plot_model(model, to_file='model.png')

    target = create_model(FRAMES_PER_STATE,
                          INPUT_SHAPE,
                          NUM_ACTIONS,
                          model_name='linear q_network target')
    preprocessor = HistoryPreprocessor(FRAMES_PER_STATE - 1)
    memory = ReplayMemory(REPLAY_MEM_SIZE, FRAMES_PER_STATE)
    held_out_states = ReplayMemory(HELD_OUT_STATES_SIZE, FRAMES_PER_STATE)
    policy = LinearDecayGreedyEpsilonPolicy(1, .05, int(1e6))
    agent = DQNAgent(model, target, preprocessor, memory, policy,
                     held_out_states, HELD_OUT_STATES_SIZE, GAMMA,
                     TARGET_UPDATE_FREQ, NUM_BURN_IN, TRAIN_FREQ, BATCH_SIZE,
                     REPLAY_START_SIZE, NUM_ACTIONS, IS_DOUBLE_Q)

    # compile agent
    adam = Adam(lr=0.0001)
    agent.compile(adam, mean_huber_loss)
    agent.fit(env, NUM_ITERATIONS, MAX_EPISODE_LEN)
示例#2
0
def main():
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env',
                        default='SpaceInvaders-v0',
                        help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--model', default=1, type=int, help='model')
    parser.add_argument('--double', action='store_true')

    args = parser.parse_args()
    print('Using Tensorflow Version of ' + tf.__version__)
    # args.input_shape = tuple(args.input_shape)

    args.output = get_output_folder(args.output, args.env)
    print("Output Folder: " + args.output)

    # here is where you should start up a session,
    # create your DQN agent, create your model, etc.
    # then you can run your fit method.

    sess = tf.Session()
    K.set_session(sess)

    env = gym.make(args.env)
    num_actions = env.action_space.n
    # 0 linear; 1 deep; 2 dueling
    model = create_model(WINDOW, INPUT_SHAPE, num_actions, args.model)
    atari_preprocessor = AtariPreprocessor(INPUT_SHAPE)
    history_preprocessor = HistoryPreprocessor(HIST_LENGTH)
    preprocessor = PreprocessorSequence(
        [atari_preprocessor, history_preprocessor])
    memory = ReplayMemory(MAX_MEMORY, WINDOW)
    policy = LinearDecayGreedyEpsilonPolicy(START_EPSILON, END_EPSILON,
                                            NUM_STEPS)

    dqn_agent = DQNAgent(model,
                         num_actions,
                         preprocessor,
                         memory,
                         policy,
                         GAMMA,
                         TARGET_UPDATE_FREQ,
                         INIT_MEMORY,
                         TRAIN_FREQ,
                         BATCH_SIZE,
                         double=args.double)

    optimizer = Adam(lr=LEARNING_RATE, epsilon=MIN_SQ_GRAD)
    loss_func = mean_huber_loss
    dqn_agent.compile(optimizer, loss_func)
    dqn_agent.fit(env, NUM_ITERATIONS, MAX_EPISODE_LENGTH)
示例#3
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name')
    parser.add_argument(
        '-o', '--output', default='linearQ', help='Directory to save data to')
    parser.add_argument('--seed', default=703, type=int, help='Random seed')

    args = parser.parse_args()
    args.output = get_output_folder(args.output, args.env)
    # args.output = '/home/thupxd/deeprl_for_atari_games/' + args.output # Comment out when running locally!
    os.makedirs(args.output, exist_ok=True)

    # here is where you should start up a session,
    # create your DQN agent, create your model, etc.
    # then you can run your fit method.

    # Make the environment
    env = gym.make(args.env)
    # input('**************************  Hit to begin training...  ******************************')

    # Create a Q network
    num_actions = env.action_space.n
    q_net = create_model(4, (84, 84), num_actions, model_name='Linear_Q_Net')
    # print('======================== Keras Q-network model is created. =========================')

    # Initialize a preporcessor sequence object
    atari_preprocessor = tfrl.preprocessors.AtariPreprocessor((84, 84))
    # print('======================== Preprocessor object is created. =========================')

    # Initialize a replay memory
    replay_memory = tfrl.core.ReplayMemory(1000000, 4)
    # print('======================== Replay_memory object is created. =========================')

    # Initialize a policy
    _policy = tfrl.policy.GreedyEpsilonPolicy(0.05, num_actions)
    policy = tfrl.policy.LinearDecayGreedyEpsilonPolicy(_policy, 1, 0.1, 1000000)
    # print('======================== (linear-decay) Eps-Greedy Policy object is created. =========================')

    # Initialize a DQNAgent
    DQNAgent = tfrl.dqn.DQNAgent(q_net, atari_preprocessor, replay_memory, policy, gamma=0.99,
                                 target_update_freq=10000, num_burn_in=100000, train_freq=4, 
                                 batch_size=32, window_size=4)
    # print('======================== DQN agent is created. =========================')

    # Compiling, Training, Test
    # print('======================== Model compilation begin! =========================')
    adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    q_net.compile(optimizer=adam, loss=mean_huber_loss)
    # print('======================== Model compilation finished! =========================')
    # print('======================== Model training begin! =========================')
    DQNAgent.fit(env, args.env, args.output, 5000000, 100000)
示例#4
0
def main():  # noqa: D103
    config = get_config(True)
    env = gym.make(config.env_name)
    q = create_model(4, (84, 84),
                     env.action_space.n,
                     model_name=config.modelname)
    q_target = create_model(4, (84, 84),
                            env.action_space.n,
                            model_name=config.modelname)
    huber_loss = tfrl.objectives.mean_huber_loss
    adam = Adam(lr=config.learning_rate)
    q.compile(adam, huber_loss, metrics=['accuracy'])
    q_target.compile(adam, huber_loss, metrics=['accuracy'])
    policy = LinearDecayGreedyEpsilonPolicy(0.9, 0.05, config.iteration_num /
                                            50)  # Deprecated
    with open(config.losslog, "w") as log:
        log.write("Iteraton,Loss,Accuarcy\n")
    with open(config.rewardlog, "w") as log:
        log.write("Iteraton,reward\n")
    #####
    #Agent = DoubleDQNAgent(q, q_target, policy, config.gamma, config.num_burn_in, config.train_freq, config.batch_size, config)

    Agent = DQNAgent(q, q_target, policy, config.gamma, config.num_burn_in,
                     config.train_freq, config.batch_size, config)

    mse_loss, mae_metric, q, q_target = Agent.fit(env, config.iteration_num, 0)
    TimeStamp = datetime.datetime.strftime(datetime.datetime.now(),
                                           "%y-%m-%d_%H-%M")

    q.save_weights(
        str(config.modelname) + '_' + TimeStamp + '_final_weights.h5')
示例#5
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')

    args = parser.parse_args()
    #args.input_shape = tuple(args.input_shape)

    #args.output = get_output_folder(args.output, args.env)

    #set up environment model
    env = gym.make(str(args.env))
    NUM_ACTIONS = env.action_space.n  #env.get_action_space().num_actions()

    #make dqn agent
    FRAMES_PER_STATE = 4
    INPUT_SHAPE = (84, 84)
    GAMMA = .99
    NUM_ITERATIONS = 500000
    TARGET_UPDATE_FREQ = 0
    NUM_BURN_IN = 0
    TRAIN_FREQ = 0
    BATCH_SIZE = 0

    model = create_model(FRAMES_PER_STATE,
                         INPUT_SHAPE,
                         NUM_ACTIONS,
                         model_name='linear q_network')
    preprocessor = HistoryPreprocessor(FRAMES_PER_STATE - 1)
    memory = None
    policy = LinearDecayGreedyEpsilonPolicy(1, .05, 10e6)
    agent = DQNAgent(model, preprocessor, memory, policy, GAMMA,
                     TARGET_UPDATE_FREQ, NUM_BURN_IN, TRAIN_FREQ, BATCH_SIZE)

    #compile agent
    adam = Adam(lr=0.0001)
    loss = losses.mean_squared_error
    agent.compile(adam, loss)
    agent.fit(env, NUM_ITERATIONS)
示例#6
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(
        description='Run DQN on Atari SpaceInvaders')
    parser.add_argument('--env',
                        default='SpaceInvaders-v0',
                        help='Atari env name')
    parser.add_argument('--mode',
                        default='vanilla',
                        type=str,
                        help='vanilla or double dqn')

    args = parser.parse_args()
    print " MODE IS", args.mode

    video_every_nth = 50000
    eval_every_nth = 50000

    if args.env == "breakout":
        args.env = 'Breakout-v0'
        video_every_nth = 50000
    if args.env == "space_invaders":
        args.env = 'SpaceInvaders-v0'
    if args.env == 'enduro':
        args.env = 'Enduro-v0'
        video_every_nth = 50000
        eval_every_nth = 50000

    agent = DQNAgent(env=args.env,
                     gamma=0.99,
                     target_update_freq=10000,
                     num_burn_in=50000,
                     train_freq=4,
                     batch_size=32,
                     mode=args.mode)
    agent.fit(num_iterations=int(5e6),
              max_episode_length=100000,
              save_model_every_nth=10000,
              eval_every_nth=eval_every_nth,
              log_loss_every_nth=1000,
              video_every_nth=video_every_nth)
示例#7
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Game')
    parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name', required=True)
    parser.add_argument(
        '-o', '--output', default='atari-v0', help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')

    args = parser.parse_args()
    print 'Using Tensorflow Version of ' + tf.__version__
    #args.input_shape = tuple(args.input_shape)

    args.output = get_output_folder(args.output, args.env)
    print "Output Folder: " + args.output

    # here is where you should start up a session,
    # create your DQN agent, create your model, etc.
    # then you can run your fit method.

    sess = tf.Session()
    K.set_session(sess)

    env = gym.make(args.env)
    #env = gym.wrappers.Monitor(env, args.output + '/gym')
    num_actions = env.action_space.n
    # 0 linear; 1 deep; 2 dueling
    model = create_model(WINDOW, INPUT_SHAPE, num_actions, 1)
    atari_preprocessor = AtariPreprocessor(INPUT_SHAPE)
    history_preprocessor = HistoryPreprocessor(4)
    preprocessor = PreprocessorSequence([atari_preprocessor, history_preprocessor])
    memory = ReplayMemory(MAX_MEMORY, WINDOW)
    policy = GreedyEpsilonPolicy(0.05)

    dqn_agent = DQNAgent(model, num_actions, preprocessor, memory, policy, GAMMA,
                         TARGET_UPDATE_FREQ, INIT_MEMORY, TRAIN_FREQ, BATCH_SIZE, double=False)

    optimizer = Adam(lr=0.00025, epsilon=10-3)
    loss_func = mean_huber_loss
    dqn_agent.compile(optimizer, loss_func)
    #dqn_agent.calc_q_values(state)
    dqn_agent.fit(env, 100000,  MAX_EPISODE_LENGTH)
示例#8
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env',
                        default='SpaceInvaders-v0',
                        help='Atari env name')
    parser.add_argument('--seed', default=23333, type=int, help='Random seed')
    parser.add_argument('--memory_size',
                        default=1000000,
                        type=int,
                        help='memory_size')

    args = parser.parse_args()

    seed_all(args.seed)

    env = gym.make(args.env)
    n_actions = env.action_space.n  # n = 6 for SpaceInvaders-v0
    env = wrap_deepmind(env,
                        episode_life=True,
                        clip_rewards=True,
                        frame_stack=True,
                        scale=False)  # todo
    model = Model(in_channels=4, n_actions=n_actions)
    memory = ReplayMemory(max_size=args.memory_size)
    policy = LinearDecayGreedyEpsilonPolicy(n_actions=n_actions,
                                            start_value=1,
                                            end_value=0.1,
                                            num_steps=1000000)
    agent = DQNAgent(q_network=model,
                     memory=memory,
                     gamma=0.99,
                     target_update_freq=1000,
                     num_burn_in=200000,
                     batch_size=256,
                     policy=policy,
                     train_freq=32)
    agent.fit(env, num_iterations=100000, max_episode_length=10000)
示例#9
0
def main():

    #env = gym.make("Enduro-v0")
    #env = gym.make("SpaceInvaders-v0")
    #env = gym.make("Breakout-v0")

    model_name = "q2"
    if (len(sys.argv) >= 2):
        model_name = sys.argv[1]

    if (len(sys.argv) >= 3):
        env = gym.make(sys.argv[2])
    else:
        #env = gym.make("Enduro-v0")
        env = gym.make("SpaceInvaders-v0")
        #env = gym.make("Breakout-v0")

    #no skip frames
    env.frameskip = 1

    input_shape = (84, 84)
    batch_size = 1
    num_actions = env.action_space.n
    memory_size = 2  #2 because it need to save the current state and the future state, no matter what it gets, it will always just pick the earlier one
    memory_burn_in_num = 1
    start_epsilon = 1
    end_epsilon = 0.01
    decay_steps = 1000000
    target_update_freq = 1  #no targeting
    train_freq = 4  #How often you train the network
    history_size = 4

    history_prep = HistoryPreprocessor(history_size)
    atari_prep = AtariPreprocessor(input_shape, 0, 999)
    numpy_prep = NumpyPreprocessor()
    preprocessors = PreprocessorSequence(
        [atari_prep, history_prep, numpy_prep])  #from left to right

    policy = LinearDecayGreedyEpsilonPolicy(start_epsilon, end_epsilon,
                                            decay_steps)

    linear_model = create_model(history_size, input_shape, num_actions,
                                model_name)
    optimizer = Adam(lr=0.001,
                     beta_1=0.9,
                     beta_2=0.999,
                     epsilon=1e-08,
                     decay=0.0)
    loss_func = huber_loss
    #linear_model.compile(optimizer, loss_func)
    linear_model.summary()
    random_policy = UniformRandomPolicy(num_actions)
    #memory = ActionReplayMemory(1000000,4)
    memory = ActionReplayMemory(memory_size, history_size)
    #memory_burn_in(env,memory,preprocessors,memory_burn_in_num,random_policy)

    #print(reward_arr)
    #print(curr_state_arr)
    agent = DQNAgent(linear_model, preprocessors, memory, policy, 0.99,
                     target_update_freq, None, train_freq, batch_size)
    agent.compile(optimizer, loss_func)
    agent.save_models()
    agent.fit(env, 1000000, 100000)
示例#10
0
def main():
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--mode', choices=['train', 'test'], default='test')
    parser.add_argument('--network',
                        choices=['deep', 'linear'],
                        default='deep')
    parser.add_argument('--method',
                        choices=['dqn', 'double', 'dueling'],
                        default='dqn')
    parser.add_argument('--monitor', type=bool, default=True)
    parser.add_argument('--iter', type=int, default=2400000)
    parser.add_argument('--test_policy',
                        choices=['Greedy', 'GreedyEpsilon'],
                        default='GreedyEpsilon')

    args = parser.parse_args()
    args.seed = np.random.randint(0, 1000000, 1)[0]
    args.weights = 'models/dqn_{}_weights_{}_{}_{}.h5f'.format(
        args.env, args.method, args.network, args.iter)
    args.monitor_path = 'tmp/dqn_{}_weights_{}_{}_{}_{}'.format(
        args.env, args.method, args.network, args.iter, args.test_policy)
    if args.mode == 'train':
        args.monitor = False

    env = gym.make(args.env)
    if args.monitor:
        env = wrappers.Monitor(env, args.monitor_path)
    np.random.seed(args.seed)
    env.seed(args.seed)

    args.gamma = 0.99
    args.learning_rate = 0.0001
    args.epsilon = 0.05
    args.num_iterations = 5000000
    args.batch_size = 32

    args.window_length = 4
    args.num_burn_in = 50000
    args.target_update_freq = 10000
    args.log_interval = 10000
    args.model_checkpoint_interval = 10000
    args.train_freq = 4

    args.num_actions = env.action_space.n
    args.input_shape = (84, 84)
    args.memory_max_size = 1000000

    args.output = get_output_folder(args.output, args.env)

    args.suffix = args.method + '_' + args.network
    if (args.method == 'dqn'):
        args.enable_double_dqn = False
        args.enable_dueling_network = False
    elif (args.method == 'double'):
        args.enable_double_dqn = True
        args.enable_dueling_network = False
    elif (args.method == 'dueling'):
        args.enable_double_dqn = False
        args.enable_dueling_network = True
    else:
        print('Attention! Method Worng!!!')

    if args.test_policy == 'Greedy':
        test_policy = GreedyPolicy()
    elif args.test_policy == 'GreedyEpsilon':
        test_policy = GreedyEpsilonPolicy(args.epsilon)

    print(args)

    K.tensorflow_backend.set_session(get_session())
    model = create_model(args.window_length, args.input_shape,
                         args.num_actions, args.network)

    # we create our preprocessor, the Ataripreprocessor will only process current frame the agent is seeing. And the sequence
    # preprocessor will construct the state by concatenating 3 previous frames from HistoryPreprocessor and current processed frame
    Processor = {}
    Processor['Atari'] = AtariPreprocessor(args.input_shape)
    Processor['History'] = HistoryPreprocessor(args.window_length)
    ProcessorSequence = PreprocessorSequence(Processor)  # construct 84x84x4

    # we create our memory for saving all experience collected during training with window length 4
    memory = ReplayMemory(max_size=args.memory_max_size,
                          input_shape=args.input_shape,
                          window_length=args.window_length)

    # we use linear decay greedy epsilon policy and tune the epsilon from 1 to 0.1 during the first 100w iterations and then keep using
    # epsilon with 0.1 to further train the network
    policy = LinearDecayGreedyEpsilonPolicy(GreedyEpsilonPolicy(args.epsilon),
                                            attr_name='eps',
                                            start_value=1,
                                            end_value=0.1,
                                            num_steps=1000000)

    # we construct our agent and use 0.99 as our discounted factor, 32 as our batch_size. We update our model for each 4 iterations. But during first
    # 50000 iterations, we only collect data to the memory and don't update our model.
    dqn = DQNAgent(q_network=model,
                   policy=policy,
                   memory=memory,
                   num_actions=args.num_actions,
                   test_policy=test_policy,
                   preprocessor=ProcessorSequence,
                   gamma=args.gamma,
                   target_update_freq=args.target_update_freq,
                   num_burn_in=args.num_burn_in,
                   train_freq=args.train_freq,
                   batch_size=args.batch_size,
                   enable_double_dqn=args.enable_double_dqn,
                   enable_dueling_network=args.enable_dueling_network)

    adam = Adam(lr=args.learning_rate)
    dqn.compile(optimizer=adam)

    if args.mode == 'train':
        weights_filename = 'dqn_{}_weights_{}.h5f'.format(
            args.env, args.suffix)
        checkpoint_weights_filename = 'dqn_' + args.env + '_weights_' + args.suffix + '_{step}.h5f'
        log_filename = 'dqn_{}_log_{}.json'.format(args.env, args.suffix)
        log_dir = '../tensorboard_{}_log_{}'.format(args.env, args.suffix)
        callbacks = [
            ModelIntervalCheckpoint(checkpoint_weights_filename,
                                    interval=args.model_checkpoint_interval)
        ]
        callbacks += [FileLogger(log_filename, interval=100)]
        callbacks += [
            TensorboardStepVisualization(log_dir=log_dir,
                                         histogram_freq=1,
                                         write_graph=True,
                                         write_images=True)
        ]

        # start training
        # we don't apply action repetition explicitly since the game will randomly skip frame itself
        dqn.fit(env,
                callbacks=callbacks,
                verbose=1,
                num_iterations=args.num_iterations,
                action_repetition=1,
                log_interval=args.log_interval,
                visualize=True)

        dqn.save_weights(weights_filename, overwrite=True)
        dqn.evaluate(env,
                     num_episodes=10,
                     visualize=True,
                     num_burn_in=5,
                     action_repetition=1)
    elif args.mode == 'test':
        weights_filename = 'dqn_{}_weights_{}.h5f'.format(
            args.env, args.suffix)
        if args.weights:
            weights_filename = args.weights
        dqn.load_weights(weights_filename)
        dqn.evaluate(env,
                     num_episodes=250,
                     visualize=True,
                     num_burn_in=5,
                     action_repetition=1)

        # we upload our result to openai gym
        if args.monitor:
            env.close()
            gym.upload(args.monitor_path, api_key='sk_J62obX9PQg2ExrM6H9rvzQ')
示例#11
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('-e',
                        '--env',
                        default='Enduro-v0',
                        help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('-n', '--network', default='dqn', help='Network Type')

    args = parser.parse_args()

    print args

    # define params
    gamma = 0.99
    target_update_freq = 10000
    num_burn_in = 50000
    train_freq = 4
    batch_size = 32
    hist_length = 4
    memory_size = 1000000
    num_iterations = 5000000
    params = {
        'action_update_freq': 1,
        'epsilon': 0.05,
        'eps_start': 1.0,
        'eps_end': 0.1,
        'eps_num_steps': 1000000,
        'disp_loss_freq': 4000,
        'eval_freq': 10000,
        'weight_save_freq': 50000,
        'eval_episodes': 20,
        'print_freq': 100,
    }

    # create environment
    env = gym.make(args.env)
    env_test = gym.make(args.env)
    num_actions = env.action_space.n

    #create Tensor Flow Session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    K.set_session(sess)

    # set up preprocessors
    atari_preprocessor = AtariPreprocessor((84, 84))
    hist_preprocessor = HistoryPreprocessor(hist_length)
    preprocessor = PreprocessorSequence(
        (atari_preprocessor, hist_preprocessor))

    test_atari_preprocessor = AtariPreprocessor((84, 84))
    test_hist_preprocessor = HistoryPreprocessor(hist_length)
    test_preprocessor = PreprocessorSequence(
        (test_atari_preprocessor, test_hist_preprocessor))
    print("Set up preprocessors")

    # set up replay memory
    memory = ReplayMemory(memory_size, memory_size)
    print("Set up memory")

    # get model and set up agent
    if args.network == 'dqn':
        q_network = create_model_dqn(hist_length, (84, 84), num_actions)
        agent = DQNAgent(q_network, preprocessor, test_preprocessor, memory,
                         gamma, target_update_freq, num_burn_in, train_freq,
                         batch_size, params)
    elif args.network == 'ddqn':
        q_network = create_model_dqn(hist_length, (84, 84), num_actions)
        agent = DoubleDQNAgent(q_network, preprocessor, test_preprocessor,
                               memory, gamma, target_update_freq, num_burn_in,
                               train_freq, batch_size, params)
    elif args.network == 'duel':
        q_network = create_model_dueling(hist_length, (84, 84), num_actions)
        agent = DQNAgent(q_network, preprocessor, test_preprocessor, memory,
                         gamma, target_update_freq, num_burn_in, train_freq,
                         batch_size, params)
    elif args.network == 'linear_naive':
        params['use_replay'] = False
        params['use_target'] = False
        q_network = create_model_linear(hist_length, (84, 84), num_actions)

        # set params for no replay and no target
        memory.resize(1)
        num_burn_in = 0

        agent = LinearDQNAgent(q_network, preprocessor, test_preprocessor,
                               memory, gamma, target_update_freq, num_burn_in,
                               train_freq, batch_size, params)
    elif args.network == 'linear_soph':
        params['use_replay'] = True
        params['use_target'] = True
        q_network = create_model_linear(hist_length, (84, 84), num_actions)

        agent = LinearDQNAgent(q_network, preprocessor, test_preprocessor,
                               memory, gamma, target_update_freq, num_burn_in,
                               train_freq, batch_size, params)
    elif args.network == 'linear_double':
        q_network = create_model_linear(hist_length, (84, 84), num_actions)

        agent = DoubleDQNAgent(q_network, preprocessor, test_preprocessor,
                               memory, gamma, target_update_freq, num_burn_in,
                               train_freq, batch_size, params)

    # Compile model in agent
    adam = Adam(lr=1e-4)
    agent.compile(adam, mean_huber_loss, args.output)
    print("Set up agent.")

    # fit model
    print("Fitting Model.")
    agent.fit(env, env_test, num_iterations, args.output, 1e4)
def main():  # noqa: D103
    #(SpaceInvaders-v0
    # Enduro-v0
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')

    parser.add_argument('--env',
                        default='SpaceInvaders-v0',
                        help='Atari env name')
    #parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name')
    #parser.add_argument('--env', default='PendulumSai-v0', help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')

    args = parser.parse_args()
    #args.input_shape = tuple(args.input_shape)

    #args.output = get_output_folder(args.output, args.env)

    # here is where you should start up a session,
    # create your DQN agent, create your model, etc.
    # then you can run your fit method.
    model_name = 'linear'
    env = gym.make(args.env)
    num_iter = 2000000
    max_epi_iter = 1000

    epsilon = 0.4
    window = 4
    gamma = 0.99
    target_update_freq = 5000
    train_freq = 1
    batch_size = 32
    num_burn_in = 5000
    num_actions = 3  #env.action_space.n
    state_size = (84, 84, 1)
    new_size = state_size
    max_size = 1000000

    lr = 0.00020
    beta_1 = 0.9
    beta_2 = 0.999
    epsilon2 = 1e-08
    decay = 0.0

    u_policy = UniformRandomPolicy(num_actions)
    ge_policy = GreedyEpsilonPolicy(epsilon)
    g_policy = GreedyPolicy()
    policy = {
        'u_policy': u_policy,
        'ge_policy': ge_policy,
        'g_policy': g_policy
    }
    #preprocessor = PreprocessorSequence([AtariPreprocessor(new_size), HistoryPreprocessor(window)])
    preprocessor = AtariPreprocessor(new_size)
    memory = SequentialMemory(max_size=max_size, window_length=window)

    model = create_model(window, state_size, num_actions)
    print(model.summary())
    dqnA = DQNAgent(q_network=model,
                    preprocessor=preprocessor,
                    memory=memory,
                    policy=policy,
                    gamma=gamma,
                    target_update_freq=target_update_freq,
                    num_burn_in=num_burn_in,
                    train_freq=train_freq,
                    batch_size=batch_size,
                    model_name=model_name)
    #testing
    #selected_action = dqnA.select_action( np.random.rand(1,210,160,12), train=1, warmup_phase=0)
    h_loss = huber_loss
    optimizer = Adam(lr=lr,
                     beta_1=beta_1,
                     beta_2=beta_2,
                     epsilon=epsilon2,
                     decay=decay)
    dqnA.compile(optimizer, h_loss)
    #callback1 = ProgbarLogger(count_mode='samples')

    dqnA.fit(env, num_iterations=num_iter, max_episode_length=max_epi_iter)
示例#13
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(
        description='Run DQN on Atari environment')
    parser.add_argument('--env',
                        default='SpaceInvaders-v0',
                        help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--iters',
                        default=5000000,
                        type=int,
                        help='Number of interactions with environment')
    parser.add_argument('--mb_size',
                        default=32,
                        type=int,
                        help='Minibatch size')
    parser.add_argument('--max_episode_len',
                        default=2000,
                        type=int,
                        help='Maximum length of episode')
    parser.add_argument('--frame_count',
                        default=4,
                        type=int,
                        help='Number of frames to feed to Q-network')
    parser.add_argument('--eps',
                        default=0.05,
                        type=float,
                        help='Epsilon value for epsilon-greedy exploration')
    parser.add_argument('--learning_rate',
                        default=0.0001,
                        type=float,
                        help='Learning rate for training')
    parser.add_argument('--discount',
                        default=0.99,
                        type=float,
                        help='Discounting factor')
    parser.add_argument('--replay_mem_size',
                        default=500000,
                        type=int,
                        help='Maximum size of replay memory')
    parser.add_argument('--train_freq',
                        default=3,
                        type=int,
                        help='Frequency of updating Q-network')
    parser.add_argument('--target_update_freq',
                        default=10000,
                        type=int,
                        help='Frequency of updating target network')
    parser.add_argument(
        '--eval',
        action='store_true',
        help='Indicator to evaluate model on given environment')
    parser.add_argument(
        '--filename',
        type=str,
        help='Filename for saved model to load during evaluation')
    parser.add_argument(
        '--model_type',
        type=str,
        help=
        'Type of model to use: naive, linear, deep, linear_double, deep_double, dueling'
    )
    parser.add_argument(
        '--initial_replay_size',
        default=50000,
        type=int,
        help=
        'Initial size of the replay memory upto which a uniform random policy should be used'
    )
    parser.add_argument('--evaluate_every',
                        default=5000,
                        type=int,
                        help='Number of updates to run evaluation after')

    args = parser.parse_args()
    #args.input_shape = tuple(args.input_shape)

    # Get output folder
    args.output = get_output_folder(args.output, args.env)

    # Create environment
    env = gym.make(args.env)
    env.reset()

    # Create model
    preprocessed_input_shape = (84, 84)
    model = create_model(args.frame_count, preprocessed_input_shape,
                         env.action_space.n, args.env + "-test",
                         args.model_type)

    # Initialize replay memory
    replay_mem = ReplayMemory(args.replay_mem_size, args.frame_count)

    # Create agent
    preprocessor_seq = PreprocessorSequence(
        [AtariPreprocessor(preprocessed_input_shape)])

    dqn = DQNAgent(model, preprocessor_seq, replay_mem, args.discount,
                   args.target_update_freq, args.initial_replay_size,
                   args.train_freq, args.mb_size, args.eps, args.output,
                   args.evaluate_every, args.model_type)

    dqn.compile()
    if args.eval:
        dqn.eval_on_file(env, args.filename)
    else:
        if args.model_type == 'naive' or args.model_type == 'linear_double':
            dqn.fit_naive(env, args.iters, args.max_episode_len)
        else:
            dqn.fit(env, args.iters, args.max_episode_len)
示例#14
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Space Invaders')
    parser.add_argument('--seed', default=10703, type=int, help='Random seed')
    parser.add_argument('--input_shape', default=SIZE_OF_STATE, help='Input shape')
    parser.add_argument('--gamma', default=0.99, help='Discount factor')
    # TODO experiment with this value.
    parser.add_argument('--epsilon', default=0.1, help='Final exploration probability in epsilon-greedy')
    parser.add_argument('--learning_rate', default=0.00025, help='Training learning rate.')
    parser.add_argument('--batch_size', default=32, type = int, help=
                                'Batch size of the training part')
    parser.add_argument('--question', type=int, default=7,
                        help='Which hw question to run.')


    parser.add_argument('--evaluate', action='store_true',
                        help='Only affects worker. Run evaluation instead of training.')
    parser.add_argument('--worker_epsilon', type=float,
                        help='Only affects worker. Override epsilon to use (instead of one in file).')
    parser.add_argument('--skip_model_restore', action='store_true',
                        help='Only affects worker. Use a newly initialized model instead of restoring one.')
    parser.add_argument('--generate_fixed_samples', action='store_true',
                        help=('Special case execution. Generate fixed samples and close. ' +
                             'This is necessary to run whenever the network or action space changes.'))
    parser.add_argument('--ai_input_dir', default='gcloud/inputs/',
                        help='Input directory with initialization files.')
    parser.add_argument('--ai_output_dir', default='gcloud/outputs/',
                        help='Output directory for gameplay files.')
    parser.add_argument('--is_worker', dest='is_manager',
                        action='store_false',
                        help='Whether this is a worker (no training).')
    parser.add_argument('--is_manager', dest='is_manager',
                        action='store_true',
                        help='Whether this is a manager (trains).')
    parser.set_defaults(is_manager=True)


    parser.add_argument('--psc', action='store_true',
                        help=('Only affects manager. Whether on PSC, ' +
                              'and should for example reduce disk usage.'))

    # Copied from original phillip code (run.py).
    for opt in CPU.full_opts():
      opt.update_parser(parser)
    parser.add_argument("--dolphin", action="store_true", default=None, help="run dolphin")
    for opt in DolphinRunner.full_opts():
      opt.update_parser(parser)

    args = parser.parse_args()
    # run.sh might pass these in via environment variable, so user directory
    # might not already be expanded.
    args.ai_input_dir = os.path.expanduser(args.ai_input_dir)
    args.ai_output_dir = os.path.expanduser(args.ai_output_dir)
    if args.is_manager:
        random.seed(args.seed)
        np.random.seed(args.seed)
        tf.set_random_seed(args.seed)

    do_evaluation = args.evaluate or random.random() < WORKER_EVALUATION_PROBABILITY
    if do_evaluation or args.generate_fixed_samples:
        args.cpu = EVAL_CPU_LEVEL
        print('OVERRIDING cpu level to: ' + str(EVAL_CPU_LEVEL))

    if args.generate_fixed_samples and args.is_manager:
        raise Exception('Can not generate fixed samples as manager. Must use ' +
                        '--is_worker and all other necessary flags (e.g. --iso ISO_PATH)')

    env = SmashEnv()
    if not args.is_manager:
        env.make(args)  # Opens Dolphin.

    question_settings = get_question_settings(args.question, args.batch_size)

    online_model, online_params = create_model(
        input_shape=args.input_shape,
        num_actions=env.action_space.n, model_name='online_model',
        create_network_fn=question_settings['create_network_fn'],
        learning_rate=args.learning_rate)

    target_model = online_model
    update_target_params_ops = []
    if (question_settings['target_update_freq'] is not None or
        question_settings['is_double_network']):
        target_model, target_params = create_model(
            input_shape=args.input_shape,
            num_actions=env.action_space.n, model_name='target_model',
            create_network_fn=question_settings['create_network_fn'],
            learning_rate=args.learning_rate)
        update_target_params_ops = [t.assign(s) for s, t in zip(online_params, target_params)]


    replay_memory = ReplayMemory(
        max_size=question_settings['replay_memory_size'],
        error_if_full=(not args.is_manager))


    saver = tf.train.Saver(max_to_keep=None)
    agent = DQNAgent(online_model=online_model,
                    target_model = target_model,
                    memory=replay_memory,
                    gamma=args.gamma,
                    target_update_freq=question_settings['target_update_freq'],
                    update_target_params_ops=update_target_params_ops,
                    batch_size=args.batch_size,
                    is_double_network=question_settings['is_double_network'],
                    is_double_dqn=question_settings['is_double_dqn'])

    sess = tf.Session()

    with sess.as_default():
        if args.generate_fixed_samples:
            print('Generating ' + str(NUM_FIXED_SAMPLES) + ' fixed samples and saving to ./' + FIXED_SAMPLES_FILENAME)
            print('This file is only ever used on the manager.')
            agent.compile(sess)
            fix_samples = agent.prepare_fixed_samples(
                env, sess, UniformRandomPolicy(env.action_space.n),
                NUM_FIXED_SAMPLES, MAX_EPISODE_LENGTH)
            env.terminate()
            with open(FIXED_SAMPLES_FILENAME, 'wb') as f:
                pickle.dump(fix_samples, f)
            return

        if args.is_manager or args.skip_model_restore:
            agent.compile(sess)
        else:
            saver.restore(sess, os.path.join(args.ai_input_dir, WORKER_INPUT_MODEL_FILENAME))

        print('_________________')
        print('number_actions: ' + str(env.action_space.n))

        # Worker code.
        if not args.is_manager:
          print('ai_input_dir: ' + args.ai_input_dir)
          print('ai_output_dir: ' + args.ai_output_dir)

          if do_evaluation:
              evaluation = agent.evaluate(env, sess, GreedyPolicy(), EVAL_EPISODES, MAX_EPISODE_LENGTH)
              print('Evaluation: ' + str(evaluation))
              with open(FIXED_SAMPLES_FILENAME, 'rb') as fixed_samples_f:
                fix_samples = pickle.load(fixed_samples_f)
              mean_max_Q = calculate_mean_max_Q(sess, online_model, fix_samples)

              evaluation = evaluation + (mean_max_Q,)
              with open(os.path.join(args.ai_output_dir, WORKER_OUTPUT_EVALUATE_FILENAME), 'wb') as f:
                  pickle.dump(evaluation, f)
              env.terminate()
              return

          worker_epsilon = args.worker_epsilon
          if worker_epsilon is None:
              with open(os.path.join(args.ai_input_dir, WORKER_INPUT_EPSILON_FILENAME)) as f:
                  lines = f.readlines()
                  # TODO handle unexpected lines better than just ignoring?
                  worker_epsilon = float(lines[0])
          print('Worker epsilon: ' + str(worker_epsilon))
          train_policy = GreedyEpsilonPolicy(worker_epsilon)

          agent.play(env, sess, train_policy, total_seconds=PLAY_TOTAL_SECONDS, max_episode_length=MAX_EPISODE_LENGTH)
          replay_memory.save_to_file(os.path.join(args.ai_output_dir, WORKER_OUTPUT_GAMEPLAY_FILENAME))
          env.terminate()
          return



        # Manager code.
        mprint('Loading fix samples')
        with open(FIXED_SAMPLES_FILENAME, 'rb') as fixed_samples_f:
            fix_samples = pickle.load(fixed_samples_f)

        evaluation_dirs = set()
        play_dirs = set()
        save_model(saver, sess, args.ai_input_dir, epsilon=1.0)
        epsilon_generator = LinearDecayGreedyEpsilonPolicy(
            1.0, args.epsilon, TOTAL_WORKER_JOBS / 5.0)
        fits_so_far = 0
        mprint('Begin to train (now safe to run gcloud)')
        mprint('Initial mean_max_q: ' + str(calculate_mean_max_Q(sess, online_model, fix_samples)))

        while len(play_dirs) < TOTAL_WORKER_JOBS:
            output_dirs = os.listdir(args.ai_output_dir)
            output_dirs = [os.path.join(args.ai_output_dir, x) for x in output_dirs]
            output_dirs = set(x for x in output_dirs if os.path.isdir(x))
            new_dirs = sorted(output_dirs - evaluation_dirs - play_dirs)

            if len(new_dirs) == 0:
                time.sleep(0.1)
                continue

            new_dir = new_dirs[-1]  # Most recent gameplay.
            evaluation_path = os.path.join(new_dir, WORKER_OUTPUT_EVALUATE_FILENAME)

            if os.path.isfile(evaluation_path):
                evaluation_dirs.add(new_dir)
                with open(evaluation_path, 'rb') as evaluation_file:
                    rewards, game_lengths, mean_max_Q = pickle.load(evaluation_file)
                evaluation = [np.mean(rewards), np.std(rewards),
                              np.mean(game_lengths), np.std(game_lengths),
                              mean_max_Q]
                mprint('Evaluation: ' + '\t'.join(str(x) for x in evaluation))
                continue

            memory_path = os.path.join(new_dir, WORKER_OUTPUT_GAMEPLAY_FILENAME)
            try:
                if os.path.getsize(memory_path) == 0:
                    # TODO Figure out why this happens despite temporary directory work.
                    #      Also sometimes the file doesn't exist? Hence the try/except.
                    mprint('Output not ready somehow: ' + memory_path)
                    time.sleep(0.1)
                    continue

                with open(memory_path, 'rb') as memory_file:
                    worker_memories = pickle.load(memory_file)
            except Exception as exception:
                print('Error reading ' + memory_path + ': ' + str(exception.args))
                time.sleep(0.1)
                continue
            for worker_memory in worker_memories:
                replay_memory.append(*worker_memory)
            if args.psc:
                os.remove(memory_path)


            play_dirs.add(new_dir)
            if len(play_dirs) <= NUM_BURN_IN_JOBS:
                mprint('Skip training because still burn in.')
                mprint('len(worker_memories): ' + str(len(worker_memories)))
                continue

            for _ in range(int(len(worker_memories) * FITS_PER_SINGLE_MEMORY)):
                agent.fit(sess, fits_so_far)
                fits_so_far += 1

            # Partial evaluation to give frequent insight into agent progress.
            # Last time checked, this took ~0.1 seconds to complete.
            mprint('mean_max_q, len(worker_memories): ' +
                   str(calculate_mean_max_Q(sess, online_model, fix_samples)) +
                   ', ' + str(len(worker_memories)))

            # Always decrement epsilon (e.g. not just when saving model).
            model_epsilon = epsilon_generator.get_epsilon(decay_epsilon=True)
            if len(play_dirs) % SAVE_MODEL_EVERY == 0:
                save_model(saver, sess, args.ai_input_dir, model_epsilon)
示例#15
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='../log/',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--gamma',
                        default=0.99,
                        type=float,
                        help='Discount factor')
    parser.add_argument('--batch_size',
                        default=32,
                        type=int,
                        help='Minibatch size')
    parser.add_argument('--learning_rate',
                        default=0.0001,
                        type=float,
                        help='Learning rate')
    parser.add_argument(
        '--initial_epsilon',
        default=1.0,
        type=float,
        help='Initial exploration probability in epsilon-greedy')
    parser.add_argument('--final_epsilon',
                        default=0.05,
                        type=float,
                        help='Final exploration probability in epsilon-greedy')
    parser.add_argument(
        '--exploration_steps',
        default=2000000,
        type=int,
        help=
        'Number of steps over which the initial value of epsilon is linearly annealed to its final value'
    )
    parser.add_argument(
        '--num_samples',
        default=10000000,
        type=int,
        help='Number of training samples from the environment in training')
    parser.add_argument('--num_frames',
                        default=4,
                        type=int,
                        help='Number of frames to feed to Q-Network')
    parser.add_argument('--num_frames_mv',
                        default=10,
                        type=int,
                        help='Number of frames to used to detect movement')
    parser.add_argument('--frame_width',
                        default=84,
                        type=int,
                        help='Resized frame width')
    parser.add_argument('--frame_height',
                        default=84,
                        type=int,
                        help='Resized frame height')
    parser.add_argument(
        '--replay_memory_size',
        default=1000000,
        type=int,
        help='Number of replay memory the agent uses for training')
    parser.add_argument(
        '--target_update_freq',
        default=10000,
        type=int,
        help='The frequency with which the target network is updated')
    parser.add_argument('--train_freq',
                        default=4,
                        type=int,
                        help='The frequency of actions wrt Q-network update')
    parser.add_argument('--save_freq',
                        default=200000,
                        type=int,
                        help='The frequency with which the network is saved')
    parser.add_argument('--eval_freq',
                        default=200000,
                        type=int,
                        help='The frequency with which the policy is evlauted')
    parser.add_argument(
        '--num_burn_in',
        default=50000,
        type=int,
        help=
        'Number of steps to populate the replay memory before training starts')
    parser.add_argument('--load_network',
                        default=False,
                        action='store_true',
                        help='Load trained mode')
    parser.add_argument('--load_network_path',
                        default='',
                        help='the path to the trained mode file')
    parser.add_argument(
        '--net_mode',
        default='dqn',
        help='choose the mode of net, can be linear, dqn, duel')
    parser.add_argument('--max_episode_length',
                        default=10000,
                        type=int,
                        help='max length of each episode')
    parser.add_argument('--num_episodes_at_test',
                        default=10,
                        type=int,
                        help='Number of episodes the agent plays at test')
    parser.add_argument('--ddqn',
                        default=False,
                        dest='ddqn',
                        action='store_true',
                        help='enable ddqn')
    parser.add_argument('--train',
                        default=True,
                        dest='train',
                        action='store_true',
                        help='Train mode')
    parser.add_argument('--test',
                        dest='train',
                        action='store_false',
                        help='Test mode')
    parser.add_argument('--no_experience',
                        default=False,
                        action='store_true',
                        help='do not use experience replay')
    parser.add_argument('--no_target',
                        default=False,
                        action='store_true',
                        help='do not use target fixing')
    parser.add_argument('--no_monitor',
                        default=False,
                        action='store_true',
                        help='do not record video')
    parser.add_argument('-p',
                        '--platform',
                        default='rle',
                        help='rle or atari. rle: rle; atari: gym-atari')
    parser.add_argument('-pl',
                        '--perlife',
                        default=False,
                        action='store_true',
                        help='use per life or not. ')
    parser.add_argument('-mv',
                        '--mv_reward',
                        default=False,
                        action='store_true',
                        help='use movement reward or not')
    parser.add_argument('-c',
                        '--clip_reward',
                        default=False,
                        action='store_true',
                        help='clip reward or not')
    parser.add_argument('--decay_reward',
                        default=False,
                        action='store_true',
                        help='decay reward or not')
    parser.add_argument('--expert_memory',
                        default=None,
                        help='path of the expert memory')
    parser.add_argument(
        '--initial_prob_replaying_expert',
        default=1.0,
        type=float,
        help='Initial probability of using expert replaying memory')
    parser.add_argument(
        '--final_prob_replaying_expert',
        default=0.05,
        type=float,
        help='Final probability of using expert replaying memory')
    parser.add_argument(
        '--steps_replaying_expert',
        default=1000000,
        type=float,
        help=
        '# steps over which the initial prob of replaying expert memory is linearly annealed to its final value'
    )
    parser.add_argument('--trace_dir',
                        default='',
                        help='the trace dir for expert')
    parser.add_argument('--trace2mem',
                        default=False,
                        action='store_true',
                        help='convert trace to memory')
    parser.add_argument('--mem_dump',
                        default='',
                        help='the path of memory dump')
    args = parser.parse_args()
    args.output = get_output_folder(args.output, args.env)

    if args.trace2mem:
        trace2mem(args)
        exit(0)

    if args.platform == 'atari':
        env = gym.make(args.env)
    else:
        rom_path = 'roms/' + args.env
        if args.no_monitor:
            env = rle(rom_path, record=True, path=args.output)
        else:
            env = rle(rom_path)
    print("Output saved to: ", args.output)
    print("Args used:")
    print(args)

    # here is where you should start up a session,
    # create your DQN agent, create your model, etc.
    # then you can run your fit method.

    num_actions = env.action_space.n
    print("Game ", args.env, " #actions: ", num_actions)
    dqn = DQNAgent(args, num_actions)
    if args.train:
        print("Training mode.")
        if args.perlife:
            env = RLEEnvPerLifeWrapper(env)
        dqn.fit(env, args.num_samples, args.max_episode_length)
    else:
        print("Evaluation mode.")
        dqn.evaluate(env, args.num_episodes_at_test, args.max_episode_length,
                     not args.no_monitor)
示例#16
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env',
                        default='SpaceInvadersDeterministic-v3',
                        help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--model',
                        default='dqn',
                        help='Q Network type to use.')
    parser.add_argument('--double', action='store_true')

    model_map = {
        'linear': LinearQN,
        'mlp': MLP,
        'dqn': DQN,
        'dueling': DuelingDQN
    }

    args = parser.parse_args()

    args.model = args.model.lower()
    if args.model not in model_map:
        print("Invalid model type. Valid types are", model_map.keys())
        sys.exit(1)

    args.output = get_output_folder(args.output, args.env)

    # here is where you should start up a session,
    # create your DQN agent, create your model, etc.
    # then you can run your fit method.

    env = gym.make(args.env)

    monitored_env = gym.wrappers.Monitor(
        gym.make(args.env),
        args.output,
        video_callable=lambda i: i % EVAL_NUM_EPISODES == 0)

    atari = not args.env.startswith("CartPole")

    if atari:
        input_shape = (IMAGE_SIZE, IMAGE_SIZE)
        preprocessor = lambda: PreprocessorSequence(
            AtariPreprocessor(new_size=input_shape),
            HistoryPreprocessor(history_length=WINDOW_SIZE, max_over=True))
    else:
        input_shape = (4, )
        preprocessor = lambda: HistoryPreprocessor(history_length=WINDOW_SIZE)

    memory = ExperienceReplay(max_size=REPLAY_BUFFER_SIZE,
                              window_length=WINDOW_SIZE)

    NUM_ACTIONS = env.action_space.n
    #policy = UniformRandomPolicy(num_actions=NUM_ACTIONS)
    #policy = GreedyEpsilonPolicy(NUM_ACTIONS, EPSILON)
    policy = LinearDecayGreedyEpsilonPolicy(NUM_ACTIONS, 1.0, EPSILON,
                                            NUM_ITERATIONS_LINEAR_DECAY)

    model = model_map[args.model](exp_name=args.output)

    agent = DQNAgent(q_network=model,
                     preprocessor=preprocessor,
                     memory=memory,
                     policy=policy,
                     gamma=GAMMA,
                     target_update_freq=TARGET_UPDATE_FREQ,
                     replay_buffer_size=REPLAY_BUFFER_SIZE,
                     train_freq=TRAIN_FREQ,
                     batch_size=BATCH_SIZE,
                     output_dir=args.output,
                     double_dqn=args.double)

    agent.compile(window=WINDOW_SIZE,
                  input_shape=input_shape,
                  num_actions=NUM_ACTIONS,
                  model_name='q_network')

    signal.signal(signal.SIGINT, agent.signal_handler)
    signal.signal(signal.SIGTERM, agent.signal_handler)
    signal.signal(signal.SIGHUP, agent.signal_handler)

    agent.fit(env, monitored_env, num_iterations=NUM_ITERATIONS)
示例#17
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name')
    parser.add_argument('--network_name', default='linear_q_network', type=str, help='Type of model to use')
    parser.add_argument('--window', default=4, type=int, help='how many frames are used each time')
    parser.add_argument('--new_size', default=(84, 84), type=tuple, help='new size')
    parser.add_argument('--batch_size', default=32, type=int, help='Batch size')
    parser.add_argument('--replay_buffer_size', default=750000, type=int, help='Replay buffer size')
    parser.add_argument('--gamma', default=0.99, type=float, help='Discount factor')
    parser.add_argument('--alpha', default=0.0001, type=float, help='Learning rate')
    parser.add_argument('--epsilon', default=0.05, type=float, help='Exploration probability for epsilon-greedy')
    parser.add_argument('--target_update_freq', default=10000, type=int,
                        help='Frequency for copying weights to target network')
    parser.add_argument('--num_burn_in', default=50000, type=int,
                        help='Number of prefilled samples in the replay buffer')
    parser.add_argument('--num_iterations', default=5000000, type=int,
                        help='Number of overal interactions to the environment')
    parser.add_argument('--max_episode_length', default=200000, type=int, help='Terminate earlier for one episode')
    parser.add_argument('--train_freq', default=4, type=int, help='Frequency for training')
    parser.add_argument('--repetition_times', default=3, type=int, help='Parameter for action repetition')
    parser.add_argument('-o', '--output', default='atari-v0', type=str, help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--experience_replay', default=False, type=bool,
                        help='Choose whether or not to use experience replay')
    parser.add_argument('--train', default=True, type=bool, help='Train/Evaluate, set True if train the model')
    parser.add_argument('--model_path', default='/media/hongbao/Study/Courses/10703/hw2/lqn_noexp',
                        type=str, help='specify model path to evaluation')
    parser.add_argument('--max_grad', default=1.0, type=float, help='Parameter for huber loss')
    parser.add_argument('--model_num', default=5000000, type=int, help='specify saved model number during train')
    parser.add_argument('--log_dir', default='log', type=str, help='specify log folder to save evaluate result')
    parser.add_argument('--eval_num', default=100, type=int, help='number of evaluation to run')
    parser.add_argument('--save_freq', default=100000, type=int, help='model save frequency')

    args = parser.parse_args()
    print("\nParameters:")
    for arg in vars(args):
        print arg, getattr(args, arg)
    print("")

    env = gym.make(args.env)
    num_actions = env.action_space.n
    # define model object
    preprocessor = AtariPreprocessor(args.new_size)
    memory = ReplayMemory(args.replay_buffer_size, args.window)

    # Initiating policy for both tasks (training and evaluating)
    policy = LinearDecayGreedyEpsilonPolicy(args.epsilon, 0, 1000000)

    if not args.train:
        '''Evaluate the model'''
        # check model path
        if args.model_path is '':
            print "Model path must be set when evaluate"
            exit(1)

        # specific log file to save result
        log_file = os.path.join(args.log_dir, args.network_name, str(args.model_num))
        model_dir = os.path.join(args.model_path, args.network_name, str(args.model_num))

        with tf.Session() as sess:
            # load model
            with open(model_dir + ".json", 'r') as json_file:
                loaded_model_json = json_file.read()
                q_network_online = model_from_json(loaded_model_json)
                q_network_target = model_from_json(loaded_model_json)

            sess.run(tf.global_variables_initializer())

            # load weights into model
            q_network_online.load_weights(model_dir + ".h5")
            q_network_target.load_weights(model_dir + ".h5")

            dqn_agent = DQNAgent((q_network_online, q_network_target), preprocessor, memory, policy, num_actions,
                                 args.gamma, args.target_update_freq, args.num_burn_in, args.train_freq,
                                 args.batch_size, \
                                 args.experience_replay, args.repetition_times, args.network_name, args.max_grad,
                                 args.env, sess)

            dqn_agent.evaluate(env, log_file, args.eval_num)
        exit(0)

    '''Train the model'''
    q_network_online = create_model(args.window, args.new_size, num_actions, args.network_name, True)
    q_network_target = create_model(args.window, args.new_size, num_actions, args.network_name, False)

    # create output dir, meant to pop up error when dir exist to avoid over written
    os.mkdir(os.path.join(args.output, args.network_name))

    with tf.Session() as sess:
        dqn_agent = DQNAgent((q_network_online, q_network_target), preprocessor, memory, policy, num_actions,
                             args.gamma, args.target_update_freq, args.num_burn_in, args.train_freq, args.batch_size, \
                             args.experience_replay, args.repetition_times, args.network_name, args.max_grad, args.env,
                             sess)

        optimizer = tf.train.AdamOptimizer(learning_rate=args.alpha)
        dqn_agent.compile(optimizer, mean_huber_loss)
        dqn_agent.fit(env, args.num_iterations, os.path.join(args.output, args.network_name), args.save_freq,
                      args.max_episode_length)
示例#18
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    #parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument('--env',
                        default='SpaceInvaders-v0',
                        help='Atari env name')
    parser.add_argument('--output',
                        default='results',
                        help='Directory to save data to')
    parser.add_argument('-l',
                        '--isLinear',
                        default=0,
                        type=int,
                        choices=range(0, 2),
                        help='1: use linear model; 0: use deep model')
    parser.add_argument(
        '-m',
        '--modelType',
        default='q',
        choices=['q', 'double', 'dueling'],
        help=
        'q: q learning; double: double q learning; dueling: dueling q learning'
    )
    parser.add_argument(
        '-s',
        '--simple',
        default=0,
        type=int,
        choices=range(0, 2),
        help=
        '1: without replay or target fixing ; 0: use replay and target fixing')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')

    args = parser.parse_args()

    #args.input_shape = tuple(args.input_shape)
    if not os.path.exists(args.output):
        os.makedirs(args.output)
    model_name = ('linear_' if args.isLinear else 'deep_') + args.modelType + (
        '_simple' if args.simple else '')
    args.output = get_output_folder(args.output + '/' + model_name, args.env)
    env = gym.make(args.env)
    #env = gym.wrappers.Monitor(env, args.output)
    env.seed(args.seed)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    K.set_session(sess)
    K.get_session().run(tf.initialize_all_variables())

    is_linear = args.isLinear
    agent = DQNAgent(
        q_network=create_model(4, (84, 84), env.action_space.n, is_linear,
                               args.modelType),
        q_network2=create_model(4, (84, 84), env.action_space.n, is_linear,
                                args.modelType),
        preprocessor=AtariPreprocessor((84, 84)),
        memory=ReplayMemory(1000000, 4),
        gamma=0.99,
        target_update_freq=10000,
        num_burn_in=50000,
        train_freq=4,
        batch_size=32,
        is_linear=is_linear,
        model_type=args.modelType,
        use_replay_and_target_fixing=(not args.simple),
        epsilon=0,  #0.05,
        action_interval=4,
        output_path=args.output,
        save_freq=100000)

    agent.compile(lr=0.0001)
    agent.fit(env, 5000000)
    agent.load_weights()
    agent.evaluate(env, 100, video_path_suffix='final')
    env.close()
示例#19
0
def main():
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--type',
                        default="DQN",
                        help='Type of network to train. ()')

    args = parser.parse_args()

    #check if valid network type
    network_types = [
        "Linear", "LinearERTF", "DoubleLinear", "DQN", "DDQN", "Duling"
    ]
    if (not (args.type in network_types)):
        raise ValueError("Invalid network type.")

    NETWORK_TYPE = args.type

    #set up environment model
    env = gym.make(str(args.env))
    NUM_ACTIONS = env.action_space.n

    #make dqn agent
    """
    FRAMES_PER_STATE = 4
    INPUT_SHAPE = (84,84)
    GAMMA = .99
    NUM_ITERATIONS = 1000000
    TARGET_UPDATE_FREQ =  100000
    BATCH_SIZE = 32
    REPLAY_MEM_SIZE = 1000000
    REPLAY_START_SIZE = 50000
    MAX_EPISODE_LEN = 100
    REWARD_SAMPLE = 1000
    HELD_OUT_STATES_SIZE=1000
    """

    FRAMES_PER_STATE = 4
    INPUT_SHAPE = (84, 84)
    GAMMA = .99
    NUM_ITERATIONS = 20000
    TARGET_UPDATE_FREQ = 1000
    BATCH_SIZE = 32
    REPLAY_MEM_SIZE = 1000000
    REPLAY_START_SIZE = 1000
    MAX_EPISODE_LEN = 10
    REWARD_SAMPLE = 1000
    HELD_OUT_STATES_SIZE = 1000

    #retuns a list of models ie: [Online,None] or [Online,Target] or [OnlineA,OnlineB]
    models = create_model(FRAMES_PER_STATE, INPUT_SHAPE, NUM_ACTIONS,
                          NETWORK_TYPE)
    history = HistoryPreprocessor(FRAMES_PER_STATE - 1)
    preprocessor = Preprocessor()
    if (NETWORK_TYPE != "Linear"):
        memory = ReplayMemory(REPLAY_MEM_SIZE, FRAMES_PER_STATE)
    else:
        memory = None
    held_out_states = ReplayMemory(HELD_OUT_STATES_SIZE, FRAMES_PER_STATE)
    policy = LinearDecayGreedyEpsilonPolicy(1, .05, int(1e6))
    agent = DQNAgent(models[0], models[1], preprocessor, history, memory,
                     policy, GAMMA, TARGET_UPDATE_FREQ, BATCH_SIZE,
                     REPLAY_START_SIZE, NUM_ACTIONS, NETWORK_TYPE,
                     REWARD_SAMPLE, held_out_states, HELD_OUT_STATES_SIZE)

    #compile agent
    adam = Adam(lr=0.0001)
    loss = mean_huber_loss
    agent.compile(adam, loss)
    agent.fit(env, NUM_ITERATIONS, MAX_EPISODE_LEN)

    model_json = models[0].to_json()
    with open(NETWORK_TYPE + "model.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    models[0].save_weights(NETWORK_TYPE + "model.h5")
    print("Saved model to disk")
示例#20
0
def main():  # noqa: D103

        parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
        parser.add_argument('--env', default='Enduro-v0', help='Atari env name')
        parser.add_argument('--seed', default=0, type=int, help='Random seed')
        parser.add_argument('--model_type', default='dqn', help='Model type: linear, dqn, double_linear, double_dqn')
        parser.add_arguement('--mode', default='train', help='Mode: train for training, test for testing')
        parser.add_arguement('--memory_size', default=200000, type=int, help='Replay memory size')
        parser.add_arguement('--save_every', default=50000, type=int, help='Frequency for saving weights')
        parser.add_arguement('--max_ep_length', default=50000, type=int, help='Maximum episode length during training')
        parser.add_arguement('--use_target_fixing', action='store_true', help='Use target fixing')
        parser.add_arguement('--use_replay_memory', action='store_true', help='Use replay memory')

        args = parser.parse_args()
        
        # Loading the appropriate environment. 
        env = gym.make('Enduro-v0')

        window = 4
        input_shape = (84,84)
        num_actions = env.action_space.n
                
        # Limit GPU use
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)

        # Set mode
        mode = args.mode

        # Set model variables

        # Model type to train.
        model_type = args.model_type

        # Initialize the Preprocessor, Memory, policy for training, 
        preproc = Preprocessor()
        memory = ReplayMemory(args.memory_size)
        policy = LinearDecayGreedyEpsilonPolicy(1,0.1,1000000, num_actions) # decay epsilon from 1 to 0.1 over 1 million steps

        # Setting experimental parameters - details of choices specified in the write up.
        gamma = 0.99
        target_update_freq = 10000
        num_burn_in = 1000
        train_freq = 0 # not using this parameter
        batch_size = 32
        target_fix_flag = args.target_fixing
        replay_mem_flag = args.replay_memory
        save_every = args.save_every


        print(sess)

        # Create a DQN agent with the specified parameters. 
        dqn = DQNAgent(sess, window, input_shape, num_actions, model_type, preproc, memory, policy, 
                                        gamma, target_fix_flag, target_update_freq, replay_mem_flag, num_burn_in, train_freq, batch_size, save_every)

        # Train the model on 3-5 Million frames, with given maximum episode length.
        if mode == 'train':
                dqn.fit(env, 5000000, args.max_ep_length)

        elif mode == 'test':

                # Load the model for testing. 
                model_file = 'saved_models_dqn/model_100000.ckpt'
                dqn.restore_model(model_file)

                # Evaluate the model.
                dqn.evaluate(env, 20 ,5000, 'test', lambda x: True, False, True)
示例#21
0
def main():  # noqa: D103

    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument(
        '-o', '--output', default='atari-v0', help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('-ni', '--num_iterations', default=10, type=int, help='Num of iterations for training')
    parser.add_argument('-m', '--max_episode_length', default=60, type=int, help='Max episode length of a sequence')
    parser.add_argument('-ne', '--num_episodes', default=10, type=int, help='Num of epsidoes for evaluating')
    parser.add_argument('-r', '--replay_memory', default=10, type=int, help='The size of replay memory')
    parser.add_argument('-gamma', '--discount_factor', default=0.99, type=float, help='Discount factor of MDP')
    parser.add_argument('-ge', '--Greedy_epsilon', default=0.95, type=float, help='The probability to choose a greedy action')

    args = parser.parse_args()

    #args.input_shape = tuple(args.input_shape)

    args.output = get_output_folder(args.output, args.env)

    # the dirs to store results
    os.makedirs(args.output)
    os.chdir(args.output)

    # here is where you should start up a session,
    # create your DQN agent, create your model, etc.
    # then you can run your fit method.

    env = gym.make('Breakout-v0')
    env.reset()

    # Preprocess image
    preprocess_network = preprocessors.PreprocessorSequence('network')
    preprocess_memory = preprocessors.PreprocessorSequence('memory')

    # Policy choose
    Greedy = policy.GreedyEpsilonPolicy(0.95)
    DG = policy.LinearDecayGreedyEpsilonPolicy('attr_name', 1, 0.1, 1000000)

    # Create model from Atari paper
    model = create_model(window=4, input_shape=(84, 84), num_actions=4)

    # load weights
    location = '/'

    # Define tensorboard
    tensorboard = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True)

    # Optimazor
    optimizor = Adam(lr=0.00025)

    # Create memory
    memory = core.ReplayMemory(max_size=args.replay_memory, phi_length=4, window_height=84, window_length=84, rng=np.random.RandomState(100))
    agent = DQNAgent(q_network=model, target=model, preprocessor={'network': preprocess_network, 'memory': preprocess_memory},
        memory=memory, policy={'Greedy': Greedy, 'DG': DG}, gamma=args.discount_factor, target_update_freq=100000, num_burn_in=args.replay_memory, train_freq=4, batch_size=32
        ,callbacks=tensorboard)
    agent.compile(optimizer= optimizor, loss_func=objectives.mean_huber_loss)
    agent.init_memory(env=env, max_episode_length=30)
    agent.fit(env=env, num_iterations=args.num_iterations, max_episode_length=args.max_episode_length)
    agent.evaluate(env=env, num_episodes=args.num_episodes, max_episode_length=args.max_episode_length)

    # store the hyperameters
    file_abs = "./hypermeters"
    with open(file_abs, "w") as f:
        f.write("Num of iterations:")
        f.write(str(args.num_iterations) + '\n')
        f.write("Max epsidoe length:")
        f.write(str(args.max_episode_length) + '\n')
        f.write("Num of episodes:")
        f.write(str(args.num_episodes) + '\n')
        f.write("Replay memory:")
        f.write(str(args.replay_memory) + '\n')
        f.write("Discount factor:")
        f.write(str(args.discount_factor) + '\n')
示例#22
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(
        description='Run DQN on given game environment')
    parser.add_argument('--env',
                        default='SpaceInvaders-v0',
                        help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='train',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--gamma',
                        default=0.99,
                        type=float,
                        help='Discount factor')
    parser.add_argument(
        '--target_update_freq',
        default=10000,
        type=int,
        help='interval between two updates of the target network')
    parser.add_argument(
        '--num_burn_in',
        default=10,
        type=int,
        help=
        'number of samples to be filled into the replay memory before updating the network'
    )
    parser.add_argument('--train_freq',
                        default=1,
                        type=int,
                        help='How often to update the Q-network')
    parser.add_argument('--batch_size',
                        default=32,
                        type=int,
                        help='batch_size')
    parser.add_argument('--num_iterations',
                        default=50000,
                        type=int,
                        help="num of iterations to run for the training")
    parser.add_argument('--max_episode_length',
                        default=10000,
                        type=int,
                        help='max length of one episode')
    parser.add_argument('--lr',
                        default=0.0001,
                        type=float,
                        help='learning rate')
    parser.add_argument('--epsilon',
                        default=0.05,
                        type=float,
                        help='epsilon for exploration')
    parser.add_argument('--experiment_id',
                        default=None,
                        type=int,
                        help='index of experiment to reload checkpoint')
    parser.add_argument('--save_freq',
                        default=10000,
                        type=int,
                        help='checkpoint saving frequency')
    parser.add_argument(
        '--evaluate_freq',
        default=10000,
        type=int,
        help='frequency to do evaluation and record video by wrapper')
    parser.add_argument('--test_num_episodes',
                        default=20,
                        type=int,
                        help='number of episodes to play at each evaluation')

    args = parser.parse_args()

    if not args.experiment_id:
        args.output = get_output_folder(args.output, args.env)
    else:
        args.output = os.path.join(args.output, args.env) + '-run{}'.format(
            args.experiment_id)
    game_env = gym.make(args.env)
    num_actions = game_env.action_space.n
    input_shape = (84, 84)

    #todo: setup logger
    #writer = tf.summary.FileWriter()

    #setup model
    model = create_model(window=4,
                         input_shape=input_shape,
                         num_actions=num_actions,
                         model_name='linear_model')

    #setup optimizer
    #optimizer = Adam(lr=args.lr)
    optimizer = tf.train.AdamOptimizer(learning_rate=args.lr)

    #setup preprocessor
    atari_preprocessor = AtariPreprocessor(input_shape)
    history_preprocessor = HistoryPreprocessor(history_length=3)
    preprocessor = PreprocessorSequence(
        [atari_preprocessor, history_preprocessor])

    #setup policy
    policy = GreedyEpsilonPolicy(epsilon=args.epsilon, num_actions=num_actions)

    #setup DQN agent
    agent = DQNAgent(q_network=model,
                     preprocessor=preprocessor,
                     memory=None,
                     policy=policy,
                     gamma=args.gamma,
                     target_update_freq=args.target_update_freq,
                     num_burn_in=args.num_burn_in,
                     train_freq=args.train_freq,
                     batch_size=args.batch_size,
                     logdir=args.output,
                     save_freq=args.save_freq,
                     evaluate_freq=args.evaluate_freq,
                     test_num_episodes=args.test_num_episodes)
    agent.compile(optimizer=optimizer, loss_func=mean_huber_loss)
    agent.fit(env=game_env,
              num_iterations=args.num_iterations,
              max_episode_length=args.max_episode_length)
示例#23
0
def main(args):
    # gpu id
    # gpu_id = args.gpu
    # os.environ['CUDA_VISIBLE_DEVICES'] = '%d'%gpu_id
    # make env
    env = gym.make(args.env)
    if args.mode == 'test' and args.submit:
        monitor_log = os.path.join(args.output, 'monitor.log')
        env = wrappers.Monitor(env, monitor_log, force=True)
    # build model
    # actions 0-5: 0 do nothing, 1 fire, 2 right, 3 left, 4 right+fire, 5 left+fire
    num_actions = env.action_space.n
    mem_size = 1000000
    window = 4
    input_shape = (84, 84)
    if args.type in ['DQN', 'double-DQN']:
        model = create_model(window, input_shape, num_actions, args.init)
        target = create_model(window, input_shape, num_actions, args.init)
    elif args.type in ['linear', 'linear-simple', 'double-Q']:
        model = create_model_linear(window, input_shape, num_actions,
                                    args.init)
        target = create_model_linear(window, input_shape, num_actions,
                                     args.init)
    elif args.type == 'duel':
        model = create_model_duel(window, input_shape, num_actions, args.init)
        target = create_model_duel(window, input_shape, num_actions, args.init)
    # memory = ReplayMemory(1000000, 100)  # window length is arbitrary
    # target_update_freq = 10000
    # num_burn_in = 50000
    target_update_freq = 10000
    num_burn_in = 50000
    train_freq = 4
    batch_size = 32
    gamma = 0.99
    epsilon = 0.05
    updates_per_epoch = 50000
    num_iterations = 50000000
    eval_episodes = 100
    max_episode_length = 10000

    # simple: no experience replay and no target fixing
    # if args.type == 'linear-simple':
    #     mem_size = 5
    #     target_update_freq = 1
    #     num_burn_in = 0
    #     batch_size = 1
    if args.type == 'linear-simple':
        num_burn_in = 0

    memory = ReplayMemoryEfficient(mem_size, window, input_shape)
    # with tf.device('/gpu:%d'%gpu_id):

    config = tf.ConfigProto(intra_op_parallelism_threads=8)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    # preprocessor
    preprocessor = PreprocessorSequence()
    # policy
    policy = LinearDecayGreedyEpsilonPolicy(1, 0.1, 1000000)
    policy_eval = GreedyEpsilonPolicy(epsilon)
    # build agent
    dqn_agent = DQNAgent(sess, env, args.type, model, target, preprocessor,
                         memory, policy, policy_eval, gamma,
                         target_update_freq, num_burn_in, train_freq,
                         batch_size, num_actions, updates_per_epoch,
                         args.output)
    if args.mode == 'train':  # compile net and train with fit
        # rmsprop = RMSprop(lr=0.00025, rho=0.95, epsilon=0.01)
        # dqn_agent.compile_networks(rmsprop, mean_huber_loss)
        # adam = Adam(lr=0.00025, beta_1=0.95, beta_2=0.95, epsilon=0.1)
        adam = Adam(lr=0.0001)
        dqn_agent.compile_networks(adam, mean_huber_loss)
        if args.type == 'linear-simple':
            dqn_agent.fit_simple(num_iterations, max_episode_length)
        else:
            dqn_agent.fit(num_iterations, max_episode_length)
    elif args.mode == 'test':  # load net and evaluate
        model_path = os.path.join(args.output, 'model_epoch%03d' % args.epoch)
        dqn_agent.load_networks(model_path)
        if args.submit:
            eval_episodes = 1
        dqn_agent.play(eval_episodes, max_episode_length)
        # if args.submit:
        #     gym.upload(monitor_log, api_key='sk_wa5MgeDTnOQ209qBCP7jQ')
        # else:
        #     log_file = open(os.path.join(args.output, 'evaluation.txt'), 'a+')
        #     log_file.write('%d %f %f %f %f\n' % (args.epoch,
        #                                          np.mean(lengths),
        #                                          np.std(lengths),
        #                                          np.mean(rewards),
        #                                          np.std(rewards)))
        #     log_file.close()
    env.close()