def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') args = parser.parse_args() #args.input_shape = tuple(args.input_shape) #args.output = get_output_folder(args.output, args.env) #set up environment model env = gym.make(str(args.env)) NUM_ACTIONS = env.action_space.n #env.get_action_space().num_actions() #make dqn agent FRAMES_PER_STATE = 4 INPUT_SHAPE = (84, 84) GAMMA = .99 NUM_ITERATIONS = 5000000 TARGET_UPDATE_FREQ = 10000 NUM_BURN_IN = 32 TRAIN_FREQ = 0 BATCH_SIZE = 32 REPLAY_MEM_SIZE = 1000000 REPLAY_START_SIZE = 50000 MAX_EPISODE_LEN = 10000 HELD_OUT_STATES_SIZE = 1000 IS_DOUBLE_Q = True model = create_model(FRAMES_PER_STATE, INPUT_SHAPE, NUM_ACTIONS, model_name='linear q_network') plot_model(model, to_file='model.png') target = create_model(FRAMES_PER_STATE, INPUT_SHAPE, NUM_ACTIONS, model_name='linear q_network target') preprocessor = HistoryPreprocessor(FRAMES_PER_STATE - 1) memory = ReplayMemory(REPLAY_MEM_SIZE, FRAMES_PER_STATE) held_out_states = ReplayMemory(HELD_OUT_STATES_SIZE, FRAMES_PER_STATE) policy = LinearDecayGreedyEpsilonPolicy(1, .05, int(1e6)) agent = DQNAgent(model, target, preprocessor, memory, policy, held_out_states, HELD_OUT_STATES_SIZE, GAMMA, TARGET_UPDATE_FREQ, NUM_BURN_IN, TRAIN_FREQ, BATCH_SIZE, REPLAY_START_SIZE, NUM_ACTIONS, IS_DOUBLE_Q) # compile agent adam = Adam(lr=0.0001) agent.compile(adam, mean_huber_loss) agent.fit(env, NUM_ITERATIONS, MAX_EPISODE_LEN)
def main(): parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--model', default=1, type=int, help='model') parser.add_argument('--double', action='store_true') args = parser.parse_args() print('Using Tensorflow Version of ' + tf.__version__) # args.input_shape = tuple(args.input_shape) args.output = get_output_folder(args.output, args.env) print("Output Folder: " + args.output) # here is where you should start up a session, # create your DQN agent, create your model, etc. # then you can run your fit method. sess = tf.Session() K.set_session(sess) env = gym.make(args.env) num_actions = env.action_space.n # 0 linear; 1 deep; 2 dueling model = create_model(WINDOW, INPUT_SHAPE, num_actions, args.model) atari_preprocessor = AtariPreprocessor(INPUT_SHAPE) history_preprocessor = HistoryPreprocessor(HIST_LENGTH) preprocessor = PreprocessorSequence( [atari_preprocessor, history_preprocessor]) memory = ReplayMemory(MAX_MEMORY, WINDOW) policy = LinearDecayGreedyEpsilonPolicy(START_EPSILON, END_EPSILON, NUM_STEPS) dqn_agent = DQNAgent(model, num_actions, preprocessor, memory, policy, GAMMA, TARGET_UPDATE_FREQ, INIT_MEMORY, TRAIN_FREQ, BATCH_SIZE, double=args.double) optimizer = Adam(lr=LEARNING_RATE, epsilon=MIN_SQ_GRAD) loss_func = mean_huber_loss dqn_agent.compile(optimizer, loss_func) dqn_agent.fit(env, NUM_ITERATIONS, MAX_EPISODE_LENGTH)
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') parser.add_argument( '-o', '--output', default='linearQ', help='Directory to save data to') parser.add_argument('--seed', default=703, type=int, help='Random seed') args = parser.parse_args() args.output = get_output_folder(args.output, args.env) # args.output = '/home/thupxd/deeprl_for_atari_games/' + args.output # Comment out when running locally! os.makedirs(args.output, exist_ok=True) # here is where you should start up a session, # create your DQN agent, create your model, etc. # then you can run your fit method. # Make the environment env = gym.make(args.env) # input('************************** Hit to begin training... ******************************') # Create a Q network num_actions = env.action_space.n q_net = create_model(4, (84, 84), num_actions, model_name='Linear_Q_Net') # print('======================== Keras Q-network model is created. =========================') # Initialize a preporcessor sequence object atari_preprocessor = tfrl.preprocessors.AtariPreprocessor((84, 84)) # print('======================== Preprocessor object is created. =========================') # Initialize a replay memory replay_memory = tfrl.core.ReplayMemory(1000000, 4) # print('======================== Replay_memory object is created. =========================') # Initialize a policy _policy = tfrl.policy.GreedyEpsilonPolicy(0.05, num_actions) policy = tfrl.policy.LinearDecayGreedyEpsilonPolicy(_policy, 1, 0.1, 1000000) # print('======================== (linear-decay) Eps-Greedy Policy object is created. =========================') # Initialize a DQNAgent DQNAgent = tfrl.dqn.DQNAgent(q_net, atari_preprocessor, replay_memory, policy, gamma=0.99, target_update_freq=10000, num_burn_in=100000, train_freq=4, batch_size=32, window_size=4) # print('======================== DQN agent is created. =========================') # Compiling, Training, Test # print('======================== Model compilation begin! =========================') adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) q_net.compile(optimizer=adam, loss=mean_huber_loss) # print('======================== Model compilation finished! =========================') # print('======================== Model training begin! =========================') DQNAgent.fit(env, args.env, args.output, 5000000, 100000)
def main(): # noqa: D103 config = get_config(True) env = gym.make(config.env_name) q = create_model(4, (84, 84), env.action_space.n, model_name=config.modelname) q_target = create_model(4, (84, 84), env.action_space.n, model_name=config.modelname) huber_loss = tfrl.objectives.mean_huber_loss adam = Adam(lr=config.learning_rate) q.compile(adam, huber_loss, metrics=['accuracy']) q_target.compile(adam, huber_loss, metrics=['accuracy']) policy = LinearDecayGreedyEpsilonPolicy(0.9, 0.05, config.iteration_num / 50) # Deprecated with open(config.losslog, "w") as log: log.write("Iteraton,Loss,Accuarcy\n") with open(config.rewardlog, "w") as log: log.write("Iteraton,reward\n") ##### #Agent = DoubleDQNAgent(q, q_target, policy, config.gamma, config.num_burn_in, config.train_freq, config.batch_size, config) Agent = DQNAgent(q, q_target, policy, config.gamma, config.num_burn_in, config.train_freq, config.batch_size, config) mse_loss, mae_metric, q, q_target = Agent.fit(env, config.iteration_num, 0) TimeStamp = datetime.datetime.strftime(datetime.datetime.now(), "%y-%m-%d_%H-%M") q.save_weights( str(config.modelname) + '_' + TimeStamp + '_final_weights.h5')
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') args = parser.parse_args() #args.input_shape = tuple(args.input_shape) #args.output = get_output_folder(args.output, args.env) #set up environment model env = gym.make(str(args.env)) NUM_ACTIONS = env.action_space.n #env.get_action_space().num_actions() #make dqn agent FRAMES_PER_STATE = 4 INPUT_SHAPE = (84, 84) GAMMA = .99 NUM_ITERATIONS = 500000 TARGET_UPDATE_FREQ = 0 NUM_BURN_IN = 0 TRAIN_FREQ = 0 BATCH_SIZE = 0 model = create_model(FRAMES_PER_STATE, INPUT_SHAPE, NUM_ACTIONS, model_name='linear q_network') preprocessor = HistoryPreprocessor(FRAMES_PER_STATE - 1) memory = None policy = LinearDecayGreedyEpsilonPolicy(1, .05, 10e6) agent = DQNAgent(model, preprocessor, memory, policy, GAMMA, TARGET_UPDATE_FREQ, NUM_BURN_IN, TRAIN_FREQ, BATCH_SIZE) #compile agent adam = Adam(lr=0.0001) loss = losses.mean_squared_error agent.compile(adam, loss) agent.fit(env, NUM_ITERATIONS)
def main(): # noqa: D103 parser = argparse.ArgumentParser( description='Run DQN on Atari SpaceInvaders') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') parser.add_argument('--mode', default='vanilla', type=str, help='vanilla or double dqn') args = parser.parse_args() print " MODE IS", args.mode video_every_nth = 50000 eval_every_nth = 50000 if args.env == "breakout": args.env = 'Breakout-v0' video_every_nth = 50000 if args.env == "space_invaders": args.env = 'SpaceInvaders-v0' if args.env == 'enduro': args.env = 'Enduro-v0' video_every_nth = 50000 eval_every_nth = 50000 agent = DQNAgent(env=args.env, gamma=0.99, target_update_freq=10000, num_burn_in=50000, train_freq=4, batch_size=32, mode=args.mode) agent.fit(num_iterations=int(5e6), max_episode_length=100000, save_model_every_nth=10000, eval_every_nth=eval_every_nth, log_loss_every_nth=1000, video_every_nth=video_every_nth)
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Game') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name', required=True) parser.add_argument( '-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') args = parser.parse_args() print 'Using Tensorflow Version of ' + tf.__version__ #args.input_shape = tuple(args.input_shape) args.output = get_output_folder(args.output, args.env) print "Output Folder: " + args.output # here is where you should start up a session, # create your DQN agent, create your model, etc. # then you can run your fit method. sess = tf.Session() K.set_session(sess) env = gym.make(args.env) #env = gym.wrappers.Monitor(env, args.output + '/gym') num_actions = env.action_space.n # 0 linear; 1 deep; 2 dueling model = create_model(WINDOW, INPUT_SHAPE, num_actions, 1) atari_preprocessor = AtariPreprocessor(INPUT_SHAPE) history_preprocessor = HistoryPreprocessor(4) preprocessor = PreprocessorSequence([atari_preprocessor, history_preprocessor]) memory = ReplayMemory(MAX_MEMORY, WINDOW) policy = GreedyEpsilonPolicy(0.05) dqn_agent = DQNAgent(model, num_actions, preprocessor, memory, policy, GAMMA, TARGET_UPDATE_FREQ, INIT_MEMORY, TRAIN_FREQ, BATCH_SIZE, double=False) optimizer = Adam(lr=0.00025, epsilon=10-3) loss_func = mean_huber_loss dqn_agent.compile(optimizer, loss_func) #dqn_agent.calc_q_values(state) dqn_agent.fit(env, 100000, MAX_EPISODE_LENGTH)
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') parser.add_argument('--seed', default=23333, type=int, help='Random seed') parser.add_argument('--memory_size', default=1000000, type=int, help='memory_size') args = parser.parse_args() seed_all(args.seed) env = gym.make(args.env) n_actions = env.action_space.n # n = 6 for SpaceInvaders-v0 env = wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=True, scale=False) # todo model = Model(in_channels=4, n_actions=n_actions) memory = ReplayMemory(max_size=args.memory_size) policy = LinearDecayGreedyEpsilonPolicy(n_actions=n_actions, start_value=1, end_value=0.1, num_steps=1000000) agent = DQNAgent(q_network=model, memory=memory, gamma=0.99, target_update_freq=1000, num_burn_in=200000, batch_size=256, policy=policy, train_freq=32) agent.fit(env, num_iterations=100000, max_episode_length=10000)
def main(): #env = gym.make("Enduro-v0") #env = gym.make("SpaceInvaders-v0") #env = gym.make("Breakout-v0") model_name = "q2" if (len(sys.argv) >= 2): model_name = sys.argv[1] if (len(sys.argv) >= 3): env = gym.make(sys.argv[2]) else: #env = gym.make("Enduro-v0") env = gym.make("SpaceInvaders-v0") #env = gym.make("Breakout-v0") #no skip frames env.frameskip = 1 input_shape = (84, 84) batch_size = 1 num_actions = env.action_space.n memory_size = 2 #2 because it need to save the current state and the future state, no matter what it gets, it will always just pick the earlier one memory_burn_in_num = 1 start_epsilon = 1 end_epsilon = 0.01 decay_steps = 1000000 target_update_freq = 1 #no targeting train_freq = 4 #How often you train the network history_size = 4 history_prep = HistoryPreprocessor(history_size) atari_prep = AtariPreprocessor(input_shape, 0, 999) numpy_prep = NumpyPreprocessor() preprocessors = PreprocessorSequence( [atari_prep, history_prep, numpy_prep]) #from left to right policy = LinearDecayGreedyEpsilonPolicy(start_epsilon, end_epsilon, decay_steps) linear_model = create_model(history_size, input_shape, num_actions, model_name) optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) loss_func = huber_loss #linear_model.compile(optimizer, loss_func) linear_model.summary() random_policy = UniformRandomPolicy(num_actions) #memory = ActionReplayMemory(1000000,4) memory = ActionReplayMemory(memory_size, history_size) #memory_burn_in(env,memory,preprocessors,memory_burn_in_num,random_policy) #print(reward_arr) #print(curr_state_arr) agent = DQNAgent(linear_model, preprocessors, memory, policy, 0.99, target_update_freq, None, train_freq, batch_size) agent.compile(optimizer, loss_func) agent.save_models() agent.fit(env, 1000000, 100000)
def main(): parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--mode', choices=['train', 'test'], default='test') parser.add_argument('--network', choices=['deep', 'linear'], default='deep') parser.add_argument('--method', choices=['dqn', 'double', 'dueling'], default='dqn') parser.add_argument('--monitor', type=bool, default=True) parser.add_argument('--iter', type=int, default=2400000) parser.add_argument('--test_policy', choices=['Greedy', 'GreedyEpsilon'], default='GreedyEpsilon') args = parser.parse_args() args.seed = np.random.randint(0, 1000000, 1)[0] args.weights = 'models/dqn_{}_weights_{}_{}_{}.h5f'.format( args.env, args.method, args.network, args.iter) args.monitor_path = 'tmp/dqn_{}_weights_{}_{}_{}_{}'.format( args.env, args.method, args.network, args.iter, args.test_policy) if args.mode == 'train': args.monitor = False env = gym.make(args.env) if args.monitor: env = wrappers.Monitor(env, args.monitor_path) np.random.seed(args.seed) env.seed(args.seed) args.gamma = 0.99 args.learning_rate = 0.0001 args.epsilon = 0.05 args.num_iterations = 5000000 args.batch_size = 32 args.window_length = 4 args.num_burn_in = 50000 args.target_update_freq = 10000 args.log_interval = 10000 args.model_checkpoint_interval = 10000 args.train_freq = 4 args.num_actions = env.action_space.n args.input_shape = (84, 84) args.memory_max_size = 1000000 args.output = get_output_folder(args.output, args.env) args.suffix = args.method + '_' + args.network if (args.method == 'dqn'): args.enable_double_dqn = False args.enable_dueling_network = False elif (args.method == 'double'): args.enable_double_dqn = True args.enable_dueling_network = False elif (args.method == 'dueling'): args.enable_double_dqn = False args.enable_dueling_network = True else: print('Attention! Method Worng!!!') if args.test_policy == 'Greedy': test_policy = GreedyPolicy() elif args.test_policy == 'GreedyEpsilon': test_policy = GreedyEpsilonPolicy(args.epsilon) print(args) K.tensorflow_backend.set_session(get_session()) model = create_model(args.window_length, args.input_shape, args.num_actions, args.network) # we create our preprocessor, the Ataripreprocessor will only process current frame the agent is seeing. And the sequence # preprocessor will construct the state by concatenating 3 previous frames from HistoryPreprocessor and current processed frame Processor = {} Processor['Atari'] = AtariPreprocessor(args.input_shape) Processor['History'] = HistoryPreprocessor(args.window_length) ProcessorSequence = PreprocessorSequence(Processor) # construct 84x84x4 # we create our memory for saving all experience collected during training with window length 4 memory = ReplayMemory(max_size=args.memory_max_size, input_shape=args.input_shape, window_length=args.window_length) # we use linear decay greedy epsilon policy and tune the epsilon from 1 to 0.1 during the first 100w iterations and then keep using # epsilon with 0.1 to further train the network policy = LinearDecayGreedyEpsilonPolicy(GreedyEpsilonPolicy(args.epsilon), attr_name='eps', start_value=1, end_value=0.1, num_steps=1000000) # we construct our agent and use 0.99 as our discounted factor, 32 as our batch_size. We update our model for each 4 iterations. But during first # 50000 iterations, we only collect data to the memory and don't update our model. dqn = DQNAgent(q_network=model, policy=policy, memory=memory, num_actions=args.num_actions, test_policy=test_policy, preprocessor=ProcessorSequence, gamma=args.gamma, target_update_freq=args.target_update_freq, num_burn_in=args.num_burn_in, train_freq=args.train_freq, batch_size=args.batch_size, enable_double_dqn=args.enable_double_dqn, enable_dueling_network=args.enable_dueling_network) adam = Adam(lr=args.learning_rate) dqn.compile(optimizer=adam) if args.mode == 'train': weights_filename = 'dqn_{}_weights_{}.h5f'.format( args.env, args.suffix) checkpoint_weights_filename = 'dqn_' + args.env + '_weights_' + args.suffix + '_{step}.h5f' log_filename = 'dqn_{}_log_{}.json'.format(args.env, args.suffix) log_dir = '../tensorboard_{}_log_{}'.format(args.env, args.suffix) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=args.model_checkpoint_interval) ] callbacks += [FileLogger(log_filename, interval=100)] callbacks += [ TensorboardStepVisualization(log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=True) ] # start training # we don't apply action repetition explicitly since the game will randomly skip frame itself dqn.fit(env, callbacks=callbacks, verbose=1, num_iterations=args.num_iterations, action_repetition=1, log_interval=args.log_interval, visualize=True) dqn.save_weights(weights_filename, overwrite=True) dqn.evaluate(env, num_episodes=10, visualize=True, num_burn_in=5, action_repetition=1) elif args.mode == 'test': weights_filename = 'dqn_{}_weights_{}.h5f'.format( args.env, args.suffix) if args.weights: weights_filename = args.weights dqn.load_weights(weights_filename) dqn.evaluate(env, num_episodes=250, visualize=True, num_burn_in=5, action_repetition=1) # we upload our result to openai gym if args.monitor: env.close() gym.upload(args.monitor_path, api_key='sk_J62obX9PQg2ExrM6H9rvzQ')
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('-e', '--env', default='Enduro-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('-n', '--network', default='dqn', help='Network Type') args = parser.parse_args() print args # define params gamma = 0.99 target_update_freq = 10000 num_burn_in = 50000 train_freq = 4 batch_size = 32 hist_length = 4 memory_size = 1000000 num_iterations = 5000000 params = { 'action_update_freq': 1, 'epsilon': 0.05, 'eps_start': 1.0, 'eps_end': 0.1, 'eps_num_steps': 1000000, 'disp_loss_freq': 4000, 'eval_freq': 10000, 'weight_save_freq': 50000, 'eval_episodes': 20, 'print_freq': 100, } # create environment env = gym.make(args.env) env_test = gym.make(args.env) num_actions = env.action_space.n #create Tensor Flow Session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) # set up preprocessors atari_preprocessor = AtariPreprocessor((84, 84)) hist_preprocessor = HistoryPreprocessor(hist_length) preprocessor = PreprocessorSequence( (atari_preprocessor, hist_preprocessor)) test_atari_preprocessor = AtariPreprocessor((84, 84)) test_hist_preprocessor = HistoryPreprocessor(hist_length) test_preprocessor = PreprocessorSequence( (test_atari_preprocessor, test_hist_preprocessor)) print("Set up preprocessors") # set up replay memory memory = ReplayMemory(memory_size, memory_size) print("Set up memory") # get model and set up agent if args.network == 'dqn': q_network = create_model_dqn(hist_length, (84, 84), num_actions) agent = DQNAgent(q_network, preprocessor, test_preprocessor, memory, gamma, target_update_freq, num_burn_in, train_freq, batch_size, params) elif args.network == 'ddqn': q_network = create_model_dqn(hist_length, (84, 84), num_actions) agent = DoubleDQNAgent(q_network, preprocessor, test_preprocessor, memory, gamma, target_update_freq, num_burn_in, train_freq, batch_size, params) elif args.network == 'duel': q_network = create_model_dueling(hist_length, (84, 84), num_actions) agent = DQNAgent(q_network, preprocessor, test_preprocessor, memory, gamma, target_update_freq, num_burn_in, train_freq, batch_size, params) elif args.network == 'linear_naive': params['use_replay'] = False params['use_target'] = False q_network = create_model_linear(hist_length, (84, 84), num_actions) # set params for no replay and no target memory.resize(1) num_burn_in = 0 agent = LinearDQNAgent(q_network, preprocessor, test_preprocessor, memory, gamma, target_update_freq, num_burn_in, train_freq, batch_size, params) elif args.network == 'linear_soph': params['use_replay'] = True params['use_target'] = True q_network = create_model_linear(hist_length, (84, 84), num_actions) agent = LinearDQNAgent(q_network, preprocessor, test_preprocessor, memory, gamma, target_update_freq, num_burn_in, train_freq, batch_size, params) elif args.network == 'linear_double': q_network = create_model_linear(hist_length, (84, 84), num_actions) agent = DoubleDQNAgent(q_network, preprocessor, test_preprocessor, memory, gamma, target_update_freq, num_burn_in, train_freq, batch_size, params) # Compile model in agent adam = Adam(lr=1e-4) agent.compile(adam, mean_huber_loss, args.output) print("Set up agent.") # fit model print("Fitting Model.") agent.fit(env, env_test, num_iterations, args.output, 1e4)
def main(): # noqa: D103 #(SpaceInvaders-v0 # Enduro-v0 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') #parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') #parser.add_argument('--env', default='PendulumSai-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') args = parser.parse_args() #args.input_shape = tuple(args.input_shape) #args.output = get_output_folder(args.output, args.env) # here is where you should start up a session, # create your DQN agent, create your model, etc. # then you can run your fit method. model_name = 'linear' env = gym.make(args.env) num_iter = 2000000 max_epi_iter = 1000 epsilon = 0.4 window = 4 gamma = 0.99 target_update_freq = 5000 train_freq = 1 batch_size = 32 num_burn_in = 5000 num_actions = 3 #env.action_space.n state_size = (84, 84, 1) new_size = state_size max_size = 1000000 lr = 0.00020 beta_1 = 0.9 beta_2 = 0.999 epsilon2 = 1e-08 decay = 0.0 u_policy = UniformRandomPolicy(num_actions) ge_policy = GreedyEpsilonPolicy(epsilon) g_policy = GreedyPolicy() policy = { 'u_policy': u_policy, 'ge_policy': ge_policy, 'g_policy': g_policy } #preprocessor = PreprocessorSequence([AtariPreprocessor(new_size), HistoryPreprocessor(window)]) preprocessor = AtariPreprocessor(new_size) memory = SequentialMemory(max_size=max_size, window_length=window) model = create_model(window, state_size, num_actions) print(model.summary()) dqnA = DQNAgent(q_network=model, preprocessor=preprocessor, memory=memory, policy=policy, gamma=gamma, target_update_freq=target_update_freq, num_burn_in=num_burn_in, train_freq=train_freq, batch_size=batch_size, model_name=model_name) #testing #selected_action = dqnA.select_action( np.random.rand(1,210,160,12), train=1, warmup_phase=0) h_loss = huber_loss optimizer = Adam(lr=lr, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon2, decay=decay) dqnA.compile(optimizer, h_loss) #callback1 = ProgbarLogger(count_mode='samples') dqnA.fit(env, num_iterations=num_iter, max_episode_length=max_epi_iter)
def main(): # noqa: D103 parser = argparse.ArgumentParser( description='Run DQN on Atari environment') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--iters', default=5000000, type=int, help='Number of interactions with environment') parser.add_argument('--mb_size', default=32, type=int, help='Minibatch size') parser.add_argument('--max_episode_len', default=2000, type=int, help='Maximum length of episode') parser.add_argument('--frame_count', default=4, type=int, help='Number of frames to feed to Q-network') parser.add_argument('--eps', default=0.05, type=float, help='Epsilon value for epsilon-greedy exploration') parser.add_argument('--learning_rate', default=0.0001, type=float, help='Learning rate for training') parser.add_argument('--discount', default=0.99, type=float, help='Discounting factor') parser.add_argument('--replay_mem_size', default=500000, type=int, help='Maximum size of replay memory') parser.add_argument('--train_freq', default=3, type=int, help='Frequency of updating Q-network') parser.add_argument('--target_update_freq', default=10000, type=int, help='Frequency of updating target network') parser.add_argument( '--eval', action='store_true', help='Indicator to evaluate model on given environment') parser.add_argument( '--filename', type=str, help='Filename for saved model to load during evaluation') parser.add_argument( '--model_type', type=str, help= 'Type of model to use: naive, linear, deep, linear_double, deep_double, dueling' ) parser.add_argument( '--initial_replay_size', default=50000, type=int, help= 'Initial size of the replay memory upto which a uniform random policy should be used' ) parser.add_argument('--evaluate_every', default=5000, type=int, help='Number of updates to run evaluation after') args = parser.parse_args() #args.input_shape = tuple(args.input_shape) # Get output folder args.output = get_output_folder(args.output, args.env) # Create environment env = gym.make(args.env) env.reset() # Create model preprocessed_input_shape = (84, 84) model = create_model(args.frame_count, preprocessed_input_shape, env.action_space.n, args.env + "-test", args.model_type) # Initialize replay memory replay_mem = ReplayMemory(args.replay_mem_size, args.frame_count) # Create agent preprocessor_seq = PreprocessorSequence( [AtariPreprocessor(preprocessed_input_shape)]) dqn = DQNAgent(model, preprocessor_seq, replay_mem, args.discount, args.target_update_freq, args.initial_replay_size, args.train_freq, args.mb_size, args.eps, args.output, args.evaluate_every, args.model_type) dqn.compile() if args.eval: dqn.eval_on_file(env, args.filename) else: if args.model_type == 'naive' or args.model_type == 'linear_double': dqn.fit_naive(env, args.iters, args.max_episode_len) else: dqn.fit(env, args.iters, args.max_episode_len)
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Space Invaders') parser.add_argument('--seed', default=10703, type=int, help='Random seed') parser.add_argument('--input_shape', default=SIZE_OF_STATE, help='Input shape') parser.add_argument('--gamma', default=0.99, help='Discount factor') # TODO experiment with this value. parser.add_argument('--epsilon', default=0.1, help='Final exploration probability in epsilon-greedy') parser.add_argument('--learning_rate', default=0.00025, help='Training learning rate.') parser.add_argument('--batch_size', default=32, type = int, help= 'Batch size of the training part') parser.add_argument('--question', type=int, default=7, help='Which hw question to run.') parser.add_argument('--evaluate', action='store_true', help='Only affects worker. Run evaluation instead of training.') parser.add_argument('--worker_epsilon', type=float, help='Only affects worker. Override epsilon to use (instead of one in file).') parser.add_argument('--skip_model_restore', action='store_true', help='Only affects worker. Use a newly initialized model instead of restoring one.') parser.add_argument('--generate_fixed_samples', action='store_true', help=('Special case execution. Generate fixed samples and close. ' + 'This is necessary to run whenever the network or action space changes.')) parser.add_argument('--ai_input_dir', default='gcloud/inputs/', help='Input directory with initialization files.') parser.add_argument('--ai_output_dir', default='gcloud/outputs/', help='Output directory for gameplay files.') parser.add_argument('--is_worker', dest='is_manager', action='store_false', help='Whether this is a worker (no training).') parser.add_argument('--is_manager', dest='is_manager', action='store_true', help='Whether this is a manager (trains).') parser.set_defaults(is_manager=True) parser.add_argument('--psc', action='store_true', help=('Only affects manager. Whether on PSC, ' + 'and should for example reduce disk usage.')) # Copied from original phillip code (run.py). for opt in CPU.full_opts(): opt.update_parser(parser) parser.add_argument("--dolphin", action="store_true", default=None, help="run dolphin") for opt in DolphinRunner.full_opts(): opt.update_parser(parser) args = parser.parse_args() # run.sh might pass these in via environment variable, so user directory # might not already be expanded. args.ai_input_dir = os.path.expanduser(args.ai_input_dir) args.ai_output_dir = os.path.expanduser(args.ai_output_dir) if args.is_manager: random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) do_evaluation = args.evaluate or random.random() < WORKER_EVALUATION_PROBABILITY if do_evaluation or args.generate_fixed_samples: args.cpu = EVAL_CPU_LEVEL print('OVERRIDING cpu level to: ' + str(EVAL_CPU_LEVEL)) if args.generate_fixed_samples and args.is_manager: raise Exception('Can not generate fixed samples as manager. Must use ' + '--is_worker and all other necessary flags (e.g. --iso ISO_PATH)') env = SmashEnv() if not args.is_manager: env.make(args) # Opens Dolphin. question_settings = get_question_settings(args.question, args.batch_size) online_model, online_params = create_model( input_shape=args.input_shape, num_actions=env.action_space.n, model_name='online_model', create_network_fn=question_settings['create_network_fn'], learning_rate=args.learning_rate) target_model = online_model update_target_params_ops = [] if (question_settings['target_update_freq'] is not None or question_settings['is_double_network']): target_model, target_params = create_model( input_shape=args.input_shape, num_actions=env.action_space.n, model_name='target_model', create_network_fn=question_settings['create_network_fn'], learning_rate=args.learning_rate) update_target_params_ops = [t.assign(s) for s, t in zip(online_params, target_params)] replay_memory = ReplayMemory( max_size=question_settings['replay_memory_size'], error_if_full=(not args.is_manager)) saver = tf.train.Saver(max_to_keep=None) agent = DQNAgent(online_model=online_model, target_model = target_model, memory=replay_memory, gamma=args.gamma, target_update_freq=question_settings['target_update_freq'], update_target_params_ops=update_target_params_ops, batch_size=args.batch_size, is_double_network=question_settings['is_double_network'], is_double_dqn=question_settings['is_double_dqn']) sess = tf.Session() with sess.as_default(): if args.generate_fixed_samples: print('Generating ' + str(NUM_FIXED_SAMPLES) + ' fixed samples and saving to ./' + FIXED_SAMPLES_FILENAME) print('This file is only ever used on the manager.') agent.compile(sess) fix_samples = agent.prepare_fixed_samples( env, sess, UniformRandomPolicy(env.action_space.n), NUM_FIXED_SAMPLES, MAX_EPISODE_LENGTH) env.terminate() with open(FIXED_SAMPLES_FILENAME, 'wb') as f: pickle.dump(fix_samples, f) return if args.is_manager or args.skip_model_restore: agent.compile(sess) else: saver.restore(sess, os.path.join(args.ai_input_dir, WORKER_INPUT_MODEL_FILENAME)) print('_________________') print('number_actions: ' + str(env.action_space.n)) # Worker code. if not args.is_manager: print('ai_input_dir: ' + args.ai_input_dir) print('ai_output_dir: ' + args.ai_output_dir) if do_evaluation: evaluation = agent.evaluate(env, sess, GreedyPolicy(), EVAL_EPISODES, MAX_EPISODE_LENGTH) print('Evaluation: ' + str(evaluation)) with open(FIXED_SAMPLES_FILENAME, 'rb') as fixed_samples_f: fix_samples = pickle.load(fixed_samples_f) mean_max_Q = calculate_mean_max_Q(sess, online_model, fix_samples) evaluation = evaluation + (mean_max_Q,) with open(os.path.join(args.ai_output_dir, WORKER_OUTPUT_EVALUATE_FILENAME), 'wb') as f: pickle.dump(evaluation, f) env.terminate() return worker_epsilon = args.worker_epsilon if worker_epsilon is None: with open(os.path.join(args.ai_input_dir, WORKER_INPUT_EPSILON_FILENAME)) as f: lines = f.readlines() # TODO handle unexpected lines better than just ignoring? worker_epsilon = float(lines[0]) print('Worker epsilon: ' + str(worker_epsilon)) train_policy = GreedyEpsilonPolicy(worker_epsilon) agent.play(env, sess, train_policy, total_seconds=PLAY_TOTAL_SECONDS, max_episode_length=MAX_EPISODE_LENGTH) replay_memory.save_to_file(os.path.join(args.ai_output_dir, WORKER_OUTPUT_GAMEPLAY_FILENAME)) env.terminate() return # Manager code. mprint('Loading fix samples') with open(FIXED_SAMPLES_FILENAME, 'rb') as fixed_samples_f: fix_samples = pickle.load(fixed_samples_f) evaluation_dirs = set() play_dirs = set() save_model(saver, sess, args.ai_input_dir, epsilon=1.0) epsilon_generator = LinearDecayGreedyEpsilonPolicy( 1.0, args.epsilon, TOTAL_WORKER_JOBS / 5.0) fits_so_far = 0 mprint('Begin to train (now safe to run gcloud)') mprint('Initial mean_max_q: ' + str(calculate_mean_max_Q(sess, online_model, fix_samples))) while len(play_dirs) < TOTAL_WORKER_JOBS: output_dirs = os.listdir(args.ai_output_dir) output_dirs = [os.path.join(args.ai_output_dir, x) for x in output_dirs] output_dirs = set(x for x in output_dirs if os.path.isdir(x)) new_dirs = sorted(output_dirs - evaluation_dirs - play_dirs) if len(new_dirs) == 0: time.sleep(0.1) continue new_dir = new_dirs[-1] # Most recent gameplay. evaluation_path = os.path.join(new_dir, WORKER_OUTPUT_EVALUATE_FILENAME) if os.path.isfile(evaluation_path): evaluation_dirs.add(new_dir) with open(evaluation_path, 'rb') as evaluation_file: rewards, game_lengths, mean_max_Q = pickle.load(evaluation_file) evaluation = [np.mean(rewards), np.std(rewards), np.mean(game_lengths), np.std(game_lengths), mean_max_Q] mprint('Evaluation: ' + '\t'.join(str(x) for x in evaluation)) continue memory_path = os.path.join(new_dir, WORKER_OUTPUT_GAMEPLAY_FILENAME) try: if os.path.getsize(memory_path) == 0: # TODO Figure out why this happens despite temporary directory work. # Also sometimes the file doesn't exist? Hence the try/except. mprint('Output not ready somehow: ' + memory_path) time.sleep(0.1) continue with open(memory_path, 'rb') as memory_file: worker_memories = pickle.load(memory_file) except Exception as exception: print('Error reading ' + memory_path + ': ' + str(exception.args)) time.sleep(0.1) continue for worker_memory in worker_memories: replay_memory.append(*worker_memory) if args.psc: os.remove(memory_path) play_dirs.add(new_dir) if len(play_dirs) <= NUM_BURN_IN_JOBS: mprint('Skip training because still burn in.') mprint('len(worker_memories): ' + str(len(worker_memories))) continue for _ in range(int(len(worker_memories) * FITS_PER_SINGLE_MEMORY)): agent.fit(sess, fits_so_far) fits_so_far += 1 # Partial evaluation to give frequent insight into agent progress. # Last time checked, this took ~0.1 seconds to complete. mprint('mean_max_q, len(worker_memories): ' + str(calculate_mean_max_Q(sess, online_model, fix_samples)) + ', ' + str(len(worker_memories))) # Always decrement epsilon (e.g. not just when saving model). model_epsilon = epsilon_generator.get_epsilon(decay_epsilon=True) if len(play_dirs) % SAVE_MODEL_EVERY == 0: save_model(saver, sess, args.ai_input_dir, model_epsilon)
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('-o', '--output', default='../log/', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--gamma', default=0.99, type=float, help='Discount factor') parser.add_argument('--batch_size', default=32, type=int, help='Minibatch size') parser.add_argument('--learning_rate', default=0.0001, type=float, help='Learning rate') parser.add_argument( '--initial_epsilon', default=1.0, type=float, help='Initial exploration probability in epsilon-greedy') parser.add_argument('--final_epsilon', default=0.05, type=float, help='Final exploration probability in epsilon-greedy') parser.add_argument( '--exploration_steps', default=2000000, type=int, help= 'Number of steps over which the initial value of epsilon is linearly annealed to its final value' ) parser.add_argument( '--num_samples', default=10000000, type=int, help='Number of training samples from the environment in training') parser.add_argument('--num_frames', default=4, type=int, help='Number of frames to feed to Q-Network') parser.add_argument('--num_frames_mv', default=10, type=int, help='Number of frames to used to detect movement') parser.add_argument('--frame_width', default=84, type=int, help='Resized frame width') parser.add_argument('--frame_height', default=84, type=int, help='Resized frame height') parser.add_argument( '--replay_memory_size', default=1000000, type=int, help='Number of replay memory the agent uses for training') parser.add_argument( '--target_update_freq', default=10000, type=int, help='The frequency with which the target network is updated') parser.add_argument('--train_freq', default=4, type=int, help='The frequency of actions wrt Q-network update') parser.add_argument('--save_freq', default=200000, type=int, help='The frequency with which the network is saved') parser.add_argument('--eval_freq', default=200000, type=int, help='The frequency with which the policy is evlauted') parser.add_argument( '--num_burn_in', default=50000, type=int, help= 'Number of steps to populate the replay memory before training starts') parser.add_argument('--load_network', default=False, action='store_true', help='Load trained mode') parser.add_argument('--load_network_path', default='', help='the path to the trained mode file') parser.add_argument( '--net_mode', default='dqn', help='choose the mode of net, can be linear, dqn, duel') parser.add_argument('--max_episode_length', default=10000, type=int, help='max length of each episode') parser.add_argument('--num_episodes_at_test', default=10, type=int, help='Number of episodes the agent plays at test') parser.add_argument('--ddqn', default=False, dest='ddqn', action='store_true', help='enable ddqn') parser.add_argument('--train', default=True, dest='train', action='store_true', help='Train mode') parser.add_argument('--test', dest='train', action='store_false', help='Test mode') parser.add_argument('--no_experience', default=False, action='store_true', help='do not use experience replay') parser.add_argument('--no_target', default=False, action='store_true', help='do not use target fixing') parser.add_argument('--no_monitor', default=False, action='store_true', help='do not record video') parser.add_argument('-p', '--platform', default='rle', help='rle or atari. rle: rle; atari: gym-atari') parser.add_argument('-pl', '--perlife', default=False, action='store_true', help='use per life or not. ') parser.add_argument('-mv', '--mv_reward', default=False, action='store_true', help='use movement reward or not') parser.add_argument('-c', '--clip_reward', default=False, action='store_true', help='clip reward or not') parser.add_argument('--decay_reward', default=False, action='store_true', help='decay reward or not') parser.add_argument('--expert_memory', default=None, help='path of the expert memory') parser.add_argument( '--initial_prob_replaying_expert', default=1.0, type=float, help='Initial probability of using expert replaying memory') parser.add_argument( '--final_prob_replaying_expert', default=0.05, type=float, help='Final probability of using expert replaying memory') parser.add_argument( '--steps_replaying_expert', default=1000000, type=float, help= '# steps over which the initial prob of replaying expert memory is linearly annealed to its final value' ) parser.add_argument('--trace_dir', default='', help='the trace dir for expert') parser.add_argument('--trace2mem', default=False, action='store_true', help='convert trace to memory') parser.add_argument('--mem_dump', default='', help='the path of memory dump') args = parser.parse_args() args.output = get_output_folder(args.output, args.env) if args.trace2mem: trace2mem(args) exit(0) if args.platform == 'atari': env = gym.make(args.env) else: rom_path = 'roms/' + args.env if args.no_monitor: env = rle(rom_path, record=True, path=args.output) else: env = rle(rom_path) print("Output saved to: ", args.output) print("Args used:") print(args) # here is where you should start up a session, # create your DQN agent, create your model, etc. # then you can run your fit method. num_actions = env.action_space.n print("Game ", args.env, " #actions: ", num_actions) dqn = DQNAgent(args, num_actions) if args.train: print("Training mode.") if args.perlife: env = RLEEnvPerLifeWrapper(env) dqn.fit(env, args.num_samples, args.max_episode_length) else: print("Evaluation mode.") dqn.evaluate(env, args.num_episodes_at_test, args.max_episode_length, not args.no_monitor)
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='SpaceInvadersDeterministic-v3', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--model', default='dqn', help='Q Network type to use.') parser.add_argument('--double', action='store_true') model_map = { 'linear': LinearQN, 'mlp': MLP, 'dqn': DQN, 'dueling': DuelingDQN } args = parser.parse_args() args.model = args.model.lower() if args.model not in model_map: print("Invalid model type. Valid types are", model_map.keys()) sys.exit(1) args.output = get_output_folder(args.output, args.env) # here is where you should start up a session, # create your DQN agent, create your model, etc. # then you can run your fit method. env = gym.make(args.env) monitored_env = gym.wrappers.Monitor( gym.make(args.env), args.output, video_callable=lambda i: i % EVAL_NUM_EPISODES == 0) atari = not args.env.startswith("CartPole") if atari: input_shape = (IMAGE_SIZE, IMAGE_SIZE) preprocessor = lambda: PreprocessorSequence( AtariPreprocessor(new_size=input_shape), HistoryPreprocessor(history_length=WINDOW_SIZE, max_over=True)) else: input_shape = (4, ) preprocessor = lambda: HistoryPreprocessor(history_length=WINDOW_SIZE) memory = ExperienceReplay(max_size=REPLAY_BUFFER_SIZE, window_length=WINDOW_SIZE) NUM_ACTIONS = env.action_space.n #policy = UniformRandomPolicy(num_actions=NUM_ACTIONS) #policy = GreedyEpsilonPolicy(NUM_ACTIONS, EPSILON) policy = LinearDecayGreedyEpsilonPolicy(NUM_ACTIONS, 1.0, EPSILON, NUM_ITERATIONS_LINEAR_DECAY) model = model_map[args.model](exp_name=args.output) agent = DQNAgent(q_network=model, preprocessor=preprocessor, memory=memory, policy=policy, gamma=GAMMA, target_update_freq=TARGET_UPDATE_FREQ, replay_buffer_size=REPLAY_BUFFER_SIZE, train_freq=TRAIN_FREQ, batch_size=BATCH_SIZE, output_dir=args.output, double_dqn=args.double) agent.compile(window=WINDOW_SIZE, input_shape=input_shape, num_actions=NUM_ACTIONS, model_name='q_network') signal.signal(signal.SIGINT, agent.signal_handler) signal.signal(signal.SIGTERM, agent.signal_handler) signal.signal(signal.SIGHUP, agent.signal_handler) agent.fit(env, monitored_env, num_iterations=NUM_ITERATIONS)
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') parser.add_argument('--network_name', default='linear_q_network', type=str, help='Type of model to use') parser.add_argument('--window', default=4, type=int, help='how many frames are used each time') parser.add_argument('--new_size', default=(84, 84), type=tuple, help='new size') parser.add_argument('--batch_size', default=32, type=int, help='Batch size') parser.add_argument('--replay_buffer_size', default=750000, type=int, help='Replay buffer size') parser.add_argument('--gamma', default=0.99, type=float, help='Discount factor') parser.add_argument('--alpha', default=0.0001, type=float, help='Learning rate') parser.add_argument('--epsilon', default=0.05, type=float, help='Exploration probability for epsilon-greedy') parser.add_argument('--target_update_freq', default=10000, type=int, help='Frequency for copying weights to target network') parser.add_argument('--num_burn_in', default=50000, type=int, help='Number of prefilled samples in the replay buffer') parser.add_argument('--num_iterations', default=5000000, type=int, help='Number of overal interactions to the environment') parser.add_argument('--max_episode_length', default=200000, type=int, help='Terminate earlier for one episode') parser.add_argument('--train_freq', default=4, type=int, help='Frequency for training') parser.add_argument('--repetition_times', default=3, type=int, help='Parameter for action repetition') parser.add_argument('-o', '--output', default='atari-v0', type=str, help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--experience_replay', default=False, type=bool, help='Choose whether or not to use experience replay') parser.add_argument('--train', default=True, type=bool, help='Train/Evaluate, set True if train the model') parser.add_argument('--model_path', default='/media/hongbao/Study/Courses/10703/hw2/lqn_noexp', type=str, help='specify model path to evaluation') parser.add_argument('--max_grad', default=1.0, type=float, help='Parameter for huber loss') parser.add_argument('--model_num', default=5000000, type=int, help='specify saved model number during train') parser.add_argument('--log_dir', default='log', type=str, help='specify log folder to save evaluate result') parser.add_argument('--eval_num', default=100, type=int, help='number of evaluation to run') parser.add_argument('--save_freq', default=100000, type=int, help='model save frequency') args = parser.parse_args() print("\nParameters:") for arg in vars(args): print arg, getattr(args, arg) print("") env = gym.make(args.env) num_actions = env.action_space.n # define model object preprocessor = AtariPreprocessor(args.new_size) memory = ReplayMemory(args.replay_buffer_size, args.window) # Initiating policy for both tasks (training and evaluating) policy = LinearDecayGreedyEpsilonPolicy(args.epsilon, 0, 1000000) if not args.train: '''Evaluate the model''' # check model path if args.model_path is '': print "Model path must be set when evaluate" exit(1) # specific log file to save result log_file = os.path.join(args.log_dir, args.network_name, str(args.model_num)) model_dir = os.path.join(args.model_path, args.network_name, str(args.model_num)) with tf.Session() as sess: # load model with open(model_dir + ".json", 'r') as json_file: loaded_model_json = json_file.read() q_network_online = model_from_json(loaded_model_json) q_network_target = model_from_json(loaded_model_json) sess.run(tf.global_variables_initializer()) # load weights into model q_network_online.load_weights(model_dir + ".h5") q_network_target.load_weights(model_dir + ".h5") dqn_agent = DQNAgent((q_network_online, q_network_target), preprocessor, memory, policy, num_actions, args.gamma, args.target_update_freq, args.num_burn_in, args.train_freq, args.batch_size, \ args.experience_replay, args.repetition_times, args.network_name, args.max_grad, args.env, sess) dqn_agent.evaluate(env, log_file, args.eval_num) exit(0) '''Train the model''' q_network_online = create_model(args.window, args.new_size, num_actions, args.network_name, True) q_network_target = create_model(args.window, args.new_size, num_actions, args.network_name, False) # create output dir, meant to pop up error when dir exist to avoid over written os.mkdir(os.path.join(args.output, args.network_name)) with tf.Session() as sess: dqn_agent = DQNAgent((q_network_online, q_network_target), preprocessor, memory, policy, num_actions, args.gamma, args.target_update_freq, args.num_burn_in, args.train_freq, args.batch_size, \ args.experience_replay, args.repetition_times, args.network_name, args.max_grad, args.env, sess) optimizer = tf.train.AdamOptimizer(learning_rate=args.alpha) dqn_agent.compile(optimizer, mean_huber_loss) dqn_agent.fit(env, args.num_iterations, os.path.join(args.output, args.network_name), args.save_freq, args.max_episode_length)
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') #parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') parser.add_argument('--output', default='results', help='Directory to save data to') parser.add_argument('-l', '--isLinear', default=0, type=int, choices=range(0, 2), help='1: use linear model; 0: use deep model') parser.add_argument( '-m', '--modelType', default='q', choices=['q', 'double', 'dueling'], help= 'q: q learning; double: double q learning; dueling: dueling q learning' ) parser.add_argument( '-s', '--simple', default=0, type=int, choices=range(0, 2), help= '1: without replay or target fixing ; 0: use replay and target fixing') parser.add_argument('--seed', default=0, type=int, help='Random seed') args = parser.parse_args() #args.input_shape = tuple(args.input_shape) if not os.path.exists(args.output): os.makedirs(args.output) model_name = ('linear_' if args.isLinear else 'deep_') + args.modelType + ( '_simple' if args.simple else '') args.output = get_output_folder(args.output + '/' + model_name, args.env) env = gym.make(args.env) #env = gym.wrappers.Monitor(env, args.output) env.seed(args.seed) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) K.get_session().run(tf.initialize_all_variables()) is_linear = args.isLinear agent = DQNAgent( q_network=create_model(4, (84, 84), env.action_space.n, is_linear, args.modelType), q_network2=create_model(4, (84, 84), env.action_space.n, is_linear, args.modelType), preprocessor=AtariPreprocessor((84, 84)), memory=ReplayMemory(1000000, 4), gamma=0.99, target_update_freq=10000, num_burn_in=50000, train_freq=4, batch_size=32, is_linear=is_linear, model_type=args.modelType, use_replay_and_target_fixing=(not args.simple), epsilon=0, #0.05, action_interval=4, output_path=args.output, save_freq=100000) agent.compile(lr=0.0001) agent.fit(env, 5000000) agent.load_weights() agent.evaluate(env, 100, video_path_suffix='final') env.close()
def main(): parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--type', default="DQN", help='Type of network to train. ()') args = parser.parse_args() #check if valid network type network_types = [ "Linear", "LinearERTF", "DoubleLinear", "DQN", "DDQN", "Duling" ] if (not (args.type in network_types)): raise ValueError("Invalid network type.") NETWORK_TYPE = args.type #set up environment model env = gym.make(str(args.env)) NUM_ACTIONS = env.action_space.n #make dqn agent """ FRAMES_PER_STATE = 4 INPUT_SHAPE = (84,84) GAMMA = .99 NUM_ITERATIONS = 1000000 TARGET_UPDATE_FREQ = 100000 BATCH_SIZE = 32 REPLAY_MEM_SIZE = 1000000 REPLAY_START_SIZE = 50000 MAX_EPISODE_LEN = 100 REWARD_SAMPLE = 1000 HELD_OUT_STATES_SIZE=1000 """ FRAMES_PER_STATE = 4 INPUT_SHAPE = (84, 84) GAMMA = .99 NUM_ITERATIONS = 20000 TARGET_UPDATE_FREQ = 1000 BATCH_SIZE = 32 REPLAY_MEM_SIZE = 1000000 REPLAY_START_SIZE = 1000 MAX_EPISODE_LEN = 10 REWARD_SAMPLE = 1000 HELD_OUT_STATES_SIZE = 1000 #retuns a list of models ie: [Online,None] or [Online,Target] or [OnlineA,OnlineB] models = create_model(FRAMES_PER_STATE, INPUT_SHAPE, NUM_ACTIONS, NETWORK_TYPE) history = HistoryPreprocessor(FRAMES_PER_STATE - 1) preprocessor = Preprocessor() if (NETWORK_TYPE != "Linear"): memory = ReplayMemory(REPLAY_MEM_SIZE, FRAMES_PER_STATE) else: memory = None held_out_states = ReplayMemory(HELD_OUT_STATES_SIZE, FRAMES_PER_STATE) policy = LinearDecayGreedyEpsilonPolicy(1, .05, int(1e6)) agent = DQNAgent(models[0], models[1], preprocessor, history, memory, policy, GAMMA, TARGET_UPDATE_FREQ, BATCH_SIZE, REPLAY_START_SIZE, NUM_ACTIONS, NETWORK_TYPE, REWARD_SAMPLE, held_out_states, HELD_OUT_STATES_SIZE) #compile agent adam = Adam(lr=0.0001) loss = mean_huber_loss agent.compile(adam, loss) agent.fit(env, NUM_ITERATIONS, MAX_EPISODE_LEN) model_json = models[0].to_json() with open(NETWORK_TYPE + "model.json", "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 models[0].save_weights(NETWORK_TYPE + "model.h5") print("Saved model to disk")
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Enduro-v0', help='Atari env name') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--model_type', default='dqn', help='Model type: linear, dqn, double_linear, double_dqn') parser.add_arguement('--mode', default='train', help='Mode: train for training, test for testing') parser.add_arguement('--memory_size', default=200000, type=int, help='Replay memory size') parser.add_arguement('--save_every', default=50000, type=int, help='Frequency for saving weights') parser.add_arguement('--max_ep_length', default=50000, type=int, help='Maximum episode length during training') parser.add_arguement('--use_target_fixing', action='store_true', help='Use target fixing') parser.add_arguement('--use_replay_memory', action='store_true', help='Use replay memory') args = parser.parse_args() # Loading the appropriate environment. env = gym.make('Enduro-v0') window = 4 input_shape = (84,84) num_actions = env.action_space.n # Limit GPU use config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Set mode mode = args.mode # Set model variables # Model type to train. model_type = args.model_type # Initialize the Preprocessor, Memory, policy for training, preproc = Preprocessor() memory = ReplayMemory(args.memory_size) policy = LinearDecayGreedyEpsilonPolicy(1,0.1,1000000, num_actions) # decay epsilon from 1 to 0.1 over 1 million steps # Setting experimental parameters - details of choices specified in the write up. gamma = 0.99 target_update_freq = 10000 num_burn_in = 1000 train_freq = 0 # not using this parameter batch_size = 32 target_fix_flag = args.target_fixing replay_mem_flag = args.replay_memory save_every = args.save_every print(sess) # Create a DQN agent with the specified parameters. dqn = DQNAgent(sess, window, input_shape, num_actions, model_type, preproc, memory, policy, gamma, target_fix_flag, target_update_freq, replay_mem_flag, num_burn_in, train_freq, batch_size, save_every) # Train the model on 3-5 Million frames, with given maximum episode length. if mode == 'train': dqn.fit(env, 5000000, args.max_ep_length) elif mode == 'test': # Load the model for testing. model_file = 'saved_models_dqn/model_100000.ckpt' dqn.restore_model(model_file) # Evaluate the model. dqn.evaluate(env, 20 ,5000, 'test', lambda x: True, False, True)
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument( '-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('-ni', '--num_iterations', default=10, type=int, help='Num of iterations for training') parser.add_argument('-m', '--max_episode_length', default=60, type=int, help='Max episode length of a sequence') parser.add_argument('-ne', '--num_episodes', default=10, type=int, help='Num of epsidoes for evaluating') parser.add_argument('-r', '--replay_memory', default=10, type=int, help='The size of replay memory') parser.add_argument('-gamma', '--discount_factor', default=0.99, type=float, help='Discount factor of MDP') parser.add_argument('-ge', '--Greedy_epsilon', default=0.95, type=float, help='The probability to choose a greedy action') args = parser.parse_args() #args.input_shape = tuple(args.input_shape) args.output = get_output_folder(args.output, args.env) # the dirs to store results os.makedirs(args.output) os.chdir(args.output) # here is where you should start up a session, # create your DQN agent, create your model, etc. # then you can run your fit method. env = gym.make('Breakout-v0') env.reset() # Preprocess image preprocess_network = preprocessors.PreprocessorSequence('network') preprocess_memory = preprocessors.PreprocessorSequence('memory') # Policy choose Greedy = policy.GreedyEpsilonPolicy(0.95) DG = policy.LinearDecayGreedyEpsilonPolicy('attr_name', 1, 0.1, 1000000) # Create model from Atari paper model = create_model(window=4, input_shape=(84, 84), num_actions=4) # load weights location = '/' # Define tensorboard tensorboard = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True) # Optimazor optimizor = Adam(lr=0.00025) # Create memory memory = core.ReplayMemory(max_size=args.replay_memory, phi_length=4, window_height=84, window_length=84, rng=np.random.RandomState(100)) agent = DQNAgent(q_network=model, target=model, preprocessor={'network': preprocess_network, 'memory': preprocess_memory}, memory=memory, policy={'Greedy': Greedy, 'DG': DG}, gamma=args.discount_factor, target_update_freq=100000, num_burn_in=args.replay_memory, train_freq=4, batch_size=32 ,callbacks=tensorboard) agent.compile(optimizer= optimizor, loss_func=objectives.mean_huber_loss) agent.init_memory(env=env, max_episode_length=30) agent.fit(env=env, num_iterations=args.num_iterations, max_episode_length=args.max_episode_length) agent.evaluate(env=env, num_episodes=args.num_episodes, max_episode_length=args.max_episode_length) # store the hyperameters file_abs = "./hypermeters" with open(file_abs, "w") as f: f.write("Num of iterations:") f.write(str(args.num_iterations) + '\n') f.write("Max epsidoe length:") f.write(str(args.max_episode_length) + '\n') f.write("Num of episodes:") f.write(str(args.num_episodes) + '\n') f.write("Replay memory:") f.write(str(args.replay_memory) + '\n') f.write("Discount factor:") f.write(str(args.discount_factor) + '\n')
def main(): # noqa: D103 parser = argparse.ArgumentParser( description='Run DQN on given game environment') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') parser.add_argument('-o', '--output', default='train', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--gamma', default=0.99, type=float, help='Discount factor') parser.add_argument( '--target_update_freq', default=10000, type=int, help='interval between two updates of the target network') parser.add_argument( '--num_burn_in', default=10, type=int, help= 'number of samples to be filled into the replay memory before updating the network' ) parser.add_argument('--train_freq', default=1, type=int, help='How often to update the Q-network') parser.add_argument('--batch_size', default=32, type=int, help='batch_size') parser.add_argument('--num_iterations', default=50000, type=int, help="num of iterations to run for the training") parser.add_argument('--max_episode_length', default=10000, type=int, help='max length of one episode') parser.add_argument('--lr', default=0.0001, type=float, help='learning rate') parser.add_argument('--epsilon', default=0.05, type=float, help='epsilon for exploration') parser.add_argument('--experiment_id', default=None, type=int, help='index of experiment to reload checkpoint') parser.add_argument('--save_freq', default=10000, type=int, help='checkpoint saving frequency') parser.add_argument( '--evaluate_freq', default=10000, type=int, help='frequency to do evaluation and record video by wrapper') parser.add_argument('--test_num_episodes', default=20, type=int, help='number of episodes to play at each evaluation') args = parser.parse_args() if not args.experiment_id: args.output = get_output_folder(args.output, args.env) else: args.output = os.path.join(args.output, args.env) + '-run{}'.format( args.experiment_id) game_env = gym.make(args.env) num_actions = game_env.action_space.n input_shape = (84, 84) #todo: setup logger #writer = tf.summary.FileWriter() #setup model model = create_model(window=4, input_shape=input_shape, num_actions=num_actions, model_name='linear_model') #setup optimizer #optimizer = Adam(lr=args.lr) optimizer = tf.train.AdamOptimizer(learning_rate=args.lr) #setup preprocessor atari_preprocessor = AtariPreprocessor(input_shape) history_preprocessor = HistoryPreprocessor(history_length=3) preprocessor = PreprocessorSequence( [atari_preprocessor, history_preprocessor]) #setup policy policy = GreedyEpsilonPolicy(epsilon=args.epsilon, num_actions=num_actions) #setup DQN agent agent = DQNAgent(q_network=model, preprocessor=preprocessor, memory=None, policy=policy, gamma=args.gamma, target_update_freq=args.target_update_freq, num_burn_in=args.num_burn_in, train_freq=args.train_freq, batch_size=args.batch_size, logdir=args.output, save_freq=args.save_freq, evaluate_freq=args.evaluate_freq, test_num_episodes=args.test_num_episodes) agent.compile(optimizer=optimizer, loss_func=mean_huber_loss) agent.fit(env=game_env, num_iterations=args.num_iterations, max_episode_length=args.max_episode_length)
def main(args): # gpu id # gpu_id = args.gpu # os.environ['CUDA_VISIBLE_DEVICES'] = '%d'%gpu_id # make env env = gym.make(args.env) if args.mode == 'test' and args.submit: monitor_log = os.path.join(args.output, 'monitor.log') env = wrappers.Monitor(env, monitor_log, force=True) # build model # actions 0-5: 0 do nothing, 1 fire, 2 right, 3 left, 4 right+fire, 5 left+fire num_actions = env.action_space.n mem_size = 1000000 window = 4 input_shape = (84, 84) if args.type in ['DQN', 'double-DQN']: model = create_model(window, input_shape, num_actions, args.init) target = create_model(window, input_shape, num_actions, args.init) elif args.type in ['linear', 'linear-simple', 'double-Q']: model = create_model_linear(window, input_shape, num_actions, args.init) target = create_model_linear(window, input_shape, num_actions, args.init) elif args.type == 'duel': model = create_model_duel(window, input_shape, num_actions, args.init) target = create_model_duel(window, input_shape, num_actions, args.init) # memory = ReplayMemory(1000000, 100) # window length is arbitrary # target_update_freq = 10000 # num_burn_in = 50000 target_update_freq = 10000 num_burn_in = 50000 train_freq = 4 batch_size = 32 gamma = 0.99 epsilon = 0.05 updates_per_epoch = 50000 num_iterations = 50000000 eval_episodes = 100 max_episode_length = 10000 # simple: no experience replay and no target fixing # if args.type == 'linear-simple': # mem_size = 5 # target_update_freq = 1 # num_burn_in = 0 # batch_size = 1 if args.type == 'linear-simple': num_burn_in = 0 memory = ReplayMemoryEfficient(mem_size, window, input_shape) # with tf.device('/gpu:%d'%gpu_id): config = tf.ConfigProto(intra_op_parallelism_threads=8) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # preprocessor preprocessor = PreprocessorSequence() # policy policy = LinearDecayGreedyEpsilonPolicy(1, 0.1, 1000000) policy_eval = GreedyEpsilonPolicy(epsilon) # build agent dqn_agent = DQNAgent(sess, env, args.type, model, target, preprocessor, memory, policy, policy_eval, gamma, target_update_freq, num_burn_in, train_freq, batch_size, num_actions, updates_per_epoch, args.output) if args.mode == 'train': # compile net and train with fit # rmsprop = RMSprop(lr=0.00025, rho=0.95, epsilon=0.01) # dqn_agent.compile_networks(rmsprop, mean_huber_loss) # adam = Adam(lr=0.00025, beta_1=0.95, beta_2=0.95, epsilon=0.1) adam = Adam(lr=0.0001) dqn_agent.compile_networks(adam, mean_huber_loss) if args.type == 'linear-simple': dqn_agent.fit_simple(num_iterations, max_episode_length) else: dqn_agent.fit(num_iterations, max_episode_length) elif args.mode == 'test': # load net and evaluate model_path = os.path.join(args.output, 'model_epoch%03d' % args.epoch) dqn_agent.load_networks(model_path) if args.submit: eval_episodes = 1 dqn_agent.play(eval_episodes, max_episode_length) # if args.submit: # gym.upload(monitor_log, api_key='sk_wa5MgeDTnOQ209qBCP7jQ') # else: # log_file = open(os.path.join(args.output, 'evaluation.txt'), 'a+') # log_file.write('%d %f %f %f %f\n' % (args.epoch, # np.mean(lengths), # np.std(lengths), # np.mean(rewards), # np.std(rewards))) # log_file.close() env.close()