def main(): parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Enduro-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') args = parser.parse_args() # setup parameters # create the model and the agent # run fit method. env = gym.make(args.env) num_actions = env.action_space.n replay = True # apply replay memory or not double = False # apply double network or not model_type = 'deep' model1 = create_model((105, 80), num_actions, model_type=model_type) model2 = create_model((105, 80), num_actions, model_type=model_type) preprocessor = core.Preprocessor() if replay: mem_size = 40000 #1000000 window = 4 mem = core.ReplayMemory(mem_size, window) else: mem = list() p = policy.LinearDecayGreedyEpsilonPolicy(1, 0.01, 500, num_actions) agent = Agent(model_type=model_type, q_network=model1, preprocessor=preprocessor, memory=mem, policy=p, gamma=0.99, target_update_freq=1000, num_burn_in=4000, train_freq=10, batch_size=32, replay=replay, target_network=model2, double=double) agent.compile() agent.fit(env, num_iterations=400) agent.q_network.save_weights('last-weights')
alpha = 0.001 #Learning Rate num_iteration = 300000 #ATTENTION TO THE FOLLOWING VARIABLE: Max_TimeStep = 10000#this is T described in the reference 2 #Make Max_TimeStep to be small so that I can debug on my PC #Change that to big value when upload to the super computer num_update_target = 10000 #target update after this number of iterations #Again, make it small first to debug locally and then make big when submit! update_counter = 0 #used for debug only, will not affect the algorithm batch_size = 32 env = gym.make('SpaceInvaders-v0') #if we use SpaceInvaders as suggested by the paper, we should use 3, #while all the other games are required that we use 4 num_frame_skip = 3 output_num = env.action_space.n LinearPolicy = policy.LinearDecayGreedyEpsilonPolicy(output_num, 1, 0.05, 100000) experience = Experience() #============BUILD NETWORK=========================== #define session to run: sess = tf.Session() #define placeholders for state, action, reward, nextstate, terminal index1 = tf.placeholder(tf.int32, shape = [None, 2], name = 'index1') index2 = tf.placeholder(tf.int32, shape = [None, 2], name = 'index2') action = tf.placeholder(tf.int32, shape = [None, 2], name = 'action') terminal = tf.placeholder(tf.float32, shape = [None, 1], name = 'terminal') r = tf.placeholder(tf.float32, shape = [None, 1], name = 'r') x = tf.placeholder(tf.float32, shape = [None, num_frame_skip*84*84], name = 'x') next_x = tf.placeholder(tf.float32, shape = [None, num_frame_skip*84*84], name = 'next_x') #weights for the Q1 linear Q network
#ATTENTION TO THE FOLLOWING VARIABLE: rewardOneEpisode = 0 # sum the reward obtained from 1 episode Max_TimeStep = 10000 #this is T described in the reference 2 #Make Max_TimeStep to be small so that I can debug on my PC #Change that to big value when upload to the super computer!!! num_update_target = 10000 #target update after this number of iterations #Again, make it small first to debug locally and then make big when submit! update_counter = 0 #used for debug only, will not affect the algorithm batch_size = 32 env = gym.make('SpaceInvaders-v0') #if we use SpaceInvaders as suggested by the paper, we should use 3, #while all the other games are required that we use 4 num_frame_skip = 3 #==========Neural Nets Parameters are here================= output_num_final = env.action_space.n LinearPolicy = policy.LinearDecayGreedyEpsilonPolicy(output_num_final, 1, 0.05, 200000) out_dim1 = 32 #number of filters for convolutional layer1 out_dim2 = 64 #number of filters for convolutional layer2 out_dim3 = 64 #number of filters for convolutional layer3 experience = Experience() #============BUILD NETWORK=========================== # the data fomat used here is "NCHW"!!! #define session to run: sess = tf.Session() #define placeholders for state, action, reward, nextstate, terminal action = tf.placeholder(tf.int32, shape=[None, 2], name='action') terminal = tf.placeholder(tf.float32, shape=[None, 1], name='terminal') r = tf.placeholder(tf.float32, shape=[None, 1], name='r') x = tf.placeholder(tf.float32,
def main(): parser = argparse.ArgumentParser(description = 'Run Atari') parser.add_argument('--env',default='SpaceInvaders-v0',help ='Atari Env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--gpu', default='0', type=str, help='GPU to use.') parser.add_argument('--optimizer', default='rmsprop', type=str,help='optimizer (rmsprop/adam).') parser.add_argument('--learning_rate', default=0.00025, type=float,help='Learning rate.') parser.add_argument('--model', default='convnet', type=str, help='Type of model to use.') parser.add_argument('--max_iters', default=100000000, type=int, help='Max num of iterations to run for.') parser.add_argument('--checkpoint', default='', type=str, help='Checkpoint to load from.') parser.add_argument('--render', action='store_true', default=False, help='Render what we got, or train?') parser.add_argument('--render_path', type=str, default='/dev/null/', help='Path to store the render in.') parser.add_argument('--input_image', action='store_true', default=False, help='Standardize (-1,1) the image or not.') parser.add_argument('--part_gpu', action='store_true', default=True, help='Use part of GPU.') parser.add_argument('--train_policy', type=str, default='anneal', help='anneal/epgreedy') parser.add_argument('--exp_id', type=int, default=None, help='For assoc between scripts and results, ' 'give script number.') parser.add_argument('--target_update_freq', type=int, default=10000, help='Sync the target and live networks.') parser.add_argument('--train_freq', type=int, default=4, help='Number of iters to push into replay before training.') parser.add_argument('--mem_size', type=int, default=100000, help='Size of replay memory, 1M is too large.') parser.add_argument('--learning_type', type=str, default='normal', help='Set normal, or double for DDQN.') parser.add_argument('--final_eval', action='store_true', default=False, help='Perform the final 100 episode evaluation.') args =parser.parse_args() args.outout,experiment_id = get_output_folder(args.output,args.env,args.exp_id) np.random.seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu if args.part_gpu: from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.allow_growth =True set_session(tf.Session(config=config)) input_size =64 frame_history =4 batch_size = 32 input_shape = (input_size,input_size,frame_history) mem_size = args.mem_size gamma =0.99 target_update_freq = args.target_update_freq num_burn_in = 20000 train_freq = args.train_freq summary_writer = None if not args.render and not args.final_eval: summary_writer= tf.summary.FileWriter(logdir =args.output) env= gym.make(args.env) env.seed(args.seed) env.reset() model = networks.get_model(args.model, input_shape, env.action_space.n) target_model = networks.get_model(args.model, input_shape, env.action_space.n) preproc = preprocessor.SequentialPreprocessor( [preprocessor.AtariPreprocessor(input_size, args.std_img), preprocessor.PreservePreprocessor(frame_history)]) if args.train_policy == 'anneal': pol = policy.LinearDecayGreedyEpsilonPolicy( 1, 0.1, 1000000) elif args.train_policy == 'epgreedy': pol = policy.GreedyEpsilonPolicy(0.1) else: raise ValueError() if args.optimizer == 'rmsprop': optimizer = RMSprop(args.learning_rate, rho=0.95, epsilon=0.01) elif args.optimizer == 'adam': optimizer = Adam(args.learning_rate) mem = memory.BasicReplayMemory(mem_size) learning_type = dqn_utils.CASE_NORMAL if args.learning_type == 'double': learning_type = dqn_utils.CASE_DOUBLE D = dqn_utils.DQNAgent( model, target_model, preproc, mem, pol, gamma, target_update_freq, num_burn_in, train_freq, batch_size, optimizer=optimizer, loss_func='mse', summary_writer=summary_writer, checkpoint_dir=args.output, experiment_id=experiment_id, env_name=args.env, learning_type=learning_type) if args.checkpoint: D.load(args.checkpoint) if args.final_eval: D.evaluate(env, 100, 0, max_episode_length=1e4, final_eval=True) elif args.render: # testing_env = copy.deepcopy(env) D.evaluate(env, 1, 0, max_episode_length=1e4, render=True, render_path=args.render_path) else: D.fit(env, args.max_iters)