def main(): date = datetime.now().strftime('%Y%m%d%H%M%S') parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='PongNoFrameskip-v4') parser.add_argument('--load', type=str) parser.add_argument('--logdir', type=str, default=date) parser.add_argument('--render', action='store_true') parser.add_argument('--demo', action='store_true') args = parser.parse_args() outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir) if not os.path.exists(outdir): os.makedirs(outdir) logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir) env_name = args.env tmp_env = gym.make(env_name) is_atari = len(tmp_env.observation_space.shape) != 1 if not is_atari: observation_space = tmp_env.observation_space constants = box_constants if isinstance(tmp_env.action_space, gym.spaces.Box): num_actions = tmp_env.action_space.shape[0] else: num_actions = tmp_env.action_space.n state_shape = [observation_space.shape[0], constants.STATE_WINDOW] state_preprocess = lambda s: s reward_preprocess = lambda r: r / 10.0 # (window_size, dim) -> (dim, window_size) phi = lambda s: np.transpose(s, [1, 0]) else: constants = atari_constants num_actions = tmp_env.action_space.n state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW] def state_preprocess(state): state = atari_preprocess(state, constants.STATE_SHAPE) state = np.array(state, dtype=np.float32) return state / 255.0 reward_preprocess = lambda r: np.clip(r, -1.0, 1.0) # (window_size, H, W) -> (H, W, window_size) phi = lambda s: np.transpose(s, [1, 2, 0]) # flag of continuous action space continuous = isinstance(tmp_env.action_space, gym.spaces.Box) upper_bound = tmp_env.action_space.high if continuous else None # save settings dump_constants(constants, os.path.join(outdir, 'constants.json')) sess = tf.Session() sess.__enter__() model = make_network( constants.CONVS, constants.FCS, use_lstm=constants.LSTM, padding=constants.PADDING, continuous=continuous) # learning rate with decay operation if constants.LR_DECAY == 'linear': lr = LinearScheduler(constants.LR, constants.FINAL_STEP, 'lr') epsilon = LinearScheduler( constants.EPSILON, constants.FINAL_STEP, 'epsilon') else: lr = ConstantScheduler(constants.LR, 'lr') epsilon = ConstantScheduler(constants.EPSILON, 'epsilon') agent = Agent( model, num_actions, nenvs=constants.ACTORS, lr=lr, epsilon=epsilon, gamma=constants.GAMMA, lam=constants.LAM, lstm_unit=constants.LSTM_UNIT, value_factor=constants.VALUE_FACTOR, entropy_factor=constants.ENTROPY_FACTOR, time_horizon=constants.TIME_HORIZON, batch_size=constants.BATCH_SIZE, grad_clip=constants.GRAD_CLIP, state_shape=state_shape, epoch=constants.EPOCH, phi=phi, use_lstm=constants.LSTM, continuous=continuous, upper_bound=upper_bound ) saver = tf.train.Saver() if args.load: saver.restore(sess, args.load) # create environemtns envs = [] for i in range(constants.ACTORS): env = gym.make(args.env) env.seed(constants.RANDOM_SEED) if is_atari: env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env) env = EpisodicLifeEnv(env) wrapped_env = EnvWrapper( env, r_preprocess=reward_preprocess, s_preprocess=state_preprocess ) envs.append(wrapped_env) batch_env = BatchEnvWrapper(envs) sess.run(tf.global_variables_initializer()) summary_writer = tf.summary.FileWriter(logdir, sess.graph) logger = TfBoardLogger(summary_writer) logger.register('reward', dtype=tf.float32) end_episode = lambda r, s, e: logger.plot('reward', r, s) def after_action(state, reward, global_step, local_step): if global_step % 10 ** 6 == 0: path = os.path.join(outdir, 'model.ckpt') saver.save(sess, path, global_step=global_step) trainer = BatchTrainer( env=batch_env, agent=agent, render=args.render, state_shape=state_shape[:-1], state_window=constants.STATE_WINDOW, final_step=constants.FINAL_STEP, after_action=after_action, end_episode=end_episode, training=not args.demo ) trainer.start()
def main(): date = datetime.now().strftime('%Y%m%d%H%M%S') parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='PongDeterministic-v4') parser.add_argument('--threads', type=int, default=8) parser.add_argument('--load', type=str) parser.add_argument('--logdir', type=str, default=date) parser.add_argument('--render', action='store_true') parser.add_argument('--demo', action='store_true') parser.add_argument('--record', action='store_true') args = parser.parse_args() outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir) if not os.path.exists(outdir): os.makedirs(outdir) logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir) env_name = args.env tmp_env = gym.make(env_name) is_atari = len(tmp_env.observation_space.shape) != 1 # box environment if not is_atari: observation_space = tmp_env.observation_space constants = box_constants actions = range(tmp_env.action_space.n) state_shape = [observation_space.shape[0], constants.STATE_WINDOW] state_preprocess = lambda s: s # (window_size, dim) -> (dim, window_size) phi = lambda s: np.transpose(s, [1, 0]) # atari environment else: constants = atari_constants actions = get_action_space(env_name) state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW] def state_preprocess(state): # atari specific preprocessing state = atari_preprocess(state, constants.STATE_SHAPE) state = np.array(state, dtype=np.float32) return state / 255.0 # (window_size, H, W) -> (H, W, window_size) phi = lambda s: np.transpose(s, [1, 2, 0]) # save settings dump_constants(constants, os.path.join(outdir, 'constants.json')) sess = tf.Session() sess.__enter__() model = make_network( constants.CONVS, constants.FCS, lstm=constants.LSTM, padding=constants.PADDING) # share Adam optimizer with all threads! lr = tf.Variable(constants.LR) decayed_lr = tf.placeholder(tf.float32) decay_lr_op = lr.assign(decayed_lr) if constants.OPTIMIZER == 'rmsprop': optimizer = tf.train.RMSPropOptimizer(lr, decay=0.99, epsilon=0.1) else: optimizer = tf.train.AdamOptimizer(lr) master = make_agent( model, actions, optimizer, state_shape, phi, 'global', constants) global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') saver = tf.train.Saver(global_vars) if args.load: saver.restore(sess, args.load) agents = [] envs = [] for i in range(args.threads): name = 'worker{}'.format(i) agent = make_agent( model, actions, optimizer, state_shape, phi, name, constants) agents.append(agent) env = gym.make(args.env) env.seed(constants.RANDOM_SEED) if is_atari: env = NoopResetEnv(env) env = EpisodicLifeEnv(env) wrapped_env = EnvWrapper( env, r_preprocess=lambda r: np.clip(r, -1, 1), s_preprocess=state_preprocess ) envs.append(wrapped_env) sess.run(tf.global_variables_initializer()) summary_writer = tf.summary.FileWriter(logdir, sess.graph) tflogger = TfBoardLogger(summary_writer) tflogger.register('reward', dtype=tf.float32) tflogger.register('eval_reward', dtype=tf.float32) end_episode = lambda r, gs, s, ge, e: tflogger.plot('reward', r, gs) def after_action(state, reward, shared_step, global_step, local_step): if constants.LR_DECAY == 'linear': decay = 1.0 - (float(shared_step) / constants.FINAL_STEP) if decay < 0.0: decay = 0.0 sess.run(decay_lr_op, feed_dict={decayed_lr: constants.LR * decay}) if shared_step % 10 ** 6 == 0: path = os.path.join(outdir, 'model.ckpt') saver.save(sess, path, global_step=shared_step) trainer = AsyncTrainer( envs=envs, agents=agents, render=args.render, state_shape=state_shape[:-1], state_window=constants.STATE_WINDOW, final_step=constants.FINAL_STEP, after_action=after_action, end_episode=end_episode, training=not args.demo, n_threads=args.threads ) trainer.start()
def main(): date = datetime.now().strftime('%Y%m%d%H%M%S') parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='PongNoFrameskip-v4') parser.add_argument('--load', type=str) # how to load parser.add_argument('--logdir', type=str, default=date) parser.add_argument('--render', action='store_true') parser.add_argument('--demo', action='store_true') # training or not training args = parser.parse_args() outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir) if not os.path.exists(outdir): os.makedirs(outdir) logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir) env_name = args.env tmp_env = gym.make(env_name) is_atari = len(tmp_env.observation_space.shape) != 1 if not is_atari: observation_space = tmp_env.observation_space constants = box_constants if isinstance(tmp_env.action_space, gym.spaces.Box): num_actions = tmp_env.action_space.shape[0] # for continuous action space, num_actions means how many continuous actions else: num_actions = tmp_env.action_space.n # for discrete action space, num_actions means how many selectable actions. state_shape = [observation_space.shape[0], constants.STATE_WINDOW] state_preprocess = lambda s: s reward_preprocess = lambda r: r / 10.0 # (window_size, dim) -> (dim, window_size) phi = lambda s: np.transpose(s, [1, 0]) else: constants = atari_constants num_actions = tmp_env.action_space.n state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW] def state_preprocess(state): state = atari_preprocess(state, constants.STATE_SHAPE) state = np.array(state, dtype=np.float32) return state / 255.0 reward_preprocess = lambda r: np.clip(r, -1.0, 1.0) # (window_size, H, W) -> (H, W, window_size) phi = lambda s: np.transpose(s, [1, 2, 0]) # a transformation function # flag of continuous action space continuous = isinstance(tmp_env.action_space, gym.spaces.Box) # 'gym.spaces.Box' means continuous action space upper_bound = tmp_env.action_space.high if continuous else None # save settings dump_constants(constants, os.path.join(outdir, 'constants.json')) sess = tf.Session() sess.__enter__() model = make_network( # !!! just a lambda function constants.CONVS, constants.FCS, use_lstm=constants.LSTM, padding=constants.PADDING, continuous=continuous) # model is a function instance, # mlp network for continuous action space, cnn network for discrete # learning rate with decay operation if constants.LR_DECAY == 'linear': lr = LinearScheduler(constants.LR, constants.FINAL_STEP, 'lr') epsilon = LinearScheduler( constants.EPSILON, constants.FINAL_STEP, 'epsilon') else: lr = ConstantScheduler(constants.LR, 'lr') epsilon = ConstantScheduler(constants.EPSILON, 'epsilon') agent = Agent( model, # !!! num_actions, nenvs=constants.ACTORS, lr=lr, epsilon=epsilon, gamma=constants.GAMMA, lam=constants.LAM, lstm_unit=constants.LSTM_UNIT, value_factor=constants.VALUE_FACTOR, entropy_factor=constants.ENTROPY_FACTOR, time_horizon=constants.TIME_HORIZON, batch_size=constants.BATCH_SIZE, grad_clip=constants.GRAD_CLIP, state_shape=state_shape, epoch=constants.EPOCH, phi=phi, use_lstm=constants.LSTM, continuous=continuous, upper_bound=upper_bound ) saver = tf.train.Saver(max_to_keep=5) if args.load: saver.restore(sess, args.load) else: # this else is important sess.run(tf.global_variables_initializer()) # # create environemtns envs = [] for i in range(constants.ACTORS): # 8 actors env = gym.make(args.env) env.seed(constants.RANDOM_SEED) if is_atari: env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env) env = EpisodicLifeEnv(env) wrapped_env = EnvWrapper( env, r_preprocess=reward_preprocess, s_preprocess=state_preprocess ) envs.append(wrapped_env) # append all wrapped_envs batch_env = BatchEnvWrapper(envs) # envs is a list # sess.run(tf.global_variables_initializer()) # should not be here? otherwise it will override the loaded checkpoint summary_writer = tf.summary.FileWriter(logdir, sess.graph) logger = TfBoardLogger(summary_writer) logger.register('reward', dtype=tf.float32) end_episode = lambda r, s, e: logger.plot('reward', r, s) # record the reward a episode def after_action(state, reward, global_step, local_step):# after an action, check weather need to save model # demo mode will not save the model params if (global_step % 10**5 >=0 and global_step % 10**5 <= 10 ) and not args.demo : # save model about every 10 ** 5, can't use global step% 10**5 ==0, because global_step may not # get the number of multiple of 10**5. path = os.path.join(outdir, 'model.ckpt') print('model saved, global step:{}'.format(global_step)) saver.save(sess, path, global_step=global_step) trainer = BatchTrainer( env=batch_env, agent=agent, # Agent instannce render=args.render, state_shape=state_shape[:-1], state_window=constants.STATE_WINDOW, final_step=constants.FINAL_STEP, # final_step is a total time step limit # final_step=12345, after_action=after_action, # callback function after an action end_episode=end_episode, training=not args.demo # if --demo, then not training, if no --demo, then training the policy net and value net ) trainer.start()