def main(): FLAGS(sys.argv) # Choose which RL algorithm to train. print("env : %s" % FLAGS.env) # 1. Create gym environment env = gym.make(FLAGS.env) # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) agent = RandomAgent(env.action_space) episode_count = 100 reward = 0 done = False for i in range(episode_count): ob = env.reset() while True: action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) if done: break
def main(): # 1. Create gym environment env = gym.make("ppaquette/SuperMarioBros-1-1-v0") # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) agent = RandomAgent(env.action_space) episode_count = 100 reward = 0 done = False # for i in range(episode_count): # ob = env.reset() # while True: # action = agent.act(ob, reward, done) # ob, reward, done, _ = env.step(1) # if done: # break for i in range(episode_count): ob = env.reset() while True: key = readchar.readkey() # Choose an action from keyboard if key not in arrow_keys.keys(): print("Game aborted!") break action = arrow_keys[key] state, reward, done, info = env.step(action) if done: print("Finished with reward", reward) break
def main(): MAX_BUFFER_SIZE = 100000 MAX_EPISODES = 10000 TRAIN_EPISODE = 100 TARGET_UPDATE_EPS = 1000 batch_size = 32 n_size = 84 discount = 0.99 checkpoint_dir = './checkpoints' save_file_name = 'mario_weight.ckpt' # 1. Create gym environment env = gym.make("ppaquette/SuperMarioBros-1-1-v0") # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) sess = tf.Session() targetDQN = DQN(sess, name="target") dqn_var_list = targetDQN.var_list sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(var_list=dqn_var_list) saver.restore(sess, os.path.join(checkpoint_dir, save_file_name)) for eps in range(MAX_EPISODES): done = False step_count = 0 state = env.reset() state_queue = deque(maxlen=4) state_queue.append(state) while not done: step_count += 1 # cumulate 4 frames if step_count < 4: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) state_queue.append(next_state) continue action = np.argmax( targetDQN.predict( np.reshape(np.array(state_queue), [1, n_size, n_size, 4]))) # Get new state and reward from environment next_state, reward, done, _ = env.step(action) state_queue.append(next_state)
def main(): FLAGS(sys.argv) # 1. Create gym environment env = gym.make(FLAGS.env) # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) if (FLAGS.algorithm == "deepq"): act = deepq.load("models/deepq/%s" % FLAGS.file) nstack = 4 nh, nw, nc = env.observation_space.shape history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8) while True: obs, done = env.reset(), False history = update_history(history, obs) episode_rew = 0 while not done: env.render() action = act(history)[0] obs, rew, done, _ = env.step(action) history = update_history(history, obs) episode_rew += rew print("action : %s reward : %s" % (action, rew)) print("Episode reward", episode_rew) elif (FLAGS.algorithm == "acktr"): policy_fn = CnnPolicy model = acktr_disc.load(policy_fn, env, seed=0, total_timesteps=1, nprocs=4, filename="models/acktr/%s" % FLAGS.file) nstack = 4 nh, nw, nc = env.observation_space.shape history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8) while True: obs, done = env.reset(), False history = update_history(history, obs) episode_rew = 0 while not done: env.render() action = model.step(history)[0][0] obs, rew, done, _ = env.step(action) history = update_history(history, obs) episode_rew += rew print("action : %s reward : %s" % (action, rew)) print("Episode reward", episode_rew)
def train_dqn(env_id, num_timesteps): """Train a dqn model. Parameters ------- env_id: environment to train on num_timesteps: int number of env steps to optimizer for """ # 1. Create gym environment env = gym.make(FLAGS.env) # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) # 4. Create a CNN model for Q-Function model = cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=FLAGS.dueling ) # 5. Train the model act = deepq.learn( env, q_func=model, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=FLAGS.prioritized, callback=deepq_callback ) act.save("mario_model.pkl") env.close()
def _thunk(): # 1. Create gym environment env = gym.make(env_id) env.seed(seed + rank) if logger.get_dir(): env = bench.Monitor( env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) return env
summary_placeholders = [ tf.placeholder(tf.float32) for _ in range(len(summary_vars)) ] update_ops = [ summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars)) ] summary_op = tf.summary.merge_all() return summary_placeholders, update_ops, summary_op if __name__ == "__main__": # 환경과 DQN 에이전트 생성 env = gym.make("ppaquette/SuperMarioBros-1-1-v0") # Apply action space wrapper env = MarioActionSpaceWrapper(env) # Apply observation space wrapper to reduce input size env = ProcessFrame84(env) n_action = 5 agent = DQNAgent(n_action=n_action) scores, episodes, global_step = [], [], 0 for e in range(EPISODES): done = False dead = False step, score, start_life = 0, 0, 5 observe = env.reset() # [224, 256, 3] -> [84, 84, 1]
summary_vars = [episode_total_reward, episode_avg_max_q, episode_duration, episode_avg_loss] summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))] update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))] summary_op = tf.summary.merge_all() return summary_placeholders, update_ops, summary_op if __name__ == "__main__": # 환경과 DQN 에이전트 생성 # 환경 종류 : SuperMarioBros-1-1-v0([224, 226, 3]), SuperMarioBros-1-1-Tiles-v0([13, 16, 1]) # meta-SuperMarioBros-Tiles-v0 (클리어 시 다음 스테이지로) env = gym.make("ppaquette/SuperMarioBros-1-1-Tiles-v0") # Apply action space wrapper env = MarioActionSpaceWrapper(env) # Apply observation space wrapper to reduce input size [224, 256, 3] -> [84, 84, 1] # env = ProcessFrame84(env) n_action = 7 agent = DQNAgent(n_action=n_action) scores, episodes, global_step = [], [], 0 for e in range(EPISODES): done = False dead = False step, score, start_life = 0, 0, 5 observe = env.reset() # [13, 16, 1]
import gym import gym_pull import super_mario_bros from wrappers import ProcessFrame84, MarioActionSpaceWrapper #gym_pull.pull('github.com/ppaquette/gym-super-mario') env = gym.make('ppaquette/meta-SuperMarioBros-v0') print(env.observation_space, env.action_space) env = MarioActionSpaceWrapper(env) print(env.observation_space, env.action_space) env = ProcessFrame84(env) print(env.observation_space, env.action_space) env.reset()
def main(): MAX_BUFFER_SIZE = 100000 MAX_EPISODES = 10000 TRAIN_EPISODE = 100 TARGET_UPDATE_EPS = 1000 batch_size = 32 n_size = 84 discount = 0.99 checkpoint_dir = './checkpoints' save_file_name = 'mario_weight_2.ckpt' # 1. Create gym environment env = gym.make("ppaquette/SuperMarioBros-1-1-v0") # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) #replay_buffer = PrioritizedReplayBuffer(MAX_BUFFER_SIZE, alpha=prioritized_replay_alpha) replay_buffer = ReplayBuffer(MAX_BUFFER_SIZE) sess = tf.Session() mainDQN = DQN(sess, name="main") targetDQN = DQN(sess, name="target") dqn_var_list = targetDQN.var_list sess.run(tf.global_variables_initializer()) copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) saver = tf.train.Saver(var_list=dqn_var_list) for eps in range(MAX_EPISODES): # decaying epsilon greedy e = 1. / ((eps / 10) + 1) done = False step_count = 0 state = env.reset() state_queue = deque(maxlen=4) next_state_queue = deque(maxlen=4) state_queue.append(state) next_state_queue.append(state) prev_100 = 0 curr_100 = 0 while not done: step_count += 1 # cumulate 4 frames if step_count < 4: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) state_queue.append(next_state) next_state_queue.append(next_state) continue # training starts if np.random.rand() < e: action = env.action_space.sample() else: # Choose an action by greedily from the Q-network action = np.argmax( mainDQN.predict( np.reshape(np.array(state_queue), [1, n_size, n_size, 4]))) # Get new state and reward from environment next_state, reward, done, _ = env.step(action) if done: # Penalty reward = -100 curr_100 += reward next_state_queue.append(next_state) replay_buffer.add(np.array(state_queue), action, reward, np.array(next_state_queue), done) if step_count % TRAIN_EPISODE == 0: states, actions, rewards, next_states, _ = replay_buffer.sample( batch_size) states, next_states = np.reshape( states, [batch_size, n_size, n_size, 4]), np.reshape( next_states, [batch_size, n_size, n_size, 4]) Q_t = targetDQN.predict(next_states) Q_m = mainDQN.predict(states) Q_t = np.max(Q_t, axis=1) estimates = rewards + discount * Q_t Q_m[np.arange(batch_size), actions] = estimates loss = mainDQN.update(states, Q_m) print("eps: {} step: {} loss: {}".format( eps, step_count, loss)) if curr_100 > prev_100: save_path = saver.save( sess, os.path.join(checkpoint_dir, save_file_name)) print("Model saved in file: %s" % save_path) prev_100 = curr_100 curr_100 = 0 if step_count % TARGET_UPDATE_EPS == 0: sess.run(copy_ops) state_queue.append(next_state)