def __init__(self, playback_mode, env, render=True, mod=None): self._playback_mode = playback_mode self._env = env self._render = render self._sess = tf.Session() self._agent = DDQNAgent(self._sess, DIM_STATE, DIM_ACTION, LR, TAU, net_name='flat') self._sess.run(tf.global_variables_initializer()) self._agent.update_target_paras() self._saver = tf.train.Saver() self._replay_buffer = ReplayBuffer(BUFFER_SIZE) self._explorer = Explorer(EPS_BEGIN, EPS_END, EPS_STEPS, playback_mode) self.summary = Summary(self._sess, DIR_SUM) self.summary.add_variable(tf.Variable(0.), 'reward') self.summary.add_variable(tf.Variable(0.), 'loss') self.summary.add_variable(tf.Variable(0.), 'maxq') self.summary.build() self.summary.write_variables(FLAGS) self._steps = 0 if mod and os.path.exists(FLAGS.dir_mod.format(mod)): checkpoint = tf.train.get_checkpoint_state(FLAGS.dir_mod.format(mod)) self._saver.restore(self._sess, save_path=checkpoint.model_checkpoint_path) print("Loaded checkpoints {0}".format(checkpoint.model_checkpoint_path))
def __init__(self, playback_mode, mod=None, net_name='pong_syr'): self._playback_mode = playback_mode self._last_reward = 0 super(DqnHalfPongSyr, self).__init__(force_game_fps=8, run_real_time=playback_mode) self._last_state = None self._last_action = np.zeros(DIM_ACTION) self._last_action[1] = 1 self.sess = tf.Session() self.agent = DDQNAgent(self.sess, DIM_STATE, DIM_ACTION, LR, TAU, net_name=net_name) self.sess.run(tf.global_variables_initializer()) self.agent.update_target_paras() self.saver = tf.train.Saver() self.replay_buffer = ReplayBuffer(BUFFER_SIZE) self.explorer = Explorer(EPS_BEGIN, EPS_END, EPS_STEPS, playback_mode) self.summary = Summary(self.sess, DIR_SUM) self.summary.add_variable(tf.Variable(0.), 'reward') self.summary.add_variable(tf.Variable(0.), 'loss') self.summary.build() self.summary.write_variables(FLAGS) self._steps = 0 self._sum_reward = [0] self._dif_reward = deque(maxlen=EP_STEPS) if mod and os.path.exists(FLAGS.dir_mod.format(mod)): checkpoint = tf.train.get_checkpoint_state( FLAGS.dir_mod.format(mod)) self.saver.restore(self.sess, save_path=checkpoint.model_checkpoint_path) print("Loaded checkpoints {0}".format( checkpoint.model_checkpoint_path))
class DdqnPong(): def __init__(self, playback_mode, env, render=True, mod=None): self._playback_mode = playback_mode self._env = env self._render = render self._sess = tf.Session() self._agent = DDQNAgent(self._sess, DIM_STATE, DIM_ACTION, LR, TAU, net_name='flat') self._sess.run(tf.global_variables_initializer()) self._agent.update_target_paras() self._saver = tf.train.Saver() self._replay_buffer = ReplayBuffer(BUFFER_SIZE) self._explorer = Explorer(EPS_BEGIN, EPS_END, EPS_STEPS, playback_mode) self.summary = Summary(self._sess, DIR_SUM) self.summary.add_variable(tf.Variable(0.), 'reward') self.summary.add_variable(tf.Variable(0.), 'loss') self.summary.add_variable(tf.Variable(0.), 'maxq') self.summary.build() self.summary.write_variables(FLAGS) self._steps = 0 if mod and os.path.exists(FLAGS.dir_mod.format(mod)): checkpoint = tf.train.get_checkpoint_state(FLAGS.dir_mod.format(mod)) self._saver.restore(self._sess, save_path=checkpoint.model_checkpoint_path) print("Loaded checkpoints {0}".format(checkpoint.model_checkpoint_path)) def start(self): for ep in range(MAX_EP): sum_reward = 0 last_state = [] last_img = self._env.reset() last_img = (pre_process_image(last_img, SCREEN_WIDTH, SCREEN_HEIGHT)) for _ in range(STATE_FRAMES): last_state.append(last_img) last_state = np.dstack(last_state) for step in range(EP_STEPS): if self._render: env.render() q_value = self._agent.predict([last_state])[0] last_max_qvalue = np.max(q_value) act_1_hot = self._explorer.get_action(q_value) act_index = np.argmax(act_1_hot) # ['NOOP-XX', 'FIRE->NOOP->0', 'RIGHT->UP->1', 'LEFT->DOWN->2', 'RIGHTFIRE-XX', 'LEFTFIRE-XX'] observation, reward, done, info = env.step(act_index+1) if reward == 0: reward += 0.1 state = pre_process_image(observation, SCREEN_WIDTH, SCREEN_HEIGHT) state = np.reshape(state, (SCREEN_WIDTH, SCREEN_HEIGHT, 1)) state = np.append(state, last_state[:, :, :3], axis=2) self._replay_buffer.add(last_state, act_1_hot, reward, state, done) loss = None if not self._playback_mode and len(self._replay_buffer) > OBV_STEPS: loss = self._train() last_state = state sum_reward += reward self._steps += 1 if done or step == EP_STEPS - 1: print('| Step: %i' % self._steps, '| Episode: %i' % ep, '| Epoch: %i' % step, '| qvalue: %.5f' % last_max_qvalue, '| Sum_Reward: %i' % sum_reward) if loss != None: self.summary.run(feed_dict={ 'loss': loss, 'reward': sum_reward, 'maxq': last_max_qvalue}) break def _train(self): batch_state, batch_action, batch_reward, batch_state_next, batch_done = \ self._replay_buffer.sample_batch(MINI_BATCH) q_value = self._agent.predict(batch_state_next) max_q_value_index = np.argmax(q_value, axis=1) target_q_value = self._agent.predict_target(batch_state_next) double_q = target_q_value[range(len(target_q_value)), max_q_value_index] batch_y = [] for r, q, d in zip(batch_reward, double_q, batch_done): if d: batch_y.append(r) else: batch_y.append(r + GAMMA * q) opt, loss = self._agent.train(batch_state, batch_action, batch_y) self._agent.update_target_paras() if not self._steps % CKP_STEP: self._saver.save(self._sess, DIR_MOD + '/net', global_step=self._steps) print('Mod saved!') return loss
class DqnHalfPongSyr(PyGamePlayer): def __init__(self, playback_mode, mod=None, net_name='pong_syr'): self._playback_mode = playback_mode self._last_reward = 0 super(DqnHalfPongSyr, self).__init__(force_game_fps=8, run_real_time=playback_mode) self._last_state = None self._last_action = np.zeros(DIM_ACTION) self._last_action[1] = 1 self.sess = tf.Session() self.agent = DDQNAgent(self.sess, DIM_STATE, DIM_ACTION, LR, TAU, net_name=net_name) self.sess.run(tf.global_variables_initializer()) self.agent.update_target_paras() self.saver = tf.train.Saver() self.replay_buffer = ReplayBuffer(BUFFER_SIZE) self.explorer = Explorer(EPS_BEGIN, EPS_END, EPS_STEPS, playback_mode) self.summary = Summary(self.sess, DIR_SUM) self.summary.add_variable(tf.Variable(0.), 'reward') self.summary.add_variable(tf.Variable(0.), 'loss') self.summary.build() self.summary.write_variables(FLAGS) self._steps = 0 self._sum_reward = [0] self._dif_reward = deque(maxlen=EP_STEPS) if mod and os.path.exists(FLAGS.dir_mod.format(mod)): checkpoint = tf.train.get_checkpoint_state( FLAGS.dir_mod.format(mod)) self.saver.restore(self.sess, save_path=checkpoint.model_checkpoint_path) print("Loaded checkpoints {0}".format( checkpoint.model_checkpoint_path)) def get_keys_pressed(self, screen_array, feedback, terminal): _, screen_binary = cv2.threshold( cv2.cvtColor(screen_array, cv2.COLOR_BGR2GRAY), 1, 255, cv2.THRESH_BINARY) if self._last_state is None: self._last_state = np.stack(tuple(screen_binary for _ in range(STATE_FRAMES)), axis=2) return DqnHalfPongSyr._key_presses_from_action(self._last_action) screen_binary = np.reshape(screen_binary, (SCREEN_WIDTH, SCREEN_HEIGHT, 1)) current_state = np.append(self._last_state[:, :, 1:], screen_binary, axis=2) if not self._playback_mode: self.replay_buffer.add(self._last_state, self._last_action, feedback, current_state, terminal) if len(self.replay_buffer) > OBV_STEPS: loss = self._train() self._sum_reward.append(feedback) if feedback != 0.0: self._dif_reward.append(feedback) if not self._steps % EP_STEPS: print( '| Step: %i' % self._steps, '| Epoch: %i' % (self._steps / EP_STEPS), '| Sum_Reward: %i' % sum(self._sum_reward), '| Dif_Reward: %.4f' % (sum(self._dif_reward) / len(self._dif_reward))) if not self._steps % (EP_STEPS * 10): self.summary.run(feed_dict={ 'loss': loss, 'reward': sum(self._sum_reward) }) self._sum_reward = [0] self._last_state = current_state self._last_action = self._get_action() return DqnHalfPongSyr._key_presses_from_action(self._last_action) def _get_action(self): target_q = self.agent.predict([self._last_state])[0] return self.explorer.get_action(target_q) def _train(self): self._steps += 1 batch_state, batch_action, batch_reward, batch_state_next, batch_done = \ self.replay_buffer.sample_batch(MINI_BATCH) q_value = self.agent.predict(batch_state_next) max_q_value_index = np.argmax(q_value, axis=1) target_q_value = self.agent.predict_target(batch_state_next) double_q = target_q_value[range(len(target_q_value)), max_q_value_index] batch_y = [] for r, q, d in zip(batch_reward, double_q, batch_done): if d: batch_y.append(r) else: batch_y.append(r + GAMMA * q) opt, loss = self.agent.train(batch_state, batch_action, batch_y) self.agent.update_target_paras() if not self._steps % CKP_STEP: self.saver.save(self.sess, DIR_MOD + '/net', global_step=self._steps) print('Mod saved!') return loss def get_feedback(self): from Env.games.half_pong import score # get the difference in score between this and the last run score_change = (score - self._last_reward) self._last_reward = score return float(score_change), score_change == -1 @staticmethod def _key_presses_from_action(action_set): if action_set[0] == 1: return [K_DOWN] elif action_set[1] == 1: return [] elif action_set[2] == 1: return [K_UP] raise Exception("Unexpected action") def start(self): super(DqnHalfPongSyr, self).start() from Env.games.half_pong import run run(screen_width=SCREEN_WIDTH, screen_height=SCREEN_HEIGHT)
class DdqnBirdSyr(): def __init__(self, playback_mode, mod=None): self._playback_mode = playback_mode env = FlappyBird(pipe_gap=200) self._ple = PLE(env, fps=30, display_screen=DISPLAY) self._ple.init() self._sess = tf.Session() self._agent = DDQNAgent(self._sess, DIM_STATE, DIM_ACTION, LR, TAU, net_name='cnn_bird') self._sess.run(tf.global_variables_initializer()) self._agent.update_target_paras() self._saver = tf.train.Saver() self._replay_buffer = ReplayBuffer(BUFFER_SIZE) self._explorer = Explorer(EPS_BEGIN, EPS_END, EPS_STEPS, playback_mode) self.summary = Summary(self._sess, DIR_SUM) self.summary.add_variable(tf.Variable(0.), 'reward') self.summary.add_variable(tf.Variable(0.), 'loss') self.summary.add_variable(tf.Variable(0.), 'maxq') self.summary.build() self.summary.write_variables(FLAGS) self._steps = 0 if mod and os.path.exists(FLAGS.dir_mod.format(mod)): checkpoint = tf.train.get_checkpoint_state(FLAGS.dir_mod.format(mod)) self._saver.restore(self._sess, save_path=checkpoint.model_checkpoint_path) print("Loaded checkpoints {0}".format(checkpoint.model_checkpoint_path)) def start(self): for ep in range(MAX_EP): sum_reward = 0 last_state = [] for _ in range(STATE_FRAMES): last_state.append(self._ple.getScreenGrayscale()) last_state = np.dstack(last_state) last_max_qvalue = 0 for step in range(EP_STEPS): time.sleep(0.01) if not step % STATE_FRAMES: q_value = self._agent.predict([last_state])[0] last_max_qvalue = np.max(q_value) act_1_hot = self._explorer.get_action(q_value) act_index = np.argmax(act_1_hot) else: # do nothing act_index = 1 act_1_hot = np.zeros(DIM_ACTION) act_1_hot[act_index] = 1 reward = self._ple.act(self._ple.getActionSet()[act_index]) if reward == 0: reward = 0.1 elif reward == -5: reward = -1 state = np.reshape(self._ple.getScreenGrayscale(), (SCREEN_WIDTH, SCREEN_HEIGHT, 1)) state = np.append(state, last_state[:, :, :3], axis=2) done = False if self._ple.game_over(): done = True self._replay_buffer.add(last_state, act_1_hot, reward, state, done) loss = None if not self._playback_mode and len(self._replay_buffer) > OBV_STEPS: loss = self._train() last_state = state sum_reward += reward self._steps += 1 if done or step == EP_STEPS - 1: print('| Step: %i' % self._steps, '| Episode: %i' % ep, '| Epoch: %i' % step, '| qvalue: %.5f' % last_max_qvalue, '| Sum_Reward: %i' % sum_reward) if loss != None: self.summary.run(feed_dict={ 'loss': loss, 'reward': sum_reward, 'maxq': last_max_qvalue}) self._ple.reset_game() break def _train(self): batch_state, batch_action, batch_reward, batch_state_next, batch_done = \ self._replay_buffer.sample_batch(MINI_BATCH) q_value = self._agent.predict(batch_state_next) max_q_value_index = np.argmax(q_value, axis=1) target_q_value = self._agent.predict_target(batch_state_next) double_q = target_q_value[range(len(target_q_value)), max_q_value_index] batch_y = [] for r, q, d in zip(batch_reward, double_q, batch_done): if d: batch_y.append(r) else: batch_y.append(r + GAMMA * q) opt, loss = self._agent.train(batch_state, batch_action, batch_y) self._agent.update_target_paras() if not self._steps % CKP_STEP: self._saver.save(self._sess, DIR_MOD + '/net', global_step=self._steps) print('Mod saved!') return loss