def play(self, current_weights, epsilon=0.01): tf.config.set_visible_devices([], 'GPU') self.qnet.set_weights(current_weights) episode_steps, episode_rewards = 0, 0 frame = preprocess_frame(self.env.reset()) for _ in range(self.n_frames): self.frames.append(frame) state = np.stack(self.frames, axis=2)[np.newaxis, ...] done = False while not done: state = np.stack(self.frames, axis=2)[np.newaxis, ...] action = self.qnet.sample_action(state, epsilon=epsilon) next_frame, reward, done, _ = self.env.step(action) self.frames.append(preprocess_frame(next_frame)) episode_steps += 1 episode_rewards += reward if episode_steps > 1000 and episode_rewards < 10: break return episode_steps, episode_rewards
def test_play(self, n_testplay=1, monitor_dir=None, checkpoint_path=None): if checkpoint_path: env = gym.make(self.env_name) frame = util.preprocess_frame(env.reset()) frames = collections.deque([frame] * self.n_frames, maxlen=self.n_frames) state = np.stack(frames, axis=2)[np.newaxis, ...] self.qnet(state) self.qnet.load_weights(checkpoint_path) if monitor_dir: monitor_dir = Path(monitor_dir) if monitor_dir.exists(): shutil.rmtree(monitor_dir) monitor_dir.mkdir() env = gym.wrappers.Monitor(gym.make(self.env_name), monitor_dir, force=True, video_callable=(lambda ep: True)) else: env = gym.make(self.env_name) scores = [] steps = [] for _ in range(n_testplay): frame = util.preprocess_frame(env.reset()) frames = collections.deque([frame] * self.n_frames, maxlen=self.n_frames) done = False episode_steps = 0 episode_rewards = 0 while not done: state = np.stack(frames, axis=2)[np.newaxis, ...] epsilon = 0 if self.use_noisy else 0.05 action = self.qnet.sample_action(state, epsilon) next_frame, reward, done, _ = env.step(action) frames.append(util.preprocess_frame(next_frame)) episode_rewards += reward episode_steps += 1 if episode_steps > 500 and episode_rewards < 3: #: ゲーム開始(action: 0)しないまま停滞するケースへの対処 break scores.append(episode_rewards) steps.append(episode_steps) return scores, steps
def define_network(self): #: hide GPU from remote actor tf.config.set_visible_devices([], 'GPU') #: define by run frame = preprocess_frame(self.env.reset()) for _ in range(self.n_frames): self.frames.append(frame) state = np.stack(self.frames, axis=2)[np.newaxis, ...] self.local_qnet(state)
def play_with_video(self, checkpoint_path, monitor_dir, epsilon=0.01): monitor_dir = Path(monitor_dir) if monitor_dir.exists(): shutil.rmtree(monitor_dir) monitor_dir.mkdir() env = gym.wrappers.Monitor(gym.make(self.env_name), monitor_dir, force=True, video_callable=(lambda ep: True)) frame = preprocess_frame(env.reset()) frames = collections.deque([frame] * self.n_frames, maxlen=self.n_frames) state = np.stack(frames, axis=2)[np.newaxis, ...] self.qnet(state) self.qnet.load_weights(checkpoint_path) episode_steps, episode_rewards = 0, 0 state = np.stack(self.frames, axis=2)[np.newaxis, ...] done = False while not done: state = np.stack(self.frames, axis=2)[np.newaxis, ...] action = self.qnet.sample_action(state, epsilon) next_frame, reward, done, _ = self.env.step(action) self.frames.append(preprocess_frame(next_frame)) episode_steps += 1 episode_rewards += reward return episode_rewards
def define_network(self): env = gym.make(self.env_name) frame = preprocess_frame(env.reset()) frames = [frame] * self.n_frames state = np.stack(frames, axis=2)[np.newaxis, ...] #: define by run self.qnet(state) self.target_qnet(state) self.target_qnet.set_weights(self.qnet.get_weights()) return self.qnet.get_weights()
def learn(self, n_episodes, logdir="log"): logdir = Path(__file__).parent / logdir if logdir.exists(): shutil.rmtree(logdir) self.summary_writer = tf.summary.create_file_writer(str(logdir)) for episode in range(1, n_episodes + 1): env = gym.make(self.env_name) frame = util.preprocess_frame(env.reset()) frames = collections.deque([frame] * self.n_frames, maxlen=self.n_frames) episode_rewards = 0 episode_steps = 0 done = False lives = 5 while not done: self.steps, episode_steps = self.steps + 1, episode_steps + 1 state = np.stack(frames, axis=2)[np.newaxis, ...] action = self.qnet.sample_action(state, self.epsilon) next_frame, reward, done, info = env.step(action) episode_rewards += reward frames.append(util.preprocess_frame(next_frame)) next_state = np.stack(frames, axis=2)[np.newaxis, ...] if info["ale.lives"] != lives: lives = info["ale.lives"] transition = (state, action, reward, next_state, True) else: transition = (state, action, reward, next_state, done) self.replay_buffer.push(transition) if len(self.replay_buffer) >= 50000: if self.steps % self.update_period == 0: if self.use_categorical: loss = self.update_categorical_network() else: loss = self.update_network() with self.summary_writer.as_default(): tf.summary.scalar("loss", loss, step=self.steps) tf.summary.scalar("buffer_size", len(self.replay_buffer), step=self.steps) tf.summary.scalar("epsilon", self.epsilon, step=self.steps) tf.summary.scalar("train_score", episode_rewards, step=self.steps) tf.summary.scalar("train_steps", episode_steps, step=self.steps) if self.steps % self.target_update_period == 0: self.target_qnet.set_weights(self.qnet.get_weights()) print( f"Episode: {episode}, score: {episode_rewards}, steps: {episode_steps}" ) if episode % 20 == 0: test_scores, test_steps = self.test_play(n_testplay=1) with self.summary_writer.as_default(): tf.summary.scalar("test_score", test_scores[0], step=self.steps) tf.summary.scalar("test_step", test_steps[0], step=self.steps) for layer in self.qnet.layers[-3:]: for var in layer.variables: tf.summary.histogram(var.name, var, step=self.steps) if episode % 500 == 0: self.qnet.save_weights("checkpoints/qnet")
def rollout(self, current_weights): tf.config.set_visible_devices([], 'GPU') self.local_qnet.set_weights(current_weights) state = np.stack(self.frames, axis=2)[np.newaxis, ...] for _ in range(self.buffer_size): state = np.stack(self.frames, axis=2)[np.newaxis, ...] action = self.local_qnet.sample_action(state, self.epsilon) next_frame, reward, done, info = self.env.step(action) self.episode_steps += 1 self.episode_rewards += reward self.frames.append(preprocess_frame(next_frame)) next_state = np.stack(self.frames, axis=2)[np.newaxis, ...] if self.lives != info["ale.lives"]: #: loss of life as episode ends transition = (state, action, reward, next_state, True) self.lives = info["ale.lives"] else: transition = (state, action, reward, next_state, done) self.local_buffer.push(transition) if done: print(self.pid, self.episode_steps, self.episode_rewards, round(self.epsilon, 3)) self.episode_steps = 0 self.episode_rewards = 0 self.lives = 5 frame = preprocess_frame(self.env.reset()) for _ in range(self.n_frames): self.frames.append(frame) experiences = self.local_buffer.pull() states = np.vstack([exp.state for exp in experiences]).astype(np.float32) actions = np.vstack([exp.action for exp in experiences]).astype(np.float32) rewards = np.array([exp.reward for exp in experiences]).reshape(-1, 1) next_states = np.vstack([exp.next_state for exp in experiences]).astype(np.float32) dones = np.array([exp.done for exp in experiences]).reshape(-1, 1) next_actions, next_qvalues = self.local_qnet.sample_actions( next_states) next_actions_onehot = tf.one_hot(next_actions, self.action_space) max_next_qvalues = tf.reduce_sum(next_qvalues * next_actions_onehot, axis=1, keepdims=True) TQ = rewards + self.gamma**(self.nstep) * (1 - dones) * max_next_qvalues qvalues = self.local_qnet(states) actions_onehot = tf.one_hot(actions.flatten().astype(np.int32), self.action_space) Q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True) priorities = ((np.abs(TQ - Q) + 0.001)**self.alpha).flatten() experiences = [zlib.compress(pickle.dumps(exp)) for exp in experiences] return priorities, experiences, self.pid
x2 = self.dense2(x) advantages = self.advantages(x2) advantages = tf.reshape(advantages, (batch_size, self.action_space, self.n_atoms)) advantages_mean = tf.reduce_mean(advantages, axis=1, keepdims=True) advantages_scaled = advantages - advantages_mean logits = value + advantages_scaled probs = tf.nn.softmax(logits, axis=2) return probs if __name__ == "__main__": import util import gym env = gym.make("BreakoutDeterministic-v4") frame = util.preprocess_frame(env.reset()) frames = [frame] * 4 state = np.stack(frames, axis=2)[np.newaxis, ...] action_space = 4 model = NoisyQNetwork(action_space) out = model(state) import pdb pdb.set_trace() print(out)