class Execute: def __init__(self, path): self.config = Configuration.construct(path) self.env = Environment(self.config) self.memory = ReplayMemory(self.config) self.model = Model(self.config) self.ep = None def get_epsilon(self, is_play): if is_play: return self.config.play.ep ep_start = self.config.train.ep.start ep_final = self.config.train.ep.final ep_num_frames = self.config.train.ep.num_frames decay = (ep_start - ep_final) / ep_num_frames if self.ep is None: self.ep = ep_start self.ep = max(self.ep - decay, ep_final) return self.ep def log(self, **kawrgs): log = "" for name, value in kawrgs.items(): log += f"{name}: {value}, " print(log) def run_episode(self, episode=1, steps=0, is_play=True, debug=False): config = self.config self.env.reset() action = 1 _, _, curr_state, is_done = self.env.step(action) total_reward = 0 update_net = 0; C = config.train.network_update_freq t = 0; T = config.max_episode_length while not is_done and t < T: if t % config.action_repeat == 0: ep = self.get_epsilon(is_play) action = self.model.choose_action(curr_state, ep) prev_state, reward, curr_state, is_done = self.env.step(action) total_reward += reward t += 1 if is_play: self.env.render("human") if debug and t % config.play.debug.time == 0: self.log(ftype=self.env.get_frame_type(), action=action, reward=total_reward) continue self.memory.add((prev_state, action, reward, curr_state, is_done)) if self.memory.get_size() > config.train.replay_start_size: for i in range(config.train.batch_run): batch = self.memory.sample() self.model.optimize(batch) steps = (steps + 1) % C if steps % C == 0: self.model.update_qhat() update_net += 1 if not is_play and debug and episode % config.train.debug.time == 0: self.log(ftype=self.env.get_frame_type(), total_reward=total_reward, network_update_steps=update_net, episode_time=t, ep=ep) return total_reward, steps def load_model(self): ftype = self.env.get_frame_type() in_size = self.env.get_in_size() num_actions = self.env.get_num_actions() self.model.load_model(ftype, in_size, num_actions) def play(self, debug=False): self.load_model() for ep in range(1): self.run_episode(is_play=True, debug=debug) def train(self, debug=False): self.load_model() optimize_steps = 0 episodes = self.config.train.episodes for episode in range(1, episodes+1): reward, steps = self.run_episode(episode=episode, steps=optimize_steps, is_play=False, debug=debug) optimize_steps += steps if episode % self.config.train.save_model_episode == 0: self.model.save_model() self.model.update_qhat() self.model.save_model() def close(self): self.env.close() self.memory.close()
class Agent: def __init__(self, args): # which environment to load from the opencv database self.env_id = "PongNoFrameskip-v4" # create the environment self.env = Environment(self.env_id) # part of the q-value formula self.discount_factor = 0.99 self.batch_size = 64 # how often to update the network (backpropogation) self.update_frequency = 4 # often synchronize with the target network self.target_network_update_freq = 1000 # keeps track of the frames for training, and retrieves them in batches self.agent_history_length = 4 self.memory = ReplayMemory(capacity=10000, batch_size=self.batch_size) # two neural networks. One for main and one for target self.main_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length) self.target_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length) # adam optimizer. just a standard procedure self.optimizer = Adam(learning_rate=1e-4, epsilon=1e-6) # we start with a high exploration rate then slowly decrease it self.init_explr = 1.0 self.final_explr = 0.1 self.final_explr_frame = 1000000 self.replay_start_size = 10000 # metrics for the loss self.loss = tf.keras.losses.Huber() # this will be the mean of 100 last rewards self.loss_metric = tf.keras.metrics.Mean(name="loss") # comes from the q loss below self.q_metric = tf.keras.metrics.Mean(name="Q_value") # what is the max number of frames to train. probably won't reach here. self.training_frames = int(1e7) # path to save the checkpoints, logs and the weights self.checkpoint_path = "./checkpoints/" + args.run_name self.tensorboard_writer = tf.summary.create_file_writer(self.checkpoint_path + "/runs/") self.print_log_interval = 10 self.save_weight_interval = 10 self.env.reset() # calculate the network loss on the replay buffer (Q-learning) def update_main_q_network(self, state_batch, action_batch, reward_batch, next_state_batch, terminal_batch): with tf.GradientTape() as tape: ## THIS IS WHERE THE MAGIC HAPPENS! ## L = Q(s, a) - (r + discount_factor* Max Q(s’, a)) next_state_q = self.target_network(next_state_batch) next_state_max_q = tf.math.reduce_max(next_state_q, axis=1) expected_q = reward_batch + self.discount_factor * next_state_max_q * (1.0 - tf.cast(terminal_batch, tf.float32)) main_q = tf.reduce_sum(self.main_network(state_batch) * tf.one_hot(action_batch, self.env.get_action_space_size(), 1.0, 0.0), axis=1) loss = self.loss(tf.stop_gradient(expected_q), main_q) gradients = tape.gradient(loss, self.main_network.trainable_variables) clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients] self.optimizer.apply_gradients(zip(clipped_gradients, self.main_network.trainable_variables)) self.loss_metric.update_state(loss) self.q_metric.update_state(main_q) return loss # calculate the network loss on the replay buffer (Double Q-learning) def update_main_dq_network(self, state_batch, action_batch, reward_batch, next_state_batch, terminal_batch): with tf.GradientTape() as tape: # THIS IS WHERE THE MAGIC HAPPENS! ## here we maintain two Q values: one to maximize the reward in the next state and one to update current state q_online = self.main_network(next_state_batch) # Use q values from online network action_q_online = tf.math.argmax(q_online, axis=1) # optimal actions from the q_online q_target = self.target_network(next_state_batch) # q values from target netowkr ddqn_q = tf.reduce_sum(q_target * tf.one_hot(action_q_online, self.env.get_action_space_size(), 1.0, 0.0), axis=1) expected_q = reward_batch + self.discount_factor * ddqn_q * (1.0 - tf.cast(terminal_batch, tf.float32)) # Corresponds to equation (4) in ddqn paper main_q = tf.reduce_sum(self.main_network(state_batch) * tf.one_hot(action_batch, self.env.get_action_space_size(), 1.0, 0.0), axis=1) loss = self.loss(tf.stop_gradient(expected_q), main_q) gradients = tape.gradient(loss, self.main_network.trainable_variables) clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients] self.optimizer.apply_gradients(zip(clipped_gradients, self.main_network.trainable_variables)) self.loss_metric.update_state(loss) self.q_metric.update_state(main_q) return loss # get the next action index based on the state (84,84,4) and exploration rate def get_action(self, state, exploration_rate): recent_state = tf.expand_dims(state, axis=0) if tf.random.uniform((), minval=0, maxval=1, dtype=tf.float32) < exploration_rate: action = tf.random.uniform((), minval=0, maxval=self.env.get_action_space_size(), dtype=tf.int32) else: q_value = self.main_network(tf.cast(recent_state, tf.float32)) action = tf.cast(tf.squeeze(tf.math.argmax(q_value, axis=1)), dtype=tf.int32) return action # get the epsilon value for the current based. Similar to https://openai.com/blog/openai-baselines-dqn/ def get_eps(self, current_step, terminal_eps=0.01, terminal_frame_factor=25): terminal_eps_frame = self.final_explr_frame * terminal_frame_factor if current_step < self.replay_start_size: eps = self.init_explr elif self.replay_start_size <= current_step and current_step < self.final_explr_frame: eps = (self.final_explr - self.init_explr) / (self.final_explr_frame - self.replay_start_size) * (current_step - self.replay_start_size) + self.init_explr elif self.final_explr_frame <= current_step and current_step < terminal_eps_frame: eps = (terminal_eps - self.final_explr) / (terminal_eps_frame - self.final_explr_frame) * (current_step - self.final_explr_frame) + self.final_explr else: eps = terminal_eps return eps # copy over the weights between the main and target network to synchronize def update_target_network(self): main_vars = self.main_network.trainable_variables target_vars = self.target_network.trainable_variables for main_var, target_var in zip(main_vars, target_vars): target_var.assign(main_var) def train(self, algorithm='q'): total_step = 0 episode = 0 latest_mean_score = -99.99 latest_100_score = deque(maxlen=100) # this is kinda arbitrary but looks like the best bot reach 20 when they are done training in this game max_reward = 20.0 # train until the mean reward reaches 20 while latest_mean_score < max_reward: # reset the variable for the upcoming episode state = self.env.reset() episode_step = 0 episode_score = 0.0 done = False while not done: # while the episode is not done, calculate the epsilon and get the next action eps = self.get_eps(tf.constant(total_step, tf.float32)) action = self.get_action(tf.constant(state), tf.constant(eps, tf.float32)) next_state, reward, done, info = self.env.step(action) episode_score += reward self.memory.push(state, action, reward, next_state, done) state = next_state # update the netwrok if (total_step % self.update_frequency == 0) and (total_step > self.replay_start_size): indices = self.memory.get_minibatch_indices() state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.generate_minibatch_samples(indices) if algorithm == 'q': self.update_main_q_network(state_batch, action_batch, reward_batch, next_state_batch, terminal_batch) else: self.update_main_dq_network(state_batch, action_batch, reward_batch, next_state_batch, terminal_batch) if (total_step % self.target_network_update_freq == 0) and (total_step > self.replay_start_size): loss = self.update_target_network() total_step += 1 episode_step += 1 if done: latest_100_score.append(episode_score) self.write_summary(episode, latest_100_score, episode_score, total_step, eps) episode += 1 if episode % self.print_log_interval == 0: print("Episode: ", episode) print("Latest 100 avg: {:.4f}".format(np.mean(latest_100_score))) print("Progress: {} / {} ( {:.2f} % )".format(total_step, self.training_frames, np.round(total_step / self.training_frames, 3) * 100)) latest_mean_score = np.mean(latest_100_score) if episode % self.save_weight_interval == 0: print("Saving weights...") self.main_network.save_weights(self.checkpoint_path + "/weights/episode_{}".format(episode)) # write the summaries back to the tensorboard def write_summary(self, episode, latest_100_score, episode_score, total_step, eps): with self.tensorboard_writer.as_default(): tf.summary.scalar("Reward", episode_score, step=episode) tf.summary.scalar("Latest 100 avg rewards", np.mean(latest_100_score), step=episode) tf.summary.scalar("Loss", self.loss_metric.result(), step=episode) tf.summary.scalar("Average Q", self.q_metric.result(), step=episode) tf.summary.scalar("Total Frames", total_step, step=episode) tf.summary.scalar("Epsilon", eps, step=episode) self.loss_metric.reset_states() self.q_metric.reset_states()
encoding_specific = args.frozen_model elif args.spatial_encoding == 'pc-gauss' or args.spatial_encoding == 'pc-gauss-softmax': encoding_specific = args.pc_gauss_sigma elif args.spatial_encoding == 'pc-dog': encoding_specific = '{}-{}'.format(args.pc_gauss_sigma, args.pc_diff_sigma) elif args.spatial_encoding == 'hex-trig': encoding_specific = args.hex_freq_coef elif args.spatial_encoding == 'tile-coding': encoding_specific = '{}tiles_{}bins'.format(args.n_tiles, args.n_bins) if not os.path.exists('data'): os.makedirs('data') fname = 'data/random_walk_{}_{}_{}to{}_{}dim_{}steps.npz'.format( args.spatial_encoding, encoding_specific, args.limit_low, args.limit_high, args.dim, args.n_steps, ) positions = np.zeros((args.n_steps, 2)) activations = np.zeros((args.n_steps, dim)) for n in range(args.n_steps): print('\x1b[2K\r Step {} of {}'.format(n + 1, args.n_steps), end="\r") activations[n, :], positions[n, :] = env.step() np.savez(fname, positions=positions, activations=activations)
ax = plt.axes(xlim=(args.limit_low, args.limit_high), ylim=(args.limit_low, args.limit_high)) line, = ax.plot([], [], lw=3) x = [] y = [] fig_pca = plt.figure() # ax_pca = plt.axes(xlim=(0, args.n_components), ylim=(0, args.n_components)) ax_pca = plt.axes() image = ax_pca.imshow( np.random.uniform(0, 1, size=(args.n_components, args.n_components))) plt.show(block=False) for n in range(n_steps): activations, pos = env.step() if all_activations is None: all_activations = activations.reshape((1, dim)).copy() else: all_activations = np.append(all_activations, activations.reshape((1, dim)), axis=0) if all_pos is None: all_pos = pos.reshape((1, 2)).copy() else: all_pos = np.append(all_pos, pos.reshape((1, 2)), axis=0) if show_pca: # pca.fit(all_activations)
class Game(object): """A single episode of interaction with the environment.""" def __init__(self, action_space_size: int, discount: float): self.environment = Environment() # Game specific environment. self.history = [] self.rewards = [] self.child_visits = [] self.root_values = [] self.action_space_size = action_space_size self.discount = discount def terminal(self) -> bool: # Game specific termination rules. pass def legal_actions(self) -> List[Action]: # Game specific calculation of legal actions. return [] def apply(self, action: Action): reward = self.environment.step(action) self.rewards.append(reward) self.history.append(action) def store_search_statistics(self, root: Node): sum_visits = sum(child.visit_count for child in root.children.values()) action_space = (Action(index) for index in range(self.action_space_size)) self.child_visits.append([ root.children[a].visit_count / sum_visits if a in root.children else 0 for a in action_space ]) self.root_values.append(root.value()) def make_image(self, state_index: int): # Game specific feature planes. return [] def make_target(self, state_index: int, num_unroll_steps: int, td_steps: int, to_play: Player): # The value target is the discounted root value of the search tree N steps # into the future, plus the discounted sum of all rewards until then. targets = [] for current_index in range(state_index, state_index + num_unroll_steps + 1): bootstrap_index = current_index + td_steps if bootstrap_index < len(self.root_values): value = self.root_values[ bootstrap_index] * self.discount**td_steps else: value = 0 if self.rewards: for i, reward in enumerate( self.rewards[current_index:bootstrap_index]): value += reward * self.discount**i # pytype: disable=unsupported-operands if current_index < len(self.root_values): if self.rewards: targets.append((value, self.rewards[current_index], self.child_visits[current_index])) else: targets.append( (value, 0, self.child_visits[current_index])) else: # States past the end of games are treated as absorbing states. targets.append((0, 0, [])) return targets def to_play(self) -> Player: return Player() def action_history(self) -> ActionHistory: return ActionHistory(self.history, self.action_space_size)