class QTrainer: def __init__(self, config_or_model, load_model=False): self.config = None self.model_loaded = False #load a saved model if load_model: print("Loading model from: {}".format(config_or_model)) load_path = Path(config_or_model) if (not load_path.exists()) or (not load_path.is_dir()): print("Error: directory doesn't exist") config_filename = load_path.joinpath("config.json") self.config = self.load_config(str(config_filename)) else: self.config = self.load_config(config_or_model) #select game self.game_name = self.config["game"] self.game = None if self.game_name == "snake": self.game = game.Snake elif self.game_name == "box": self.game = game.Box else: print("Error: unknown game {}".format(self.game_name)) self.nn_config = self.config["nn"] #parameters of experience memory self.memory_size = self.config["memory_size"] self.memory_alpha = self.config["memory_alpha"] self.memory_beta_start = self.config["memory_beta_start"] self.memory_beta_end = self.config["memory_beta_end"] self.memory_beta_num_steps = self.config["memory_beta_num_steps"] self.memory_beta_step = (self.memory_beta_end - self.memory_beta_start ) / self.memory_beta_num_steps self.exp_memory_start_size = self.config["memory_start_size"] #game parameters: image size, board size, num_goals, ... self.width = self.config["width"] self.height = self.config["height"] self.image_scale_factor = self.config["image_scale_factor"] self.num_goals = self.config["num_goals"] self.img_width = self.width * self.image_scale_factor self.img_height = self.height * self.image_scale_factor self.num_img_channels = self.game.num_channels self.num_actions = self.game.num_actions #random policy parameters self.epsilon_start = self.config["epsilon_start"] self.epsilon_min = self.config["epsilon_min"] self.num_epsilon_steps = self.config["num_epsilon_steps"] self.epsilon_step = (self.epsilon_start - self.epsilon_min) / self.num_epsilon_steps #scale rewards, training might be more stable if q-values converge to range [-1,1] self.scale_reward_max = None if "scale_reward_max" in self.config: self.scale_reward_max = self.config["scale_reward_max"] self.game.max_reward *= self.scale_reward_max self.game.min_reward *= self.scale_reward_max self.game.empty_reward *= self.scale_reward_max print("Scaling rewards by {}".format(self.scale_reward_max)) #frequence parameters of updating target network, output, saving, tensorboard, evaluation self.max_steps = self.config["max_steps"] self.output_freq = self.config["output_freq"] self.update_freq = self.config["update_freq"] self.target_network_update_mode = self.config[ "target_network_update_mode"] self.target_network_update_tau = None self.target_network_update_freq = None if self.target_network_update_mode == "hard": self.target_network_update_freq = self.config[ "target_network_update_freq"] else: self.target_network_update_tau = self.config[ "target_network_update_tau"] self.eval_freq = self.config["eval_freq"] self.eval_steps = self.config["eval_steps"] self.tensorboard_log_freq = self.config["tensorboard_log_freq"] self.tensorboard_log_path = self.config["tensorboard_log_path"] self.save_freq = self.config["save_freq"] self.save_path = self.config["save_path"] self.batch_size = self.config["batch_size"] #parameters that are actually changed while training, these need to be saved and loaded self.curr_step = 0 self.epsilon = self.epsilon_start self.memory_beta = self.memory_beta_start self.best_average_score = 0 #create experience memory self.exp_memory = ExperienceMemory(self.memory_size, self.img_width, self.img_height, self.num_img_channels, self.memory_alpha) #create QLearner object, load saved neural network model if necessary self.qlearner = None if load_model: load_path = str( Path(config_or_model).joinpath("nn").joinpath("model")) self.qlearner = QLearner( self.nn_config, self.num_actions, self.img_width, self.img_height, self.num_img_channels, self.memory_size, load_model=load_path, target_network_update_tau=self.target_network_update_tau) self.curr_step = self.config["curr_step"] self.epsilon = self.config["epsilon"] self.memory_beta = self.config["memory_beta"] self.best_average_score = self.config["best_average_score"] print("Model loaded successfully") self.model_loaded = True else: self.qlearner = QLearner( self.nn_config, self.num_actions, self.img_width, self.img_height, self.num_img_channels, self.memory_size, target_network_update_tau=self.target_network_update_tau) if self.tensorboard_log_freq > 0: self.qlearner.add_tensorboard_ops(self.tensorboard_log_path) #return a new game instance def get_game(self): return self.game(self.width, self.height, self.image_scale_factor, self.num_goals) #initialize experience memory obtained by random play, i.e. at each step the agent chooses an action uniformly at random def init_random_exp_memory(self, size): if size > self.memory_size: size = self.memory_size game = self.get_game() self.exp_memory.add(game.get_state(), 0, 0, 0) for i in range(size): random_action = np.random.randint(0, self.num_actions) reward, is_terminal = game.execute_action(random_action) state = game.get_state() self.exp_memory.add(state, random_action, reward, is_terminal) if is_terminal: game.reset() self.exp_memory.add(game.get_state(), 0, 0, 0) #initialize experience memory with epsilon-greedy policy def init_exp_memory(self, size): if size > self.memory_size: size = self.memory_size game = self.get_game() self.exp_memory.add(game.get_state(), 0, 0, 0) for i in range(size): action = 0 if np.random.rand() < self.epsilon: action = np.random.randint(0, self.num_actions) else: action = self.qlearner.compute_action(game.get_state())[0] reward, is_terminal = game.execute_action(action) state = game.get_state() self.exp_memory.add(state, action, reward, is_terminal) if is_terminal: game.reset() self.exp_memory.add(game.get_state(), 0, 0, 0) def train(self): if self.model_loaded: self.init_exp_memory(self.exp_memory_start_size) else: self.init_random_exp_memory(self.exp_memory_start_size) total_reward = 0.0 games_played = 1 game = self.get_game() self.exp_memory.add(game.get_state(), 0, 0, 0) while self.curr_step < self.max_steps: #play one game step according to epsilon-greedy policy action = 0 if np.random.rand() < self.epsilon: action = np.random.randint(0, self.num_actions) else: action = self.qlearner.compute_action(game.get_state())[0] reward, is_terminal = game.execute_action(action) self.exp_memory.add(game.get_state(), action, reward, is_terminal) if is_terminal: game.reset() self.exp_memory.add(game.get_state(), 0, 0, 0) games_played += 1 total_reward += self.renormalize_reward(reward) #compute next epsilon self.epsilon = np.maximum(self.epsilon_min, self.epsilon - self.epsilon_step) self.memory_beta = np.minimum( self.memory_beta_end, self.memory_beta + self.memory_beta_step) if self.curr_step % self.update_freq == 0: #sample a batch of transitions from experience memory s, a, r, s2, t, indices, p_values = self.exp_memory.sample( self.batch_size) #output tensorboard summaries write_summary = False if (self.tensorboard_log_freq > 0) and ( self.curr_step % self.tensorboard_log_freq == 0): write_summary = True #beta is divided by 2 here because squared error loss squares beta _, _, td = self.qlearner.train_step( s, a, r, s2, t, p_values, self.memory_beta / 2.0, write_summary=write_summary) self.exp_memory.update_p(indices, td) #update target network if self.target_network_update_mode == "soft": if self.curr_step % self.update_freq == 0: self.qlearner.update_target_network() else: if self.curr_step % self.target_network_update_freq == 0: self.qlearner.update_target_network() #output current training status if self.curr_step % self.output_freq == 0: average_reward = total_reward / games_played total_reward = 0 games_played = 1 print("step: {} epsilon: {} average reward per game: {}". format(self.curr_step, self.epsilon, average_reward)) #evaluate current target network and save model if average score per game has improved if (self.curr_step % self.eval_freq == 0): score, num_games, average, max_score = self.eval( self.eval_steps) print("Evaluating model with {} steps:".format( self.eval_steps)) print( "Total score: {} Games: {} Average: {} Max: {}".format( score, num_games, average, max_score)) if average >= self.best_average_score: print("Improved average score") print("Saving model...") self.save() self.best_average_score = average #add average score to tensorboard summary = tf.Summary() summary.value.add(tag='average_score', simple_value=average) summary.value.add(tag='max_score', simple_value=max_score) self.qlearner.summary_writer.add_summary( summary, self.curr_step) self.curr_step += 1 #evaluate model for a given number of steps def eval(self, num_steps): game = self.get_game() total_score = 0.0 current_score = 0.0 num_games = 1.0 max_score = 0.0 for i in range(num_steps): action = self.qlearner.compute_action(game.get_state())[0] reward, is_terminal = game.execute_action(action) reward = self.renormalize_reward(reward) current_score += reward total_score += reward if is_terminal: game.reset() if i < (num_steps - 1): num_games += 1 if current_score > max_score: max_score = current_score current_score = 0 average = total_score / num_games return total_score, num_games, average, max_score #compute original values for scaled rewards def renormalize_reward(self, reward): if not self.scale_reward_max is None: return reward / self.scale_reward_max else: return reward def load_config(self, filename): result = None with open(filename, 'r') as fp: result = json.load(fp) return result def save(self): base_path = Path(self.save_path) if not base_path.exists(): base_path.mkdir() date_str = datetime.datetime.today().strftime("%Y-%m-%d--%H-%M") save_path = date_str + "--step" + str(self.curr_step) save_path = base_path.joinpath(save_path) #create path if it doesn't exist if not save_path.exists(): save_path.mkdir() self.config["epsilon"] = self.epsilon self.config["curr_step"] = self.curr_step self.config["memory_beta"] = self.memory_beta self.config["best_average_score"] = self.best_average_score #save config config_filename = save_path.joinpath("config.json") with config_filename.open('w') as fp: json.dump(self.config, fp, indent=4) #save neural network nn_path = save_path.joinpath("nn") if not nn_path.exists(): nn_path.mkdir() self.qlearner.save_model(str(nn_path.joinpath("model"))) #output game images def eval_with_images(self, num_steps, path): image_id = 0 game = self.get_game() self.save_image(game.get_state(), path, image_id, 0, 0, 0, 0.0) total_score = 0 games_finished = 0 max_game_score = 0 current_game_score = 0.0 for i in range(num_steps): image_id += 1 action = self.qlearner.compute_action(game.get_state())[0] reward, is_terminal = game.execute_action(action) reward = self.renormalize_reward(reward) total_score += reward current_game_score += reward self.save_image(game.get_state(), path, image_id, action, reward, is_terminal, score=current_game_score) if is_terminal: game.reset() games_finished += 1 if current_game_score > max_game_score: max_game_score = current_game_score current_game_score = 0.0 self.save_image(game.get_state(), path, image_id, action, reward, is_terminal, score=current_game_score) print("Max score: {}".format(max_game_score)) #output images for games whose score is above a given threshold def find_max_games(self, num_steps, path, score_threshold): image_id = 0 game = self.get_game() frames = [] frames.append((np.copy(game.get_state()), 0.0)) max_game_score = 0 current_game_score = 0.0 for i in range(num_steps): if i % (num_steps // 10) == 0: print("At step {}".format(i)) action = self.qlearner.compute_action(game.get_state())[0] reward, is_terminal = game.execute_action(action) reward = self.renormalize_reward(reward) current_game_score += reward frames.append((np.copy(game.get_state()), current_game_score)) if is_terminal: game.reset() if current_game_score > max_game_score: max_game_score = current_game_score if current_game_score > score_threshold: print("Saving images...") for frame in frames: self.save_image(frame[0], path, image_id, 0, 0, 0, score=frame[1]) image_id += 1 frames = [] frames.append((np.copy(game.get_state()), 0.0)) current_game_score = 0.0 print("Max score: {}".format(max_game_score)) #output transition images def test_experience_memory(self, num_steps, path): image_id = 0 self.init_random_exp_memory(self.exp_memory_start_size) s, a, r, s2, t = self.exp_memory.sample(num_steps) for i in range(num_steps): image_id += 1 action = a[i] reward = r[i] is_terminal = t[i] self.save_transition(s[i], action, reward, s2[i], is_terminal, path, image_id) def save_transition(self, s, a, r, s2, t, path, image_id): self.save_image(self.combine_images(s, s2), path, image_id, a, r, t) def combine_images(self, image1, image2, sep_width=10): image1 = np.squeeze(image1) image2 = np.squeeze(image2) shape = image1.shape sep = np.ones([shape[0], sep_width, self.num_img_channels], dtype=float) frames1 = [] frames2 = [] for j in range(self.num_frames): start_index = j * self.num_img_channels end_index = (j + 1) * self.num_img_channels frames1.append(image1[:, :, start_index:end_index]) frames2.append(image2[:, :, start_index:end_index]) if j != (self.num_frames - 1): frames1.append(sep) frames2.append(sep) image1 = np.concatenate(frames1, axis=1) image2 = np.concatenate(frames2, axis=1) shape = image1.shape sep = np.ones([sep_width, shape[1], self.num_img_channels], dtype=float) return np.concatenate((image2, sep, image1), axis=0) def save_image(self, img, path, image_id, action, reward, is_terminal, score=None): save_file = Path(path).joinpath("img{}.png".format(image_id)) with save_file.open('wb') as fp: fig = plt.figure() plt.imshow(np.squeeze(img), origin="lower") plt.axis("off") if not score is None: plt.title("Score: {}".format(score)) else: plt.title("action: {} reward: {} terminal: {}".format( self.game.action_names[action], reward, is_terminal)) fig.savefig(fp, bbox_inches='tight', format="png") plt.close()