def save(self): """ Save a model. Used by train at the interval specified by save_interval. """ path = f"models/{self.model_name}.pt" save(self.model, path) debug(f"model saved in {path}")
def load(self): """ Load a model. Used by visualize to load a trained model. """ files = [ f.rpartition(".pt")[0] for f in os.listdir("models") if f != ".gitignore" ] if self.model_name not in files: valid_model_names = ", ".join(files) raise AlgorithmError( f"Choose a valid model name: {valid_model_names}") path = f"models/{self.model_name}.pt" debug(f"model loaded from {path}") return load(path)
def push(self, episode, reward, epsilon): eps = (episode - self.current_episode) / (time.time() - self.current_time) self.current_episode = episode self.current_time = time.time() self.total_rewards.append(reward) mean_reward = np.mean(self.total_rewards[-100:]) self.writer.add_scalar("epsilon", epsilon, episode) self.writer.add_scalar("episodes_per_second", eps, episode) self.writer.add_scalar("reward_avg_100", mean_reward, episode) self.writer.add_scalar("reward", reward, episode) if episode % self.debug_every == 0: debug(f"episode {episode:6d} finished - avg. reward: {mean_reward:2f}")
def visualize(self): self.model = self.load() self.acmodel = self.model["acmodel"] self.acmodel.eval() env = gym.make(self.env.spec.id) env.seed(self.seed) self.env = SubprocVecEnv([env]) self.obs = self.env.reset() done = False while True: action, _, _ = self.select_action(self.obs) obs, reward, done, _ = self.env.step(action.cpu().numpy()) self.obs = obs self.env.render() time.sleep(1 / self.fps) if done[0]: debug(f"reward: {reward[0]}")
def run(action, args): algorithms = {} debug("algorithms detected:") for algorithm in ReinforcementLearningAlgorithm.subclasses: debug(algorithm.__name__) algorithms.update({algorithm.__name__: algorithm}) debug("") try: if args.algorithm not in algorithms: valid_algorithms = ", ".join(algorithms.keys()) error(f"choose a valid algorithm: {valid_algorithms}") return 1 try: env = gym.make(args.environment) except gym.error.Error: valid_environments = "\n".join([ env.id for env in gym.envs.registry.all() if env.id.startswith("MiniGrid") ]) error(f"choose a valid gym enviroment:\n{valid_environments}") return 1 algo = algorithms[args.algorithm](env=env, args=args) with logger.catch(reraise=True): if action == "train": algo.train() if action == "visualize": algo.visualize() except AlgorithmError as e: error(e.msg) return 1 except Exception as e: error(e) return 1 return 0
def __exit__(self, *args): debug("closing tensorboard") self.writer.close()
def train(self): Q = self.model["q_table"] eps = self.start_eps rewards = [] for i in range(1, self.max_num_updates + 1): # reduce chance for random action if eps > self.end_eps: eps -= self.eps_decay if self.seed: self.env.seed(self.seed) obs = self.env.reset() obs = preprocess_obs(obs, self.q_table_length, self.discrete_obs_space) current_reward = 0 done = False while True: # get q values q = Q[obs, :] # greedy-epsilon if np.random.rand(1) < eps: # sample random action from action space a = self.env.action_space.sample() else: # choose action with highest Q value a = np.argmax(q) # get next observation, reward and done from environment next_obs, reward, done, _ = self.env.step(a) next_obs = preprocess_obs(next_obs, self.q_table_length, self.discrete_obs_space) # construct a target next_q_max = np.max(Q[next_obs, :]) target_q = next_q_max * self.y + reward # update q-table with new knowledge Q[obs, a] = (1 - self.lr) * Q[obs, a] + self.lr * target_q # update variables for next iteration current_reward += reward obs = next_obs if self.render_interval != 0 and i % self.render_interval == 0: self.env.render() time.sleep(1 / self.fps) if done: break rewards.append(current_reward) if i % 100 == 0 and i != 0: debug( f"episode {i:5d} finished - avg. reward: {np.average(rewards[-100:-1]):2f}" ) if self.save_interval != 0 and i % self.save_interval == 0: self.save() success(f"all {self.max_num_updates:5d} episodes finished!") info(f"reward for the final episode: {rewards[-1]:2f}") if self.save_interval != 0: self.save() debug("plotting reward over episodes") matplotlib.rcParams["figure.dpi"] = 100 plt.plot(rewards) plt.plot(savgol_filter(rewards, 23, 3), "-r", linewidth=2.0) plt.title(self.model_name) plt.xlabel("episode") plt.ylabel("reward") plt.show()
def train(self): q_net = self.model["q_network"] q_net.train() # loss function, could experiment with alternatives like Huber loss (F.smooth_l1_loss) too criterion = F.mse_loss # optimizer, could experiment with alternatives like AdaBound (adabound.AdaBound) too optimizer = optim.SGD(q_net.parameters(), lr=self.lr) eps = self.start_eps rewards = [] for i in range(1, self.max_num_updates + 1): # reduce chance for random action if eps > self.end_eps: eps -= self.eps_decay if self.seed: self.env.seed(self.seed) obs = self.env.reset() obs = preprocess_obs(obs, self.in_features, self.discrete_obs_space) current_reward = 0 done = False while True: # get q values q = q_net(obs.unsqueeze(0)) # greedy-epsilon if np.random.rand(1) < eps: # sample random action from action space a = self.env.action_space.sample() else: with torch.no_grad(): # choose action with highest Q value a = q.argmax().item() # get next observation, reward and done from environment next_obs, reward, done, _ = self.env.step(a) next_obs = preprocess_obs(next_obs, self.in_features, self.discrete_obs_space) # construct a target (compare this to a label in supervised learning) by taking # our current q values and replacing the q value for the action chosen with: # the max q value in the next observation * discount factor + the reward next_q = q_net(next_obs.unsqueeze(0)) next_q_max = next_q.max().item() target_q = q.detach().clone() # clone an independant target_q[0, a] = next_q_max * self.y + reward # compute loss loss = criterion(q, target_q) # optimize: backprop and update weights optimizer.zero_grad() loss.backward() optimizer.step() # update variables for next iteration current_reward += reward obs = next_obs if self.render_interval != 0 and i % self.render_interval == 0: self.env.render() time.sleep(1 / self.fps) if done: break rewards.append(current_reward) if i % 100 == 0: debug( f"episode {i:5d} finished - avg. reward: {np.average(rewards[-100:-1]):2f}" ) if self.save_interval != 0 and i % self.save_interval == 0: self.save() success(f"all {self.max_num_updates:5d} episodes finished!") info(f"reward for the final episode: {rewards[-1]:2f}") if self.save_interval != 0: self.save() debug("plotting reward over episodes") matplotlib.rcParams["figure.dpi"] = 200 plt.plot(rewards) plt.plot(savgol_filter(rewards, 23, 3), "-r", linewidth=2.0) plt.title(self.model_name) plt.xlabel("episode") plt.ylabel("reward") plt.show()
def main(): p = argparse.ArgumentParser() p.add_argument("--log-time-stamps", action="store_true", default=False) subp = p.add_subparsers(dest="subcmd_name") p_train = subp.add_parser("train", formatter_class=argparse.RawTextHelpFormatter) p_train.add_argument( "--algorithm", type=str, required=True, metavar="algo", help="str: reinforcement learning algorithm algo to use.", ) p_train.add_argument( "--environment", type=str, required=True, metavar="env", help="str: minigrid environment env to use.", ) p_train.add_argument( "--learning-rate", type=float, default=None, metavar="α", help="float: learning rate α to use.", ) p_train.add_argument( "--discount-factor", type=float, default=None, metavar="γ", help="float: discount factor γ to use.", ) p_train.add_argument( "--start-eps", type=float, default=None, metavar="se", help="float: anneal epsilon used in greedy-epsilon from se.", ) p_train.add_argument( "--end-eps", type=float, default=None, metavar="ee", help="float: anneal epsilon used in greedy-epsilon to ee.", ) p_train.add_argument( "--annealing-steps", type=float, default=None, metavar="as", help="float: decay epsilon over as steps.", ) p_train.add_argument( "--updates", type=int, default=None, metavar="n", help="int: train model for up to n updates", ) p_train.add_argument( "--render-interval", type=int, default=None, metavar="i", help="int: if i > 0, render every i:th episode", ) p_train.add_argument( "--save-interval", type=int, default=None, metavar="j", help="int: if j > 0, save model every j:th episode", ) p_train.add_argument( "--model-name", type=str, default=None, metavar="name", help= "str: save model as models/<name>.pt when (if) the model is saved", ) p_train.add_argument("--seed", type=int, default=None, metavar="seed", help="int: seed used for all randomness") p_train.add_argument( "--fps", type=int, default=None, metavar="fps", help="int: rendering delay = 1/fps + time to compute next action", ) p_train.add_argument("--tensorboard", action="store_true", help="bool: use tensorboard") p_train.set_defaults(action="train") p_visualize = subp.add_parser("visualize") p_visualize.add_argument( "--algorithm", type=str, required=True, metavar="algo", help="str: reinforcement learning algorithm algo to use.", ) p_visualize.add_argument( "--environment", type=str, required=True, metavar="env", help="str: minigrid environment env to use.", ) p_visualize.add_argument( "--model-name", type=str, default=None, metavar="name", help="str: load model from models/<name>.pt", ) p_visualize.add_argument("--seed", type=int, default=None, metavar="seed", help="int: seed used for all randomness") p_visualize.add_argument( "--fps", type=int, default=None, metavar="fps", help="int: rendering delay = 1/fps + time to compute next action", ) p_visualize.set_defaults(action="visualize") args = p.parse_args() fmt = get_format(args.log_time_stamps) config = {"handlers": [{"sink": stderr, "format": fmt}]} logger.configure(**config) if not hasattr(args, "action"): error("You need to select a subcommand {train, visualize}") info("\n" + p_train.format_usage() + p_visualize.format_usage()) return 1 try: result = run(args.action, args) debug(f"{args.subcmd_name} returned {result}") except KeyboardInterrupt: error("Interrupted by user") return 1 return result