def _setup_learn(self, total_timesteps: int, eval_env: Optional[GymEnv], callback: Union[None, Callable, List[BaseCallback], BaseCallback] = None, eval_freq: int = 10000, n_eval_episodes: int = 5, log_path: Optional[str] = None, reset_num_timesteps: bool = True, tb_log_name: str = 'run', ) -> Tuple[int, BaseCallback]: """ Initialize different variables needed for training. :param total_timesteps: (int) The total number of samples (env steps) to train on :param eval_env: (Optional[GymEnv]) :param callback: (Union[None, BaseCallback, List[BaseCallback, Callable]]) :param eval_freq: (int) How many steps between evaluations :param n_eval_episodes: (int) How many episodes to play per evaluation :param log_path (Optional[str]): Path to a log folder :param reset_num_timesteps: (bool) Whether to reset or not the ``num_timesteps`` attribute :param tb_log_name: (str) the name of the run for tensorboard log :return: (Tuple[int, BaseCallback]) """ self.start_time = time.time() self.ep_info_buffer = deque(maxlen=100) self.ep_success_buffer = deque(maxlen=100) if self.action_noise is not None: self.action_noise.reset() if reset_num_timesteps: self.num_timesteps = 0 self._episode_num = 0 else: # Make sure training timesteps are ahead of the internal counter total_timesteps += self.num_timesteps self._total_timesteps = total_timesteps # Avoid resetting the environment when calling ``.learn()`` consecutive times if reset_num_timesteps or self._last_obs is None: self._last_obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: self._last_original_obs = self._vec_normalize_env.get_original_obs() if eval_env is not None and self.seed is not None: eval_env.seed(self.seed) eval_env = self._get_eval_env(eval_env) # Configure logger's outputs utils.configure_logger(self.verbose, self.tensorboard_log, tb_log_name, reset_num_timesteps) # Create eval callback if needed callback = self._init_callback(callback, eval_env, eval_freq, n_eval_episodes, log_path) return total_timesteps, callback
def learn(self, total_timesteps, n_steps, n_iter, batch_size, save_path, tb_log_path=None): configure_logger(verbose=self.verbose, tensorboard_log=tb_log_path, tb_log_name="HAC", reset_num_timesteps=True) step_count = 0 i_episode = 1 while step_count <= total_timesteps: self.reward = 0 self.timestep = 0 state = self.env.reset() # collecting experience in environment last_state, done, _step_count = self.run_HAC(self.env, self.k_level - 1, state, self.goal_state, is_subgoal_test=False) step_count += _step_count # updating with collected data if step_count > n_steps * i_episode: vio_num = get_violation_count(self.env) if vio_num is not None: logger.record("rollout/violation", vio_num) logger.record(f"rollout/ep_rew_mean", self.reward) self.update(n_iter, batch_size) i_episode += 1 logger.dump(step_count) self.save(save_path) return self
def _setup_learn( self, total_timesteps: int, eval_env: Optional[GymEnv], callback: MaybeCallback = None, eval_freq: int = 10000, n_eval_episodes: int = 5, log_path: Optional[str] = None, reset_num_timesteps: bool = True, tb_log_name: str = "run", ) -> Tuple[int, BaseCallback]: """ Initialize different variables needed for training. :param total_timesteps: The total number of samples (env steps) to train on :param eval_env: Environment to use for evaluation. :param callback: Callback(s) called at every step with state of the algorithm. :param eval_freq: How many steps between evaluations :param n_eval_episodes: How many episodes to play per evaluation :param log_path: Path to a folder where the evaluations will be saved :param reset_num_timesteps: Whether to reset or not the ``num_timesteps`` attribute :param tb_log_name: the name of the run for tensorboard log :return: """ self.start_time = time.time() if self.ep_info_buffer is None or reset_num_timesteps: # Initialize buffers if they don't exist, or reinitialize if resetting counters self.ep_info_buffer = deque(maxlen=100) self.ep_success_buffer = deque(maxlen=100) if self.action_noise is not None: self.action_noise.reset() if reset_num_timesteps: self.num_timesteps = 0 self._episode_num = 0 else: # Make sure training timesteps are ahead of the internal counter total_timesteps += self.num_timesteps self._total_timesteps = total_timesteps # Avoid resetting the environment when calling ``.learn()`` consecutive times if reset_num_timesteps or self._last_obs is None: self._last_obs = self.env.reset() self._last_dones = np.zeros((self.env.num_envs, ), dtype=bool) # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: self._last_original_obs = self._vec_normalize_env.get_original_obs( ) if eval_env is not None and self.seed is not None: eval_env.seed(self.seed) eval_env = self._get_eval_env(eval_env) # Configure logger's outputs utils.configure_logger(self.verbose, self.tensorboard_log, tb_log_name, reset_num_timesteps) # Create eval callback if needed callback = self._init_callback(callback, eval_env, eval_freq, n_eval_episodes, log_path) return total_timesteps, callback
def main(): # noqa: C901 parser = argparse.ArgumentParser() parser.add_argument("-tb", "--tensorboard-log", help="Tensorboard log dir", default="", type=str) parser.add_argument("--env", help="environment ID", type=str, default="CartPole-v1") parser.add_argument("-f", "--folder", help="Log folder", type=str, default="rl-trained-agents") parser.add_argument("--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument("-n", "--n-timesteps", help="number of timesteps", default=1000, type=int) parser.add_argument( "--num-threads", help="Number of threads for PyTorch (-1 to use default)", default=-1, type=int) parser.add_argument("--n-envs", help="number of environments", default=1, type=int) # parser.add_argument("--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int) parser.add_argument( "--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default='0', type=str) parser.add_argument("--verbose", help="Verbose mode (0: no output, 1: INFO)", default=1, type=int) parser.add_argument( "--no-render", action="store_true", default=False, help="Do not render the environment (useful for tests)") parser.add_argument( "--render-mode", default='step', help="Whether to render at each step or at the end of an episode") parser.add_argument("--deterministic", action="store_true", default=False, help="Use deterministic actions") parser.add_argument( "--load-best", action="store_true", default=False, help="Load best model instead of last model if available") parser.add_argument( "--load-checkpoint", type=int, help="Load checkpoint instead of last model if available, " "you must pass the number of timesteps corresponding to it", ) parser.add_argument("--stochastic", action="store_true", default=False, help="Use stochastic actions") parser.add_argument( "--norm-reward", action="store_true", default=False, help="Normalize reward if applicable (trained with VecNormalize)") parser.add_argument("--seed", help="Random generator seed", type=int, default=0) parser.add_argument("--info-freq", help="Frequency on which info valuers are logged", type=int, default=10) parser.add_argument("--reward-log", help="Where to log reward", default="", type=str) parser.add_argument( "--gym-packages", type=str, nargs="+", default=[], help= "Additional external Gym environemnt package modules to import (e.g. gym_minigrid)", ) parser.add_argument( "--env-kwargs", type=str, nargs="+", action=StoreDict, help="Optional keyword argument to pass to the env constructor") args = parser.parse_args() # Going through custom gym packages to let them register in the global registory for env_module in args.gym_packages: importlib.import_module(env_module) env_id = args.env algo = args.algo folder = args.folder if args.exp_id == '0': args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print(f"Loading latest experiment, id={args.exp_id}") # Sanity checks if args.exp_id != '0' and args.exp_id != '-1': log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}") else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), f"The {log_path} folder was not found" found = False for ext in ["zip"]: model_path = os.path.join(log_path, f"{env_id}.{ext}") found = os.path.isfile(model_path) if found: break if args.load_best: model_path = os.path.join(log_path, "best_model.zip") found = os.path.isfile(model_path) if args.load_checkpoint is not None: model_path = os.path.join( log_path, f"rl_model_{args.load_checkpoint}_steps.zip") found = os.path.isfile(model_path) if not found: raise ValueError( f"No model found for {algo} on {env_id}, path: {model_path}") else: print(f"Loading model for {algo} on {env_id}, path: {model_path}") off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"] if algo in off_policy_algos: args.n_envs = 1 set_random_seed(args.seed) if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) is_atari = ExperimentManager.is_atari(env_id) stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) # load env_kwargs if existing env_kwargs = {} args_path = os.path.join(log_path, env_id, "args.yml") if os.path.isfile(args_path): with open(args_path, "r") as f: loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader) # pytype: disable=module-attr if loaded_args["env_kwargs"] is not None: env_kwargs = loaded_args["env_kwargs"] # overwrite with command line arguments if args.env_kwargs is not None: env_kwargs.update(args.env_kwargs) log_dir = args.reward_log if args.reward_log != "" else None env = create_test_env( env_id, n_envs=args.n_envs, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams, env_kwargs=env_kwargs, ) kwargs = dict(seed=args.seed) if algo in off_policy_algos: # Dummy buffer size as we don't need memory to enjoy the trained agent kwargs.update(dict(buffer_size=1)) # Check if we are running python 3.8+ # we need to patch saved model under python 3.6/3.7 to load them newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8 custom_objects = {} if newer_python_version: custom_objects = { "learning_rate": 0.0, "lr_schedule": lambda _: 0.0, "clip_range": lambda _: 0.0, } model = ALGOS[algo].load(model_path, env=env, custom_objects=custom_objects, **kwargs) # tb_path = '' # for i in range(0,100000,1): # tb_path = os.path.join(args.tensorboard_log, env_id, algo.upper() + "_" + str(i)) # if not os.path.exists(tb_path): # break # print("algo=",algo, " logdir=", tb_path) # writer = SummaryWriter(log_dir=tb_path) obs = env.reset() # Deterministic by default except for atari games stochastic = args.stochastic or is_atari and not args.deterministic deterministic = not stochastic state = None episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 ep_count = 0 # For HER, monitor success rate successes = [] sbcommon_utils.configure_logger(args.verbose, os.path.join(args.tensorboard_log, env_id), algo.upper(), reset_num_timesteps=True) xlsx_logpath = os.path.join( args.tensorboard_log, env_id) if logger.get_dir() is None else logger.get_dir() xlsx_logger = Xlsx_Logger(xlsx_logpath, env_id) with open(os.path.join(xlsx_logpath, 'args.yaml'), 'w') as file: yaml.dump(args, file) fig: plt.Figure = None info_freq = args.info_freq try: for step in range(args.n_timesteps): action, state = model.predict(obs, state=state, deterministic=deterministic) obs, reward, done, infos = env.step(action) episode_reward += reward[0] ep_len += 1 if args.n_envs == 1: # log info variables to tensorboard if (step % info_freq == 0 or done) and type(infos[0]) is dict: if not args.no_render: if not done and args.render_mode == 'step': fig = env.render("human") elif done and args.render_mode == 'episode': fig = env.envs[0].rendered_episode xlsx_logger.set_step_ep(ep_count, step) for key in infos[0]: if key == 'episode' or key == 'terminal_observation' or key == 'render': continue val = infos[0].get(key) logger.record("eval/" + key, val, exclude='stdout') xlsx_logger.log(key, val) if fig is not None: log_fig = logger.Figure(fig, False) logger.record("eval/figure", log_fig, exclude='stdout') # writer.add_scalar("eval/"+key, val, step) logger.dump(step=step) # For atari the return reward is not the atari score # so we have to get it from the infos dict if is_atari and infos is not None and args.verbose >= 1: episode_infos = infos[0].get("episode") if episode_infos is not None: print(f"Atari Episode Score: {episode_infos['r']:.2f}") print("Atari Episode Length", episode_infos["l"]) if done and not is_atari and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed print("Episode #{}, step#{}".format(ep_count, step)) print(f" Episode Reward: {episode_reward:.2f}") print(" Episode Length", ep_len) episode_rewards.append(episode_reward) logger.record("eval/ep_len", ep_len, exclude='stdout') logger.record("eval/ep_reward", episode_reward, exclude='stdout') xlsx_logger.log('ep_len', ep_len) xlsx_logger.log('reward', episode_reward) logger.dump(step=step) episode_lengths.append(ep_len) episode_reward = 0.0 ep_len = 0 ep_count += 1 state = None # Reset also when the goal is achieved when using HER if done and infos[0].get("is_success") is not None: if args.verbose > 1: print("Success?", infos[0].get("is_success", False)) if infos[0].get("is_success") is not None: successes.append(infos[0].get("is_success", False)) episode_reward, ep_len = 0.0, 0 ep_count += 1 # if (not args.no_render) and args.render_mode=='step': # fig = env.render("human") # else: # fig = None except KeyboardInterrupt: pass logger.dump(step=step) xlsx_logger.close() if args.verbose > 0 and len(successes) > 0: print(f"Success rate: {100 * np.mean(successes):.2f}%") if args.verbose > 0 and len(episode_rewards) > 0: print(f"{len(episode_rewards)} Episodes") print( f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}" ) if args.verbose > 0 and len(episode_lengths) > 0: print( f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}" ) env.close()