def _dump_logs(self) -> None: """ Write log. """ fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/episodes", self._episode_num, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total timesteps", self.num_timesteps, exclude="tensorboard") if self.use_sde: logger.record("train/std", (self.actor.get_std()).mean().item()) if len(self.ep_success_buffer) > 0: logger.record("rollout/success rate", safe_mean(self.ep_success_buffer)) # Pass the number of timesteps for tensorboard logger.dump(step=self.num_timesteps)
def test_main(tmp_path): """ tests for the logger module """ info("hi") debug("shouldn't appear") set_level(DEBUG) debug("should appear") configure(folder=str(tmp_path)) record("a", 3) record("b", 2.5) dump() record("b", -2.5) record("a", 5.5) dump() info("^^^ should see a = 5.5") record_mean("b", -22.5) record_mean("b", -44.4) record("a", 5.5) dump() with ScopedConfigure(None, None): info("^^^ should see b = 33.3") with ScopedConfigure(str(tmp_path / "test-logger"), ["json"]): record("b", -2.5) dump() reset() record("a", "longasslongasslongasslongasslongasslongassvalue") dump() warn("hey") error("oh") record_dict({"test": 1})
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name ) callback.on_training_start(locals(), globals()) # debug =============================================================== if mode == 'debug': print(['OPA.learn started, ready to loop (OPA.collect_rollouts + OPA.train)']) while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # debug =========================================================== if mode == 'debug': print(['OPA.learn', 'num_timesteps:', self.num_timesteps, 'total_timesteps:', total_timesteps]) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) # debug =============================================================== if mode == 'debug': print(['OPA.learn finished, ready to OPA.train']) self.train() callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name ) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: self.fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: # logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])) # logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("rollout/ep_reward_mean", safe_mean([ep_info['r'] for ep_info in self.ep_info_buffer])) logger.record("rollout/ep_len_mean", safe_mean([ep_info['l'] for ep_info in self.ep_info_buffer])) if len(self.specific_reward_info_buffer) > 0 and len(self.specific_reward_info_buffer[0]) > 0: logger.record('rollout/mimic_qpos_reward', safe_mean([specific_reward_info['mimic_qpos_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_qvel_reward', safe_mean([specific_reward_info['mimic_qvel_reward'] for specific_reward_info in self.specific_reward_info_buffer])) #logger.record('rollout/mimic_ee_reward', safe_mean([specific_reward_info['mimic_ee_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_body_orientation_reward', safe_mean([specific_reward_info['mimic_body_orientation_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_body_reward', safe_mean([specific_reward_info['mimic_body_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_body_vel_reward', safe_mean([specific_reward_info['mimic_body_vel_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_contact_reward', safe_mean([specific_reward_info['mimic_contact_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record("time/fps", self.fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) self.train() callback.on_training_end() return self
def train( self, n_epochs: int = 100, *, on_epoch_end: Callable[[dict], None] = None, log_interval: int = 100, ): """Train with supervised learning for some number of epochs. Here an 'epoch' is just a complete pass through the expert transition dataset. Args: n_epochs: number of complete passes made through dataset. on_epoch_end: optional callback to run at the end of each epoch. Will receive all locals from this function as dictionary argument (!!). log_interval: log stats after every log_interval batches """ assert self.batch_size >= 1 samples_so_far = 0 batch_num = 0 for epoch_num in trange(n_epochs, desc="BC epoch"): while samples_so_far < (epoch_num + 1) * self.expert_dataset.size(): batch_num += 1 trans = self.expert_dataset.sample(self.batch_size) assert len(trans) == self.batch_size samples_so_far += self.batch_size obs_tensor = th.as_tensor(trans.obs).to(self.policy.device) acts_tensor = th.as_tensor(trans.acts).to(self.policy.device) loss, stats_dict = self._calculate_loss(obs_tensor, acts_tensor) self.optimizer.zero_grad() loss.backward() self.optimizer.step() stats_dict["epoch_num"] = epoch_num stats_dict["n_updates"] = batch_num stats_dict["batch_size"] = len(trans) if batch_num % log_interval == 0: for k, v in stats_dict.items(): logger.record(k, v) logger.dump(batch_num) if on_epoch_end is not None: on_epoch_end(locals())
def learn(self, total_timesteps, log_interval, n_eval_episodes=5): start_time = time.time() iteration = 0 while self.num_timesteps < total_timesteps: progress = round(self.num_timesteps / total_timesteps * 100, 2) self.collect_samples() iteration += 1 if log_interval is not None and iteration % log_interval == 0: logger.record("Progress", str(progress) + '%') logger.record("time/total timesteps", self.num_timesteps) if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", np.mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", np.mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) fps = int(self.num_timesteps / (time.time() - start_time)) logger.record("time/total_time", (time.time() - start_time)) logger.dump(step=self.num_timesteps) self.train(self.rollout) if np.random.randn() < 0.25: self.train_rnd(self.rollout) logger.record("Complete", '.') logger.record("time/total timesteps", self.num_timesteps) if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", np.mean([ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", np.mean([ep_info["l"] for ep_info in self.ep_info_buffer])) fps = int(self.num_timesteps / (time.time() - start_time)) logger.record("time/total_time", (time.time() - start_time)) logger.dump(step=self.num_timesteps) return self
def train( self, *, n_epochs: Optional[int] = None, n_batches: Optional[int] = None, on_epoch_end: Callable[[], None] = None, log_interval: int = 100, ): """Train with supervised learning for some number of epochs. Here an 'epoch' is just a complete pass through the expert data loader, as set by `self.set_expert_data_loader()`. Args: n_epochs: Number of complete passes made through expert data before ending training. Provide exactly one of `n_epochs` and `n_batches`. n_batches: Number of batches loaded from dataset before ending training. Provide exactly one of `n_epochs` and `n_batches`. on_epoch_end: Optional callback with no parameters to run at the end of each epoch. log_interval: Log stats after every log_interval batches. """ it = EpochOrBatchIteratorWithProgress( self.expert_data_loader, n_epochs=n_epochs, n_batches=n_batches, on_epoch_end=on_epoch_end, ) batch_num = 0 for batch, stats_dict_it in it: loss, stats_dict_loss = self._calculate_loss( batch["obs"], batch["acts"]) self.optimizer.zero_grad() loss.backward() self.optimizer.step() if batch_num % log_interval == 0: for stats in [stats_dict_it, stats_dict_loss]: for k, v in stats.items(): logger.record(k, v) logger.dump(batch_num) batch_num += 1
def learn(self, total_timesteps, n_steps, n_iter, batch_size, save_path, tb_log_path=None): configure_logger(verbose=self.verbose, tensorboard_log=tb_log_path, tb_log_name="HAC", reset_num_timesteps=True) step_count = 0 i_episode = 1 while step_count <= total_timesteps: self.reward = 0 self.timestep = 0 state = self.env.reset() # collecting experience in environment last_state, done, _step_count = self.run_HAC(self.env, self.k_level - 1, state, self.goal_state, is_subgoal_test=False) step_count += _step_count # updating with collected data if step_count > n_steps * i_episode: vio_num = get_violation_count(self.env) if vio_num is not None: logger.record("rollout/violation", vio_num) logger.record(f"rollout/ep_rew_mean", self.reward) self.update(n_iter, batch_size) i_episode += 1 logger.dump(step_count) self.save(save_path) return self
def test_no_accum(tmpdir): logger.configure(tmpdir, ["csv"]) sb_logger.record("A", 1) sb_logger.record("B", 1) sb_logger.dump() sb_logger.record("A", 2) sb_logger.dump() sb_logger.record("B", 3) sb_logger.dump() expect = {"A": [1, 2, ""], "B": [1, "", 3]} _compare_csv_lines(osp.join(tmpdir, "progress.csv"), expect)
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, param_noise: bool = False, sigma: float = 0.1, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: #during rollout we collect batches of states and rewards continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps, param_noise=param_noise, sigma=sigma) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) # during training gradient descent is done self.train(param_noise, sigma) if param_noise: sigma = self.update_sigma(sigma) # print("current_sigma") # print(sigma) callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: # Collect n_steps (e.g. 512) number of steps. Total timesteps = n_steps * num_envs (e.g. 512 * 8 = 4096) # Hence each rollout has a total of 4096 timesteps continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") #logger.record("rollout/ep_rew_mean", safe_mean([goal_diff for goal_diff in self.ep_info_buffer])) #if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: # logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])) # logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) #logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) # Save model every 50 iterations if iteration > 0 and iteration % 50 == 0: # Save Pytorch Model locally if self.model_checkpoints_path is not None: th.save( self.policy.state_dict(), self.model_checkpoints_path + f"/model_v{iteration}") # Save Pytorch model to wanb local dir and upload to wandb cloud dashboard if self.log_handler is not None: self.log_handler.save( self.model_checkpoints_path + f"/model_v{iteration}", base_path=self.model_checkpoints_path) # Save the best model if achieve a new high score if self.save_best_model: print( f"Model achieve best score: {self.best_score} at iteration {iteration}" ) # Save Pytorch Model locally if self.model_checkpoints_path is not None: th.save(self.policy.state_dict(), self.model_checkpoints_path + "/model_bestscore") # Save Pytorch model to wanb local dir and upload to wandb cloud dashboard if self.log_handler is not None: self.log_handler.save( self.model_checkpoints_path + "/model_bestscore", base_path=self.model_checkpoints_path) self.save_best_model = False # PPO Training self.train() callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 100 * 16 * 5, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "A2C", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "A2C": DEVICE = th.device('cuda') self.ep_info_buffer = deque(maxlen=100) self.ep_success_buffer = deque(maxlen=100) self.num_timesteps = 0 self._episode_num = 0 self._last_obs = self.env.reset() self._last_dones = np.zeros((self.env.num_envs, ), dtype=np.bool) self.rollout_buffer = buffers.RolloutBuffer( self.n_steps, self.env.observation_space, self.env.action_space, DEVICE, gamma=self.gamma, gae_lambda=1.0, n_envs=self.env.num_envs, ) self.policy = policies.ActorCriticPolicy( self.env.observation_space, self.env.action_space, lambda _: 7e-4, features_extractor_class=torch_layers.NatureCNN).to(DEVICE) writer = tensorboard.SummaryWriter( datetime.datetime.now().strftime('logs/a2c/%d-%m-%Y %H-%M')) while True: self.rollout_buffer.reset() for n_steps in range(self.n_steps): with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(DEVICE) actions, values, log_probs = self.policy.forward( obs_tensor) actions = actions.cpu().numpy() new_obs, rewards, dones, infos = self.env.step(actions) self.num_timesteps += self.env.num_envs self._update_info_buffer(infos) # Reshape in case of discrete action actions = actions.reshape(-1, 1) self.rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs) self._last_obs = new_obs self._last_dones = dones self.rollout_buffer.compute_returns_and_advantage(values, dones=dones) if self.num_timesteps % log_interval == 0: logger.dump(step=self.num_timesteps) writer.add_scalar( 'Score', np.mean([ep_info["r"] for ep_info in self.ep_info_buffer]), self.num_timesteps // log_interval) self.train()
def test_hard(tmpdir): logger.configure(tmpdir) # Part One: Test logging outside of the accumulating scope, and within scopes # with two different different logging keys (including a repeat). sb_logger.record("no_context", 1) with logger.accumulate_means("disc"): sb_logger.record("C", 2) sb_logger.record("D", 2) sb_logger.dump() sb_logger.record("C", 4) sb_logger.dump() with logger.accumulate_means("gen"): sb_logger.record("E", 2) sb_logger.dump() sb_logger.record("E", 0) sb_logger.dump() with logger.accumulate_means("disc"): sb_logger.record("C", 3) sb_logger.dump() sb_logger.dump() # Writes 1 mean each from "gen" and "disc". expect_raw_gen = {"raw/gen/E": [2, 0]} expect_raw_disc = { "raw/disc/C": [2, 4, 3], "raw/disc/D": [2, "", ""], } expect_default = { "mean/gen/E": [1], "mean/disc/C": [3], "mean/disc/D": [2], "no_context": [1], } _compare_csv_lines(osp.join(tmpdir, "progress.csv"), expect_default) _compare_csv_lines(osp.join(tmpdir, "raw", "gen", "progress.csv"), expect_raw_gen) _compare_csv_lines(osp.join(tmpdir, "raw", "disc", "progress.csv"), expect_raw_disc) # Part Two: # Check that we append to the same logs after the first dump to "means/*". with logger.accumulate_means("disc"): sb_logger.record("D", 100) sb_logger.dump() sb_logger.record("no_context", 2) sb_logger.dump() # Writes 1 mean from "disc". "gen" is blank. expect_raw_gen = {"raw/gen/E": [2, 0]} expect_raw_disc = { "raw/disc/C": [2, 4, 3, ""], "raw/disc/D": [2, "", "", 100], } expect_default = { "mean/gen/E": [1, ""], "mean/disc/C": [3, ""], "mean/disc/D": [2, 100], "no_context": [1, 2], } _compare_csv_lines(osp.join(tmpdir, "progress.csv"), expect_default) _compare_csv_lines(osp.join(tmpdir, "raw", "gen", "progress.csv"), expect_raw_gen) _compare_csv_lines(osp.join(tmpdir, "raw", "disc", "progress.csv"), expect_raw_disc)
def main(): def env_contr(): return gym.make("CartPole-v0") # # env = multiwalker_v0.env() # env = pad_observations(env) # env = pad_action_space(env) # markov_env = aec_to_markov(env) # venv = MarkovVectorEnv(markov_env) # return venv n_envs = 6 # def nest_env_const(): # cat = ConcatVecEnv([env_contr]*envs_per_proc) # return cat example_env = env_contr() num_envs = n_envs * 1 #example_env.num_envs #cat = ProcConcatVec([nest_env_const]*n_procs,example_env.observation_space, example_env.action_space, num_envs) cat = MakeCPUAsyncConstructor(0)([env_contr] * n_envs, example_env.observation_space, example_env.action_space) #, num_envs) cat = VecEnvWrapper(cat) env = cat policy = "MlpPolicy" logger = make_logger("log") stable_baselines3.common.logger.Logger.CURRENT = logger a2c = PPO(policy, cat, n_steps=4, batch_size=6, n_epochs=3) print(type(a2c.env)) #a2c.learn(1000000) total_timesteps, callback = a2c._setup_learn(10000, None, None, None, n_eval_episodes=5, reset_num_timesteps=None, tb_log_name="PPo") #total_timesteps = 100 iteration = 0 log_interval = 1 for i in range(total_timesteps): continue_training = a2c.collect_rollouts(env, callback, a2c.rollout_buffer, n_rollout_steps=a2c.n_steps) print(a2c.ep_info_buffer) if continue_training is False: break iteration += 1 a2c._update_current_progress_remaining(a2c.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(a2c.num_timesteps / (time.time() - a2c.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") print(a2c.ep_info_buffer) if len(a2c.ep_info_buffer) > 0 and len(a2c.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in a2c.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in a2c.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - a2c.start_time), exclude="tensorboard") logger.record("time/total_timesteps", a2c.num_timesteps, exclude="tensorboard") logger.dump(step=a2c.num_timesteps) a2c.train()
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: for partner_idx in range(self.policy.num_partners): try: self.env.envs[0].switch_to_env(partner_idx) except: pass continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer[partner_idx], n_rollout_steps=self.n_steps, partner_idx=partner_idx) #continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer[partner_idx], n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) self.train() callback.on_training_end() return self
def collect_rollouts( self, # noqa: C901 env: VecEnv, # Type hint as string to avoid circular import callback: 'BaseCallback', n_episodes: int = 1, n_steps: int = -1, action_noise: Optional[ActionNoise] = None, learning_starts: int = 0, replay_buffer: Optional[ReplayBuffer] = None, log_interval: Optional[int] = None) -> RolloutReturn: """ Collect experiences and store them into a ReplayBuffer. :param env: (VecEnv) The training environment :param callback: (BaseCallback) Callback that will be called at each step (and at the beginning and end of the rollout) :param n_episodes: (int) Number of episodes to use to collect rollout data You can also specify a ``n_steps`` instead :param n_steps: (int) Number of steps to use to collect rollout data You can also specify a ``n_episodes`` instead. :param action_noise: (Optional[ActionNoise]) Action noise that will be used for exploration Required for deterministic policy (e.g. TD3). This can also be used in addition to the stochastic policy for SAC. :param learning_starts: (int) Number of steps before learning for the warm-up phase. :param replay_buffer: (ReplayBuffer) :param log_interval: (int) Log data every ``log_interval`` episodes :return: (RolloutReturn) """ episode_rewards, total_timesteps = [], [] total_steps, total_episodes = 0, 0 assert isinstance(env, VecEnv), "You must pass a VecEnv" assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment" if n_episodes > 0 and n_steps > 0: # Note we are refering to the constructor arguments # that are named `train_freq` and `n_episodes_rollout` # but correspond to `n_steps` and `n_episodes` here warnings.warn( "You passed a positive value for `train_freq` and `n_episodes_rollout`." "Please make sure this is intended. " "The agent will collect data by stepping in the environment " "until both conditions are true: " "`number of steps in the env` >= `train_freq` and " "`number of episodes` > `n_episodes_rollout`") if self.use_sde: self.actor.reset_noise() callback.on_rollout_start() continue_training = True while total_steps < n_steps or total_episodes < n_episodes: done = False episode_reward, episode_timesteps = 0.0, 0 while not done: if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.actor.reset_noise() # Select action randomly or according to policy if self.num_timesteps < learning_starts and not ( self.use_sde and self.use_sde_at_warmup): # Warmup phase unscaled_action = np.array([self.action_space.sample()]) else: # Note: we assume that the policy uses tanh to scale the action # We use non-deterministic action in the case of SAC, for TD3, it does not matter unscaled_action, _ = self.predict(self._last_obs, deterministic=False) # Rescale the action from [low, high] to [-1, 1] if isinstance(self.action_space, gym.spaces.Box): scaled_action = self.policy.scale_action(unscaled_action) # Add noise to the action (improve exploration) if action_noise is not None: # NOTE: in the original implementation of TD3, the noise was applied to the unscaled action # Update(October 2019): Not anymore scaled_action = np.clip(scaled_action + action_noise(), -1, 1) # We store the scaled action in the buffer buffer_action = scaled_action action = self.policy.unscale_action(scaled_action) else: # Discrete case, no need to normalize or clip buffer_action = unscaled_action action = buffer_action # Rescale and perform action new_obs, reward, done, infos = env.step(action) # Only stop training if return value is False, not when it is None. if callback.on_step() is False: return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False) episode_reward += reward # Retrieve reward and episode length if using Monitor wrapper self._update_info_buffer(infos, done) # Store data in replay buffer if replay_buffer is not None: # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs() reward_ = self._vec_normalize_env.get_original_reward() else: # Avoid changing the original ones self._last_original_obs, new_obs_, reward_ = self._last_obs, new_obs, reward replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done) self._last_obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: self._last_original_obs = new_obs_ self.num_timesteps += 1 episode_timesteps += 1 total_steps += 1 if 0 < n_steps <= total_steps: break if done: total_episodes += 1 self._episode_num += 1 episode_rewards.append(episode_reward) total_timesteps.append(episode_timesteps) if action_noise is not None: action_noise.reset() # Log training infos if log_interval is not None and self._episode_num % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/episodes", self._episode_num, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( 'rollout/ep_rew_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buffer ])) logger.record( 'rollout/ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buffer ])) logger.record("time/fps", fps) logger.record('time/time_elapsed', int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total timesteps", self.num_timesteps, exclude="tensorboard") if self.use_sde: logger.record("train/std", (self.actor.get_std()).mean().item()) if len(self.ep_success_buffer) > 0: logger.record('rollout/success rate', safe_mean(self.ep_success_buffer)) # Pass the number of timesteps for tensorboard logger.dump(step=self.num_timesteps) mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0 callback.on_rollout_end() return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training)
def main(): # noqa: C901 parser = argparse.ArgumentParser() parser.add_argument("-tb", "--tensorboard-log", help="Tensorboard log dir", default="", type=str) parser.add_argument("--env", help="environment ID", type=str, default="CartPole-v1") parser.add_argument("-f", "--folder", help="Log folder", type=str, default="rl-trained-agents") parser.add_argument("--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument("-n", "--n-timesteps", help="number of timesteps", default=1000, type=int) parser.add_argument( "--num-threads", help="Number of threads for PyTorch (-1 to use default)", default=-1, type=int) parser.add_argument("--n-envs", help="number of environments", default=1, type=int) # parser.add_argument("--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int) parser.add_argument( "--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default='0', type=str) parser.add_argument("--verbose", help="Verbose mode (0: no output, 1: INFO)", default=1, type=int) parser.add_argument( "--no-render", action="store_true", default=False, help="Do not render the environment (useful for tests)") parser.add_argument( "--render-mode", default='step', help="Whether to render at each step or at the end of an episode") parser.add_argument("--deterministic", action="store_true", default=False, help="Use deterministic actions") parser.add_argument( "--load-best", action="store_true", default=False, help="Load best model instead of last model if available") parser.add_argument( "--load-checkpoint", type=int, help="Load checkpoint instead of last model if available, " "you must pass the number of timesteps corresponding to it", ) parser.add_argument("--stochastic", action="store_true", default=False, help="Use stochastic actions") parser.add_argument( "--norm-reward", action="store_true", default=False, help="Normalize reward if applicable (trained with VecNormalize)") parser.add_argument("--seed", help="Random generator seed", type=int, default=0) parser.add_argument("--info-freq", help="Frequency on which info valuers are logged", type=int, default=10) parser.add_argument("--reward-log", help="Where to log reward", default="", type=str) parser.add_argument( "--gym-packages", type=str, nargs="+", default=[], help= "Additional external Gym environemnt package modules to import (e.g. gym_minigrid)", ) parser.add_argument( "--env-kwargs", type=str, nargs="+", action=StoreDict, help="Optional keyword argument to pass to the env constructor") args = parser.parse_args() # Going through custom gym packages to let them register in the global registory for env_module in args.gym_packages: importlib.import_module(env_module) env_id = args.env algo = args.algo folder = args.folder if args.exp_id == '0': args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print(f"Loading latest experiment, id={args.exp_id}") # Sanity checks if args.exp_id != '0' and args.exp_id != '-1': log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}") else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), f"The {log_path} folder was not found" found = False for ext in ["zip"]: model_path = os.path.join(log_path, f"{env_id}.{ext}") found = os.path.isfile(model_path) if found: break if args.load_best: model_path = os.path.join(log_path, "best_model.zip") found = os.path.isfile(model_path) if args.load_checkpoint is not None: model_path = os.path.join( log_path, f"rl_model_{args.load_checkpoint}_steps.zip") found = os.path.isfile(model_path) if not found: raise ValueError( f"No model found for {algo} on {env_id}, path: {model_path}") else: print(f"Loading model for {algo} on {env_id}, path: {model_path}") off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"] if algo in off_policy_algos: args.n_envs = 1 set_random_seed(args.seed) if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) is_atari = ExperimentManager.is_atari(env_id) stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) # load env_kwargs if existing env_kwargs = {} args_path = os.path.join(log_path, env_id, "args.yml") if os.path.isfile(args_path): with open(args_path, "r") as f: loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader) # pytype: disable=module-attr if loaded_args["env_kwargs"] is not None: env_kwargs = loaded_args["env_kwargs"] # overwrite with command line arguments if args.env_kwargs is not None: env_kwargs.update(args.env_kwargs) log_dir = args.reward_log if args.reward_log != "" else None env = create_test_env( env_id, n_envs=args.n_envs, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams, env_kwargs=env_kwargs, ) kwargs = dict(seed=args.seed) if algo in off_policy_algos: # Dummy buffer size as we don't need memory to enjoy the trained agent kwargs.update(dict(buffer_size=1)) # Check if we are running python 3.8+ # we need to patch saved model under python 3.6/3.7 to load them newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8 custom_objects = {} if newer_python_version: custom_objects = { "learning_rate": 0.0, "lr_schedule": lambda _: 0.0, "clip_range": lambda _: 0.0, } model = ALGOS[algo].load(model_path, env=env, custom_objects=custom_objects, **kwargs) # tb_path = '' # for i in range(0,100000,1): # tb_path = os.path.join(args.tensorboard_log, env_id, algo.upper() + "_" + str(i)) # if not os.path.exists(tb_path): # break # print("algo=",algo, " logdir=", tb_path) # writer = SummaryWriter(log_dir=tb_path) obs = env.reset() # Deterministic by default except for atari games stochastic = args.stochastic or is_atari and not args.deterministic deterministic = not stochastic state = None episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 ep_count = 0 # For HER, monitor success rate successes = [] sbcommon_utils.configure_logger(args.verbose, os.path.join(args.tensorboard_log, env_id), algo.upper(), reset_num_timesteps=True) xlsx_logpath = os.path.join( args.tensorboard_log, env_id) if logger.get_dir() is None else logger.get_dir() xlsx_logger = Xlsx_Logger(xlsx_logpath, env_id) with open(os.path.join(xlsx_logpath, 'args.yaml'), 'w') as file: yaml.dump(args, file) fig: plt.Figure = None info_freq = args.info_freq try: for step in range(args.n_timesteps): action, state = model.predict(obs, state=state, deterministic=deterministic) obs, reward, done, infos = env.step(action) episode_reward += reward[0] ep_len += 1 if args.n_envs == 1: # log info variables to tensorboard if (step % info_freq == 0 or done) and type(infos[0]) is dict: if not args.no_render: if not done and args.render_mode == 'step': fig = env.render("human") elif done and args.render_mode == 'episode': fig = env.envs[0].rendered_episode xlsx_logger.set_step_ep(ep_count, step) for key in infos[0]: if key == 'episode' or key == 'terminal_observation' or key == 'render': continue val = infos[0].get(key) logger.record("eval/" + key, val, exclude='stdout') xlsx_logger.log(key, val) if fig is not None: log_fig = logger.Figure(fig, False) logger.record("eval/figure", log_fig, exclude='stdout') # writer.add_scalar("eval/"+key, val, step) logger.dump(step=step) # For atari the return reward is not the atari score # so we have to get it from the infos dict if is_atari and infos is not None and args.verbose >= 1: episode_infos = infos[0].get("episode") if episode_infos is not None: print(f"Atari Episode Score: {episode_infos['r']:.2f}") print("Atari Episode Length", episode_infos["l"]) if done and not is_atari and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed print("Episode #{}, step#{}".format(ep_count, step)) print(f" Episode Reward: {episode_reward:.2f}") print(" Episode Length", ep_len) episode_rewards.append(episode_reward) logger.record("eval/ep_len", ep_len, exclude='stdout') logger.record("eval/ep_reward", episode_reward, exclude='stdout') xlsx_logger.log('ep_len', ep_len) xlsx_logger.log('reward', episode_reward) logger.dump(step=step) episode_lengths.append(ep_len) episode_reward = 0.0 ep_len = 0 ep_count += 1 state = None # Reset also when the goal is achieved when using HER if done and infos[0].get("is_success") is not None: if args.verbose > 1: print("Success?", infos[0].get("is_success", False)) if infos[0].get("is_success") is not None: successes.append(infos[0].get("is_success", False)) episode_reward, ep_len = 0.0, 0 ep_count += 1 # if (not args.no_render) and args.render_mode=='step': # fig = env.render("human") # else: # fig = None except KeyboardInterrupt: pass logger.dump(step=step) xlsx_logger.close() if args.verbose > 0 and len(successes) > 0: print(f"Success rate: {100 * np.mean(successes):.2f}%") if args.verbose > 0 and len(episode_rewards) > 0: print(f"{len(episode_rewards)} Episodes") print( f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}" ) if args.verbose > 0 and len(episode_lengths) > 0: print( f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}" ) env.close()
def train_normal(self): # Update optimizer learning rate self._update_learning_rate(self.policy.optimizer) # Compute current clip range clip_range = self.clip_range(self._current_progress_remaining) # Optional: clip range for the value function if self.clip_range_vf is not None: clip_range_vf = self.clip_range_vf( self._current_progress_remaining) entropy_losses, all_kl_divs = [], [] pg_losses, value_losses = [], [] clip_fractions = [] # train for n_epochs epochs for epoch in range(self.n_epochs): approx_kl_divs = [] # Do a complete pass on the rollout buffer for rollout_data in self.rollout_buffer.get(self.batch_size): actions = rollout_data.actions if isinstance(self.action_space, spaces.Discrete): # Convert discrete action from float to long actions = rollout_data.actions.long().flatten() # Re-sample the noise matrix because the log_std has changed # TODO: investigate why there is no issue with the gradient # if that line is commented (as in SAC) if self.use_sde: self.policy.reset_noise(self.batch_size) """ Sida: Change the input to evaluate_actions() """ with self.policy.features_extractor.start_training( rollout_data.short_hidden_states, rollout_data.long_hidden_states): values, log_prob, entropy = self.policy.evaluate_actions( rollout_data.observations, actions) values = values.flatten() # Normalize advantage advantages = rollout_data.advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) # ratio between old and new policy, should be one at the first iteration ratio = th.exp(log_prob - rollout_data.old_log_prob) # clipped surrogate loss policy_loss_1 = advantages * ratio policy_loss_2 = advantages * th.clamp(ratio, 1 - clip_range, 1 + clip_range) policy_loss = -th.min(policy_loss_1, policy_loss_2).mean() # Logging pg_losses.append(policy_loss.item()) clip_fraction = th.mean( (th.abs(ratio - 1) > clip_range).float()).item() clip_fractions.append(clip_fraction) if self.clip_range_vf is None: # No clipping values_pred = values else: # Clip the different between old and new value # NOTE: this depends on the reward scaling values_pred = rollout_data.old_values + th.clamp( values - rollout_data.old_values, -clip_range_vf, clip_range_vf) # Value loss using the TD(gae_lambda) target value_loss = F.mse_loss(rollout_data.returns, values_pred) value_losses.append(value_loss.item()) # Entropy loss favor exploration if entropy is None: # Approximate entropy when no analytical form entropy_loss = -th.mean(-log_prob) else: entropy_loss = -th.mean(entropy) entropy_losses.append(entropy_loss.item()) loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss # Optimization step self.policy.optimizer.zero_grad() loss.backward() # Clip grad norm th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) self.policy.optimizer.step() approx_kl_divs.append( th.mean(rollout_data.old_log_prob - log_prob).detach().cpu().numpy()) all_kl_divs.append(np.mean(approx_kl_divs)) if self.target_kl is not None and np.mean( approx_kl_divs) > 1.5 * self.target_kl: print( f"Early stopping at step {epoch} due to reaching max kl: {np.mean(approx_kl_divs):.2f}" ) break self._n_updates += self.n_epochs explained_var = explained_variance( self.rollout_buffer.values.flatten(), self.rollout_buffer.returns.flatten()) # Logs logger.record("train_normal/entropy_loss", np.mean(entropy_losses)) logger.record("train_normal/policy_gradient_loss", np.mean(pg_losses)) logger.record("train_normal/value_loss", np.mean(value_losses)) logger.record("train_normal/approx_kl", np.mean(approx_kl_divs)) logger.record("train_normal/clip_fraction", np.mean(clip_fractions)) logger.record("train_normal/loss", loss.item()) logger.record("train_normal/explained_variance", explained_var) if hasattr(self.policy, "log_std"): logger.record("train_normal/std", th.exp(self.policy.log_std).mean().item()) logger.record("train_normal/n_updates", self._n_updates, exclude="tensorboard") logger.record("train_normal/clip_range", clip_range) if self.clip_range_vf is not None: logger.record("train_normal/clip_range_vf", clip_range_vf) logger.dump(step=self.num_timesteps)
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 print('setup training') total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) print(f'start training, total timesteps is {total_timesteps}') while self.num_timesteps < total_timesteps: print(f'num timesteps: {self.num_timesteps}/{total_timesteps}') print(f'collect rollouts, rollout steps = {self.n_steps}') continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: print( 'stop training (only happens if callback on_step returns false)' ) break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos print('display training infos') # print(f'len(self.ep_info_buffer)={len(self.ep_info_buffer)}, len(self.ep_info_buffer[0])={len(self.ep_info_buffer[0])}') if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) print('train') self.train() callback.on_training_end() return self
def dump(step=0) -> None: """Alias for `stable_baselines3.logger.dump`.""" sb_logger.dump(step)
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, parameter_noise: bool = False, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) #initiatilizing value of noise std current_sigma = 1.0 while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps, parameter_noise=parameter_noise, sigma=0.5) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) self.train() if parameter_noise: states = self.rollout_buffer.observations states = th.tensor(states) actions_unnoisy, values_unnoisy, log_prob_unnoisy = self.policy( states, parameter_noise=False) actions_noisy, values_noisy, log_prob_noisy = self.policy( states, parameter_noise=True, sigma=current_sigma) distance = th.sum((actions_unnoisy - actions_noisy)**2)**0.5 distance_threshold = 1 sigma_scalefactor = 1.01 if distance > distance_threshold: current_sigma /= sigma_scalefactor else: current_sigma *= sigma_scalefactor callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: rollout = self.collect_rollouts( self.env, n_episodes=-1, n_steps=1, action_noise=self.action_noise, callback=callback, learning_starts=self.learning_starts, replay_buffer=self.replay_buffer, log_interval=log_interval, ) if rollout.continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: # If no `gradient_steps` is specified, # do as many gradients steps as steps performed during the rollout self.train(gradient_steps=1, batch_size=self.batch_size) callback.on_training_end() return self
def test_main(tmp_path): """ tests for the logger module """ info("hi") debug("shouldn't appear") assert get_level() == INFO set_level(DEBUG) assert get_level() == DEBUG debug("should appear") configure(folder=str(tmp_path)) assert get_dir() == str(tmp_path) record("a", 3) record("b", 2.5) dump() record("b", -2.5) record("a", 5.5) dump() info("^^^ should see a = 5.5") record("f", "this text \n \r should appear in one line") dump() info('^^^ should see f = "this text \n \r should appear in one line"') record_mean("b", -22.5) record_mean("b", -44.4) record("a", 5.5) dump() with ScopedConfigure(None, None): info("^^^ should see b = 33.3") with ScopedConfigure(str(tmp_path / "test-logger"), ["json"]): record("b", -2.5) dump() reset() record("a", "longasslongasslongasslongasslongasslongassvalue") dump() warn("hey") error("oh") record_dict({"test": 1}) assert isinstance(get_log_dict(), dict) and set(get_log_dict().keys()) == {"test"}
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) for k in self.ep_info_buffer[0].keys(): if k not in "lrt": logger.record( f"progress/{k}", safe_mean([ ep_info[k] for ep_info in self.ep_info_buffer ])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) if iteration % (log_interval * 10) == 0: #save parameters every 10 log steps self.save('./interim_trained_models/') self.train() callback.on_training_end() return self