Exemplo n.º 1
0
    def _dump_logs(self) -> None:
        """
        Write log.
        """
        fps = int(self.num_timesteps / (time.time() - self.start_time))
        logger.record("time/episodes",
                      self._episode_num,
                      exclude="tensorboard")
        if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
            logger.record(
                "rollout/ep_rew_mean",
                safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
            logger.record(
                "rollout/ep_len_mean",
                safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
        logger.record("time/fps", fps)
        logger.record("time/time_elapsed",
                      int(time.time() - self.start_time),
                      exclude="tensorboard")
        logger.record("time/total timesteps",
                      self.num_timesteps,
                      exclude="tensorboard")
        if self.use_sde:
            logger.record("train/std", (self.actor.get_std()).mean().item())

        if len(self.ep_success_buffer) > 0:
            logger.record("rollout/success rate",
                          safe_mean(self.ep_success_buffer))
        # Pass the number of timesteps for tensorboard
        logger.dump(step=self.num_timesteps)
Exemplo n.º 2
0
def test_main(tmp_path):
    """
    tests for the logger module
    """
    info("hi")
    debug("shouldn't appear")
    set_level(DEBUG)
    debug("should appear")
    configure(folder=str(tmp_path))
    record("a", 3)
    record("b", 2.5)
    dump()
    record("b", -2.5)
    record("a", 5.5)
    dump()
    info("^^^ should see a = 5.5")
    record_mean("b", -22.5)
    record_mean("b", -44.4)
    record("a", 5.5)
    dump()
    with ScopedConfigure(None, None):
        info("^^^ should see b = 33.3")

    with ScopedConfigure(str(tmp_path / "test-logger"), ["json"]):
        record("b", -2.5)
        dump()

    reset()
    record("a", "longasslongasslongasslongasslongasslongassvalue")
    dump()
    warn("hey")
    error("oh")
    record_dict({"test": 1})
Exemplo n.º 3
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0
        
        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name
        )

        callback.on_training_start(locals(), globals())
        
        # debug ===============================================================
        if mode == 'debug':
            print(['OPA.learn started, ready to loop (OPA.collect_rollouts + OPA.train)'])
            
        
        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps, total_timesteps)
            
            # debug ===========================================================
            if mode == 'debug':
                print(['OPA.learn', 'num_timesteps:', self.num_timesteps, 'total_timesteps:', total_timesteps])

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations", iteration, exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
                    logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard")
                logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard")
                logger.dump(step=self.num_timesteps)
            # debug ===============================================================
            if mode == 'debug':
                print(['OPA.learn finished, ready to OPA.train'])
            self.train()

        callback.on_training_end()

        return self
Exemplo n.º 4
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name
        )

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps, total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                self.fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations", iteration, exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
                    # logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
                    # logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
                    logger.record("rollout/ep_reward_mean", safe_mean([ep_info['r'] for ep_info in self.ep_info_buffer]))
                    logger.record("rollout/ep_len_mean", safe_mean([ep_info['l'] for ep_info in self.ep_info_buffer]))
                if len(self.specific_reward_info_buffer) > 0 and len(self.specific_reward_info_buffer[0]) > 0:
                    logger.record('rollout/mimic_qpos_reward', safe_mean([specific_reward_info['mimic_qpos_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_qvel_reward', safe_mean([specific_reward_info['mimic_qvel_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    #logger.record('rollout/mimic_ee_reward', safe_mean([specific_reward_info['mimic_ee_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_body_orientation_reward', safe_mean([specific_reward_info['mimic_body_orientation_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_body_reward', safe_mean([specific_reward_info['mimic_body_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_body_vel_reward', safe_mean([specific_reward_info['mimic_body_vel_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_contact_reward', safe_mean([specific_reward_info['mimic_contact_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                logger.record("time/fps", self.fps)
                logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard")
                logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            self.train()

        callback.on_training_end()

        return self
Exemplo n.º 5
0
    def train(
        self,
        n_epochs: int = 100,
        *,
        on_epoch_end: Callable[[dict], None] = None,
        log_interval: int = 100,
    ):
        """Train with supervised learning for some number of epochs.

        Here an 'epoch' is just a complete pass through the expert transition
        dataset.

        Args:
          n_epochs: number of complete passes made through dataset.
          on_epoch_end: optional callback to run at
            the end of each epoch. Will receive all locals from this function as
            dictionary argument (!!).
          log_interval: log stats after every log_interval batches
        """
        assert self.batch_size >= 1
        samples_so_far = 0
        batch_num = 0
        for epoch_num in trange(n_epochs, desc="BC epoch"):
            while samples_so_far < (epoch_num + 1) * self.expert_dataset.size():
                batch_num += 1
                trans = self.expert_dataset.sample(self.batch_size)
                assert len(trans) == self.batch_size
                samples_so_far += self.batch_size

                obs_tensor = th.as_tensor(trans.obs).to(self.policy.device)
                acts_tensor = th.as_tensor(trans.acts).to(self.policy.device)
                loss, stats_dict = self._calculate_loss(obs_tensor, acts_tensor)

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                stats_dict["epoch_num"] = epoch_num
                stats_dict["n_updates"] = batch_num
                stats_dict["batch_size"] = len(trans)

                if batch_num % log_interval == 0:
                    for k, v in stats_dict.items():
                        logger.record(k, v)
                    logger.dump(batch_num)

            if on_epoch_end is not None:
                on_epoch_end(locals())
Exemplo n.º 6
0
    def learn(self, total_timesteps, log_interval, n_eval_episodes=5):
        start_time = time.time()
        iteration = 0

        while self.num_timesteps < total_timesteps:
            progress = round(self.num_timesteps / total_timesteps * 100, 2)
            self.collect_samples()

            iteration += 1
            if log_interval is not None and iteration % log_interval == 0:
                logger.record("Progress", str(progress) + '%')
                logger.record("time/total timesteps", self.num_timesteps)
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        np.mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        np.mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                fps = int(self.num_timesteps / (time.time() - start_time))
                logger.record("time/total_time", (time.time() - start_time))
                logger.dump(step=self.num_timesteps)

            self.train(self.rollout)
            if np.random.randn() < 0.25:
                self.train_rnd(self.rollout)

        logger.record("Complete", '.')
        logger.record("time/total timesteps", self.num_timesteps)
        if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
            logger.record(
                "rollout/ep_rew_mean",
                np.mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
            logger.record(
                "rollout/ep_len_mean",
                np.mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
        fps = int(self.num_timesteps / (time.time() - start_time))
        logger.record("time/total_time", (time.time() - start_time))
        logger.dump(step=self.num_timesteps)

        return self
Exemplo n.º 7
0
    def train(
        self,
        *,
        n_epochs: Optional[int] = None,
        n_batches: Optional[int] = None,
        on_epoch_end: Callable[[], None] = None,
        log_interval: int = 100,
    ):
        """Train with supervised learning for some number of epochs.

        Here an 'epoch' is just a complete pass through the expert data loader,
        as set by `self.set_expert_data_loader()`.

        Args:
            n_epochs: Number of complete passes made through expert data before ending
                training. Provide exactly one of `n_epochs` and `n_batches`.
            n_batches: Number of batches loaded from dataset before ending training.
                Provide exactly one of `n_epochs` and `n_batches`.
            on_epoch_end: Optional callback with no parameters to run at the end of each
                epoch.
            log_interval: Log stats after every log_interval batches.
        """
        it = EpochOrBatchIteratorWithProgress(
            self.expert_data_loader,
            n_epochs=n_epochs,
            n_batches=n_batches,
            on_epoch_end=on_epoch_end,
        )

        batch_num = 0
        for batch, stats_dict_it in it:
            loss, stats_dict_loss = self._calculate_loss(
                batch["obs"], batch["acts"])

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            if batch_num % log_interval == 0:
                for stats in [stats_dict_it, stats_dict_loss]:
                    for k, v in stats.items():
                        logger.record(k, v)
                logger.dump(batch_num)
            batch_num += 1
    def learn(self,
              total_timesteps,
              n_steps,
              n_iter,
              batch_size,
              save_path,
              tb_log_path=None):
        configure_logger(verbose=self.verbose,
                         tensorboard_log=tb_log_path,
                         tb_log_name="HAC",
                         reset_num_timesteps=True)

        step_count = 0
        i_episode = 1
        while step_count <= total_timesteps:
            self.reward = 0
            self.timestep = 0

            state = self.env.reset()
            # collecting experience in environment
            last_state, done, _step_count = self.run_HAC(self.env,
                                                         self.k_level - 1,
                                                         state,
                                                         self.goal_state,
                                                         is_subgoal_test=False)
            step_count += _step_count

            # updating with collected data
            if step_count > n_steps * i_episode:
                vio_num = get_violation_count(self.env)
                if vio_num is not None:
                    logger.record("rollout/violation", vio_num)
                logger.record(f"rollout/ep_rew_mean", self.reward)

                self.update(n_iter, batch_size)
                i_episode += 1

                logger.dump(step_count)

        self.save(save_path)
        return self
Exemplo n.º 9
0
def test_no_accum(tmpdir):
    logger.configure(tmpdir, ["csv"])
    sb_logger.record("A", 1)
    sb_logger.record("B", 1)
    sb_logger.dump()
    sb_logger.record("A", 2)
    sb_logger.dump()
    sb_logger.record("B", 3)
    sb_logger.dump()
    expect = {"A": [1, 2, ""], "B": [1, "", 3]}
    _compare_csv_lines(osp.join(tmpdir, "progress.csv"), expect)
Exemplo n.º 10
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
        param_noise: bool = False,
        sigma: float = 0.1,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            #during rollout we collect batches of states and rewards
            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps,
                param_noise=param_noise,
                sigma=sigma)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            # during training gradient descent is done
            self.train(param_noise, sigma)

            if param_noise:
                sigma = self.update_sigma(sigma)
                # print("current_sigma")
                # print(sigma)

        callback.on_training_end()

        return self
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            # Collect n_steps (e.g. 512) number of steps. Total timesteps = n_steps * num_envs (e.g. 512 * 8 = 4096)
            # Hence each rollout has a total of 4096 timesteps
            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                #logger.record("rollout/ep_rew_mean", safe_mean([goal_diff for goal_diff in self.ep_info_buffer]))
                #if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
                #    logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
                #    logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
                #logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            # Save model every 50 iterations
            if iteration > 0 and iteration % 50 == 0:
                # Save Pytorch Model locally
                if self.model_checkpoints_path is not None:
                    th.save(
                        self.policy.state_dict(),
                        self.model_checkpoints_path + f"/model_v{iteration}")

                    # Save Pytorch model to wanb local dir and upload to wandb cloud dashboard
                    if self.log_handler is not None:
                        self.log_handler.save(
                            self.model_checkpoints_path +
                            f"/model_v{iteration}",
                            base_path=self.model_checkpoints_path)

            # Save the best model if achieve a new high score
            if self.save_best_model:
                print(
                    f"Model achieve best score: {self.best_score} at iteration {iteration}"
                )

                # Save Pytorch Model locally
                if self.model_checkpoints_path is not None:
                    th.save(self.policy.state_dict(),
                            self.model_checkpoints_path + "/model_bestscore")

                    # Save Pytorch model to wanb local dir and upload to wandb cloud dashboard
                    if self.log_handler is not None:
                        self.log_handler.save(
                            self.model_checkpoints_path + "/model_bestscore",
                            base_path=self.model_checkpoints_path)

                self.save_best_model = False

            # PPO Training
            self.train()

        callback.on_training_end()

        return self
Exemplo n.º 12
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 100 * 16 * 5,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "A2C",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "A2C":
        DEVICE = th.device('cuda')
        self.ep_info_buffer = deque(maxlen=100)
        self.ep_success_buffer = deque(maxlen=100)
        self.num_timesteps = 0
        self._episode_num = 0
        self._last_obs = self.env.reset()
        self._last_dones = np.zeros((self.env.num_envs, ), dtype=np.bool)

        self.rollout_buffer = buffers.RolloutBuffer(
            self.n_steps,
            self.env.observation_space,
            self.env.action_space,
            DEVICE,
            gamma=self.gamma,
            gae_lambda=1.0,
            n_envs=self.env.num_envs,
        )
        self.policy = policies.ActorCriticPolicy(
            self.env.observation_space,
            self.env.action_space,
            lambda _: 7e-4,
            features_extractor_class=torch_layers.NatureCNN).to(DEVICE)

        writer = tensorboard.SummaryWriter(
            datetime.datetime.now().strftime('logs/a2c/%d-%m-%Y %H-%M'))
        while True:
            self.rollout_buffer.reset()

            for n_steps in range(self.n_steps):
                with th.no_grad():
                    # Convert to pytorch tensor
                    obs_tensor = th.as_tensor(self._last_obs).to(DEVICE)
                    actions, values, log_probs = self.policy.forward(
                        obs_tensor)
                actions = actions.cpu().numpy()

                new_obs, rewards, dones, infos = self.env.step(actions)

                self.num_timesteps += self.env.num_envs

                self._update_info_buffer(infos)

                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
                self.rollout_buffer.add(self._last_obs, actions, rewards,
                                        self._last_dones, values, log_probs)
                self._last_obs = new_obs
                self._last_dones = dones

            self.rollout_buffer.compute_returns_and_advantage(values,
                                                              dones=dones)

            if self.num_timesteps % log_interval == 0:
                logger.dump(step=self.num_timesteps)
                writer.add_scalar(
                    'Score',
                    np.mean([ep_info["r"] for ep_info in self.ep_info_buffer]),
                    self.num_timesteps // log_interval)
            self.train()
Exemplo n.º 13
0
def test_hard(tmpdir):
    logger.configure(tmpdir)

    # Part One: Test logging outside of the accumulating scope, and within scopes
    # with two different different logging keys (including a repeat).

    sb_logger.record("no_context", 1)

    with logger.accumulate_means("disc"):
        sb_logger.record("C", 2)
        sb_logger.record("D", 2)
        sb_logger.dump()
        sb_logger.record("C", 4)
        sb_logger.dump()

    with logger.accumulate_means("gen"):
        sb_logger.record("E", 2)
        sb_logger.dump()
        sb_logger.record("E", 0)
        sb_logger.dump()

    with logger.accumulate_means("disc"):
        sb_logger.record("C", 3)
        sb_logger.dump()

    sb_logger.dump()  # Writes 1 mean each from "gen" and "disc".

    expect_raw_gen = {"raw/gen/E": [2, 0]}
    expect_raw_disc = {
        "raw/disc/C": [2, 4, 3],
        "raw/disc/D": [2, "", ""],
    }
    expect_default = {
        "mean/gen/E": [1],
        "mean/disc/C": [3],
        "mean/disc/D": [2],
        "no_context": [1],
    }

    _compare_csv_lines(osp.join(tmpdir, "progress.csv"), expect_default)
    _compare_csv_lines(osp.join(tmpdir, "raw", "gen", "progress.csv"),
                       expect_raw_gen)
    _compare_csv_lines(osp.join(tmpdir, "raw", "disc", "progress.csv"),
                       expect_raw_disc)

    # Part Two:
    # Check that we append to the same logs after the first dump to "means/*".

    with logger.accumulate_means("disc"):
        sb_logger.record("D", 100)
        sb_logger.dump()

    sb_logger.record("no_context", 2)

    sb_logger.dump()  # Writes 1 mean from "disc". "gen" is blank.

    expect_raw_gen = {"raw/gen/E": [2, 0]}
    expect_raw_disc = {
        "raw/disc/C": [2, 4, 3, ""],
        "raw/disc/D": [2, "", "", 100],
    }
    expect_default = {
        "mean/gen/E": [1, ""],
        "mean/disc/C": [3, ""],
        "mean/disc/D": [2, 100],
        "no_context": [1, 2],
    }

    _compare_csv_lines(osp.join(tmpdir, "progress.csv"), expect_default)
    _compare_csv_lines(osp.join(tmpdir, "raw", "gen", "progress.csv"),
                       expect_raw_gen)
    _compare_csv_lines(osp.join(tmpdir, "raw", "disc", "progress.csv"),
                       expect_raw_disc)
Exemplo n.º 14
0
def main():
    def env_contr():
        return gym.make("CartPole-v0")  #
        # env = multiwalker_v0.env()
        # env = pad_observations(env)
        # env = pad_action_space(env)
        # markov_env = aec_to_markov(env)
        # venv = MarkovVectorEnv(markov_env)
        # return venv

    n_envs = 6
    # def nest_env_const():
    #     cat = ConcatVecEnv([env_contr]*envs_per_proc)
    #     return cat
    example_env = env_contr()
    num_envs = n_envs * 1  #example_env.num_envs
    #cat = ProcConcatVec([nest_env_const]*n_procs,example_env.observation_space, example_env.action_space, num_envs)
    cat = MakeCPUAsyncConstructor(0)([env_contr] * n_envs,
                                     example_env.observation_space,
                                     example_env.action_space)  #, num_envs)
    cat = VecEnvWrapper(cat)
    env = cat
    policy = "MlpPolicy"
    logger = make_logger("log")
    stable_baselines3.common.logger.Logger.CURRENT = logger
    a2c = PPO(policy, cat, n_steps=4, batch_size=6, n_epochs=3)
    print(type(a2c.env))
    #a2c.learn(1000000)

    total_timesteps, callback = a2c._setup_learn(10000,
                                                 None,
                                                 None,
                                                 None,
                                                 n_eval_episodes=5,
                                                 reset_num_timesteps=None,
                                                 tb_log_name="PPo")

    #total_timesteps = 100
    iteration = 0
    log_interval = 1
    for i in range(total_timesteps):
        continue_training = a2c.collect_rollouts(env,
                                                 callback,
                                                 a2c.rollout_buffer,
                                                 n_rollout_steps=a2c.n_steps)
        print(a2c.ep_info_buffer)
        if continue_training is False:
            break

        iteration += 1
        a2c._update_current_progress_remaining(a2c.num_timesteps,
                                               total_timesteps)

        # Display training infos
        if log_interval is not None and iteration % log_interval == 0:
            fps = int(a2c.num_timesteps / (time.time() - a2c.start_time))
            logger.record("time/iterations", iteration, exclude="tensorboard")
            print(a2c.ep_info_buffer)
            if len(a2c.ep_info_buffer) > 0 and len(a2c.ep_info_buffer[0]) > 0:
                logger.record(
                    "rollout/ep_rew_mean",
                    safe_mean([ep_info["r"]
                               for ep_info in a2c.ep_info_buffer]))
                logger.record(
                    "rollout/ep_len_mean",
                    safe_mean([ep_info["l"]
                               for ep_info in a2c.ep_info_buffer]))
            logger.record("time/fps", fps)
            logger.record("time/time_elapsed",
                          int(time.time() - a2c.start_time),
                          exclude="tensorboard")
            logger.record("time/total_timesteps",
                          a2c.num_timesteps,
                          exclude="tensorboard")
            logger.dump(step=a2c.num_timesteps)

        a2c.train()
Exemplo n.º 15
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            for partner_idx in range(self.policy.num_partners):
                try:
                    self.env.envs[0].switch_to_env(partner_idx)
                except:
                    pass
                continue_training = self.collect_rollouts(
                    self.env,
                    callback,
                    self.rollout_buffer[partner_idx],
                    n_rollout_steps=self.n_steps,
                    partner_idx=partner_idx)
            #continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer[partner_idx], n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            self.train()

        callback.on_training_end()

        return self
    def collect_rollouts(
            self,  # noqa: C901
            env: VecEnv,
            # Type hint as string to avoid circular import
            callback: 'BaseCallback',
            n_episodes: int = 1,
            n_steps: int = -1,
            action_noise: Optional[ActionNoise] = None,
            learning_starts: int = 0,
            replay_buffer: Optional[ReplayBuffer] = None,
            log_interval: Optional[int] = None) -> RolloutReturn:
        """
        Collect experiences and store them into a ReplayBuffer.

        :param env: (VecEnv) The training environment
        :param callback: (BaseCallback) Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param n_episodes: (int) Number of episodes to use to collect rollout data
            You can also specify a ``n_steps`` instead
        :param n_steps: (int) Number of steps to use to collect rollout data
            You can also specify a ``n_episodes`` instead.
        :param action_noise: (Optional[ActionNoise]) Action noise that will be used for exploration
            Required for deterministic policy (e.g. TD3). This can also be used
            in addition to the stochastic policy for SAC.
        :param learning_starts: (int) Number of steps before learning for the warm-up phase.
        :param replay_buffer: (ReplayBuffer)
        :param log_interval: (int) Log data every ``log_interval`` episodes
        :return: (RolloutReturn)
        """
        episode_rewards, total_timesteps = [], []
        total_steps, total_episodes = 0, 0

        assert isinstance(env, VecEnv), "You must pass a VecEnv"
        assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment"

        if n_episodes > 0 and n_steps > 0:
            # Note we are refering to the constructor arguments
            # that are named `train_freq` and `n_episodes_rollout`
            # but correspond to `n_steps` and `n_episodes` here
            warnings.warn(
                "You passed a positive value for `train_freq` and `n_episodes_rollout`."
                "Please make sure this is intended. "
                "The agent will collect data by stepping in the environment "
                "until both conditions are true: "
                "`number of steps in the env` >= `train_freq` and "
                "`number of episodes` > `n_episodes_rollout`")

        if self.use_sde:
            self.actor.reset_noise()

        callback.on_rollout_start()
        continue_training = True

        while total_steps < n_steps or total_episodes < n_episodes:
            done = False
            episode_reward, episode_timesteps = 0.0, 0

            while not done:

                if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0:
                    # Sample a new noise matrix
                    self.actor.reset_noise()

                # Select action randomly or according to policy
                if self.num_timesteps < learning_starts and not (
                        self.use_sde and self.use_sde_at_warmup):
                    # Warmup phase
                    unscaled_action = np.array([self.action_space.sample()])
                else:
                    # Note: we assume that the policy uses tanh to scale the action
                    # We use non-deterministic action in the case of SAC, for TD3, it does not matter
                    unscaled_action, _ = self.predict(self._last_obs,
                                                      deterministic=False)

                # Rescale the action from [low, high] to [-1, 1]
                if isinstance(self.action_space, gym.spaces.Box):
                    scaled_action = self.policy.scale_action(unscaled_action)

                    # Add noise to the action (improve exploration)
                    if action_noise is not None:
                        # NOTE: in the original implementation of TD3, the noise was applied to the unscaled action
                        # Update(October 2019): Not anymore
                        scaled_action = np.clip(scaled_action + action_noise(),
                                                -1, 1)

                    # We store the scaled action in the buffer
                    buffer_action = scaled_action
                    action = self.policy.unscale_action(scaled_action)
                else:
                    # Discrete case, no need to normalize or clip
                    buffer_action = unscaled_action
                    action = buffer_action

                # Rescale and perform action
                new_obs, reward, done, infos = env.step(action)

                # Only stop training if return value is False, not when it is None.
                if callback.on_step() is False:
                    return RolloutReturn(0.0,
                                         total_steps,
                                         total_episodes,
                                         continue_training=False)

                episode_reward += reward

                # Retrieve reward and episode length if using Monitor wrapper
                self._update_info_buffer(infos, done)

                # Store data in replay buffer
                if replay_buffer is not None:
                    # Store only the unnormalized version
                    if self._vec_normalize_env is not None:
                        new_obs_ = self._vec_normalize_env.get_original_obs()
                        reward_ = self._vec_normalize_env.get_original_reward()
                    else:
                        # Avoid changing the original ones
                        self._last_original_obs, new_obs_, reward_ = self._last_obs, new_obs, reward

                    replay_buffer.add(self._last_original_obs, new_obs_,
                                      buffer_action, reward_, done)

                self._last_obs = new_obs
                # Save the unnormalized observation
                if self._vec_normalize_env is not None:
                    self._last_original_obs = new_obs_

                self.num_timesteps += 1
                episode_timesteps += 1
                total_steps += 1
                if 0 < n_steps <= total_steps:
                    break

            if done:
                total_episodes += 1
                self._episode_num += 1
                episode_rewards.append(episode_reward)
                total_timesteps.append(episode_timesteps)

                if action_noise is not None:
                    action_noise.reset()

                # Log training infos
                if log_interval is not None and self._episode_num % log_interval == 0:
                    fps = int(self.num_timesteps /
                              (time.time() - self.start_time))
                    logger.record("time/episodes",
                                  self._episode_num,
                                  exclude="tensorboard")
                    if len(self.ep_info_buffer) > 0 and len(
                            self.ep_info_buffer[0]) > 0:
                        logger.record(
                            'rollout/ep_rew_mean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buffer
                            ]))
                        logger.record(
                            'rollout/ep_len_mean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buffer
                            ]))
                    logger.record("time/fps", fps)
                    logger.record('time/time_elapsed',
                                  int(time.time() - self.start_time),
                                  exclude="tensorboard")
                    logger.record("time/total timesteps",
                                  self.num_timesteps,
                                  exclude="tensorboard")
                    if self.use_sde:
                        logger.record("train/std",
                                      (self.actor.get_std()).mean().item())

                    if len(self.ep_success_buffer) > 0:
                        logger.record('rollout/success rate',
                                      safe_mean(self.ep_success_buffer))
                    # Pass the number of timesteps for tensorboard
                    logger.dump(step=self.num_timesteps)

        mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0

        callback.on_rollout_end()

        return RolloutReturn(mean_reward, total_steps, total_episodes,
                             continue_training)
Exemplo n.º 17
0
def main():  # noqa: C901
    parser = argparse.ArgumentParser()
    parser.add_argument("-tb",
                        "--tensorboard-log",
                        help="Tensorboard log dir",
                        default="",
                        type=str)
    parser.add_argument("--env",
                        help="environment ID",
                        type=str,
                        default="CartPole-v1")
    parser.add_argument("-f",
                        "--folder",
                        help="Log folder",
                        type=str,
                        default="rl-trained-agents")
    parser.add_argument("--algo",
                        help="RL Algorithm",
                        default="ppo",
                        type=str,
                        required=False,
                        choices=list(ALGOS.keys()))
    parser.add_argument("-n",
                        "--n-timesteps",
                        help="number of timesteps",
                        default=1000,
                        type=int)
    parser.add_argument(
        "--num-threads",
        help="Number of threads for PyTorch (-1 to use default)",
        default=-1,
        type=int)
    parser.add_argument("--n-envs",
                        help="number of environments",
                        default=1,
                        type=int)
    # parser.add_argument("--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int)
    parser.add_argument(
        "--exp-id",
        help="Experiment ID (default: 0: latest, -1: no exp folder)",
        default='0',
        type=str)
    parser.add_argument("--verbose",
                        help="Verbose mode (0: no output, 1: INFO)",
                        default=1,
                        type=int)
    parser.add_argument(
        "--no-render",
        action="store_true",
        default=False,
        help="Do not render the environment (useful for tests)")
    parser.add_argument(
        "--render-mode",
        default='step',
        help="Whether to render at each step or at the end of an episode")
    parser.add_argument("--deterministic",
                        action="store_true",
                        default=False,
                        help="Use deterministic actions")
    parser.add_argument(
        "--load-best",
        action="store_true",
        default=False,
        help="Load best model instead of last model if available")
    parser.add_argument(
        "--load-checkpoint",
        type=int,
        help="Load checkpoint instead of last model if available, "
        "you must pass the number of timesteps corresponding to it",
    )
    parser.add_argument("--stochastic",
                        action="store_true",
                        default=False,
                        help="Use stochastic actions")
    parser.add_argument(
        "--norm-reward",
        action="store_true",
        default=False,
        help="Normalize reward if applicable (trained with VecNormalize)")
    parser.add_argument("--seed",
                        help="Random generator seed",
                        type=int,
                        default=0)
    parser.add_argument("--info-freq",
                        help="Frequency on which info valuers are logged",
                        type=int,
                        default=10)
    parser.add_argument("--reward-log",
                        help="Where to log reward",
                        default="",
                        type=str)
    parser.add_argument(
        "--gym-packages",
        type=str,
        nargs="+",
        default=[],
        help=
        "Additional external Gym environemnt package modules to import (e.g. gym_minigrid)",
    )
    parser.add_argument(
        "--env-kwargs",
        type=str,
        nargs="+",
        action=StoreDict,
        help="Optional keyword argument to pass to the env constructor")
    args = parser.parse_args()

    # Going through custom gym packages to let them register in the global registory
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    env_id = args.env
    algo = args.algo
    folder = args.folder

    if args.exp_id == '0':
        args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
        print(f"Loading latest experiment, id={args.exp_id}")

    # Sanity checks
    if args.exp_id != '0' and args.exp_id != '-1':
        log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}")
    else:
        log_path = os.path.join(folder, algo)

    assert os.path.isdir(log_path), f"The {log_path} folder was not found"

    found = False
    for ext in ["zip"]:
        model_path = os.path.join(log_path, f"{env_id}.{ext}")
        found = os.path.isfile(model_path)
        if found:
            break

    if args.load_best:
        model_path = os.path.join(log_path, "best_model.zip")
        found = os.path.isfile(model_path)

    if args.load_checkpoint is not None:
        model_path = os.path.join(
            log_path, f"rl_model_{args.load_checkpoint}_steps.zip")
        found = os.path.isfile(model_path)

    if not found:
        raise ValueError(
            f"No model found for {algo} on {env_id}, path: {model_path}")
    else:
        print(f"Loading model for {algo} on {env_id}, path: {model_path}")

    off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"]

    if algo in off_policy_algos:
        args.n_envs = 1

    set_random_seed(args.seed)

    if args.num_threads > 0:
        if args.verbose > 1:
            print(f"Setting torch.num_threads to {args.num_threads}")
        th.set_num_threads(args.num_threads)

    is_atari = ExperimentManager.is_atari(env_id)

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(
        stats_path, norm_reward=args.norm_reward, test_mode=True)

    # load env_kwargs if existing
    env_kwargs = {}
    args_path = os.path.join(log_path, env_id, "args.yml")
    if os.path.isfile(args_path):
        with open(args_path, "r") as f:
            loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader)  # pytype: disable=module-attr
            if loaded_args["env_kwargs"] is not None:
                env_kwargs = loaded_args["env_kwargs"]
    # overwrite with command line arguments
    if args.env_kwargs is not None:
        env_kwargs.update(args.env_kwargs)

    log_dir = args.reward_log if args.reward_log != "" else None

    env = create_test_env(
        env_id,
        n_envs=args.n_envs,
        stats_path=stats_path,
        seed=args.seed,
        log_dir=log_dir,
        should_render=not args.no_render,
        hyperparams=hyperparams,
        env_kwargs=env_kwargs,
    )

    kwargs = dict(seed=args.seed)
    if algo in off_policy_algos:
        # Dummy buffer size as we don't need memory to enjoy the trained agent
        kwargs.update(dict(buffer_size=1))

    # Check if we are running python 3.8+
    # we need to patch saved model under python 3.6/3.7 to load them
    newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8

    custom_objects = {}
    if newer_python_version:
        custom_objects = {
            "learning_rate": 0.0,
            "lr_schedule": lambda _: 0.0,
            "clip_range": lambda _: 0.0,
        }

    model = ALGOS[algo].load(model_path,
                             env=env,
                             custom_objects=custom_objects,
                             **kwargs)

    # tb_path = ''
    # for i in range(0,100000,1):
    #     tb_path = os.path.join(args.tensorboard_log, env_id, algo.upper() + "_" + str(i))
    #     if not os.path.exists(tb_path):
    #         break
    # print("algo=",algo, "  logdir=", tb_path)
    # writer = SummaryWriter(log_dir=tb_path)

    obs = env.reset()

    # Deterministic by default except for atari games
    stochastic = args.stochastic or is_atari and not args.deterministic
    deterministic = not stochastic

    state = None
    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    ep_count = 0
    # For HER, monitor success rate
    successes = []
    sbcommon_utils.configure_logger(args.verbose,
                                    os.path.join(args.tensorboard_log, env_id),
                                    algo.upper(),
                                    reset_num_timesteps=True)
    xlsx_logpath = os.path.join(
        args.tensorboard_log,
        env_id) if logger.get_dir() is None else logger.get_dir()
    xlsx_logger = Xlsx_Logger(xlsx_logpath, env_id)
    with open(os.path.join(xlsx_logpath, 'args.yaml'), 'w') as file:
        yaml.dump(args, file)
    fig: plt.Figure = None
    info_freq = args.info_freq
    try:
        for step in range(args.n_timesteps):
            action, state = model.predict(obs,
                                          state=state,
                                          deterministic=deterministic)
            obs, reward, done, infos = env.step(action)
            episode_reward += reward[0]
            ep_len += 1

            if args.n_envs == 1:

                # log info variables to tensorboard
                if (step % info_freq == 0 or done) and type(infos[0]) is dict:
                    if not args.no_render:
                        if not done and args.render_mode == 'step':
                            fig = env.render("human")
                        elif done and args.render_mode == 'episode':
                            fig = env.envs[0].rendered_episode
                    xlsx_logger.set_step_ep(ep_count, step)
                    for key in infos[0]:
                        if key == 'episode' or key == 'terminal_observation' or key == 'render':
                            continue
                        val = infos[0].get(key)
                        logger.record("eval/" + key, val, exclude='stdout')
                        xlsx_logger.log(key, val)
                    if fig is not None:
                        log_fig = logger.Figure(fig, False)
                        logger.record("eval/figure", log_fig, exclude='stdout')
                        # writer.add_scalar("eval/"+key, val, step)
                    logger.dump(step=step)

                # For atari the return reward is not the atari score
                # so we have to get it from the infos dict
                if is_atari and infos is not None and args.verbose >= 1:
                    episode_infos = infos[0].get("episode")
                    if episode_infos is not None:
                        print(f"Atari Episode Score: {episode_infos['r']:.2f}")
                        print("Atari Episode Length", episode_infos["l"])

                if done and not is_atari and args.verbose > 0:
                    # NOTE: for env using VecNormalize, the mean reward
                    # is a normalized reward when `--norm_reward` flag is passed

                    print("Episode #{}, step#{}".format(ep_count, step))
                    print(f"  Episode Reward: {episode_reward:.2f}")
                    print("  Episode Length", ep_len)
                    episode_rewards.append(episode_reward)
                    logger.record("eval/ep_len", ep_len, exclude='stdout')
                    logger.record("eval/ep_reward",
                                  episode_reward,
                                  exclude='stdout')
                    xlsx_logger.log('ep_len', ep_len)
                    xlsx_logger.log('reward', episode_reward)
                    logger.dump(step=step)
                    episode_lengths.append(ep_len)
                    episode_reward = 0.0
                    ep_len = 0
                    ep_count += 1
                    state = None

                # Reset also when the goal is achieved when using HER
                if done and infos[0].get("is_success") is not None:
                    if args.verbose > 1:
                        print("Success?", infos[0].get("is_success", False))

                    if infos[0].get("is_success") is not None:
                        successes.append(infos[0].get("is_success", False))
                        episode_reward, ep_len = 0.0, 0
                        ep_count += 1

            # if (not args.no_render) and args.render_mode=='step':
            #     fig = env.render("human")
            # else:
            #     fig = None

    except KeyboardInterrupt:
        pass

    logger.dump(step=step)
    xlsx_logger.close()

    if args.verbose > 0 and len(successes) > 0:
        print(f"Success rate: {100 * np.mean(successes):.2f}%")

    if args.verbose > 0 and len(episode_rewards) > 0:
        print(f"{len(episode_rewards)} Episodes")
        print(
            f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}"
        )

    if args.verbose > 0 and len(episode_lengths) > 0:
        print(
            f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}"
        )

    env.close()
Exemplo n.º 18
0
    def train_normal(self):
        # Update optimizer learning rate
        self._update_learning_rate(self.policy.optimizer)
        # Compute current clip range
        clip_range = self.clip_range(self._current_progress_remaining)
        # Optional: clip range for the value function
        if self.clip_range_vf is not None:
            clip_range_vf = self.clip_range_vf(
                self._current_progress_remaining)

        entropy_losses, all_kl_divs = [], []
        pg_losses, value_losses = [], []
        clip_fractions = []

        # train for n_epochs epochs
        for epoch in range(self.n_epochs):
            approx_kl_divs = []
            # Do a complete pass on the rollout buffer
            for rollout_data in self.rollout_buffer.get(self.batch_size):
                actions = rollout_data.actions
                if isinstance(self.action_space, spaces.Discrete):
                    # Convert discrete action from float to long
                    actions = rollout_data.actions.long().flatten()

                # Re-sample the noise matrix because the log_std has changed
                # TODO: investigate why there is no issue with the gradient
                # if that line is commented (as in SAC)
                if self.use_sde:
                    self.policy.reset_noise(self.batch_size)
                """
                Sida: Change the input to evaluate_actions()
                """
                with self.policy.features_extractor.start_training(
                        rollout_data.short_hidden_states,
                        rollout_data.long_hidden_states):
                    values, log_prob, entropy = self.policy.evaluate_actions(
                        rollout_data.observations, actions)

                values = values.flatten()
                # Normalize advantage
                advantages = rollout_data.advantages
                advantages = (advantages -
                              advantages.mean()) / (advantages.std() + 1e-8)

                # ratio between old and new policy, should be one at the first iteration
                ratio = th.exp(log_prob - rollout_data.old_log_prob)

                # clipped surrogate loss
                policy_loss_1 = advantages * ratio
                policy_loss_2 = advantages * th.clamp(ratio, 1 - clip_range,
                                                      1 + clip_range)
                policy_loss = -th.min(policy_loss_1, policy_loss_2).mean()

                # Logging
                pg_losses.append(policy_loss.item())
                clip_fraction = th.mean(
                    (th.abs(ratio - 1) > clip_range).float()).item()
                clip_fractions.append(clip_fraction)

                if self.clip_range_vf is None:
                    # No clipping
                    values_pred = values
                else:
                    # Clip the different between old and new value
                    # NOTE: this depends on the reward scaling
                    values_pred = rollout_data.old_values + th.clamp(
                        values - rollout_data.old_values, -clip_range_vf,
                        clip_range_vf)
                # Value loss using the TD(gae_lambda) target
                value_loss = F.mse_loss(rollout_data.returns, values_pred)
                value_losses.append(value_loss.item())

                # Entropy loss favor exploration
                if entropy is None:
                    # Approximate entropy when no analytical form
                    entropy_loss = -th.mean(-log_prob)
                else:
                    entropy_loss = -th.mean(entropy)

                entropy_losses.append(entropy_loss.item())

                loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss

                # Optimization step
                self.policy.optimizer.zero_grad()
                loss.backward()
                # Clip grad norm
                th.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                            self.max_grad_norm)
                self.policy.optimizer.step()
                approx_kl_divs.append(
                    th.mean(rollout_data.old_log_prob -
                            log_prob).detach().cpu().numpy())

            all_kl_divs.append(np.mean(approx_kl_divs))

            if self.target_kl is not None and np.mean(
                    approx_kl_divs) > 1.5 * self.target_kl:
                print(
                    f"Early stopping at step {epoch} due to reaching max kl: {np.mean(approx_kl_divs):.2f}"
                )
                break

        self._n_updates += self.n_epochs
        explained_var = explained_variance(
            self.rollout_buffer.values.flatten(),
            self.rollout_buffer.returns.flatten())

        # Logs
        logger.record("train_normal/entropy_loss", np.mean(entropy_losses))
        logger.record("train_normal/policy_gradient_loss", np.mean(pg_losses))
        logger.record("train_normal/value_loss", np.mean(value_losses))
        logger.record("train_normal/approx_kl", np.mean(approx_kl_divs))
        logger.record("train_normal/clip_fraction", np.mean(clip_fractions))
        logger.record("train_normal/loss", loss.item())
        logger.record("train_normal/explained_variance", explained_var)
        if hasattr(self.policy, "log_std"):
            logger.record("train_normal/std",
                          th.exp(self.policy.log_std).mean().item())

        logger.record("train_normal/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        logger.record("train_normal/clip_range", clip_range)
        if self.clip_range_vf is not None:
            logger.record("train_normal/clip_range_vf", clip_range_vf)
        logger.dump(step=self.num_timesteps)
Exemplo n.º 19
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        print('setup training')

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        print(f'start training, total timesteps is {total_timesteps}')

        while self.num_timesteps < total_timesteps:

            print(f'num timesteps: {self.num_timesteps}/{total_timesteps}')
            print(f'collect rollouts, rollout steps = {self.n_steps}')

            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps)

            if continue_training is False:
                print(
                    'stop training (only happens if callback on_step returns false)'
                )
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            print('display training infos')
            # print(f'len(self.ep_info_buffer)={len(self.ep_info_buffer)}, len(self.ep_info_buffer[0])={len(self.ep_info_buffer[0])}')

            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            print('train')
            self.train()

        callback.on_training_end()

        return self
Exemplo n.º 20
0
def dump(step=0) -> None:
    """Alias for `stable_baselines3.logger.dump`."""
    sb_logger.dump(step)
Exemplo n.º 21
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
        parameter_noise: bool = False,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        #initiatilizing value of noise std
        current_sigma = 1.0
        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps,
                parameter_noise=parameter_noise,
                sigma=0.5)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            self.train()

            if parameter_noise:

                states = self.rollout_buffer.observations
                states = th.tensor(states)

                actions_unnoisy, values_unnoisy, log_prob_unnoisy = self.policy(
                    states, parameter_noise=False)
                actions_noisy, values_noisy, log_prob_noisy = self.policy(
                    states, parameter_noise=True, sigma=current_sigma)

                distance = th.sum((actions_unnoisy - actions_noisy)**2)**0.5

                distance_threshold = 1
                sigma_scalefactor = 1.01
                if distance > distance_threshold:
                    current_sigma /= sigma_scalefactor
                else:
                    current_sigma *= sigma_scalefactor

        callback.on_training_end()

        return self
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            rollout = self.collect_rollouts(
                self.env,
                n_episodes=-1,
                n_steps=1,
                action_noise=self.action_noise,
                callback=callback,
                learning_starts=self.learning_starts,
                replay_buffer=self.replay_buffer,
                log_interval=log_interval,
            )

            if rollout.continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
                # If no `gradient_steps` is specified,
                # do as many gradients steps as steps performed during the rollout
                self.train(gradient_steps=1, batch_size=self.batch_size)

        callback.on_training_end()

        return self
def test_main(tmp_path):
    """
    tests for the logger module
    """
    info("hi")
    debug("shouldn't appear")
    assert get_level() == INFO
    set_level(DEBUG)
    assert get_level() == DEBUG
    debug("should appear")
    configure(folder=str(tmp_path))
    assert get_dir() == str(tmp_path)
    record("a", 3)
    record("b", 2.5)
    dump()
    record("b", -2.5)
    record("a", 5.5)
    dump()
    info("^^^ should see a = 5.5")
    record("f", "this text \n \r should appear in one line")
    dump()
    info('^^^ should see f = "this text \n \r should appear in one line"')
    record_mean("b", -22.5)
    record_mean("b", -44.4)
    record("a", 5.5)
    dump()
    with ScopedConfigure(None, None):
        info("^^^ should see b = 33.3")

    with ScopedConfigure(str(tmp_path / "test-logger"), ["json"]):
        record("b", -2.5)
        dump()

    reset()
    record("a", "longasslongasslongasslongasslongasslongassvalue")
    dump()
    warn("hey")
    error("oh")
    record_dict({"test": 1})
    assert isinstance(get_log_dict(), dict) and set(get_log_dict().keys()) == {"test"}
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))

                    for k in self.ep_info_buffer[0].keys():
                        if k not in "lrt":
                            logger.record(
                                f"progress/{k}",
                                safe_mean([
                                    ep_info[k]
                                    for ep_info in self.ep_info_buffer
                                ]))

                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

                if iteration % (log_interval *
                                10) == 0:  #save parameters every 10 log steps
                    self.save('./interim_trained_models/')

            self.train()

        callback.on_training_end()

        return self