Python VecEnv.step示例

编程语言: Python

命名空间/包名称: stable_baselines3.common.vec_env

类/类型: VecEnv

方法/功能: step

hotexamples.com的示例: 18

Python VecEnv.step - 已找到18个示例。这些是从开源项目中提取的最受好评的stable_baselines3.common.vec_env.VecEnv.step现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

step(18)

reset(4)

__init__(2)

get_attr(1)

norm_reward(1)

num_envs(1)

render(1)

training(1)

示例#1

显示文件

    def collect_rollouts(self, env: VecEnv, callback: BaseCallback,
                         rollout_buffer: GraphRolloutBuffer,
                         n_rollout_steps: int) -> bool:
        """
        Collect rollouts using the current policy and fill a `RolloutBuffer`.
        :param env: (VecEnv) The training environment
        :param callback: (BaseCallback) Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: (RolloutBuffer) Buffer to fill with rollouts
        :param n_steps: (int) Number of experiences to collect per environment
        :return: (bool) True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration

        callback.on_rollout_start()

        while n_steps < n_rollout_steps:

            with th.no_grad():
                # Convert to pytorch tensor
                # EDIT: obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                obs_tensor = to_batch(obs_to_graph(self._last_obs))
                actions, values, log_probs = self.policy.forward(obs_tensor)

            # TODO: May need to edit to supprot nodes.
            actions = [action.cpu().numpy() for action in actions]
            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = [
                    np.clip(action, self.action_space.low,
                            self.action_space.high) for action in actions
                ]
            new_obs, rewards, dones, infos = env.step(clipped_actions)

            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1
            self.num_timesteps += env.num_envs

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)

            rollout_buffer.add(self._last_obs, actions, rewards, dones, values,
                               log_probs)
            self._last_obs = new_obs

        rollout_buffer.compute_returns_and_advantage(values, dones=dones)

        callback.on_rollout_end()

        return True

示例#2

显示文件

    def collect_rollouts(self,
                         env: VecEnv,
                         callback: BaseCallback,
                         rollout_buffer: RolloutBuffer,
                         n_rollout_steps: int = 256) -> bool:

        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps:

            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1
            self.num_timesteps += env.num_envs

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards, dones, values,
                               log_probs)
            self._last_obs = new_obs

        rollout_buffer.compute_returns_and_advantage(values, dones=dones)

        callback.on_rollout_end()

        return True

示例#3

显示文件

文件： on_policy_algorithm.py 项目： kennyderek/stable-baselines3

    def collect_rollouts(self, env: VecEnv, callback: BaseCallback,
                         rollout_buffer: TrajRolloutBuffer,
                         n_rollout_steps: int) -> bool:
        """
        Collect rollouts using the current policy and fill a `RolloutBuffer`.

        :param env: (VecEnv) The training environment
        :param callback: (BaseCallback) Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: (RolloutBuffer) Buffer to fill with rollouts
        :param n_steps: (int) Number of experiences to collect per environment
        :return: (bool) True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        # while n_steps < n_rollout_steps:
        while not rollout_buffer.full:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor
                obs_ctx_tensor = th.as_tensor(self._last_obs).to(
                    self.device)  # (num_agents,) + (obs_dim,)
                actions, values, log_probs = self.policy.forward(
                    obs_ctx_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)

            # action_dict = zip(enumerate(clipped_actions))
            new_obs, rewards, dones, infos = env.step(
                clipped_actions)  # env step takes np.array, returns np.array?

            # TODO: figure out where to put this reset: maybe in an env wrapper like they did
            if dones[self.env.num_agents] == 1:
                env.reset()

            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1
            # self.num_timesteps += env.num_envs
            self.num_timesteps += 1

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)

            _obs = self._last_obs[..., 0:self.env.obs_size]
            _ctx = self._last_obs[..., self.env.obs_size:]

            # TODO: need to fix in the case of new number of agents, since range(len(last_obs)) will be incorrect
            for i in range(len(self._last_obs)):
                rollout_buffer.add(agent_id=i,
                                   context=_ctx[i],
                                   done=self._last_dones[i],
                                   obs=_obs[i],
                                   action=actions[i],
                                   reward=rewards[i],
                                   value=values[i],
                                   log_prob=log_probs[i])
            # for i, state_ctx_pair in enumerate(zip(*_obs, _ctx)):
            #     print(state_ctx_pair)
            #     rollout_buffer.add(agent_id=i,
            #                     context=state_ctx_pair[-1],
            #                     done=self._last_dones[i],
            #                     obs=state_ctx_pair[0:-1],
            #                     action=actions[i],
            #                     reward=rewards[i],
            #                     value=values[i],
            #                     log_prob=log_probs[i])

            # TODO: needs modification!
            # rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs)
            self._last_obs = new_obs
            self._last_dones = dones

        rollout_buffer.compute_returns_and_advantage(values)

        callback.on_rollout_end()

        return True

示例#4

显示文件

文件： ppo.py 项目： selmanakinci/stable-baselines3

    def collect_rollouts(self,
                         env: VecEnv,
                         callback: BaseCallback,
                         rollout_buffer: RolloutBuffer,
                         n_rollout_steps: int = 256) -> bool:

        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1
            self.num_timesteps += env.num_envs

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards, dones, values, log_probs)
            self._last_obs = new_obs

        rollout_buffer.compute_returns_and_advantage(values, dones=dones)


        # # MSA debugging learning
        # try:
        #     import copy
        #     c_rb = copy.copy (rollout_buffer)
        #     self.rollout_buffer_hist.append(c_rb)
        # except:
        #     pass
        # if len(self.rollout_buffer_hist) == 25:
        #     import matplotlib.pyplot as plt
        #     n_envs = 4
        #     V = np.empty((0,n_envs), float)
        #     A = np.empty((0,n_envs), float)
        #     R = np.empty((0,n_envs), float)
        #     lp = np.empty((0,n_envs), float)
        #     r = np.empty((0,n_envs), float)
        #     a = np.empty((0,n_envs, actions.shape[1]), float)
        #     S = np.empty((0,n_envs, new_obs.shape[1]), float)
        #     for rb in self.rollout_buffer_hist:
        #         V = np.append (V, rb.values, axis=0)
        #         A = np.append (A, rb.advantages, axis=0)
        #         R = np.append (R, rb.returns, axis=0)
        #         lp = np.append (lp, rb.log_probs, axis=0)
        #         r = np.append (r, rb.rewards, axis=0)
        #         a = np.append (a, rb.actions, axis=0)
        #         S = np.append (S, rb.observations, axis=0)
        #     plt.plot (V)
        #     plt.title ('Values')
        #     dir_no = "2"
        #     filename = "RL_detailed_plots/"+ dir_no + "/V.png"
        #     plt.savefig(filename)
        #     plt.close ()
        #
        #     plt.plot (A)
        #     plt.title ('Advantages')
        #     filename = "RL_detailed_plots/"+ dir_no + "/A.png"
        #     plt.savefig(filename)
        #     plt.close ()
        #
        #     plt.plot (R)
        #     plt.title ('Returns')
        #     filename = "RL_detailed_plots/"+ dir_no + "/R.png"
        #     plt.savefig(filename)
        #     plt.close ()
        #
        #     plt.plot (lp)
        #     plt.title ('Log Probs')
        #     filename = "RL_detailed_plots/"+ dir_no + "/lp.png"
        #     plt.savefig(filename)
        #     plt.close ()
        #
        #     plt.plot (r)
        #     plt.title ('rewards')
        #     filename = "RL_detailed_plots/"+ dir_no + "/rew.png"
        #     plt.savefig(filename)
        #     plt.close ()
        #
        #     try:
        #         fig, axes = plt.subplots (nrows=actions.shape[1], ncols=1, figsize=(8, actions.shape[1]))
        #         for i in range (actions.shape[1]):
        #             axes[i].plot (a[:, :, i])
        #         plt.suptitle ('Actions', y=1)
        #         filename = "RL_detailed_plots/" + dir_no + "/act.png"
        #         plt.savefig (filename)
        #         plt.close()
        #     except:
        #         plt.plot (a[:, :, 0])
        #         plt.title ('Actions')
        #         filename = "RL_detailed_plots/" + dir_no + "/act.png"
        #         plt.savefig (filename)
        #         plt.close()
        #
        #     fig, axes = plt.subplots (nrows= new_obs.shape[1], ncols=1, figsize=(8, 2*new_obs.shape[1]))
        #     for i in range ( new_obs.shape[1]):
        #         axes[i].plot (S[:, :, i])
        #         axes[i].plot (S[:, :, i])
        #     plt.suptitle ('States', y=1)
        #     filename = "RL_detailed_plots/" + dir_no + "/S.png"
        #     plt.savefig (filename)
        #     plt.close()


        callback.on_rollout_end()

        return True

示例#5

显示文件

文件： pns.py 项目： liusida/thesis-bodies

    def collect_rollouts(self, env: VecEnv, callback: BaseCallback,
                         rollout_buffer: RolloutBuffer,
                         n_rollout_steps: int) -> bool:
        """
        Collect rollouts using the current policy and fill a `RolloutBuffer`.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps:
            self.policy.set_robot_id(
                self.policy.all_robot_ids
            )  # reset robot id before collecting rollouts
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards,
                               self._last_dones, values, log_probs)
            self._last_obs = new_obs
            self._last_dones = dones

        with th.no_grad():
            # Compute value for the last timestep
            obs_tensor = th.as_tensor(new_obs).to(self.device)
            _, values, _ = self.policy.forward(obs_tensor)

        rollout_buffer.compute_returns_and_advantage(last_values=values,
                                                     dones=dones)

        callback.on_rollout_end()

        return True

示例#6

显示文件

文件： rollout.py 项目： robinbg/imitation

def generate_trajectories(
    policy,
    venv: VecEnv,
    sample_until: GenTrajTerminationFn,
    *,
    deterministic_policy: bool = False,
    rng: np.random.RandomState = np.random,
) -> Sequence[types.TrajectoryWithRew]:
    """Generate trajectory dictionaries from a policy and an environment.

    Args:
      policy (Callable,BasePolicy or BaseAlgorithm): A function mapping observation to action, a stable_baselines3 policy 
      or an algorithm trained on the gym environment.
      venv: The vectorized environments to interact with.
      sample_until: A function determining the termination condition.
          It takes a sequence of trajectories, and returns a bool.
          Most users will want to use one of `min_episodes` or `min_timesteps`.
      deterministic_policy: If True, asks policy to deterministically return
          action. Note the trajectories might still be non-deterministic if the
          environment has non-determinism!
      rng: used for shuffling trajectories.

    Returns:
      Sequence of trajectories, satisfying `sample_until`. Additional trajectories
      may be collected to avoid biasing process towards short episodes; the user
      should truncate if required.
    """
    if isinstance(policy, BaseAlgorithm):
        policy.set_env(venv)

    # Collect rollout tuples.
    trajectories = []
    # accumulator for incomplete trajectories
    trajectories_accum = TrajectoryAccumulator()
    obs = venv.reset()
    for env_idx, ob in enumerate(obs):
        # Seed with first obs only. Inside loop, we'll only add second obs from
        # each (s,a,r,s') tuple, under the same "obs" key again. That way we still
        # get all observations, but they're not duplicated into "next obs" and
        # "previous obs" (this matters for, e.g., Atari, where observations are
        # really big).
        trajectories_accum.add_step(dict(obs=ob), env_idx)

    # Now, we sample until `sample_until(trajectories)` is true.
    # If we just stopped then this would introduce a bias towards shorter episodes,
    # since longer episodes are more likely to still be active, i.e. in the process
    # of being sampled from. To avoid this, we continue sampling until all epsiodes
    # are complete.
    #
    # To start with, all environments are active.
    active = np.ones(venv.num_envs, dtype=np.bool)
    while np.any(active):
        if isinstance(policy, Callable):
            acts = policy(obs)
        else:
            acts, _ = policy.predict(obs, deterministic=deterministic_policy)
        obs, rews, dones, infos = venv.step(acts)

        # If an environment is inactive, i.e. the episode completed for that
        # environment after `sample_until(trajectories)` was true, then we do
        # *not* want to add any subsequent trajectories from it. We avoid this
        # by just making it never done.
        dones &= active

        new_trajs = trajectories_accum.add_steps_and_auto_finish(
            acts, obs, rews, dones, infos)
        trajectories.extend(new_trajs)

        if sample_until(trajectories):
            # Termination condition has been reached. Mark as inactive any environments
            # where a trajectory was completed this timestep.
            active &= ~dones

    # Note that we just drop partial trajectories. This is not ideal for some
    # algos; e.g. BC can probably benefit from partial trajectories, too.

    # Each trajectory is sampled i.i.d.; however, shorter episodes are added to
    # `trajectories` sooner. Shuffle to avoid bias in order. This is important
    # when callees end up truncating the number of trajectories or transitions.
    # It is also cheap, since we're just shuffling pointers.
    rng.shuffle(trajectories)

    # Sanity checks.
    for trajectory in trajectories:
        n_steps = len(trajectory.acts)
        # extra 1 for the end
        exp_obs = (n_steps + 1, ) + venv.observation_space.shape
        real_obs = trajectory.obs.shape
        assert real_obs == exp_obs, f"expected shape {exp_obs}, got {real_obs}"
        exp_act = (n_steps, ) + venv.action_space.shape
        real_act = trajectory.acts.shape
        assert real_act == exp_act, f"expected shape {exp_act}, got {real_act}"
        exp_rew = (n_steps, )
        real_rew = trajectory.rews.shape
        assert real_rew == exp_rew, f"expected shape {exp_rew}, got {real_rew}"

    return trajectories

示例#7

显示文件

    def collect_rollouts(
        self,
        env: VecEnv,
        callback: BaseCallback,
        rollout_buffer: RolloutBuffer,
        n_rollout_steps: int,
    ) -> bool:
        """
        Collect experiences using the current policy and fill a ``RolloutBuffer``.
        The term rollout here refers to the model-free notion and should not
        be used with the concept of rollout used in model-based RL or planning.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        # Switch to eval mode (this affects batch norm / dropout)
        self.policy.set_training_mode(False)

        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor or to TensorDict
                obs_tensor = obs_as_tensor(self._last_obs, self.device)
                actions, values, log_probs = self.policy(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)

            # Handle timeout by bootstraping with value function
            # see GitHub issue #633
            for idx, done in enumerate(dones):
                if (
                    done
                    and infos[idx].get("terminal_observation") is not None
                    and infos[idx].get("TimeLimit.truncated", False)
                ):
                    terminal_obs = self.policy.obs_to_tensor(infos[idx]["terminal_observation"])[0]
                    with th.no_grad():
                        terminal_value = self.policy.predict_values(terminal_obs)[0]
                    rewards[idx] += self.gamma * terminal_value

            rollout_buffer.add(self._last_obs, actions, rewards, self._last_episode_starts, values, log_probs)
            self._last_obs = new_obs
            self._last_episode_starts = dones

        with th.no_grad():
            # Compute value for the last timestep
            values = self.policy.predict_values(obs_as_tensor(new_obs, self.device))

        rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones)

        callback.on_rollout_end()

        return True

示例#8

显示文件

文件： average_reward_adjusted_algorithm.py 项目： schnecki/rl_overtime

    def collect_rollouts(self,
                         env: VecEnv,
                         callback: BaseCallback,
                         n_episodes: int = 1,
                         n_steps: int = -1,
                         action_noise: Optional[ActionNoise] = None,
                         learning_starts: int = 0,
                         replay_buffer: Optional[ReplayBuffer] = None,
                         log_interval: Optional[int] = None) -> RolloutReturn:
        """
        Collect experiences and store them into a ReplayBuffer.

        :param env: (VecEnv) The training environment
        :param callback: (BaseCallback) Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param n_episodes: (int) Number of episodes to use to collect rollout data
            You can also specify a ``n_steps`` instead
        :param n_steps: (int) Number of steps to use to collect rollout data
            You can also specify a ``n_episodes`` instead.
        :param action_noise: (Optional[ActionNoise]) Action noise that will be used for exploration
            Required for deterministic policy (e.g. TD3). This can also be used
            in addition to the stochastic policy for SAC.
        :param learning_starts: (int) Number of steps before learning for the warm-up phase.
        :param replay_buffer: (ReplayBuffer)
        :param log_interval: (int) Log data every ``log_interval`` episodes
        :return: (RolloutReturn)
        """
        episode_rewards, total_timesteps = [], []
        total_steps, total_episodes = 0, 0

        assert isinstance(env, VecEnv), "You must pass a VecEnv"
        assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment"

        if self.use_sde:
            self.actor.reset_noise()

        callback.on_rollout_start()
        continue_training = True

        while total_steps < n_steps or total_episodes < n_episodes:
            done = False
            episode_reward, episode_timesteps = 0.0, 0

            while not done:

                if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0:
                    # Sample a new noise matrix
                    self.actor.reset_noise()

                # Select action randomly or according to policy
                action, buffer_action, is_random_action = self._sample_action(learning_starts, action_noise)
                # print("action/buffer action/israndomaction ", action, buffer_action,is_random_action)
                # action, buffer_action = self._sample_action(learning_starts, action_noise)

                # Rescale and perform action
                new_obs, reward, done, infos = env.step(action)
                # print("Reward: ", reward)
                # print("Observation:", new_obs)
                self.period_counter += 1
                self.current_observation = new_obs

                # Only stop training if return value is False, not when it is None.
                if callback.on_step() is False:
                    return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False)

                episode_reward += reward

                # Retrieve reward and episode length if using Monitor wrapper
                self._update_info_buffer(infos, done)

                # Store data in replay buffer
                if replay_buffer is not None:
                    # Store only the unnormalized version
                    if self._vec_normalize_env is not None:
                        new_obs_ = self._vec_normalize_env.get_original_obs()
                        reward_ = self._vec_normalize_env.get_original_reward()
                    else:
                        # Avoid changing the original ones
                        self._last_original_obs, new_obs_, reward_ = self._last_obs, new_obs, reward

                    replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_,
                                      done)
                old_observation = self._last_original_obs
                self._last_obs = new_obs
                # Save the unnormalized observation
                if self._vec_normalize_env is not None:
                    self._last_original_obs = new_obs_

                self.num_timesteps += 1
                episode_timesteps += 1
                total_steps += 1
                self._update_current_progress_remaining(self.num_timesteps, self._total_timesteps)

                # For DQN, check if the target network should be updated
                # and update the exploration schedule
                # For SAC/TD3, the update is done as the same time as the gradient update
                # see https://github.com/hill-a/stable-baselines/issues/900
                self._on_step()

                if 0 < n_steps <= total_steps:
                    break

                # Compute the target Q values
                target_st1 = self.q_net_target(th.tensor(new_obs))
                # Follow greedy policy: use the one with the highest value
                target_st1, _ = target_st1.max(dim=1)
                # Avoid potential broadcast issue
                target_st1 = float(target_st1.reshape(-1, 1))

                # Compute the target Q values
                target_st = self.q_net_target(th.tensor(old_observation))
                # Follow greedy policy: use the one with the highest value
                target_st, _ = target_st.max(dim=1)

                # Avoid potential broadcast issue -> IS THIS NECESSARY?
                target_st = float(target_st.reshape(-1, 1))
                # TODO: Welchen der drei indices von target_st nehmen wir? ->maximalen
                # TODO 2: laut paper wird zuerst rho berechnet, aber hier machen wir es einen step verzögert
                # buffer_action.astype(int)[0]
                if is_random_action == 0:
                    decayed_alpha = self.exp_decay_alpha()
                    self.rho = (1 - decayed_alpha) * self.rho + decayed_alpha * (reward_ + target_st1 - target_st)

                # Fixed observation for debugging purposes
                # TODO: GENERATE RANDOM NUMBERS OR COME UP WITH BETTER NUMBERS
                obs2 = th.tensor([[10., 0., 0., 0., 5., 0., 7., 1., 0., 0., 5., 1., 6., 1.,
                                   0., 0., 3., 0., 8., 0., 0., 0., 3., 2., 14., 2., 0., 0.,
                                   4., 2., 17., 1., 0., 0., 3., 1., 10., 0., 0., 0., 5., 0., 7., 1., 0., 0., 5., 1., 6.,
                                   1.,
                                   0., 0., 3., 0., 8., 0., 0., 0., 3., 2., 14., 2., 0., 0.,
                                   4., 2., 17., 1., 0., 0., 3., 1.,
                                   10., 0., 0., 0., 5., 0., 7., 1., 0., 0., 5., 1., 6., 1.,
                                   0., 0., 3., 0., 8., 0., 0., 0., 3., 2.,
                                   10., 0., 0., 0., 5., 0., 7., 1., 0., 0., 5., 1.,
                                   10., 0., 0., 0., 5., 0., 7., 1., 0., 0., 5., 1.,
                                   10., 0., 0., 0., 5., 0., 7., 1., 0., 0., 5., 1.
                                   ]])
                fix_observation = self.q_net._predict(obs2)[1][0]
                with open('../' + 'q_values_learned_results.csv', mode='a') as results_CSV:
                    results_writer = csv.writer(results_CSV, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                    results_writer.writerow(
                        [self.period_counter, float(fix_observation[0]), float(fix_observation[1]),
                         float(fix_observation[2])])

                # Write reward to CSV file after each period
                with open('../' + 'rewards_per_period.csv', mode='a') as rewards_per_period_CSV:
                    results_writer = csv.writer(rewards_per_period_CSV, delimiter='\t', quotechar='"',
                                                quoting=csv.QUOTE_MINIMAL)
                    results_writer.writerow([self.period_counter, float(reward_), float(self.rho)])

            if done:
                total_episodes += 1
                self._episode_num += 1
                episode_rewards.append(episode_reward)
                total_timesteps.append(episode_timesteps)

                if action_noise is not None:
                    action_noise.reset()

                # Log training infos
                if log_interval is not None and self._episode_num % log_interval == 0:
                    self._dump_logs()

        mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0

        callback.on_rollout_end()

        return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training)

示例#9

显示文件

    def collect_rollouts(
        self,
        env: VecEnv,
        callback: BaseCallback,
        n_episodes: int = 1,
        n_steps: int = -1,
        action_noise: Optional[ActionNoise] = None,
        learning_starts: int = 0,
        replay_buffer: Optional[ReplayBuffer] = None,
        log_interval: Optional[int] = None,
    ) -> RolloutReturn:
        """
        Collect experiences and store them into a ``ReplayBuffer``.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param n_episodes: Number of episodes to use to collect rollout data
            You can also specify a ``n_steps`` instead
        :param n_steps: Number of steps to use to collect rollout data
            You can also specify a ``n_episodes`` instead.
        :param action_noise: Action noise that will be used for exploration
            Required for deterministic policy (e.g. TD3). This can also be used
            in addition to the stochastic policy for SAC.
        :param learning_starts: Number of steps before learning for the warm-up phase.
        :param replay_buffer:
        :param log_interval: Log data every ``log_interval`` episodes
        :return:
        """
        episode_rewards, total_timesteps = [], []
        total_steps, total_episodes = 0, 0

        assert isinstance(env, VecEnv), "You must pass a VecEnv"
        assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment"

        if self.use_sde:
            self.actor.reset_noise()

        callback.on_rollout_start()
        continue_training = True

        while total_steps < n_steps or total_episodes < n_episodes:
            done = False
            episode_reward, episode_timesteps = 0.0, 0

            while not done:

                if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0:
                    # Sample a new noise matrix
                    self.actor.reset_noise()

                # Select action randomly or according to policy
                action, buffer_action = self._sample_action(
                    learning_starts, action_noise)

                # Rescale and perform action
                new_obs, reward, done, infos = env.step(action)

                self.num_timesteps += 1
                episode_timesteps += 1
                total_steps += 1

                # Give access to local variables
                callback.update_locals(locals())
                # Only stop training if return value is False, not when it is None.
                if callback.on_step() is False:
                    return RolloutReturn(0.0,
                                         total_steps,
                                         total_episodes,
                                         continue_training=False)

                episode_reward += reward

                # Retrieve reward and episode length if using Monitor wrapper
                self._update_info_buffer(infos, done)

                # Store data in replay buffer
                if replay_buffer is not None:
                    # Store only the unnormalized version
                    if self._vec_normalize_env is not None:
                        new_obs_ = self._vec_normalize_env.get_original_obs()
                        reward_ = self._vec_normalize_env.get_original_reward()
                    else:
                        # Avoid changing the original ones
                        self._last_original_obs, new_obs_, reward_ = self._last_obs, new_obs, reward

                    replay_buffer.add(self._last_original_obs, new_obs_,
                                      buffer_action, reward_, done)

                self._last_obs = new_obs
                # Save the unnormalized observation
                if self._vec_normalize_env is not None:
                    self._last_original_obs = new_obs_

                self._update_current_progress_remaining(
                    self.num_timesteps, self._total_timesteps)

                # For DQN, check if the target network should be updated
                # and update the exploration schedule
                # For SAC/TD3, the update is done as the same time as the gradient update
                # see https://github.com/hill-a/stable-baselines/issues/900
                self._on_step()

                if 0 < n_steps <= total_steps:
                    break

            if done:
                total_episodes += 1
                self._episode_num += 1
                episode_rewards.append(episode_reward)
                total_timesteps.append(episode_timesteps)

                if action_noise is not None:
                    action_noise.reset()

                # Log training infos
                if log_interval is not None and self._episode_num % log_interval == 0:
                    self._dump_logs()

        mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0

        callback.on_rollout_end()

        return RolloutReturn(mean_reward, total_steps, total_episodes,
                             continue_training)

示例#10

显示文件

    def collect_rollouts(
        self,
        env: VecEnv,
        callback: BaseCallback,
        train_freq: TrainFreq,
        action_noise: Optional[ActionNoise] = None,
        learning_starts: int = 0,
        log_interval: Optional[int] = None,
    ) -> RolloutReturn:
        """
        Collect experiences and store them into a ReplayBuffer.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param train_freq: How much experience to collect
            by doing rollouts of current policy.
            Either ``TrainFreq(<n>, TrainFrequencyUnit.STEP)``
            or ``TrainFreq(<n>, TrainFrequencyUnit.EPISODE)``
            with ``<n>`` being an integer greater than 0.
        :param action_noise: Action noise that will be used for exploration
            Required for deterministic policy (e.g. TD3). This can also be used
            in addition to the stochastic policy for SAC.
        :param learning_starts: Number of steps before learning for the warm-up phase.
        :param log_interval: Log data every ``log_interval`` episodes
        :return:
        """

        episode_rewards, total_timesteps = [], []
        num_collected_steps, num_collected_episodes = 0, 0

        assert isinstance(env, VecEnv), "You must pass a VecEnv"
        assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment"
        assert train_freq.frequency > 0, "Should at least collect one step or episode."

        if self.model.use_sde:
            self.actor.reset_noise()

        callback.on_rollout_start()
        continue_training = True

        while should_collect_more_steps(train_freq, num_collected_steps,
                                        num_collected_episodes):
            done = False
            episode_reward, episode_timesteps = 0.0, 0

            while not done:
                # concatenate observation and (desired) goal
                observation = self._last_obs
                self._last_obs = ObsDictWrapper.convert_dict(observation)

                if (self.model.use_sde and self.model.sde_sample_freq > 0 and
                        num_collected_steps % self.model.sde_sample_freq == 0):
                    # Sample a new noise matrix
                    self.actor.reset_noise()

                # Select action randomly or according to policy
                self.model._last_obs = self._last_obs
                action, buffer_action = self._sample_action(
                    learning_starts, action_noise)

                # Perform action
                new_obs, reward, done, infos = env.step(action)

                self.num_timesteps += 1
                self.model.num_timesteps = self.num_timesteps
                episode_timesteps += 1
                num_collected_steps += 1

                # Only stop training if return value is False, not when it is None.
                if callback.on_step() is False:
                    return RolloutReturn(0.0,
                                         num_collected_steps,
                                         num_collected_episodes,
                                         continue_training=False)

                episode_reward += reward

                # Retrieve reward and episode length if using Monitor wrapper
                self._update_info_buffer(infos, done)
                self.model.ep_info_buffer = self.ep_info_buffer
                self.model.ep_success_buffer = self.ep_success_buffer

                # == Store transition in the replay buffer and/or in the episode storage ==

                if self._vec_normalize_env is not None:
                    # Store only the unnormalized version
                    new_obs_ = self._vec_normalize_env.get_original_obs()
                    reward_ = self._vec_normalize_env.get_original_reward()
                else:
                    # Avoid changing the original ones
                    self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward
                    self.model._last_original_obs = self._last_original_obs

                # As the VecEnv resets automatically, new_obs is already the
                # first observation of the next episode
                if done and infos[0].get("terminal_observation") is not None:
                    next_obs = infos[0]["terminal_observation"]
                    # VecNormalize normalizes the terminal observation
                    if self._vec_normalize_env is not None:
                        next_obs = self._vec_normalize_env.unnormalize_obs(
                            next_obs)
                else:
                    next_obs = new_obs_

                if self.online_sampling:
                    self.replay_buffer.add(self._last_original_obs, next_obs,
                                           buffer_action, reward_, done, infos)
                else:
                    # concatenate observation with (desired) goal
                    flattened_obs = ObsDictWrapper.convert_dict(
                        self._last_original_obs)
                    flattened_next_obs = ObsDictWrapper.convert_dict(next_obs)
                    # add to replay buffer
                    self.replay_buffer.add(flattened_obs, flattened_next_obs,
                                           buffer_action, reward_, done)
                    # add current transition to episode storage
                    self._episode_storage.add(self._last_original_obs,
                                              next_obs, buffer_action, reward_,
                                              done, infos)

                self._last_obs = new_obs
                self.model._last_obs = self._last_obs

                # Save the unnormalized new observation
                if self._vec_normalize_env is not None:
                    self._last_original_obs = new_obs_
                    self.model._last_original_obs = self._last_original_obs

                self.model._update_current_progress_remaining(
                    self.num_timesteps, self._total_timesteps)

                # For DQN, check if the target network should be updated
                # and update the exploration schedule
                # For SAC/TD3, the update is done as the same time as the gradient update
                # see https://github.com/hill-a/stable-baselines/issues/900
                self.model._on_step()

                self.episode_steps += 1

                if not should_collect_more_steps(train_freq,
                                                 num_collected_steps,
                                                 num_collected_episodes):
                    break

            if done or self.episode_steps >= self.max_episode_length:
                if self.online_sampling:
                    self.replay_buffer.store_episode()
                else:
                    self._episode_storage.store_episode()
                    # sample virtual transitions and store them in replay buffer
                    self._sample_her_transitions()
                    # clear storage for current episode
                    self._episode_storage.reset()

                num_collected_episodes += 1
                self._episode_num += 1
                self.model._episode_num = self._episode_num
                episode_rewards.append(episode_reward)
                total_timesteps.append(episode_timesteps)

                if action_noise is not None:
                    action_noise.reset()

                # Log training infos
                if log_interval is not None and self._episode_num % log_interval == 0:
                    self._dump_logs()

                self.episode_steps = 0

        mean_reward = np.mean(
            episode_rewards) if num_collected_episodes > 0 else 0.0

        callback.on_rollout_end()

        return RolloutReturn(mean_reward, num_collected_steps,
                             num_collected_episodes, continue_training)

示例#11

显示文件

文件： offpac.py 项目： Darklanx/stable-baselines3

    def collect_rollouts(
        self,
        env: VecEnv,
        callback: BaseCallback,
        train_freq: TrainFreq,
        buffer: TrajectoryBuffer,
        action_noise: Optional[ActionNoise] = None,
        learning_starts: int = 0,
        log_interval: Optional[int] = None,
    ) -> RolloutReturn:
        """
        Collect experiences and store them into a ``TrajectoryBuffer``.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param train_freq: How much experience to collect
            by doing rollouts of current policy.
            Either ``TrainFreq(<n>, TrainFrequencyUnit.STEP)``
            or ``TrainFreq(<n>, TrainFrequencyUnit.EPISODE)``
            with ``<n>`` being an integer greater than 0.
        :param action_noise: Action noise that will be used for exploration
            Required for deterministic policy (e.g. TD3). This can also be used
            in addition to the stochastic policy for SAC.
        :param learning_starts: Number of steps before learning for the warm-up phase.
        :param trajectory_buffer:
        :param log_interval: Log data every ``log_interval`` episodes
        :return:
        """

        episode_rewards, total_timesteps = [], []
        num_collected_steps, num_collected_episodes = 0, 0

        assert isinstance(env, VecEnv), "You must pass a VecEnv"
        # assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment"
        assert train_freq.frequency > 0, "Should at least collect one step or episode."

        if self.use_sde:
            self.actor.reset_noise()

        callback.on_rollout_start()
        continue_training = True
        self.rollout_buffer.reset()
        


        done = np.array([False for i in range(self.n_envs)])
        episode_reward, episode_timesteps = [0.0 for i in range(self.n_envs)], [0 for i in range(self.n_envs)]
        if train_freq.unit == TrainFrequencyUnit.STEP:
            self.trajectories = [Trajectory(self.device) for i in range(self.n_envs)]
            
        while True:
            ms = [0]
            get_ms(ms)
            
            if self.use_sde and self.sde_sample_freq > 0 and num_collected_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.actor.reset_noise()

            # Select action randomly or according to policy
            
            with th.no_grad():
                # action, buffer_action = self._sample_action(learning_starts, action_noise, use_behav=False)
                # log_probs = self.policy.get_action_log_probs(th.tensor(np.array(self._last_obs)).to(self.device), th.tensor(np.array([action])).T.to(self.device), use_behav=False)
                action, buffer_action = self._sample_action(learning_starts, action_noise, use_behav=True)
                log_probs = self.policy.get_action_log_probs(th.tensor(np.array(self._last_obs)).to(self.device), th.tensor(np.array([action])).T.to(self.device), use_behav=True)
                prob = th.exp(log_probs)
                prob = (1 - self.exploration_rate) * prob + (self.exploration_rate) * (1.0 / self.action_space.n)
                prob = prob.cpu().numpy()


            if (prob > 1).any():
                print("prob > 1!!! => Code in offpac.py")
                print(prob)
                print(th.tensor(log_probs))
                exit()

            new_obs, reward, done, infos = env.step(action)

            with th.no_grad():
                if self.use_v_net:
                    latent_pi, latent_vf, latent_sde = self.policy._get_latent(th.tensor(self._last_obs).to(self.device))
                    values = self.value_net(latent_vf).detach()
                else:
                    values = self.policy.compute_value(th.tensor(self._last_obs).to(self.device), use_target_v=False).detach()
                
            # self.rollout_buffer.add(self._last_obs, action.reshape(-1, 1), reward, self._last_episode_starts, values, log_probs.flatten())


            self.num_timesteps += env.num_envs
            num_collected_steps += env.num_envs


            # Give access to local variables
            callback.update_locals(locals())
            # Only stop training if return value is False, not when it is None.
            if callback.on_step() is False:
                return RolloutReturn(0.0, num_collected_steps, num_collected_episodes, continue_training=False)
            
            episode_reward += reward
            # Retrieve reward and episode length if using Monitor wrapper
            self._update_info_buffer(infos, done)

            for i in range(len(self.trajectories)):
                # trajectories[i].add(Transition(self._last_obs[i], action[i], reward[i], new_obs[i], done[i], prob[i]))
                if done[i]:
                    if infos[i]['terminal_observation'].dtype == np.float64:
                        self.trajectories[i].add(Transition(self._last_obs[i], action[i], reward[i], infos[i]['terminal_observation'].astype(np.float32), done[i], prob[i]))
                    else:
                        self.trajectories[i].add(Transition(self._last_obs[i], action[i], reward[i], infos[i]['terminal_observation'], done[i], prob[i]))
                else:
                    self.trajectories[i].add(Transition(self._last_obs[i], action[i], reward[i], new_obs[i], done[i], prob[i]))
            self._last_obs = new_obs
            self._last_episode_starts = done

            self._update_current_progress_remaining(self.num_timesteps, self._total_timesteps)

            # For DQN, check if the target network should be updated
            # and update the exploration schedule
            # For SAC/TD3, the update is done as the same time as the gradient update
            # see https://github.com/hill-a/stable-baselines/issues/900
            self._on_step()

            '''
            if not should_collect_more_steps(train_freq, num_collected_steps, num_collected_episodes):
                # even if the episdoe is not finished, we store the trajectory because no more steps can be performed
                for traj_i, traj in enumerate(trajectories):
                    self._store_transition(buffer, traj)
                    total_timesteps.append(len(traj))
                    
                    trajectories[traj_i] = Trajectory(self.device)
                    
                    episode_rewards.append(episode_reward[traj_i])
                    episode_reward[traj_i] = 0.0
                break
            '''
            



            # store transition of finished episode, but if not more steps can be collected, treat any trajectory as an episode
            if done.any():
                num_collected_episodes += np.sum(done)
                self._episode_num += np.sum(done)
                if log_interval is not None and self._episode_num % log_interval == 0:
                        self._dump_logs()


            if train_freq.unit == TrainFrequencyUnit.STEP:
                ending = not should_collect_more_steps(train_freq, num_collected_steps//self.n_envs, num_collected_episodes//self.n_envs)
                # if ending, save all trajectories, otherwise only save done episode
                if ending:
                    for traj_i, traj in enumerate(self.trajectories):
                        self._store_transition(buffer, traj)
                        # total_timesteps.append(len(traj)) # is this line affecting anything????   
                        
                        self.trajectories[traj_i] = Trajectory(self.device)
                        
                        episode_rewards.append(episode_reward[traj_i])
                        episode_reward[traj_i] = 0.0
                    break
                else:
                    if done.any():
                        traj_indexes = [i for i in np.arange(len(self.trajectories))[done]]
                        for traj_i in traj_indexes:
                            self._store_transition(buffer, self.trajectories[traj_i])
                            # total_timesteps.append(len(traj)) # is this line affecting anything????   
                            self.trajectories[traj_i] = Trajectory(self.device)
                            episode_rewards.append(episode_reward[traj_i])
                            episode_reward[traj_i] = 0.0
                            
                            

                    


            elif train_freq.unit == TrainFrequencyUnit.EPISODE:
                ending = not should_collect_more_steps(train_freq, num_collected_steps//self.n_envs, num_collected_episodes//self.n_envs)
                if done.any():
                    # if ending, save all trajectories even if not finished
                    # if not ending:
                    traj_indexes = [i for i in np.arange(len(self.trajectories))[done]]
                    for traj_i in traj_indexes:
                        self._store_transition(buffer, self.trajectories[traj_i])
                        # total_timesteps.append(len(traj)) # is this line affecting anything???? 
                        
                        self.trajectories[traj_i] = Trajectory(self.device)
                        
                        episode_rewards.append(episode_reward[traj_i])
                        episode_reward[traj_i] = 0.0
                    '''
                    else:
                        _trajectories = trajectories
                    for traj_i, traj in enumerate(_trajectories):
                        self._store_transition(buffer, traj)
                        total_timesteps.append(len(traj)) # is this line affecting anything????   
                        
                        self.trajectories[traj_i] = Trajectory(self.device)
                        
                        episode_rewards.append(episode_reward[traj_i])
                        episode_reward[traj_i] = 0.0
                    '''
                if ending:
                    break
            else:
                print(train_freq.unit)
                raise Exception("Weird train_freq.unit...")
                exit(-1)
            
        
        if done.any():
            if action_noise is not None:
                action_noise.reset()

        with th.no_grad():
            obs_tensor = th.as_tensor(new_obs).squeeze(1).to(self.device)
            if self.use_v_net:
                latent_pi, latent_vf, latent_sde = self.policy._get_latent(obs_tensor)
                values = self.value_net(latent_vf).detach()
            else:
                values = self.policy.compute_value(obs_tensor, use_target_v=False)

        self.rollout_buffer.compute_returns_and_advantage(last_values=values, dones=done)
        
        mean_reward = np.mean(episode_rewards) if num_collected_episodes > 0 else 0.0

        callback.on_rollout_end()

        return RolloutReturn(mean_reward, num_collected_steps, num_collected_episodes, continue_training)

示例#12

显示文件

    def collect_rollouts(
        self,
        env: VecEnv,
        callback: BaseCallback,
        rollout_buffer: RolloutBuffer,
        n_rollout_steps: int,
    ) -> bool:
        """
        Collect experiences using the current policy and fill a ``RolloutBuffer``.
        The term rollout here refers to the model-free notion and should not
        be used with the concept of rollout used in model-based RL or planning.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps * self.outer_steps:  # here n_rollout_steps is n_steps in PPO args. Noted by Chenyin
            # while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor or to TensorDict
                obs_tensor = obs_as_tensor(self._last_obs, self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1

            # (1) if at the T-th step, the env is going to reset, so we shall store the terminal states in advance
            # (2) if done, new_obs is the new state after resetting the env, so we need to get terminal state from infos
            if n_steps % n_rollout_steps == 0 or dones.any():
                # if dones.any():  # second case: do not reset the env when encountering step T
                terminal_obs = new_obs.copy()
                infos_array = np.array(infos)  # change list to numpy array
                i = 0
                for done in dones:
                    if done:
                        terminal_obs[i] = infos_array[i][
                            "terminal_observation"]
                    i += 1
                with th.no_grad():
                    # Convert to pytorch tensor or to TensorDict
                    obs_tensor = obs_as_tensor(terminal_obs, self.device)
                    _, terminal_values, _ = self.policy.forward(
                        obs_tensor)  # in the infinite game, V(s_T) is defined
            else:  # when dones = [False, ..., False]
                terminal_values = None

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards,
                               self._last_episode_starts, values, log_probs,
                               terminal_values)

            # Chenyin
            if n_steps % n_rollout_steps == 0:
                self._last_obs = env.reset()
                self._last_episode_starts = np.ones((env.num_envs, ),
                                                    dtype=bool)
            else:
                self._last_obs = new_obs
                self._last_episode_starts = dones
            # self._last_obs = new_obs
            # self._last_episode_starts = dones

        with th.no_grad():
            # Compute value for the last timestep
            if n_steps % n_rollout_steps == 0 or dones.any():
                # if dones.any():
                # obs_tensor = obs_as_tensor(terminal_obs, self.device)
                # _, values, _ = self.policy.forward(obs_tensor)
                values = terminal_values
                assert values is not None
            else:
                obs_tensor = obs_as_tensor(new_obs, self.device)
                _, values, _ = self.policy.forward(obs_tensor)

        rollout_buffer.compute_returns_and_advantage(last_values=values)

        callback.on_rollout_end()

        return True

示例#13

显示文件

文件： customized_ppo.py 项目： liusida/experimental_rl

    def collect_rollouts(self, env: VecEnv, callback: BaseCallback,
                         rollout_buffer: CustomizedRolloutBuffer,
                         n_rollout_steps: int) -> bool:
        """
        Collect experiences using the current policy and fill a ``RolloutBuffer``.
        The term rollout here refers to the model-free notion and should not
        be used with the concept of rollout used in model-based RL or planning.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()
        """
        Sida
        """
        short_hidden_states, long_hidden_states = None, None
        dones = None
        while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                """
                Sida: get memory before passing forward, assuming there's only one rnn module for now.
                """
                if self.policy.features_extractor.num_parallel_rnns:
                    short_hidden_states = self.policy.features_extractor.cx_rollout.cpu(
                    ).numpy()
                    long_hidden_states = self.policy.features_extractor.hx_rollout.cpu(
                    ).numpy()

                if dones is not None:
                    dones = th.as_tensor(dones).to(self.device)
                actions, values, log_probs = self.policy.forward(
                    obs_tensor, new_start=dones)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            """
            Sida: add memory to rollout buffer
            """
            rollout_buffer.add(short_hidden_states, long_hidden_states,
                               self._last_obs, actions, rewards,
                               self._last_dones, values, log_probs)

            self._last_obs = new_obs
            self._last_dones = dones

        with th.no_grad():
            # Compute value for the last timestep
            obs_tensor = th.as_tensor(new_obs).to(self.device)
            _, values, _ = self.policy.forward(obs_tensor)

        rollout_buffer.compute_returns_and_advantage(last_values=values,
                                                     dones=dones)

        callback.on_rollout_end()

        return True

示例#14

显示文件

文件： on_policy_algorithm.py 项目： flyyufelix/google_research_football_kaggle

    def collect_rollouts(self, env: VecEnv, callback: BaseCallback,
                         rollout_buffer: RolloutBuffer,
                         n_rollout_steps: int) -> bool:
        """
        Collect experiences using the current policy and fill a ``RolloutBuffer``.
        The term rollout here refers to the model-free notion and should not
        be used with the concept of rollout used in model-based RL or planning.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            if dones[0]:
                for info in infos:
                    goal_diff = info['l_score'] - info['r_score']
                    print(
                        f"Rewards: {goal_diff} | Score: [{info['l_score']} : {info['r_score']}]"
                    )
                    self.scores.append(goal_diff)

                avg_score = sum(self.scores) / len(self.scores)
                print(f"Average Reward: {avg_score}")
                print("")

                if avg_score > self.best_score:
                    self.best_score = avg_score
                    self.save_best_model = True

                if self.log_handler is not None:
                    self.log_handler.log({"Average Reward": avg_score})

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards,
                               self._last_dones, values, log_probs)
            self._last_obs = new_obs
            self._last_dones = dones

        with th.no_grad():
            # Compute value for the last timestep
            obs_tensor = th.as_tensor(new_obs).to(self.device)
            _, values, _ = self.policy.forward(obs_tensor)

        rollout_buffer.compute_returns_and_advantage(last_values=values,
                                                     dones=dones)

        callback.on_rollout_end()

        return True

示例#15

显示文件

    def collect_rollouts(self,
                         env: VecEnv,
                         # Type hint as string to avoid circular import
                         callback: 'BaseCallback',
                         n_episodes: int = 1,
                         n_steps: int = -1,
                         action_noise: Optional[ActionNoise] = None,
                         learning_starts: int = 0,
                         replay_buffer: Optional[ReplayBuffer] = None,
                         log_interval: Optional[int] = None) -> RolloutReturn:
        """
        Collect rollout using the current policy (and possibly fill the replay buffer)

        :param env: (VecEnv) The training environment
        :param n_episodes: (int) Number of episodes to use to collect rollout data
            You can also specify a ``n_steps`` instead
        :param n_steps: (int) Number of steps to use to collect rollout data
            You can also specify a ``n_episodes`` instead.
        :param action_noise: (Optional[ActionNoise]) Action noise that will be used for exploration
            Required for deterministic policy (e.g. TD3). This can also be used
            in addition to the stochastic policy for SAC.
        :param callback: (BaseCallback) Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param learning_starts: (int) Number of steps before learning for the warm-up phase.
        :param replay_buffer: (ReplayBuffer)
        :param log_interval: (int) Log data every ``log_interval`` episodes
        :return: (RolloutReturn)
        """
        episode_rewards, total_timesteps = [], []
        total_steps, total_episodes = 0, 0

        assert isinstance(env, VecEnv), "You must pass a VecEnv"
        assert env.num_envs == 1, "OffPolicyRLModel only support single environment"

        if self.use_sde:
            self.actor.reset_noise()

        callback.on_rollout_start()
        continue_training = True

        while total_steps < n_steps or total_episodes < n_episodes:
            done = False
            episode_reward, episode_timesteps = 0.0, 0

            while not done:

                if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                    # Sample a new noise matrix
                    self.actor.reset_noise()

                # Select action randomly or according to policy
                if self.num_timesteps < learning_starts and not (self.use_sde and self.use_sde_at_warmup):
                    # Warmup phase
                    unscaled_action = np.array([self.action_space.sample()])
                else:
                    # Note: we assume that the policy uses tanh to scale the action
                    # We use non-deterministic action in the case of SAC, for TD3, it does not matter
                    unscaled_action, _ = self.predict(self._last_obs, deterministic=False)

                # Rescale the action from [low, high] to [-1, 1]
                if isinstance(self.action_space, gym.spaces.Box):
                    scaled_action = self.policy.scale_action(unscaled_action)

                    # Add noise to the action (improve exploration)
                    if action_noise is not None:
                        # NOTE: in the original implementation of TD3, the noise was applied to the unscaled action
                        # Update(October 2019): Not anymore
                        scaled_action = np.clip(scaled_action + action_noise(), -1, 1)

                    # We store the scaled action in the buffer
                    buffer_action = scaled_action
                    action = self.policy.unscale_action(scaled_action)
                else:
                    # Discrete case, no need to normalize or clip
                    buffer_action = unscaled_action
                    action = buffer_action

                # Rescale and perform action
                new_obs, reward, done, infos = env.step(action)

                # Only stop training if return value is False, not when it is None.
                if callback.on_step() is False:
                    return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False)

                episode_reward += reward

                # Retrieve reward and episode length if using Monitor wrapper
                self._update_info_buffer(infos, done)

                # Store data in replay buffer
                if replay_buffer is not None:
                    # Store only the unnormalized version
                    if self._vec_normalize_env is not None:
                        new_obs_ = self._vec_normalize_env.get_original_obs()
                        reward_ = self._vec_normalize_env.get_original_reward()
                    else:
                        # Avoid changing the original ones
                        self._last_original_obs, new_obs_, reward_ = self._last_obs, new_obs, reward

                    replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done)

                self._last_obs = new_obs
                # Save the unnormalized observation
                if self._vec_normalize_env is not None:
                    self._last_original_obs = new_obs_

                self.num_timesteps += 1
                episode_timesteps += 1
                total_steps += 1
                if 0 < n_steps <= total_steps:
                    break

            if done:
                total_episodes += 1
                self._episode_num += 1
                episode_rewards.append(episode_reward)
                total_timesteps.append(episode_timesteps)

                if action_noise is not None:
                    action_noise.reset()

                # Display training infos
                if self.verbose >= 1 and log_interval is not None and self._episode_num % log_interval == 0:
                    fps = int(self.num_timesteps / (time.time() - self.start_time))
                    self.logger.logkv("episodes", self._episode_num)
                    if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
                        self.logger.logkv('ep_rew_mean', self.safe_mean([ep_info['r'] for ep_info in self.ep_info_buffer]))
                        self.logger.logkv('ep_len_mean', self.safe_mean([ep_info['l'] for ep_info in self.ep_info_buffer]))
                    self.logger.logkv("fps", fps)
                    self.logger.logkv('time_elapsed', int(time.time() - self.start_time))
                    self.logger.logkv("total timesteps", self.num_timesteps)
                    if self.use_sde:
                        self.logger.logkv("std", (self.actor.get_std()).mean().item())

                    if len(self.ep_success_buffer) > 0:
                        self.logger.logkv('success rate', self.safe_mean(self.ep_success_buffer))
                    self.logger.dumpkvs()

        mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0

        callback.on_rollout_end()

        return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training)

示例#16

显示文件

文件： on_policy_algorithm.py 项目： bstee615/stable-baselines3

    def collect_rollouts(self, env: VecEnv, callback: BaseCallback,
                         rollout_buffer: RolloutBuffer,
                         n_rollout_steps: int) -> bool:
        """
        Collect experiences using the current policy and fill a ``RolloutBuffer``.
        The term rollout here refers to the model-free notion and should not
        be used with the concept of rollout used in model-based RL or planning.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)

            # Tag on the other agent's action
            submit_actions = clipped_actions
            if self.bridge and self.bridge.other(self.is_protagonist):
                other_actions = self.bridge.other(self.is_protagonist).predict(
                    obs_tensor.cpu().numpy())[0]
                # if isinstance(self.action_space, gym.spaces.Box):
                #     clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
                if len(other_actions.shape) < len(clipped_actions.shape):
                    other_actions = other_actions.unsqueeze(dim=1)
                submit_actions = np.concatenate(
                    [other_actions, clipped_actions] if self.is_protagonist
                    else [clipped_actions, other_actions],
                    axis=1)
            elif self.adv_action_space:
                submit_actions = np.concatenate(
                    (np.array([np.full(self.adv_action_space.shape, np.nan)
                               ]), clipped_actions),
                    axis=1)

            new_obs, rewards, dones, infos = env.step(submit_actions)
            if not self.is_protagonist:
                rewards = -rewards

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards,
                               self._last_dones, values, log_probs)
            self._last_obs = new_obs
            self._last_dones = dones

        with th.no_grad():
            # Compute value for the last timestep
            obs_tensor = th.as_tensor(new_obs).to(self.device)
            _, values, _ = self.policy.forward(obs_tensor)

        rollout_buffer.compute_returns_and_advantage(last_values=values,
                                                     dones=dones)

        callback.on_rollout_end()

        return True

示例#17

显示文件

文件： off_policy_algorithm.py 项目： pstansell/stable-baselines3

    def collect_rollouts(
        self,
        env: VecEnv,
        callback: BaseCallback,
        train_freq: TrainFreq,
        replay_buffer: ReplayBuffer,
        action_noise: Optional[ActionNoise] = None,
        learning_starts: int = 0,
        log_interval: Optional[int] = None,
    ) -> RolloutReturn:
        """
        Collect experiences and store them into a ``ReplayBuffer``.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param train_freq: How much experience to collect
            by doing rollouts of current policy.
            Either ``TrainFreq(<n>, TrainFrequencyUnit.STEP)``
            or ``TrainFreq(<n>, TrainFrequencyUnit.EPISODE)``
            with ``<n>`` being an integer greater than 0.
        :param action_noise: Action noise that will be used for exploration
            Required for deterministic policy (e.g. TD3). This can also be used
            in addition to the stochastic policy for SAC.
        :param learning_starts: Number of steps before learning for the warm-up phase.
        :param replay_buffer:
        :param log_interval: Log data every ``log_interval`` episodes
        :return:
        """
        episode_rewards, total_timesteps = [], []
        num_collected_steps, num_collected_episodes = 0, 0

        assert isinstance(env, VecEnv), "You must pass a VecEnv"
        assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment"
        assert train_freq.frequency > 0, "Should at least collect one step or episode."

        if self.use_sde:
            self.actor.reset_noise()

        callback.on_rollout_start()
        continue_training = True

        while should_collect_more_steps(train_freq, num_collected_steps, num_collected_episodes):
            done = False
            episode_reward, episode_timesteps = 0.0, 0

            while not done:

                if self.use_sde and self.sde_sample_freq > 0 and num_collected_steps % self.sde_sample_freq == 0:
                    # Sample a new noise matrix
                    self.actor.reset_noise()

                # Select action randomly or according to policy
                action, buffer_action = self._sample_action(learning_starts, action_noise)

                # Rescale and perform action
                new_obs, reward, done, infos = env.step(action)

                self.num_timesteps += 1
                episode_timesteps += 1
                num_collected_steps += 1

                # Give access to local variables
                callback.update_locals(locals())
                # Only stop training if return value is False, not when it is None.
                if callback.on_step() is False:
                    return RolloutReturn(0.0, num_collected_steps, num_collected_episodes, continue_training=False)

                episode_reward += reward

                # Retrieve reward and episode length if using Monitor wrapper
                self._update_info_buffer(infos, done)

                # Store data in replay buffer (normalized action and unnormalized observation)
                self._store_transition(replay_buffer, buffer_action, new_obs, reward, done, infos)

                self._update_current_progress_remaining(self.num_timesteps, self._total_timesteps)

                # For DQN, check if the target network should be updated
                # and update the exploration schedule
                # For SAC/TD3, the update is done as the same time as the gradient update
                # see https://github.com/hill-a/stable-baselines/issues/900
                self._on_step()

                if not should_collect_more_steps(train_freq, num_collected_steps, num_collected_episodes):
                    break

            if done:
                num_collected_episodes += 1
                self._episode_num += 1
                episode_rewards.append(episode_reward)
                total_timesteps.append(episode_timesteps)

                if action_noise is not None:
                    action_noise.reset()

                # Log training infos
                if log_interval is not None and self._episode_num % log_interval == 0:
                    self._dump_logs()

        mean_reward = np.mean(episode_rewards) if num_collected_episodes > 0 else 0.0

        callback.on_rollout_end()

        return RolloutReturn(mean_reward, num_collected_steps, num_collected_episodes, continue_training)

示例#18

显示文件

    def collect_rollouts(
        self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int
    ) -> bool:
        """
        Collect experiences using the current policy and fill a ``RolloutBuffer``.
        The term rollout here refers to the model-free notion and should not
        be used with the concept of rollout used in model-based RL or planning.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()
        
        # debug ===============================================================
        if mode == 'debug':
            print(["OPA.collect_rollouts started, let's roll!"])

        while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)
            
            # notes ===========================================================
            # use last observation to generate action (with log probs) and value
            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()
            
            # debug ===========================================================
            if mode == 'debug':
                print(['OPA.collect_rollouts loop', 'n_rollout_steps:', n_rollout_steps, 'n_steps:', n_steps])
                print(['OPA.collect_rollouts loop eval', 
                       'last_obs:', self._last_obs, 'actions', actions, 'values', values, 'log_probs', log_probs])

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)

            # notes ===========================================================
            # use clipped_actions to interact with env
            new_obs, rewards, dones, infos = env.step(clipped_actions)

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs)
            # debug ===========================================================
            if mode == 'debug':
                print(['OPA.collect_rollouts loop save', 
                       'last_obs:', self._last_obs, 'actions', actions, 'values', values, 'log_probs', log_probs, 
                       'rewards', rewards, 'last_dones', self._last_dones])
            # notes ===========================================================
            # 6 things to save in buffer: last_obs, actions, rewards, last_dones, values, log_probs

            self._last_obs = new_obs
            self._last_dones = dones

        with th.no_grad():
            # Compute value for the last timestep
            obs_tensor = th.as_tensor(new_obs).to(self.device)
            _, values, _ = self.policy.forward(obs_tensor)
        
        # debug ===============================================================
        if mode == 'debug':
            print(['OPA.collect_rollouts last', 'new_obs:', new_obs, 'values:', values, 'dones:', dones])
            print(['OPA.collect_rollouts finished, ready to compute_returns'])
            
        rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones)
        

        callback.on_rollout_end()

        return True