def collect_rollouts(self, env: VecEnv, callback: BaseCallback,
                         rollout_buffer: TrajRolloutBuffer,
                         n_rollout_steps: int) -> bool:
        """
        Collect rollouts using the current policy and fill a `RolloutBuffer`.

        :param env: (VecEnv) The training environment
        :param callback: (BaseCallback) Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: (RolloutBuffer) Buffer to fill with rollouts
        :param n_steps: (int) Number of experiences to collect per environment
        :return: (bool) True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        # while n_steps < n_rollout_steps:
        while not rollout_buffer.full:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor
                obs_ctx_tensor = th.as_tensor(self._last_obs).to(
                    self.device)  # (num_agents,) + (obs_dim,)
                actions, values, log_probs = self.policy.forward(
                    obs_ctx_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)

            # action_dict = zip(enumerate(clipped_actions))
            new_obs, rewards, dones, infos = env.step(
                clipped_actions)  # env step takes np.array, returns np.array?

            # TODO: figure out where to put this reset: maybe in an env wrapper like they did
            if dones[self.env.num_agents] == 1:
                env.reset()

            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1
            # self.num_timesteps += env.num_envs
            self.num_timesteps += 1

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)

            _obs = self._last_obs[..., 0:self.env.obs_size]
            _ctx = self._last_obs[..., self.env.obs_size:]

            # TODO: need to fix in the case of new number of agents, since range(len(last_obs)) will be incorrect
            for i in range(len(self._last_obs)):
                rollout_buffer.add(agent_id=i,
                                   context=_ctx[i],
                                   done=self._last_dones[i],
                                   obs=_obs[i],
                                   action=actions[i],
                                   reward=rewards[i],
                                   value=values[i],
                                   log_prob=log_probs[i])
            # for i, state_ctx_pair in enumerate(zip(*_obs, _ctx)):
            #     print(state_ctx_pair)
            #     rollout_buffer.add(agent_id=i,
            #                     context=state_ctx_pair[-1],
            #                     done=self._last_dones[i],
            #                     obs=state_ctx_pair[0:-1],
            #                     action=actions[i],
            #                     reward=rewards[i],
            #                     value=values[i],
            #                     log_prob=log_probs[i])

            # TODO: needs modification!
            # rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs)
            self._last_obs = new_obs
            self._last_dones = dones

        rollout_buffer.compute_returns_and_advantage(values)

        callback.on_rollout_end()

        return True
Exemplo n.º 2
0
    def collect_rollouts(
        self,
        env: VecEnv,
        callback: BaseCallback,
        n_episodes: int = 1,
        n_steps: int = -1,
        action_noise: Optional[ActionNoise] = None,
        learning_starts: int = 0,
        replay_buffer: Optional[ReplayBuffer] = None,
        log_interval: Optional[int] = None,
    ) -> RolloutReturn:
        """
    Collect experiences and store them into a ReplayBuffer.

    :param env: (VecEnv) The training environment
    :param callback: (BaseCallback) Callback that will be called at each step
        (and at the beginning and end of the rollout)
    :param n_episodes: (int) Number of episodes to use to collect rollout data
        You can also specify a ``n_steps`` instead
    :param n_steps: (int) Number of steps to use to collect rollout data
        You can also specify a ``n_episodes`` instead.
    :param action_noise: (Optional[ActionNoise]) Action noise that will be used for exploration
        Required for deterministic policy (e.g. TD3). This can also be used
        in addition to the stochastic policy for SAC.
    :param learning_starts: (int) Number of steps before learning for the warm-up phase.
    :param replay_buffer: (ReplayBuffer)
    :param log_interval: (int) Log data every ``log_interval`` episodes
    :return: (RolloutReturn)
    """
        episode_rewards, total_timesteps = [], []
        total_steps, total_episodes = 0, 0

        assert isinstance(env, VecEnv), "You must pass a VecEnv"
        # assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment"

        if self.use_sde:
            self.actor.reset_noise()

        callback.on_rollout_start()
        continue_training = True

        while total_steps < n_steps or total_episodes < n_episodes:
            _last_obs = env.reset()
            done = False
            episode_reward, episode_timesteps = 0.0, 0

            while not done:

                if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0:
                    # Sample a new noise matrix
                    self.actor.reset_noise()

                # Select action randomly or according to policy
                action, buffer_action = self._sample_action(
                    learning_starts, action_noise)

                # Rescale and perform action
                new_obs, reward, doneArray, infos = env.step(action)
                done = True
                for d in doneArray:
                    if not d:
                        done = False

                self.num_timesteps += 1
                episode_timesteps += 1
                total_steps += 1

                # Give access to local variables
                callback.update_locals(locals())
                # Only stop training if return value is False, not when it is None.
                if callback.on_step() is False:
                    return RolloutReturn(0.0,
                                         total_steps,
                                         total_episodes,
                                         continue_training=False)

                episode_reward += np.sum(np.asarray(reward))

                # Retrieve reward and episode length if using Monitor wrapper
                self._update_info_buffer(infos[0], done)

                # Store data in replay buffer
                if replay_buffer is not None:
                    # Store only the unnormalized version
                    if self._vec_normalize_env is not None:
                        new_obs_ = self._vec_normalize_env.get_original_obs()
                        reward_ = self._vec_normalize_env.get_original_reward()
                    else:
                        # Avoid changing the original ones
                        self._last_original_obs, new_obs_, reward_ = self._last_obs, new_obs, reward
                    for i in range(0, env.num_envs):
                        replay_buffer.add(self._last_original_obs[i],
                                          new_obs_[i], buffer_action[i],
                                          reward_[i], doneArray[i])

                self._last_obs = new_obs
                # Save the unnormalized observation
                if self._vec_normalize_env is not None:
                    self._last_original_obs = new_obs_

                self._update_current_progress_remaining(
                    self.num_timesteps, self._total_timesteps)

                # For DQN, check if the target network should be updated
                # and update the exploration schedule
                # For SAC/TD3, the update is done as the same time as the gradient update
                # see https://github.com/hill-a/stable-baselines/issues/900
                self._on_step()

                if 0 < n_steps <= total_steps:
                    break

            if done:
                total_episodes += 1
                self._episode_num += 1
                episode_rewards.append(episode_reward)
                total_timesteps.append(episode_timesteps)

                if action_noise is not None:
                    action_noise.reset()

                # Log training infos
                if log_interval is not None and self._episode_num % log_interval == 0:
                    self._dump_logs()

        mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0

        callback.on_rollout_end()

        return RolloutReturn(mean_reward, total_steps, total_episodes,
                             continue_training)
Exemplo n.º 3
0
def generate_trajectories(
    policy,
    venv: VecEnv,
    sample_until: GenTrajTerminationFn,
    *,
    deterministic_policy: bool = False,
    rng: np.random.RandomState = np.random,
) -> Sequence[types.TrajectoryWithRew]:
    """Generate trajectory dictionaries from a policy and an environment.

    Args:
      policy (Callable,BasePolicy or BaseAlgorithm): A function mapping observation to action, a stable_baselines3 policy 
      or an algorithm trained on the gym environment.
      venv: The vectorized environments to interact with.
      sample_until: A function determining the termination condition.
          It takes a sequence of trajectories, and returns a bool.
          Most users will want to use one of `min_episodes` or `min_timesteps`.
      deterministic_policy: If True, asks policy to deterministically return
          action. Note the trajectories might still be non-deterministic if the
          environment has non-determinism!
      rng: used for shuffling trajectories.

    Returns:
      Sequence of trajectories, satisfying `sample_until`. Additional trajectories
      may be collected to avoid biasing process towards short episodes; the user
      should truncate if required.
    """
    if isinstance(policy, BaseAlgorithm):
        policy.set_env(venv)

    # Collect rollout tuples.
    trajectories = []
    # accumulator for incomplete trajectories
    trajectories_accum = TrajectoryAccumulator()
    obs = venv.reset()
    for env_idx, ob in enumerate(obs):
        # Seed with first obs only. Inside loop, we'll only add second obs from
        # each (s,a,r,s') tuple, under the same "obs" key again. That way we still
        # get all observations, but they're not duplicated into "next obs" and
        # "previous obs" (this matters for, e.g., Atari, where observations are
        # really big).
        trajectories_accum.add_step(dict(obs=ob), env_idx)

    # Now, we sample until `sample_until(trajectories)` is true.
    # If we just stopped then this would introduce a bias towards shorter episodes,
    # since longer episodes are more likely to still be active, i.e. in the process
    # of being sampled from. To avoid this, we continue sampling until all epsiodes
    # are complete.
    #
    # To start with, all environments are active.
    active = np.ones(venv.num_envs, dtype=np.bool)
    while np.any(active):
        if isinstance(policy, Callable):
            acts = policy(obs)
        else:
            acts, _ = policy.predict(obs, deterministic=deterministic_policy)
        obs, rews, dones, infos = venv.step(acts)

        # If an environment is inactive, i.e. the episode completed for that
        # environment after `sample_until(trajectories)` was true, then we do
        # *not* want to add any subsequent trajectories from it. We avoid this
        # by just making it never done.
        dones &= active

        new_trajs = trajectories_accum.add_steps_and_auto_finish(
            acts, obs, rews, dones, infos)
        trajectories.extend(new_trajs)

        if sample_until(trajectories):
            # Termination condition has been reached. Mark as inactive any environments
            # where a trajectory was completed this timestep.
            active &= ~dones

    # Note that we just drop partial trajectories. This is not ideal for some
    # algos; e.g. BC can probably benefit from partial trajectories, too.

    # Each trajectory is sampled i.i.d.; however, shorter episodes are added to
    # `trajectories` sooner. Shuffle to avoid bias in order. This is important
    # when callees end up truncating the number of trajectories or transitions.
    # It is also cheap, since we're just shuffling pointers.
    rng.shuffle(trajectories)

    # Sanity checks.
    for trajectory in trajectories:
        n_steps = len(trajectory.acts)
        # extra 1 for the end
        exp_obs = (n_steps + 1, ) + venv.observation_space.shape
        real_obs = trajectory.obs.shape
        assert real_obs == exp_obs, f"expected shape {exp_obs}, got {real_obs}"
        exp_act = (n_steps, ) + venv.action_space.shape
        real_act = trajectory.acts.shape
        assert real_act == exp_act, f"expected shape {exp_act}, got {real_act}"
        exp_rew = (n_steps, )
        real_rew = trajectory.rews.shape
        assert real_rew == exp_rew, f"expected shape {exp_rew}, got {real_rew}"

    return trajectories
Exemplo n.º 4
0
    def collect_rollouts(
        self,
        env: VecEnv,
        callback: BaseCallback,
        rollout_buffer: RolloutBuffer,
        n_rollout_steps: int,
    ) -> bool:
        """
        Collect experiences using the current policy and fill a ``RolloutBuffer``.
        The term rollout here refers to the model-free notion and should not
        be used with the concept of rollout used in model-based RL or planning.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps * self.outer_steps:  # here n_rollout_steps is n_steps in PPO args. Noted by Chenyin
            # while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor or to TensorDict
                obs_tensor = obs_as_tensor(self._last_obs, self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1

            # (1) if at the T-th step, the env is going to reset, so we shall store the terminal states in advance
            # (2) if done, new_obs is the new state after resetting the env, so we need to get terminal state from infos
            if n_steps % n_rollout_steps == 0 or dones.any():
                # if dones.any():  # second case: do not reset the env when encountering step T
                terminal_obs = new_obs.copy()
                infos_array = np.array(infos)  # change list to numpy array
                i = 0
                for done in dones:
                    if done:
                        terminal_obs[i] = infos_array[i][
                            "terminal_observation"]
                    i += 1
                with th.no_grad():
                    # Convert to pytorch tensor or to TensorDict
                    obs_tensor = obs_as_tensor(terminal_obs, self.device)
                    _, terminal_values, _ = self.policy.forward(
                        obs_tensor)  # in the infinite game, V(s_T) is defined
            else:  # when dones = [False, ..., False]
                terminal_values = None

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards,
                               self._last_episode_starts, values, log_probs,
                               terminal_values)

            # Chenyin
            if n_steps % n_rollout_steps == 0:
                self._last_obs = env.reset()
                self._last_episode_starts = np.ones((env.num_envs, ),
                                                    dtype=bool)
            else:
                self._last_obs = new_obs
                self._last_episode_starts = dones
            # self._last_obs = new_obs
            # self._last_episode_starts = dones

        with th.no_grad():
            # Compute value for the last timestep
            if n_steps % n_rollout_steps == 0 or dones.any():
                # if dones.any():
                # obs_tensor = obs_as_tensor(terminal_obs, self.device)
                # _, values, _ = self.policy.forward(obs_tensor)
                values = terminal_values
                assert values is not None
            else:
                obs_tensor = obs_as_tensor(new_obs, self.device)
                _, values, _ = self.policy.forward(obs_tensor)

        rollout_buffer.compute_returns_and_advantage(last_values=values)

        callback.on_rollout_end()

        return True