Python ObsDictWrapper示例，stable_baselines3.common.vec_env.obs_dict_wrapper.ObsDictWrapper Python示例

示例#1

0

显示文件

文件： base_class.py 项目： Eunice330/stable-baselines3

    def _wrap_env(env: GymEnv,
                  verbose: int = 0,
                  monitor_wrapper: bool = True) -> VecEnv:
        """ "
        Wrap environment with the appropriate wrappers if needed.
        For instance, to have a vectorized environment
        or to re-order the image channels.

        :param env:
        :param verbose:
        :param monitor_wrapper: Whether to wrap the env in a ``Monitor`` when possible.
        :return: The wrapped environment.
        """
        if not isinstance(env, VecEnv):
            if not is_wrapped(env, Monitor) and monitor_wrapper:
                if verbose >= 1:
                    print("Wrapping the env with a `Monitor` wrapper")
                env = Monitor(env)
            if verbose >= 1:
                print("Wrapping the env in a DummyVecEnv.")
            env = DummyVecEnv([lambda: env])

        if (is_image_space(env.observation_space)
                and not is_vecenv_wrapped(env, VecTransposeImage)
                and not is_image_space_channels_first(env.observation_space)):
            if verbose >= 1:
                print("Wrapping the env in a VecTransposeImage.")
            env = VecTransposeImage(env)

        # check if wrapper for dict support is needed when using HER
        if isinstance(env.observation_space, gym.spaces.dict.Dict):
            env = ObsDictWrapper(env)

        return env

示例#2

0

显示文件

    def create_envs(self,
                    n_envs: int,
                    eval_env: bool = False,
                    no_log: bool = False) -> VecEnv:
        """
        Create the environment and wrap it if necessary.

        :param n_envs:
        :param eval_env: Whether is it an environment used for evaluation or not
        :param no_log: Do not log training when doing hyperparameter optim
            (issue with writing the same file)
        :return: the vectorized environment, with appropriate wrappers
        """
        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env or no_log else self.save_path

        # env = SubprocVecEnv([make_env(env_id, i, self.seed) for i in range(n_envs)])
        # On most env, SubprocVecEnv does not help and is quite memory hungry
        env = make_vec_env(
            env_id=self.env_id,
            n_envs=n_envs,
            seed=self.seed,
            env_kwargs=self.env_kwargs,
            monitor_dir=log_dir,
            wrapper_class=self.env_wrapper,
            vec_env_cls=self.vec_env_class,
            vec_env_kwargs=self.vec_env_kwargs,
        )

        # Special case for GoalEnvs: log success rate too
        if "Neck" in self.env_id or self.is_robotics_env(self.env_id):
            self._log_success_rate(env)

        # Wrap the env into a VecNormalize wrapper if needed
        # and load saved statistics when present
        env = self._maybe_normalize(env, eval_env)

        # Optional Frame-stacking
        if self.frame_stack is not None:
            n_stack = self.frame_stack
            env = VecFrameStack(env, n_stack)
            if self.verbose > 0:
                print(f"Stacking {n_stack} frames")

        # Wrap if needed to re-order channels
        # (switch from channel last to channel first convention)
        if is_image_space(env.observation_space):
            if self.verbose > 0:
                print("Wrapping into a VecTransposeImage")
            env = VecTransposeImage(env)

        # check if wrapper for dict support is needed
        if self.algo == "her":
            if self.verbose > 0:
                print("Wrapping into a ObsDictWrapper")
            env = ObsDictWrapper(env)

        return env

示例#3

0

显示文件

文件： policies.py 项目： sateeshs/drl_grasping

    def predict(
        self,
        observation: np.ndarray,
        state: Optional[np.ndarray] = None,
        mask: Optional[np.ndarray] = None,
        deterministic: bool = False,
    ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
        """
        Overriden to create proper Octree batch.
        Get the policy action and state from an observation (and optional state).

        :param observation: the input observation
        :param state: The last states (can be None, used in recurrent policies)
        :param mask: The last masks (can be None, used in recurrent policies)
        :param deterministic: Whether or not to return deterministic actions.
        :return: the model's action and the next state
            (used in recurrent policies)
        """
        if isinstance(observation, dict):
            observation = ObsDictWrapper.convert_dict(observation)
        else:
            observation = np.array(observation)

        vectorized_env = is_vectorized_observation(observation,
                                                   self.observation_space)

        if self._debug_write_octree:
            ocnn.write_octree(th.from_numpy(observation[-1]), 'octree.octree')

        # Make batch out of tensor (consisting of n-stacked octrees)
        octree_batch = preprocess_stacked_octree_batch(
            observation,
            self.device,
            separate_batches=self._separate_networks_for_stacks)

        with th.no_grad():
            actions = self._predict(octree_batch, deterministic=deterministic)
        # Convert to numpy
        actions = actions.cpu().numpy()

        if isinstance(self.action_space, gym.spaces.Box):
            if self.squash_output:
                # Rescale to proper domain when using squashing
                actions = self.unscale_action(actions)
            else:
                # Actions could be on arbitrary scale, so clip the actions to avoid
                # out of bound error (e.g. if sampling from a Gaussian distribution)
                actions = np.clip(actions, self.action_space.low,
                                  self.action_space.high)

        if not vectorized_env:
            if state is not None:
                raise ValueError(
                    "Error: The environment must be vectorized when using recurrent policies."
                )
            actions = actions[0]

        return actions, state

示例#4

0

显示文件

    def predict(
        self,
        observation: np.ndarray,
        state: Optional[np.ndarray] = None,
        mask: Optional[np.ndarray] = None,
        deterministic: bool = False,
    ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
        """
        Get the policy action and state from an observation (and optional state).
        Includes sugar-coating to handle different observations (e.g. normalizing images).

        :param observation: the input observation
        :param state: The last states (can be None, used in recurrent policies)
        :param mask: The last masks (can be None, used in recurrent policies)
        :param deterministic: Whether or not to return deterministic actions.
        :return: the model's action and the next state
            (used in recurrent policies)
        """
        # TODO (GH/1): add support for RNN policies
        # if state is None:
        #     state = self.initial_state
        # if mask is None:
        #     mask = [False for _ in range(self.n_envs)]
        if isinstance(observation, dict):
            observation = ObsDictWrapper.convert_dict(observation)
        else:
            observation = np.array(observation)

        # Handle the different cases for images
        # as PyTorch use channel first format
        observation = maybe_transpose(observation, self.observation_space)

        vectorized_env = is_vectorized_observation(observation, self.observation_space)

        observation = observation.reshape((-1,) + self.observation_space.shape)

        observation = th.as_tensor(observation).to(self.device)
        with th.no_grad():
            actions = self._predict(observation, deterministic=deterministic)
        # Convert to numpy
        actions = actions.cpu().numpy()

        if isinstance(self.action_space, gym.spaces.Box):
            if self.squash_output:
                # Rescale to proper domain when using squashing
                actions = self.unscale_action(actions)
            else:
                # Actions could be on arbitrary scale, so clip the actions to avoid
                # out of bound error (e.g. if sampling from a Gaussian distribution)
                actions = np.clip(actions, self.action_space.low, self.action_space.high)

        if not vectorized_env:
            if state is not None:
                raise ValueError("Error: The environment must be vectorized when using recurrent policies.")
            actions = actions[0]

        return actions, state

示例#5

0

显示文件

文件： base_class.py 项目： kdh0429/TorchDeepMimic

    def _wrap_env(env: GymEnv, verbose: int = 0) -> VecEnv:
        if not isinstance(env, VecEnv):
            if verbose >= 1:
                print("Wrapping the env in a DummyVecEnv.")
            env = DummyVecEnv([lambda: env])

        if is_image_space(env.observation_space) and not is_wrapped(env, VecTransposeImage):
            if verbose >= 1:
                print("Wrapping the env in a VecTransposeImage.")
            env = VecTransposeImage(env)

        # check if wrapper for dict support is needed when using HER
        if isinstance(env.observation_space, gym.spaces.dict.Dict):
            env = ObsDictWrapper(env)

        return env

示例#6

0

显示文件

文件： test_callbacks.py 项目： Vincent-Fu-Lab/Apprentissage-par-renforcement-profond-sur-Pendulum

def test_eval_success_logging(tmp_path):
    n_bits = 2
    env = BitFlippingEnv(n_bits=n_bits)
    eval_env = DummyVecEnv([lambda: BitFlippingEnv(n_bits=n_bits)])
    eval_callback = EvalCallback(
        ObsDictWrapper(eval_env),
        eval_freq=250,
        log_path=tmp_path,
        warn=False,
    )
    model = HER("MlpPolicy",
                env,
                DQN,
                learning_starts=100,
                seed=0,
                max_episode_length=n_bits)
    model.learn(500, callback=eval_callback)
    assert len(eval_callback._is_success_buffer) > 0
    # More than 50% success rate
    assert np.mean(eval_callback._is_success_buffer) > 0.5

示例#7

0

显示文件

文件： test_her.py 项目： maimai04/stable-baselines3_modified

def test_save_load(tmp_path, model_class, use_sde, online_sampling):
    """
    Test if 'save' and 'load' saves and loads model correctly
    """
    if use_sde and model_class != SAC:
        pytest.skip("Only SAC has gSDE support")

    n_bits = 4
    env = BitFlippingEnv(n_bits=n_bits, continuous=not (model_class == DQN))

    kwargs = dict(use_sde=True) if use_sde else {}

    # create model
    model = HER("MlpPolicy",
                env,
                model_class,
                n_sampled_goal=5,
                goal_selection_strategy="future",
                online_sampling=online_sampling,
                verbose=0,
                tau=0.05,
                batch_size=128,
                learning_rate=0.001,
                policy_kwargs=dict(net_arch=[64]),
                buffer_size=int(1e6),
                gamma=0.98,
                gradient_steps=1,
                train_freq=4,
                learning_starts=100,
                max_episode_length=n_bits,
                **kwargs)

    model.learn(total_timesteps=300)

    env.reset()

    observations_list = []
    for _ in range(10):
        obs = env.step(env.action_space.sample())[0]
        observation = ObsDictWrapper.convert_dict(obs)
        observations_list.append(observation)
    observations = np.array(observations_list)

    # Get dictionary of current parameters
    params = deepcopy(model.policy.state_dict())

    # Modify all parameters to be random values
    random_params = dict((param_name, th.rand_like(param))
                         for param_name, param in params.items())

    # Update model parameters with the new random values
    model.policy.load_state_dict(random_params)

    new_params = model.policy.state_dict()
    # Check that all params are different now
    for k in params:
        assert not th.allclose(
            params[k], new_params[k]), "Parameters did not change as expected."

    params = new_params

    # get selected actions
    selected_actions, _ = model.predict(observations, deterministic=True)

    # Check
    model.save(tmp_path / "test_save.zip")
    del model

    # test custom_objects
    # Load with custom objects
    custom_objects = dict(learning_rate=2e-5, dummy=1.0)
    model_ = HER.load(str(tmp_path / "test_save.zip"),
                      env=env,
                      custom_objects=custom_objects,
                      verbose=2)
    assert model_.verbose == 2
    # Check that the custom object was taken into account
    assert model_.learning_rate == custom_objects["learning_rate"]
    # Check that only parameters that are here already are replaced
    assert not hasattr(model_, "dummy")

    model = HER.load(str(tmp_path / "test_save.zip"), env=env)

    # check if params are still the same after load
    new_params = model.policy.state_dict()

    # Check that all params are the same as before save load procedure now
    for key in params:
        assert th.allclose(
            params[key], new_params[key]
        ), "Model parameters not the same after save and load."

    # check if model still selects the same actions
    new_selected_actions, _ = model.predict(observations, deterministic=True)
    assert np.allclose(selected_actions, new_selected_actions, 1e-4)

    # check if learn still works
    model.learn(total_timesteps=300)

    # Test that the change of parameters works
    model = HER.load(str(tmp_path / "test_save.zip"),
                     env=env,
                     verbose=3,
                     learning_rate=2.0)
    assert model.model.learning_rate == 2.0
    assert model.verbose == 3

    # clear file from os
    os.remove(tmp_path / "test_save.zip")

示例#8

0

显示文件

文件： exp_manager.py 项目： tarunkumarmpc/drl_grasping

    def create_envs(self,
                    n_envs: int,
                    eval_env: bool = False,
                    no_log: bool = False) -> VecEnv:
        """
        Create the environment and wrap it if necessary.

        :param n_envs:
        :param eval_env: Whether is it an environment used for evaluation or not
        :param no_log: Do not log training when doing hyperparameter optim
            (issue with writing the same file)
        :return: the vectorized environment, with appropriate wrappers
        """
        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env or no_log else self.save_path

        monitor_kwargs = {}
        # Special case for GoalEnvs: log success rate too
        if "Neck" in self.env_id or self.is_robotics_env(
                self.env_id) or "parking-v0" in self.env_id:
            monitor_kwargs = dict(info_keywords=("is_success", ))

        # Note: made custom to support Gazebo Runtime wrapping
        def make_env():
            def _init():
                env = self.env_wrapper(env=self.env_id, **self.env_kwargs)
                env.seed(self.seed)
                env.action_space.seed(self.seed)

                monitor_path = log_dir if log_dir is not None else None
                if monitor_path is not None:
                    os.makedirs(log_dir, exist_ok=True)
                env = Monitor(env, filename=monitor_path, **monitor_kwargs)
                return env

            return _init

        if self.vec_env_class is None:
            self.vec_env_class = DummyVecEnv
        env = self.vec_env_class([make_env()], **self.vec_env_kwargs)

        # Wrap the env into a VecNormalize wrapper if needed
        # and load saved statistics when present
        env = self._maybe_normalize(env, eval_env)

        # Optional Frame-stacking
        if self.frame_stack is not None:
            n_stack = self.frame_stack
            env = VecFrameStack(env, n_stack)
            if self.verbose > 0:
                print(f"Stacking {n_stack} frames")

        # Wrap if needed to re-order channels
        # (switch from channel last to channel first convention)
        if is_image_space(
                env.observation_space) and not is_image_space_channels_first(
                    env.observation_space):
            if self.verbose > 0:
                print("Wrapping into a VecTransposeImage")
            env = VecTransposeImage(env)

        # check if wrapper for dict support is needed
        if self.algo == "her":
            if self.verbose > 0:
                print("Wrapping into a ObsDictWrapper")
            env = ObsDictWrapper(env)

        return env

示例#9

0

显示文件

    def collect_rollouts(
        self,
        env: VecEnv,
        callback: BaseCallback,
        train_freq: TrainFreq,
        action_noise: Optional[ActionNoise] = None,
        learning_starts: int = 0,
        log_interval: Optional[int] = None,
    ) -> RolloutReturn:
        """
        Collect experiences and store them into a ReplayBuffer.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param train_freq: How much experience to collect
            by doing rollouts of current policy.
            Either ``TrainFreq(<n>, TrainFrequencyUnit.STEP)``
            or ``TrainFreq(<n>, TrainFrequencyUnit.EPISODE)``
            with ``<n>`` being an integer greater than 0.
        :param action_noise: Action noise that will be used for exploration
            Required for deterministic policy (e.g. TD3). This can also be used
            in addition to the stochastic policy for SAC.
        :param learning_starts: Number of steps before learning for the warm-up phase.
        :param log_interval: Log data every ``log_interval`` episodes
        :return:
        """

        episode_rewards, total_timesteps = [], []
        num_collected_steps, num_collected_episodes = 0, 0

        assert isinstance(env, VecEnv), "You must pass a VecEnv"
        assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment"
        assert train_freq.frequency > 0, "Should at least collect one step or episode."

        if self.model.use_sde:
            self.actor.reset_noise()

        callback.on_rollout_start()
        continue_training = True

        while should_collect_more_steps(train_freq, num_collected_steps,
                                        num_collected_episodes):
            done = False
            episode_reward, episode_timesteps = 0.0, 0

            while not done:
                # concatenate observation and (desired) goal
                observation = self._last_obs
                self._last_obs = ObsDictWrapper.convert_dict(observation)

                if (self.model.use_sde and self.model.sde_sample_freq > 0 and
                        num_collected_steps % self.model.sde_sample_freq == 0):
                    # Sample a new noise matrix
                    self.actor.reset_noise()

                # Select action randomly or according to policy
                self.model._last_obs = self._last_obs
                action, buffer_action = self._sample_action(
                    learning_starts, action_noise)

                # Perform action
                new_obs, reward, done, infos = env.step(action)

                self.num_timesteps += 1
                self.model.num_timesteps = self.num_timesteps
                episode_timesteps += 1
                num_collected_steps += 1

                # Only stop training if return value is False, not when it is None.
                if callback.on_step() is False:
                    return RolloutReturn(0.0,
                                         num_collected_steps,
                                         num_collected_episodes,
                                         continue_training=False)

                episode_reward += reward

                # Retrieve reward and episode length if using Monitor wrapper
                self._update_info_buffer(infos, done)
                self.model.ep_info_buffer = self.ep_info_buffer
                self.model.ep_success_buffer = self.ep_success_buffer

                # == Store transition in the replay buffer and/or in the episode storage ==

                if self._vec_normalize_env is not None:
                    # Store only the unnormalized version
                    new_obs_ = self._vec_normalize_env.get_original_obs()
                    reward_ = self._vec_normalize_env.get_original_reward()
                else:
                    # Avoid changing the original ones
                    self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward
                    self.model._last_original_obs = self._last_original_obs

                # As the VecEnv resets automatically, new_obs is already the
                # first observation of the next episode
                if done and infos[0].get("terminal_observation") is not None:
                    next_obs = infos[0]["terminal_observation"]
                    # VecNormalize normalizes the terminal observation
                    if self._vec_normalize_env is not None:
                        next_obs = self._vec_normalize_env.unnormalize_obs(
                            next_obs)
                else:
                    next_obs = new_obs_

                if self.online_sampling:
                    self.replay_buffer.add(self._last_original_obs, next_obs,
                                           buffer_action, reward_, done, infos)
                else:
                    # concatenate observation with (desired) goal
                    flattened_obs = ObsDictWrapper.convert_dict(
                        self._last_original_obs)
                    flattened_next_obs = ObsDictWrapper.convert_dict(next_obs)
                    # add to replay buffer
                    self.replay_buffer.add(flattened_obs, flattened_next_obs,
                                           buffer_action, reward_, done)
                    # add current transition to episode storage
                    self._episode_storage.add(self._last_original_obs,
                                              next_obs, buffer_action, reward_,
                                              done, infos)

                self._last_obs = new_obs
                self.model._last_obs = self._last_obs

                # Save the unnormalized new observation
                if self._vec_normalize_env is not None:
                    self._last_original_obs = new_obs_
                    self.model._last_original_obs = self._last_original_obs

                self.model._update_current_progress_remaining(
                    self.num_timesteps, self._total_timesteps)

                # For DQN, check if the target network should be updated
                # and update the exploration schedule
                # For SAC/TD3, the update is done as the same time as the gradient update
                # see https://github.com/hill-a/stable-baselines/issues/900
                self.model._on_step()

                self.episode_steps += 1

                if not should_collect_more_steps(train_freq,
                                                 num_collected_steps,
                                                 num_collected_episodes):
                    break

            if done or self.episode_steps >= self.max_episode_length:
                if self.online_sampling:
                    self.replay_buffer.store_episode()
                else:
                    self._episode_storage.store_episode()
                    # sample virtual transitions and store them in replay buffer
                    self._sample_her_transitions()
                    # clear storage for current episode
                    self._episode_storage.reset()

                num_collected_episodes += 1
                self._episode_num += 1
                self.model._episode_num = self._episode_num
                episode_rewards.append(episode_reward)
                total_timesteps.append(episode_timesteps)

                if action_noise is not None:
                    action_noise.reset()

                # Log training infos
                if log_interval is not None and self._episode_num % log_interval == 0:
                    self._dump_logs()

                self.episode_steps = 0

        mean_reward = np.mean(
            episode_rewards) if num_collected_episodes > 0 else 0.0

        callback.on_rollout_end()

        return RolloutReturn(mean_reward, num_collected_steps,
                             num_collected_episodes, continue_training)

示例#10

0

显示文件

文件： her_replay_buffer.py 项目： kubic71/mff

    def _sample_transitions(
        self,
        batch_size: Optional[int],
        maybe_vec_env: Optional[VecNormalize],
        online_sampling: bool,
        n_sampled_goal: Optional[int] = None,
    ) -> Union[ReplayBufferSamples, Tuple[np.ndarray, ...]]:
        """
        :param batch_size: Number of element to sample (only used for online sampling)
        :param env: associated gym VecEnv to normalize the observations/rewards
            Only valid when using online sampling
        :param online_sampling: Using online_sampling for HER or not.
        :param n_sampled_goal: Number of sampled goals for replay. (offline sampling)
        :return: Samples.
        """
        # Select which episodes to use
        if online_sampling:
            assert batch_size is not None, "No batch_size specified for online sampling of HER transitions"
            # Do not sample the episode with index `self.pos` as the episode is invalid
            if self.full:
                episode_indices = (
                    np.random.randint(1, self.n_episodes_stored, batch_size) +
                    self.pos) % self.n_episodes_stored
            else:
                episode_indices = np.random.randint(0, self.n_episodes_stored,
                                                    batch_size)
            # A subset of the transitions will be relabeled using HER algorithm
            her_indices = np.arange(batch_size)[:int(self.her_ratio *
                                                     batch_size)]
        else:
            assert maybe_vec_env is None, "Transitions must be stored unnormalized in the replay buffer"
            assert n_sampled_goal is not None, "No n_sampled_goal specified for offline sampling of HER transitions"
            # Offline sampling: there is only one episode stored
            episode_length = self.episode_lengths[0]
            # we sample n_sampled_goal per timestep in the episode (only one is stored).
            episode_indices = np.tile(0, (episode_length * n_sampled_goal))
            # we only sample virtual transitions
            # as real transitions are already stored in the replay buffer
            her_indices = np.arange(len(episode_indices))

        ep_lengths = self.episode_lengths[episode_indices]

        # Special case when using the "future" goal sampling strategy
        # we cannot sample all transitions, we have to remove the last timestep
        if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
            # restrict the sampling domain when ep_lengths > 1
            # otherwise filter out the indices
            her_indices = her_indices[ep_lengths[her_indices] > 1]
            ep_lengths[her_indices] -= 1

        if online_sampling:
            # Select which transitions to use
            transitions_indices = np.random.randint(ep_lengths)
        else:
            if her_indices.size == 0:
                # Episode of one timestep, not enough for using the "future" strategy
                # no virtual transitions are created in that case
                return np.zeros(0), np.zeros(0), np.zeros(0), np.zeros(0)
            else:
                # Repeat every transition index n_sampled_goals times
                # to sample n_sampled_goal per timestep in the episode (only one is stored).
                # Now with the corrected episode length when using "future" strategy
                transitions_indices = np.tile(np.arange(ep_lengths[0]),
                                              n_sampled_goal)
                episode_indices = episode_indices[transitions_indices]
                her_indices = np.arange(len(episode_indices))

        # get selected transitions
        transitions = {
            key: self.buffer[key][episode_indices, transitions_indices].copy()
            for key in self.buffer.keys()
        }

        # sample new desired goals and relabel the transitions
        new_goals = self.sample_goals(episode_indices, her_indices,
                                      transitions_indices)
        transitions["desired_goal"][her_indices] = new_goals

        # Convert info buffer to numpy array
        transitions["info"] = np.array([
            self.info_buffer[episode_idx][transition_idx] for episode_idx,
            transition_idx in zip(episode_indices, transitions_indices)
        ])

        # Vectorized computation of the new reward
        transitions["reward"][her_indices, 0] = self.env.env_method(
            "compute_reward",
            # the new state depends on the previous state and action
            # s_{t+1} = f(s_t, a_t)
            # so the next_achieved_goal depends also on the previous state and action
            # because we are in a GoalEnv:
            # r_t = reward(s_t, a_t) = reward(next_achieved_goal, desired_goal)
            # therefore we have to use "next_achieved_goal" and not "achieved_goal"
            transitions["next_achieved_goal"][her_indices, 0],
            # here we use the new desired goal
            transitions["desired_goal"][her_indices, 0],
            transitions["info"][her_indices, 0],
        )

        # concatenate observation with (desired) goal
        observations = ObsDictWrapper.convert_dict(
            self._normalize_obs(transitions, maybe_vec_env))
        # HACK to make normalize obs work with the next observation
        transitions["observation"] = transitions["next_obs"]
        next_observations = ObsDictWrapper.convert_dict(
            self._normalize_obs(transitions, maybe_vec_env))

        if online_sampling:
            data = (
                observations[:, 0],
                transitions["action"],
                next_observations[:, 0],
                transitions["done"],
                self._normalize_reward(transitions["reward"], maybe_vec_env),
            )

            return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
        else:
            return observations, next_observations, transitions[
                "action"], transitions["reward"]