示例#1
0
    def _setup_worker(self, env_indices, tasks):
        """Setup workers.

        Args:
            env_indices (List[Int]): Indices of environments to be assigned
                to workers for sampling.
            tasks (List[dict]): List of tasks to assign.

        """
        if self._vec_env is not None:
            self._vec_env.close()

        vec_envs = []
        for env_ind in env_indices:
            for _ in range(self._envs_per_worker):
                vec_env = copy.deepcopy(self.env)
                vec_env.set_task(tasks[env_ind])
                vec_envs.append(vec_env)
        seed0 = deterministic.get_seed()
        if seed0 is not None:
            for (i, e) in enumerate(vec_envs):
                e.seed(seed0 + i)

        self._vec_env = VecEnvExecutor(
            envs=vec_envs, max_path_length=self.algo.max_path_length)
示例#2
0
    def start_worker(self):
        """Initialize the sampler."""
        n_envs = self._n_envs
        envs = [pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs)]

        # Deterministically set environment seeds based on the global seed.
        seed0 = deterministic.get_seed()
        if seed0 is not None:
            for (i, e) in enumerate(envs):
                e.seed(seed0 + i)

        self._vec_env = VecEnvExecutor(
            envs=envs, max_path_length=self.algo.max_path_length)
示例#3
0
class RL2Sampler(BaseSampler):
    """Sampler which uses VecEnvExecutor to run multiple environments.

    This sampler is for RL^2. See https://arxiv.org/pdf/1611.02779.pdf.

    In RL^2, there are n environments/tasks and paths in each of them
    will be concatenated at some point and fed to the policy.
    This sampler uses an OrderedDict, instead of a List, to keep track
    of the paths for each environment/task.

    Args:
        algo (metarl.np.algos.RLAlgorithm): An algorithm instance.
        env (metarl.envs.MetaRLEnv): Environement to sample from.
        meta_batch_size (int): Meta batch size for sampling. If it is
            larger than n_envs, it must be a multiple of n_envs so it can be
            evenly distributed among environments.
        n_envs (int): Number of environment instances for sampling. It it is
            larger than meta_batch_size, it must be a multiple of
            meta_batch_size so batch can be evenly distributed among
            environments.

    Raises:
        ValueError: If meta_batch_size > n_envs and meta_batch_size is not
            a multiple of n_envs, or if n_envs > meta_batch_size and n_envs
            is not a multiple of meta_batch_size.

    """

    def __init__(self, algo, env, meta_batch_size, n_envs=None):
        super().__init__(algo, env)
        if n_envs is None:
            n_envs = singleton_pool.n_parallel * 4

        self._n_envs = n_envs
        self._meta_batch_size = meta_batch_size
        self._vec_env = None
        self._envs_per_worker = None
        self._vec_envs_indices = None

        if self._meta_batch_size > self._n_envs:
            if self._meta_batch_size % self._n_envs != 0:
                raise ValueError(
                    'meta_batch_size must be a multiple of n_envs')
            self._envs_per_worker = 1
            self._vec_envs_indices = np.split(np.arange(self._meta_batch_size),
                                              self._n_envs)
        if self._n_envs >= self._meta_batch_size:
            if self._n_envs % self._meta_batch_size != 0:
                raise ValueError(
                    'n_envs must be a multiple of meta_batch_size')
            self._envs_per_worker = self._n_envs // self._meta_batch_size
            self._vec_envs_indices = [np.arange(self._meta_batch_size)]

    def start_worker(self):
        """This function is deprecated."""

    def shutdown_worker(self):
        """Shutdown workers."""
        self._vec_env.close()

    def _setup_worker(self, env_indices, tasks):
        """Setup workers.

        Args:
            env_indices (List[Int]): Indices of environments to be assigned
                to workers for sampling.
            tasks (List[dict]): List of tasks to assign.

        """
        if self._vec_env is not None:
            self._vec_env.close()

        vec_envs = []
        for env_ind in env_indices:
            for _ in range(self._envs_per_worker):
                vec_env = copy.deepcopy(self.env)
                vec_env.set_task(tasks[env_ind])
                vec_envs.append(vec_env)
        seed0 = deterministic.get_seed()
        if seed0 is not None:
            for (i, e) in enumerate(vec_envs):
                e.seed(seed0 + i)

        self._vec_env = VecEnvExecutor(
            envs=vec_envs, max_path_length=self.algo.max_path_length)

    # pylint: disable=too-many-statements
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Sample the policy for new trajectories.

        If batch size is not specified, episode per task by default is 1 so
        batch size will be meta_batch_size * max_path_length.

        When number of workers are less than meta batch size, sampling will
        be performed for each of self._vec_envs_indices in series. The
        i-th value of self._vec_envs_indices represents the indices of the
        environments/tasks to be sampled for the i-th iteration.

        Args:
            itr (int): Iteration number.
            batch_size (int): Number of samples to be collected. If None,
                it will be default [algo.max_path_length * n_envs].
            whole_paths (bool): Whether return all the paths or not. True
                by default. It's possible for the paths to have total actual
                sample size larger than batch_size, and will be truncated if
                this flag is true.

        Returns:
            OrderedDict: Sample paths. Key represents the index of the
                environment/task and value represents all the paths sampled
                from that particular environment/task.


        Note:
            Each path is a dictionary, with keys and values as following:
                * observations: numpy.ndarray with shape :math:`[N, S^*]`
                * actions: numpy.ndarray with shape :math:`[N, S^*]`
                * rewards: numpy.ndarray with shape :math:`[N, S^*]`
                * dones: numpy.ndarray with shape :math:`[N, S^*]`
                * env_infos: A dictionary with each key representing one
                  environment info, value being a numpy.ndarray with shape
                  :math:`[N, S^*]`. One example is "ale.lives" for atari
                  environments.
                * agent_infos: A dictionary with each key representing one
                  agent info, value being a numpy.ndarray with shape
                  :math:`[N, S^*]`. One example is "prev_action", which is used
                  for recurrent policy as previous action input, merged with
                  the observation input as the state input.

        """
        logger.log('Obtaining samples for iteration %d...' % itr)

        if batch_size is None:
            batch_size = self.algo.max_path_length * self._meta_batch_size

        paths = []

        tasks = self.env.sample_tasks(self._meta_batch_size)

        # Start main loop
        batch_size_per_loop = batch_size // len(self._vec_envs_indices)
        for vec_envs_indices in self._vec_envs_indices:
            self._setup_worker(vec_envs_indices, tasks)

            n_samples = 0
            obses = self._vec_env.reset()
            dones = np.asarray([True] * self._vec_env.num_envs)
            running_paths = [None] * self._vec_env.num_envs

            pbar = ProgBarCounter(batch_size)
            policy_time = 0
            env_time = 0
            process_time = 0

            policy = self.algo.policy
            # Only reset policies at the beginning of a meta batch
            policy.reset(dones)

            while n_samples < batch_size_per_loop:
                t = time.time()

                actions, agent_infos = policy.get_actions(obses)

                policy_time += time.time() - t
                t = time.time()
                next_obses, rewards, dones, env_infos = self._vec_env.step(
                    actions)
                env_time += time.time() - t
                t = time.time()

                agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
                env_infos = tensor_utils.split_tensor_dict_list(env_infos)
                if env_infos is None:
                    env_infos = [dict() for _ in range(self._vec_env.num_envs)]
                if agent_infos is None:
                    agent_infos = [
                        dict() for _ in range(self._vec_env.num_envs)
                    ]
                for idx, observation, action, reward, env_info, agent_info, done in zip(  # noqa: E501
                        itertools.count(), obses, actions, rewards, env_infos,
                        agent_infos, dones):
                    if running_paths[idx] is None:
                        running_paths[idx] = dict(
                            observations=[],
                            actions=[],
                            rewards=[],
                            dones=[],
                            env_infos=[],
                            agent_infos=[],
                        )
                    running_paths[idx]['observations'].append(observation)
                    running_paths[idx]['actions'].append(action)
                    running_paths[idx]['rewards'].append(reward)
                    running_paths[idx]['dones'].append(done)
                    running_paths[idx]['env_infos'].append(env_info)
                    running_paths[idx]['agent_infos'].append(agent_info)
                    if done:
                        obs = np.asarray(running_paths[idx]['observations'])
                        actions = np.asarray(running_paths[idx]['actions'])
                        paths.append(
                            dict(observations=obs,
                                 actions=actions,
                                 rewards=np.asarray(
                                     running_paths[idx]['rewards']),
                                 dones=np.asarray(running_paths[idx]['dones']),
                                 env_infos=tensor_utils.stack_tensor_dict_list(
                                     running_paths[idx]['env_infos']),
                                 agent_infos=tensor_utils.
                                 stack_tensor_dict_list(
                                     running_paths[idx]['agent_infos']),
                                 batch_idx=idx))
                        n_samples += len(running_paths[idx]['rewards'])
                        running_paths[idx] = None

                process_time += time.time() - t
                pbar.inc(len(obses))
                obses = next_obses

        pbar.stop()

        tabular.record('PolicyExecTime', policy_time)
        tabular.record('EnvExecTime', env_time)
        tabular.record('ProcessExecTime', process_time)

        return paths if whole_paths else truncate_paths(paths, batch_size)
示例#4
0
class OnPolicyVectorizedSampler(BatchSampler):
    """BatchSampler which uses VecEnvExecutor to run multiple environments.

    Args:
        algo (metarl.np.algos.RLAlgorithm): An algorithm instance.
        env (metarl.envs.MetaRLEnv): An environement instance.
        n_envs (int): Number of environment instances to setup.
            This parameter has effect on sampling performance.

    """
    def __init__(self, algo, env, n_envs=None):
        if n_envs is None:
            n_envs = singleton_pool.n_parallel * 4
        super().__init__(algo, env)
        self._n_envs = n_envs

        self._vec_env = None
        self._env_spec = self.env.spec

        warnings.warn(
            DeprecationWarning(
                'OnPolicyVectoriizedSampler is deprecated, and will be '
                'removed in the next release. Please use VecWorker and one of '
                'the new samplers which implement metarl.sampler.Sampler, '
                'such as RaySampler.'))

    def start_worker(self):
        """Start workers."""
        n_envs = self._n_envs
        envs = [
            cloudpickle.loads(cloudpickle.dumps(self.env))
            for _ in range(n_envs)
        ]

        # Deterministically set environment seeds based on the global seed.
        seed0 = deterministic.get_seed()
        if seed0 is not None:
            for (i, e) in enumerate(envs):
                e.seed(seed0 + i)

        self._vec_env = VecEnvExecutor(
            envs=envs, max_path_length=self.algo.max_path_length)

    def shutdown_worker(self):
        """Shutdown workers."""
        self._vec_env.close()

    # pylint: disable=too-many-statements
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Sample the policy for new trajectories.

        Args:
            itr (int): Iteration number.
            batch_size (int): Number of samples to be collected. If None,
                it will be default [algo.max_path_length * n_envs].
            whole_paths (bool): Whether return all the paths or not. True
                by default. It's possible for the paths to have total actual
                sample size larger than batch_size, and will be truncated if
                this flag is true.

        Returns:
            list[dict]: Sample paths.

        Note:
            Each path is a dictionary, with keys and values as following:
                * observations: numpy.ndarray with shape [Batch, *obs_dims]
                * actions: numpy.ndarray with shape [Batch, *act_dims]
                * rewards: numpy.ndarray with shape [Batch, ]
                * env_infos: A dictionary with each key representing one
                  environment info, value being a numpy.ndarray with shape
                  [Batch, ?]. One example is "ale.lives" for atari
                  environments.
                * agent_infos: A dictionary with each key representing one
                  agent info, value being a numpy.ndarray with shape
                  [Batch, ?]. One example is "prev_action", which is used
                  for recurrent policy as previous action input, merged with
                  the observation input as the state input.
                * dones: numpy.ndarray with shape [Batch, ]

        """
        logger.log('Obtaining samples for iteration %d...' % itr)

        if not batch_size:
            batch_size = self.algo.max_path_length * self._n_envs

        paths = []
        n_samples = 0
        obses = self._vec_env.reset()
        dones = np.asarray([True] * self._vec_env.num_envs)
        running_paths = [None] * self._vec_env.num_envs

        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy

        with click.progressbar(length=batch_size, label='Sampling') as pbar:
            while n_samples < batch_size:
                t = time.time()
                policy.reset(dones)

                actions, agent_infos = policy.get_actions(obses)

                policy_time += time.time() - t
                t = time.time()
                next_obses, rewards, dones, env_infos = \
                    self._vec_env.step(actions)
                env_time += time.time() - t
                t = time.time()

                agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
                env_infos = tensor_utils.split_tensor_dict_list(env_infos)
                if env_infos is None:
                    env_infos = [dict() for _ in range(self._vec_env.num_envs)]
                if agent_infos is None:
                    agent_infos = [
                        dict() for _ in range(self._vec_env.num_envs)
                    ]
                for idx, observation, action, reward, env_info, agent_info, done in zip(  # noqa: E501
                        itertools.count(), obses, actions, rewards, env_infos,
                        agent_infos, dones):
                    if running_paths[idx] is None:
                        running_paths[idx] = dict(observations=[],
                                                  actions=[],
                                                  rewards=[],
                                                  env_infos=[],
                                                  agent_infos=[],
                                                  dones=[])
                    running_paths[idx]['observations'].append(observation)
                    running_paths[idx]['actions'].append(action)
                    running_paths[idx]['rewards'].append(reward)
                    running_paths[idx]['env_infos'].append(env_info)
                    running_paths[idx]['agent_infos'].append(agent_info)
                    running_paths[idx]['dones'].append(done)
                    if done:
                        obs = np.asarray(running_paths[idx]['observations'])
                        actions = np.asarray(running_paths[idx]['actions'])
                        paths.append(
                            dict(observations=obs,
                                 actions=actions,
                                 rewards=np.asarray(
                                     running_paths[idx]['rewards']),
                                 env_infos=tensor_utils.stack_tensor_dict_list(
                                     running_paths[idx]['env_infos']),
                                 agent_infos=tensor_utils.
                                 stack_tensor_dict_list(
                                     running_paths[idx]['agent_infos']),
                                 dones=np.asarray(
                                     running_paths[idx]['dones'])))
                        n_samples += len(running_paths[idx]['rewards'])
                        running_paths[idx] = None

                process_time += time.time() - t
                pbar.update(len(obses))
                obses = next_obses

        tabular.record('PolicyExecTime', policy_time)
        tabular.record('EnvExecTime', env_time)
        tabular.record('ProcessExecTime', process_time)

        return paths if whole_paths else truncate_paths(paths, batch_size)
class OffPolicyVectorizedSampler(BatchSampler):
    """This class implements OffPolicyVectorizedSampler.

    Args:
        algo (metarl.np.RLAlgorithm): Algorithm.
        env (metarl.envs.MetaRLEnv): Environment.
        n_envs (int): Number of parallel environments managed by sampler.
        no_reset (bool): Reset environment between samples or not.

    """
    def __init__(self, algo, env, n_envs=None, no_reset=True):
        if n_envs is None:
            n_envs = int(algo.rollout_batch_size)
        super().__init__(algo, env)
        self._n_envs = n_envs
        self._no_reset = no_reset

        self._vec_env = None
        self._env_spec = self.env.spec

        self._last_obses = None
        self._last_uncounted_discount = [0] * n_envs
        self._last_running_length = [0] * n_envs
        self._last_success_count = [0] * n_envs

        warnings.warn(
            DeprecationWarning(
                'OffPolicyVectoriizedSampler is deprecated, and will be '
                'removed in the next release. Please use VecWorker and one of '
                'the new samplers which implement metarl.sampler.Sampler, '
                'such as RaySampler.'))

    def start_worker(self):
        """Initialize the sampler."""
        n_envs = self._n_envs
        envs = [
            cloudpickle.loads(cloudpickle.dumps(self.env))
            for _ in range(n_envs)
        ]

        # Deterministically set environment seeds based on the global seed.
        seed0 = deterministic.get_seed()
        if seed0 is not None:
            for (i, e) in enumerate(envs):
                e.seed(seed0 + i)

        self._vec_env = VecEnvExecutor(
            envs=envs, max_path_length=self.algo.max_path_length)

    def shutdown_worker(self):
        """Terminate workers if necessary."""
        self._vec_env.close()

    # pylint: disable=too-many-branches, too-many-statements
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Collect samples for the given iteration number.

        Args:
            itr(int): Iteration number.
            batch_size(int): Number of environment interactions in one batch.
            whole_paths(bool): Not effective. Only keep here to comply
                with base class.

        Raises:
            ValueError: If the algorithm doesn't have an exploration_policy
                field.

        Returns:
            list: A list of paths.

        """
        assert batch_size is not None

        paths = []
        if not self._no_reset or self._last_obses is None:
            obses = self._vec_env.reset()
        else:
            obses = self._last_obses
        completes = np.asarray([True] * self._vec_env.num_envs)
        running_paths = [None] * self._vec_env.num_envs
        n_samples = 0

        policy = self.algo.exploration_policy
        if policy is None:
            raise ValueError('OffPolicyVectoriizedSampler should only be used '
                             'with an exploration_policy.')
        while n_samples < batch_size:
            policy.reset(completes)
            obs_space = self.algo.env_spec.observation_space
            input_obses = obs_space.flatten_n(obses)

            actions, agent_infos = policy.get_actions(input_obses)

            next_obses, rewards, dones, env_infos = \
                self._vec_env.step(actions)
            completes = env_infos['vec_env_executor.complete']
            self._last_obses = next_obses
            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            n_samples += len(next_obses)

            if agent_infos is None:
                agent_infos = [dict() for _ in range(self._vec_env.num_envs)]
            if env_infos is None:
                env_infos = [dict() for _ in range(self._vec_env.num_envs)]

            for idx, reward, env_info, done, obs, next_obs, action in zip(
                    itertools.count(), rewards, env_infos, dones, obses,
                    next_obses, actions):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        rewards=[],
                        observations=[],
                        next_observations=[],
                        actions=[],
                        env_infos=[],
                        dones=[],
                        undiscounted_return=self._last_uncounted_discount[idx],
                        # running_length: Length of path up to now
                        # Note that running_length is not len(rewards)
                        # Because a path may not be complete in one batch
                        running_length=self._last_running_length[idx],
                        success_count=self._last_success_count[idx])

                running_paths[idx]['rewards'].append(reward)
                running_paths[idx]['observations'].append(obs)
                running_paths[idx]['next_observations'].append(next_obs)
                running_paths[idx]['actions'].append(action)
                running_paths[idx]['env_infos'].append(env_info)
                running_paths[idx]['dones'].append(done)
                running_paths[idx]['running_length'] += 1
                running_paths[idx]['undiscounted_return'] += reward
                running_paths[idx]['success_count'] += env_info.get(
                    'is_success') or 0

                self._last_uncounted_discount[idx] += reward
                self._last_success_count[idx] += env_info.get(
                    'is_success') or 0
                self._last_running_length[idx] += 1

                if done or n_samples >= batch_size:
                    paths.append(
                        dict(
                            rewards=np.asarray(running_paths[idx]['rewards']),
                            dones=np.asarray(running_paths[idx]['dones']),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]['env_infos']),
                            running_length=running_paths[idx]
                            ['running_length'],
                            undiscounted_return=running_paths[idx]
                            ['undiscounted_return'],
                            success_count=running_paths[idx]['success_count']))

                    act_space = self._env_spec.action_space
                    path_dict = {}

                    path_dict['observations'] = obs_space.flatten_n(
                        running_paths[idx]['observations'])
                    path_dict['next_observations'] = obs_space.flatten_n(
                        running_paths[idx]['next_observations'])
                    path_dict['rewards'] = np.asarray(
                        running_paths[idx]['rewards']).reshape(-1, 1)
                    path_dict['terminals'] = np.asarray(
                        running_paths[idx]['dones']).reshape(-1, 1)
                    path_dict['actions'] = act_space.flatten_n(
                        running_paths[idx]['actions'])

                    self.algo.replay_buffer.add_path(path_dict)
                    running_paths[idx] = None

                    if done:
                        self._last_running_length[idx] = 0
                        self._last_success_count[idx] = 0
                        self._last_uncounted_discount[idx] = 0
            obses = next_obses
        return paths
示例#6
0
class OffPolicyVectorizedSampler(BatchSampler):
    """This class implements OffPolicyVectorizedSampler.

    Args:
        algo (metarl.np.RLAlgorithm): Algorithm.
        env (metarl.envs.MetaRLEnv): Environment.
        n_envs (int): Number of parallel environments managed by sampler.
        no_reset (bool): Reset environment between samples or not.

    """
    def __init__(self, algo, env, n_envs=1, no_reset=False):
        if n_envs is None:
            n_envs = int(algo.rollout_batch_size)
        super().__init__(algo, env)
        self._n_envs = n_envs
        self._no_reset = no_reset

        self._vec_env = None
        self._env_spec = self.env.spec

        self._last_obses = None
        self._last_uncounted_discount = [0] * n_envs
        self._last_running_length = [0] * n_envs
        self._last_success_count = [0] * n_envs

    def start_worker(self):
        """Initialize the sampler."""
        n_envs = self._n_envs
        envs = [pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs)]

        # Deterministically set environment seeds based on the global seed.
        seed0 = deterministic.get_seed()
        if seed0 is not None:
            for (i, e) in enumerate(envs):
                e.seed(seed0 + i)

        self._vec_env = VecEnvExecutor(
            envs=envs, max_path_length=self.algo.max_path_length)

    def shutdown_worker(self):
        """Terminate workers if necessary."""
        self._vec_env.close()

    # pylint: disable=too-many-branches, too-many-statements
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Collect samples for the given iteration number.

        Args:
            itr(int): Iteration number.
            batch_size(int): Number of environment interactions in one batch.
            whole_paths(bool): Not effective. Only keep here to comply
                with base class.

        Returns:
            list: A list of paths.

        """
        assert batch_size is not None

        paths = []
        if not self._no_reset or self._last_obses is None:
            obses = self._vec_env.reset()
        else:
            obses = self._last_obses
        dones = np.asarray([True] * self._vec_env.num_envs)
        running_paths = [None] * self._vec_env.num_envs
        n_samples = 0

        policy = self.algo.policy
        if self.algo.es:
            self.algo.es.reset()

        while n_samples < batch_size:
            policy.reset(dones)
            if self.algo.input_include_goal:
                obs = [obs['observation'] for obs in obses]
                d_g = [obs['desired_goal'] for obs in obses]
                a_g = [obs['achieved_goal'] for obs in obses]
                input_obses = np.concatenate((obs, d_g), axis=-1)
            else:
                input_obses = obses
            obs_normalized = tensor_utils.normalize_pixel_batch(
                self._env_spec, input_obses)
            if self.algo.es:
                actions, agent_infos = self.algo.es.get_actions(
                    itr, obs_normalized, self.algo.policy)
            else:
                actions, agent_infos = self.algo.policy.get_actions(
                    obs_normalized)

            next_obses, rewards, dones, env_infos = self._vec_env.step(actions)
            self._last_obses = next_obses
            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            n_samples += len(next_obses)

            if agent_infos is None:
                agent_infos = [dict() for _ in range(self._vec_env.num_envs)]
            if env_infos is None:
                env_infos = [dict() for _ in range(self._vec_env.num_envs)]

            if self.algo.input_include_goal:
                self.algo.replay_buffer.add_transitions(
                    observation=obs,
                    action=actions,
                    goal=d_g,
                    achieved_goal=a_g,
                    terminal=dones,
                    next_observation=[
                        next_obs['observation'] for next_obs in next_obses
                    ],
                    next_achieved_goal=[
                        next_obs['achieved_goal'] for next_obs in next_obses
                    ],
                )
            else:
                self.algo.replay_buffer.add_transitions(
                    observation=obses,
                    action=actions,
                    reward=rewards,
                    terminal=dones,
                    next_observation=next_obses,
                )

            for idx, reward, env_info, done in zip(itertools.count(), rewards,
                                                   env_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        rewards=[],
                        env_infos=[],
                        dones=[],
                        undiscounted_return=self._last_uncounted_discount[idx],
                        # running_length: Length of path up to now
                        # Note that running_length is not len(rewards)
                        # Because a path may not be complete in one batch
                        running_length=self._last_running_length[idx],
                        success_count=self._last_success_count[idx])

                running_paths[idx]['rewards'].append(reward)
                running_paths[idx]['env_infos'].append(env_info)
                running_paths[idx]['dones'].append(done)
                running_paths[idx]['running_length'] += 1
                running_paths[idx]['undiscounted_return'] += reward
                running_paths[idx]['success_count'] += env_info.get(
                    'is_success') or 0

                self._last_uncounted_discount[idx] += reward
                self._last_success_count[idx] += env_info.get(
                    'is_success') or 0
                self._last_running_length[idx] += 1

                if done or n_samples >= batch_size:
                    paths.append(
                        dict(
                            rewards=np.asarray(running_paths[idx]['rewards']),
                            dones=np.asarray(running_paths[idx]['dones']),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]['env_infos']),
                            running_length=running_paths[idx]
                            ['running_length'],
                            undiscounted_return=running_paths[idx]
                            ['undiscounted_return'],
                            success_count=running_paths[idx]['success_count']))
                    running_paths[idx] = None

                    if done:
                        self._last_running_length[idx] = 0
                        self._last_success_count[idx] = 0
                        self._last_uncounted_discount[idx] = 0

                    if self.algo.es:
                        self.algo.es.reset()
            obses = next_obses
        return paths