예제 #1
0
class OffPolicyVectorizedSampler(BatchSampler):
    """This class implements OffPolicyVectorizedSampler."""
    def __init__(self, algo, n_envs=None):
        """
        Construct an OffPolicyVectorizedSampler.

        :param algo: Algorithms.
        :param n_envs: Number of parallelized sampling envs.
        """
        super(OffPolicyVectorizedSampler, self).__init__(algo)
        self.n_envs = n_envs

    @overrides
    def start_worker(self):
        """Initialize the sampler."""
        n_envs = self.n_envs
        if n_envs is None:
            n_envs = int(self.algo.rollout_batch_size)
            n_envs = max(1, min(n_envs, 100))

        if getattr(self.algo.env, 'vectorized', False):
            self.vec_env = self.algo.env.vec_env_executor(
                n_envs=n_envs, max_path_length=self.algo.max_path_length)
        else:
            envs = [
                pickle.loads(pickle.dumps(self.algo.env))
                for _ in range(n_envs)
            ]
            self.vec_env = VecEnvExecutor(
                envs=envs, max_path_length=self.algo.max_path_length)
        self.env_spec = self.algo.env.spec

    @overrides
    def shutdown_worker(self):
        """Terminate workers if necessary."""
        self.vec_env.close()

    @overrides
    def obtain_samples(self, itr):
        """
        Collect samples for the given iteration number.

        :param itr: Iteration number.
        :return: A list of paths.
        """
        paths = []
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs
        n_samples = 0
        batch_samples = self.vec_env.num_envs * self.algo.max_path_length

        policy = self.algo.policy
        if self.algo.es:
            self.algo.es.reset()

        while n_samples < batch_samples:
            policy.reset(dones)
            if self.algo.input_include_goal:
                obs = [obs["observation"] for obs in obses]
                d_g = [obs["desired_goal"] for obs in obses]
                a_g = [obs["achieved_goal"] for obs in obses]
                input_obses = np.concatenate((obs, d_g), axis=-1)
            else:
                input_obses = obses
            if self.algo.es:
                actions, agent_infos = self.algo.es.get_actions(
                    input_obses, self.algo.policy)
            else:
                actions, agent_infos = self.algo.policy.get_actions(
                    input_obses)

            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]

            if self.algo.input_include_goal:
                self.algo.replay_buffer.add_transition(
                    observation=obs,
                    action=actions,
                    goal=d_g,
                    achieved_goal=a_g,
                    terminal=dones,
                    next_observation=[
                        next_obs["observation"] for next_obs in next_obses
                    ],
                    next_achieved_goal=[
                        next_obs["achieved_goal"] for next_obs in next_obses
                    ],
                )
            else:
                self.algo.replay_buffer.add_transition(
                    observation=obses,
                    action=actions,
                    reward=rewards * self.algo.reward_scale,
                    terminal=dones,
                    next_observation=next_obses,
                )

            for idx, reward, env_info, done in zip(itertools.count(), rewards,
                                                   env_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        rewards=[],
                        env_infos=[],
                    )
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)

                if done:
                    paths.append(
                        dict(rewards=tensor_utils.stack_tensor_list(
                            running_paths[idx]["rewards"]),
                             env_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]["env_infos"])))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None

                    if self.algo.es:
                        self.algo.es.reset()
            obses = next_obses

        return paths

    @overrides
    def process_samples(self, itr, paths):
        """
        Return processed sample data based on the collected paths.

        :param itr: Iteration number.
        :param paths: A list of collected paths.
        :return: Processed sample data.
        """
        success_history = []
        for path in paths:
            if "is_success" in path["env_infos"]:
                success = np.array(path["env_infos"]["is_success"])
                success_rate = np.mean(success)
                success_history.append(success_rate)

        undiscounted_returns = [sum(path["rewards"]) for path in paths]
        samples_data = dict(undiscounted_returns=undiscounted_returns,
                            success_history=success_history)
        return samples_data
예제 #2
0
class OnPolicyVectorizedSampler(BatchSampler):
    def __init__(self, algo, env, n_envs=1):
        super(OnPolicyVectorizedSampler, self).__init__(algo, env, n_envs)
        self.n_envs = n_envs

    @overrides
    def start_worker(self):
        n_envs = self.n_envs
        envs = [pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs)]
        self.vec_env = VecEnvExecutor(
            envs=envs, max_path_length=self.algo.max_path_length)
        self.env_spec = self.env.spec

    @overrides
    def shutdown_worker(self):
        self.vec_env.close()

    @overrides
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        logger.log('Obtaining samples for iteration %d...' % itr)

        if not batch_size:
            batch_size = self.algo.max_path_length * self.n_envs

        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy

        import time
        while n_samples < batch_size:
            t = time.time()
            policy.reset(dones)

            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t
            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(  # noqa: E501
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]['observations'].append(observation)
                running_paths[idx]['actions'].append(action)
                running_paths[idx]['rewards'].append(reward)
                running_paths[idx]['env_infos'].append(env_info)
                running_paths[idx]['agent_infos'].append(agent_info)
                if done:
                    paths.append(
                        dict(observations=self.env_spec.observation_space.
                             flatten_n(running_paths[idx]['observations']),
                             actions=self.env_spec.action_space.flatten_n(
                                 running_paths[idx]['actions']),
                             rewards=tensor_utils.stack_tensor_list(
                                 running_paths[idx]['rewards']),
                             env_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]['env_infos']),
                             agent_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]['agent_infos'])))
                    n_samples += len(running_paths[idx]['rewards'])
                    running_paths[idx] = None

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        tabular.record('PolicyExecTime', policy_time)
        tabular.record('EnvExecTime', env_time)
        tabular.record('ProcessExecTime', process_time)

        if whole_paths:
            return paths
        else:
            paths_truncated = truncate_paths(paths, batch_size)
            return paths_truncated
예제 #3
0
class OffPolicyVectorizedSampler(BatchSampler):
    """This class implements OffPolicyVectorizedSampler.

    Args:
        algo(garage.np.RLAlgorithm): Algorithm.
        env(garage.envs.GarageEnv): Environment.
        n_envs(int): Number of parallel environments managed by sampler.
    """

    def __init__(self, algo, env, n_envs=None, no_reset=True):
        if n_envs is None:
            n_envs = int(algo.rollout_batch_size)
        super(OffPolicyVectorizedSampler, self).__init__(algo, env, n_envs)
        self.n_envs = n_envs
        self.no_reset = no_reset

        self._last_obses = None
        self._last_uncounted_discount = [0] * n_envs
        self._last_running_length = [0] * n_envs
        self._last_success_count = [0] * n_envs

    @overrides
    def start_worker(self):
        """Initialize the sampler."""
        n_envs = self.n_envs
        envs = [pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs)]
        self.vec_env = VecEnvExecutor(
            envs=envs, max_path_length=self.algo.max_path_length)
        self.env_spec = self.env.spec

    @overrides
    def shutdown_worker(self):
        """Terminate workers if necessary."""
        self.vec_env.close()

    @overrides
    def obtain_samples(self, itr, batch_size):
        """Collect samples for the given iteration number.

        Args:
            itr(int): Iteration number.
            batch_size(int): Number of environment interactions in one batch.

        Returns:
            list: A list of paths.
        """
        paths = []
        if not self.no_reset or self._last_obses is None:
            obses = self.vec_env.reset()
        else:
            obses = self._last_obses
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs
        n_samples = 0

        policy = self.algo.policy
        if self.algo.es:
            self.algo.es.reset()

        while n_samples < batch_size:
            policy.reset(dones)
            if self.algo.input_include_goal:
                obs = [obs['observation'] for obs in obses]
                d_g = [obs['desired_goal'] for obs in obses]
                a_g = [obs['achieved_goal'] for obs in obses]
                input_obses = np.concatenate((obs, d_g), axis=-1)
            else:
                input_obses = obses
            if self.algo.es:
                actions, agent_infos = self.algo.es.get_actions(
                    itr, input_obses, self.algo.policy)
            else:
                actions, agent_infos = self.algo.policy.get_actions(
                    input_obses)

            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            self._last_obses = next_obses
            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            n_samples += len(next_obses)

            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]

            if self.algo.input_include_goal:
                self.algo.replay_buffer.add_transitions(
                    observation=obs,
                    action=actions,
                    goal=d_g,
                    achieved_goal=a_g,
                    terminal=dones,
                    next_observation=[
                        next_obs['observation'] for next_obs in next_obses
                    ],
                    next_achieved_goal=[
                        next_obs['achieved_goal'] for next_obs in next_obses
                    ],
                )
            else:
                self.algo.replay_buffer.add_transitions(
                    observation=obses,
                    action=actions,
                    reward=rewards * self.algo.reward_scale,
                    terminal=dones,
                    next_observation=next_obses,
                )

            for idx, reward, env_info, done in zip(itertools.count(), rewards,
                                                   env_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        rewards=[],
                        env_infos=[],
                        dones=[],
                        undiscounted_return=self._last_uncounted_discount[idx],
                        # running_length: Length of path up to now
                        # Note that running_length is not len(rewards)
                        # Because a path may not be complete in one batch
                        running_length=self._last_running_length[idx],
                        success_count=self._last_success_count[idx])

                running_paths[idx]['rewards'].append(reward)
                running_paths[idx]['env_infos'].append(env_info)
                running_paths[idx]['dones'].append(done)
                running_paths[idx]['running_length'] += 1
                running_paths[idx]['undiscounted_return'] += reward
                running_paths[idx]['success_count'] += env_info.get(
                    'is_success') or 0

                self._last_uncounted_discount[idx] += reward
                self._last_success_count[idx] += env_info.get(
                    'is_success') or 0
                self._last_running_length[idx] += 1

                if done or n_samples >= batch_size:
                    paths.append(
                        dict(
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]['rewards']),
                            dones=tensor_utils.stack_tensor_list(
                                running_paths[idx]['dones']),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]['env_infos']),
                            running_length=running_paths[idx]
                            ['running_length'],
                            undiscounted_return=running_paths[idx]
                            ['undiscounted_return'],
                            success_count=running_paths[idx]['success_count']))
                    running_paths[idx] = None

                    if done:
                        self._last_running_length[idx] = 0
                        self._last_success_count[idx] = 0
                        self._last_uncounted_discount[idx] = 0

                    if self.algo.es:
                        self.algo.es.reset()
            obses = next_obses

        return paths

    @overrides
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Args:
            itr(int): Iteration number.
            paths(list): A list of collected paths.

        Returns:
            list: Processed sample data.
        """
        success_history = [
            path['success_count'] / path['running_length'] for path in paths
        ]
        undiscounted_returns = [path['undiscounted_return'] for path in paths]
        samples_data = dict(
            undiscounted_returns=undiscounted_returns,
            success_history=success_history)
        return samples_data
예제 #4
0
class OnPolicyVectorizedSampler(BatchSampler):
    def __init__(self, algo, n_envs=None):
        super(OnPolicyVectorizedSampler, self).__init__(algo)
        self.n_envs = n_envs

    @overrides
    def start_worker(self):
        n_envs = self.n_envs
        if n_envs is None:
            n_envs = int(self.algo.batch_size / self.algo.max_path_length)
            n_envs = max(1, min(n_envs, 100))

        if getattr(self.algo.env, 'vectorized', False):
            self.vec_env = self.algo.env.vec_env_executor(
                n_envs=n_envs, max_path_length=self.algo.max_path_length)
        else:
            envs = [
                pickle.loads(pickle.dumps(self.algo.env))
                for _ in range(n_envs)
            ]
            self.vec_env = VecEnvExecutor(
                envs=envs, max_path_length=self.algo.max_path_length)
        self.env_spec = self.algo.env.spec

    @overrides
    def shutdown_worker(self):
        self.vec_env.close()

    @overrides
    def obtain_samples(self, itr):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy

        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)

            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t
            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, next_observation, env_info, agent_info, done in zip(  # noqa: E501
                    itertools.count(), obses, actions, rewards, next_obses,
                    env_infos, agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        next_observations=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["next_observations"].append(
                    next_observation)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(observations=self.env_spec.observation_space.
                             flatten_n(running_paths[idx]["observations"]),
                             actions=self.env_spec.action_space.flatten_n(
                                 running_paths[idx]["actions"]),
                             rewards=tensor_utils.stack_tensor_list(
                                 running_paths[idx]["rewards"]),
                             next_observation=tensor_utils.stack_tensor_list(
                                 running_paths[idx]["next_observations"]),
                             env_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]["env_infos"]),
                             agent_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]["agent_infos"])))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths