예제 #1
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Collect samples for the given iteration number.

        Args:
            itr (int): Number of iteration.
            batch_size (int): Number of environment steps in one batch.
            whole_paths (bool): Whether to use whole path or truncated.

        Returns:
            list[dict]: A list of paths.

        """
        if not batch_size:
            batch_size = self.algo.max_path_length * self.n_envs

        cur_policy_params = self.algo.policy.get_param_values()
        paths = parallel_sampler.sample_paths(
            policy_params=cur_policy_params,
            max_samples=batch_size,
            max_path_length=self.algo.max_path_length,
            scope=self.algo.scope,
        )
        if whole_paths:
            return paths
        else:
            paths_truncated = truncate_paths(paths, batch_size)
            return paths_truncated
예제 #2
0
    def _obtain_is_samples(self, _itr, batch_size=None, whole_paths=True):
        """Collect IS samples for the given iteration number.

        Args:
            _itr (int): Number of iteration.
            batch_size (int): Number of batch size.
            whole_paths (bool): Whether to use whole path or truncated.

        Returns:
            list: A list of paths.

        """
        if batch_size is None:
            batch_size = self.algo.max_path_length

        paths = []
        for hist_policy_distribution, hist_paths in self.get_history_list(
                self.n_backtrack):
            h_paths = self._sample_isweighted_paths(
                policy=self.algo.policy,
                hist_policy_distribution=hist_policy_distribution,
                max_samples=batch_size,
                paths=hist_paths,
                hist_variance_penalty=self.hist_variance_penalty,
                max_is_ratio=self.max_is_ratio,
                ess_threshold=self.ess_threshold,
            )
            paths.extend(h_paths)

        if len(paths) > batch_size:
            paths = random.sample(paths, batch_size)

        return paths if whole_paths else truncate_paths(paths, batch_size)
예제 #3
0
    def test_truncate_paths(self):
        paths = [
            dict(
                observations=np.zeros((100, 1)),
                actions=np.zeros((100, 1)),
                rewards=np.zeros(100),
                env_infos=dict(),
                agent_infos=dict(lala=np.zeros(100)),
            ),
            dict(
                observations=np.zeros((50, 1)),
                actions=np.zeros((50, 1)),
                rewards=np.zeros(50),
                env_infos=dict(),
                agent_infos=dict(lala=np.zeros(50)),
            ),
        ]

        truncated = truncate_paths(paths, 130)
        assert len(truncated) == 2
        assert len(truncated[-1]['observations']) == 30
        assert len(truncated[0]['observations']) == 100
        # make sure not to change the original one
        assert len(paths) == 2
        assert len(paths[-1]['observations']) == 50
예제 #4
0
 def test_truncates(self):
     truncated = utils.truncate_paths(self.paths, 130)
     assert len(truncated) == 2
     assert len(truncated[-1]['observations']) == 30
     assert len(truncated[0]['observations']) == 100
     # make sure not to change the original one
     assert len(self.paths) == 2
     assert len(self.paths[-1]['observations']) == 50
예제 #5
0
 def obtain_samples(self, itr):
     cur_policy_params = self.algo.policy.get_param_values()
     paths = parallel_sampler.sample_paths(
         policy_params=cur_policy_params,
         max_samples=self.algo.batch_size,
         max_path_length=self.algo.max_path_length,
         scope=self.algo.scope,
     )
     if self.algo.whole_paths:
         return paths
     else:
         paths_truncated = truncate_paths(paths, self.algo.batch_size)
         return paths_truncated
예제 #6
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Obtain samples."""
        if not batch_size:
            batch_size = self.algo.max_path_length

        cur_params = self.algo.policy.get_param_values()
        paths = parallel_sampler.sample_paths(
            policy_params=cur_params,
            max_samples=batch_size,
            max_path_length=self.algo.max_path_length,
            scope=self.algo.scope,
        )

        return paths if whole_paths else truncate_paths(paths, batch_size)
예제 #7
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        if not batch_size:
            batch_size = self.algo.max_path_length * self.n_envs

        cur_policy_params = self.algo.policy.get_param_values()
        paths = parallel_sampler.sample_paths(
            policy_params=cur_policy_params,
            max_samples=batch_size,
            max_path_length=self.algo.max_path_length,
            scope=self.algo.scope,
        )
        if whole_paths:
            return paths
        else:
            paths_truncated = truncate_paths(paths, batch_size)
            return paths_truncated
예제 #8
0
 def obtain_is_samples(self, itr):
     paths = []
     for hist_policy_distribution, hist_paths in self.get_history_list(
             self.n_backtrack):
         h_paths = self.sample_isweighted_paths(
             policy=self.algo.policy,
             hist_policy_distribution=hist_policy_distribution,
             max_samples=self.algo.batch_size,
             max_path_length=self.algo.max_path_length,
             paths=hist_paths,
             hist_variance_penalty=self.hist_variance_penalty,
             max_is_ratio=self.max_is_ratio,
             ess_threshold=self.ess_threshold,
         )
         paths.extend(h_paths)
     if len(paths) > self.algo.batch_size:
         paths = random.sample(paths, self.algo.batch_size)
     if self.algo.whole_paths:
         return paths
     else:
         paths_truncated = truncate_paths(paths, self.algo.batch_size)
         return paths_truncated
예제 #9
0
    def _obtain_is_samples(self, itr, batch_size=None, whole_paths=True):
        if batch_size is None:
            batch_size = self.algo.max_path_length

        paths = []
        for hist_policy_distribution, hist_paths in self.get_history_list(
                self.n_backtrack):
            h_paths = self._sample_isweighted_paths(
                policy=self.algo.policy,
                hist_policy_distribution=hist_policy_distribution,
                max_samples=batch_size,
                paths=hist_paths,
                hist_variance_penalty=self.hist_variance_penalty,
                max_is_ratio=self.max_is_ratio,
                ess_threshold=self.ess_threshold,
            )
            paths.extend(h_paths)

        if len(paths) > batch_size:
            paths = random.sample(paths, batch_size)

        return paths if whole_paths else truncate_paths(paths, batch_size)
예제 #10
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Sample the policy for new trajectories.

        Args:
            itr (int): Number of iteration.
            batch_size (int): Number of environment steps in one batch.
            whole_paths (bool): Whether to use whole path or truncated.

        Returns:
            list[dict]: A list of paths.

        """
        if not batch_size:
            batch_size = self.algo.max_path_length

        cur_params = self.algo.policy.get_param_values()
        paths = parallel_sampler.sample_paths(
            policy_params=cur_params,
            max_samples=batch_size,
            max_path_length=self.algo.max_path_length,
            scope=self.algo.scope,
        )

        return paths if whole_paths else truncate_paths(paths, batch_size)
예제 #11
0
 def test_invalid_path(self):
     self.paths[0]['invalid'] = None
     with pytest.raises(ValueError):
         utils.truncate_paths(self.paths, 3)
예제 #12
0
 def test_truncates_dict(self):
     truncated = utils.truncate_paths(self.paths_dict, 130)
     assert len(truncated) == 2
     assert len(truncated[-1]['agent_infos']['lala']['baba']) == 30
     assert len(truncated[0]['agent_infos']['lala']['baba']) == 100
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Obtain samples."""
        logger.log('Obtaining samples for iteration %d...' % itr)

        if not batch_size:
            batch_size = self.algo.max_path_length * self.n_envs

        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy

        import time
        while n_samples < batch_size:
            t = time.time()
            policy.reset(dones)

            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t
            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(  # noqa: E501
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]['observations'].append(observation)
                running_paths[idx]['actions'].append(action)
                running_paths[idx]['rewards'].append(reward)
                running_paths[idx]['env_infos'].append(env_info)
                running_paths[idx]['agent_infos'].append(agent_info)
                if done:
                    paths.append(
                        dict(observations=self.env_spec.observation_space.
                             flatten_n(running_paths[idx]['observations']),
                             actions=self.env_spec.action_space.flatten_n(
                                 running_paths[idx]['actions']),
                             rewards=tensor_utils.stack_tensor_list(
                                 running_paths[idx]['rewards']),
                             env_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]['env_infos']),
                             agent_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]['agent_infos'])))
                    n_samples += len(running_paths[idx]['rewards'])
                    running_paths[idx] = None

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        tabular.record('PolicyExecTime', policy_time)
        tabular.record('EnvExecTime', env_time)
        tabular.record('ProcessExecTime', process_time)

        if whole_paths:
            return paths
        else:
            paths_truncated = truncate_paths(paths, batch_size)
            return paths_truncated
예제 #14
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Sample the policy for new trajectories.

        Args:
            itr (int): Iteration number.
            batch_size (int): Number of samples to be collected. If None,
                it will be default [algo.max_path_length * n_envs].
            whole_paths (bool): Whether return all the paths or not. True
                by default. It's possible for the paths to have total actual
                sample size larger than batch_size, and will be truncated if
                this flag is true.

        Returns:
            list[dict]: Sample paths, each path with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)

        """
        logger.log('Obtaining samples for iteration %d...' % itr)

        if not batch_size:
            batch_size = self.algo.max_path_length * self._n_envs

        paths = []
        n_samples = 0
        obses = self._vec_env.reset()
        dones = np.asarray([True] * self._vec_env.num_envs)
        running_paths = [None] * self._vec_env.num_envs

        pbar = ProgBarCounter(batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy

        while n_samples < batch_size:
            t = time.time()
            policy.reset(dones)

            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self._vec_env.step(actions)
            env_time += time.time() - t
            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self._vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self._vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(  # noqa: E501
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]['observations'].append(observation)
                running_paths[idx]['actions'].append(action)
                running_paths[idx]['rewards'].append(reward)
                running_paths[idx]['env_infos'].append(env_info)
                running_paths[idx]['agent_infos'].append(agent_info)
                if done:
                    obs = np.asarray(running_paths[idx]['observations'])
                    actions = np.asarray(running_paths[idx]['actions'])
                    paths.append(
                        dict(observations=obs,
                             actions=actions,
                             rewards=np.asarray(running_paths[idx]['rewards']),
                             env_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]['env_infos']),
                             agent_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]['agent_infos'])))
                    n_samples += len(running_paths[idx]['rewards'])
                    running_paths[idx] = None

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        tabular.record('PolicyExecTime', policy_time)
        tabular.record('EnvExecTime', env_time)
        tabular.record('ProcessExecTime', process_time)

        return paths if whole_paths else truncate_paths(paths, batch_size)
예제 #15
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Sample the policy for new trajectories.

        If batch size is not specified, episode per task by default is 1 so
        batch size will be meta_batch_size * max_path_length.

        When number of workers are less than meta batch size, sampling will
        be performed for each of self._vec_envs_indices in series. The
        i-th value of self._vec_envs_indices represents the indices of the
        environments/tasks to be sampled for the i-th iteration.

        Args:
            itr (int): Iteration number.
            batch_size (int): Number of samples to be collected. If None,
                it will be default [algo.max_path_length * n_envs].
            whole_paths (bool): Whether return all the paths or not. True
                by default. It's possible for the paths to have total actual
                sample size larger than batch_size, and will be truncated if
                this flag is true.

        Returns:
            OrderedDict: Sample paths. Key represents the index of the
                environment/task and value represents all the paths sampled
                from that particular environment/task.


        Note:
            Each path is a dictionary, with keys and values as following:
                * observations: numpy.ndarray with shape :math:`[N, S^*]`
                * actions: numpy.ndarray with shape :math:`[N, S^*]`
                * rewards: numpy.ndarray with shape :math:`[N, S^*]`
                * dones: numpy.ndarray with shape :math:`[N, S^*]`
                * env_infos: A dictionary with each key representing one
                  environment info, value being a numpy.ndarray with shape
                  :math:`[N, S^*]`. One example is "ale.lives" for atari
                  environments.
                * agent_infos: A dictionary with each key representing one
                  agent info, value being a numpy.ndarray with shape
                  :math:`[N, S^*]`. One example is "prev_action", which is used
                  for recurrent policy as previous action input, merged with
                  the observation input as the state input.

        """
        logger.log('Obtaining samples for iteration %d...' % itr)

        if batch_size is None:
            batch_size = self.algo.max_path_length * self._meta_batch_size

        paths = []

        tasks = self.env.sample_tasks(self._meta_batch_size)

        # Start main loop
        batch_size_per_loop = batch_size // len(self._vec_envs_indices)
        for vec_envs_indices in self._vec_envs_indices:
            self._setup_worker(vec_envs_indices, tasks)

            n_samples = 0
            obses = self._vec_env.reset()
            dones = np.asarray([True] * self._vec_env.num_envs)
            running_paths = [None] * self._vec_env.num_envs

            pbar = ProgBarCounter(batch_size)
            policy_time = 0
            env_time = 0
            process_time = 0

            policy = self.algo.policy
            # Only reset policies at the beginning of a meta batch
            policy.reset(dones)

            while n_samples < batch_size_per_loop:
                t = time.time()

                actions, agent_infos = policy.get_actions(obses)

                policy_time += time.time() - t
                t = time.time()
                next_obses, rewards, dones, env_infos = self._vec_env.step(
                    actions)
                env_time += time.time() - t
                t = time.time()

                agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
                env_infos = tensor_utils.split_tensor_dict_list(env_infos)
                if env_infos is None:
                    env_infos = [dict() for _ in range(self._vec_env.num_envs)]
                if agent_infos is None:
                    agent_infos = [
                        dict() for _ in range(self._vec_env.num_envs)
                    ]
                for idx, observation, action, reward, env_info, agent_info, done in zip(  # noqa: E501
                        itertools.count(), obses, actions, rewards, env_infos,
                        agent_infos, dones):
                    if running_paths[idx] is None:
                        running_paths[idx] = dict(
                            observations=[],
                            actions=[],
                            rewards=[],
                            dones=[],
                            env_infos=[],
                            agent_infos=[],
                        )
                    running_paths[idx]['observations'].append(observation)
                    running_paths[idx]['actions'].append(action)
                    running_paths[idx]['rewards'].append(reward)
                    running_paths[idx]['dones'].append(done)
                    running_paths[idx]['env_infos'].append(env_info)
                    running_paths[idx]['agent_infos'].append(agent_info)
                    if done:
                        obs = np.asarray(running_paths[idx]['observations'])
                        actions = np.asarray(running_paths[idx]['actions'])
                        paths.append(
                            dict(observations=obs,
                                 actions=actions,
                                 rewards=np.asarray(
                                     running_paths[idx]['rewards']),
                                 dones=np.asarray(running_paths[idx]['dones']),
                                 env_infos=tensor_utils.stack_tensor_dict_list(
                                     running_paths[idx]['env_infos']),
                                 agent_infos=tensor_utils.
                                 stack_tensor_dict_list(
                                     running_paths[idx]['agent_infos']),
                                 batch_idx=idx))
                        n_samples += len(running_paths[idx]['rewards'])
                        running_paths[idx] = None

                process_time += time.time() - t
                pbar.inc(len(obses))
                obses = next_obses

        pbar.stop()

        tabular.record('PolicyExecTime', policy_time)
        tabular.record('EnvExecTime', env_time)
        tabular.record('ProcessExecTime', process_time)

        return paths if whole_paths else truncate_paths(paths, batch_size)
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Sample the policy for new trajectories.

        Args:
            itr (int): Iteration number.
            batch_size (int): Number of samples to be collected. If None,
                it will be default [algo.max_path_length * n_envs].
            whole_paths (bool): Whether return all the paths or not. True
                by default. It's possible for the paths to have total actual
                sample size larger than batch_size, and will be truncated if
                this flag is true.

        Returns:
            list[dict]: Sample paths.

        Note:
            Each path is a dictionary, with keys and values as following:
                * observations: numpy.ndarray with shape [Batch, *obs_dims]
                * actions: numpy.ndarray with shape [Batch, *act_dims]
                * rewards: numpy.ndarray with shape [Batch, ]
                * env_infos: A dictionary with each key representing one
                  environment info, value being a numpy.ndarray with shape
                  [Batch, ?]. One example is "ale.lives" for atari
                  environments.
                * agent_infos: A dictionary with each key representing one
                  agent info, value being a numpy.ndarray with shape
                  [Batch, ?]. One example is "prev_action", which is used
                  for recurrent policy as previous action input, merged with
                  the observation input as the state input.
                * dones: numpy.ndarray with shape [Batch, ]

        """
        logger.log('Obtaining samples for iteration %d...' % itr)

        if not batch_size:
            batch_size = self.algo.max_path_length * self._n_envs

        paths = []
        n_samples = 0
        obses = self._vec_env.reset()
        dones = np.asarray([True] * self._vec_env.num_envs)
        running_paths = [None] * self._vec_env.num_envs

        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy

        with click.progressbar(length=batch_size, label='Sampling') as pbar:
            while n_samples < batch_size:
                t = time.time()
                policy.reset(dones)

                actions, agent_infos = policy.get_actions(obses)

                policy_time += time.time() - t
                t = time.time()
                next_obses, rewards, dones, env_infos = \
                    self._vec_env.step(actions)
                env_time += time.time() - t
                t = time.time()

                agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
                env_infos = tensor_utils.split_tensor_dict_list(env_infos)
                if env_infos is None:
                    env_infos = [dict() for _ in range(self._vec_env.num_envs)]
                if agent_infos is None:
                    agent_infos = [
                        dict() for _ in range(self._vec_env.num_envs)
                    ]
                for idx, observation, action, reward, env_info, agent_info, done in zip(  # noqa: E501
                        itertools.count(), obses, actions, rewards, env_infos,
                        agent_infos, dones):
                    if running_paths[idx] is None:
                        running_paths[idx] = dict(observations=[],
                                                  actions=[],
                                                  rewards=[],
                                                  env_infos=[],
                                                  agent_infos=[],
                                                  dones=[])
                    running_paths[idx]['observations'].append(observation)
                    running_paths[idx]['actions'].append(action)
                    running_paths[idx]['rewards'].append(reward)
                    running_paths[idx]['env_infos'].append(env_info)
                    running_paths[idx]['agent_infos'].append(agent_info)
                    running_paths[idx]['dones'].append(done)
                    if done:
                        obs = np.asarray(running_paths[idx]['observations'])
                        actions = np.asarray(running_paths[idx]['actions'])
                        paths.append(
                            dict(observations=obs,
                                 actions=actions,
                                 rewards=np.asarray(
                                     running_paths[idx]['rewards']),
                                 env_infos=tensor_utils.stack_tensor_dict_list(
                                     running_paths[idx]['env_infos']),
                                 agent_infos=tensor_utils.
                                 stack_tensor_dict_list(
                                     running_paths[idx]['agent_infos']),
                                 dones=np.asarray(
                                     running_paths[idx]['dones'])))
                        n_samples += len(running_paths[idx]['rewards'])
                        running_paths[idx] = None

                process_time += time.time() - t
                pbar.update(len(obses))
                obses = next_obses

        tabular.record('PolicyExecTime', policy_time)
        tabular.record('EnvExecTime', env_time)
        tabular.record('ProcessExecTime', process_time)

        return paths if whole_paths else truncate_paths(paths, batch_size)
예제 #17
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Collect samples for the given iteration number.

        Parameters
        ----------
        itr : int
            Iteration number.
        batch_size : int, optional
            How many simulation steps to run in each epoch.
        whole_paths : bool, optional
            Whether to return the full rollout paths data.
        """
        if not batch_size:
            batch_size = self.algo.max_path_length * self.n_envs

        # cur_params = self.algo.policy.get_param_values()
        cur_policy_params = self.algo.policy.get_param_values()
        cur_env_params = self.algo.env.get_param_values()
        paths = parallel_sampler.sample_paths(
            policy_params=cur_policy_params,
            max_samples=batch_size,
            max_path_length=self.algo.max_path_length,
            env_params=cur_env_params,
            scope=self.algo.scope,
        )
        # TODO: Doing the path correction here means the simulations will not be parallel.
        #  Need to make own parallel sampler and put it there to make that work
        if self.open_loop:
            if self.batch_simulate:
                # import pdb; pdb.set_trace()
                paths = self.sim.batch_simulate_paths(paths=paths, reward_function=self.reward_function)
            else:
                for path in paths:
                    s_0 = path["observations"][0]

                    # actions = path['env_infos']['info']['actions']
                    actions = path['actions']
                    # pdb.set_trace()
                    end_idx, info = self.sim.simulate(actions=actions, s_0=s_0)
                    # print('----- Back from simulate: ', end_idx)
                    if end_idx >= 0:
                        # pdb.set_trace()
                        self.slice_dict(path, end_idx)
                    rewards = self.reward_function.give_reward(
                        action=actions[end_idx],
                        info=self.sim.get_reward_info()
                    )
                    # print('----- Back from rewards: ', rewards)
                    # pdb.set_trace()
                    path["rewards"][end_idx] = rewards
                    # info[:, -1] = path["rewards"][:info.shape[0]]
                    # path['env_infos']['cache'] = info
                    path['env_infos']['cache'] = np.zeros_like(path["rewards"])
                    # import pdb; pdb.set_trace()

        # return paths if whole_paths else truncate_paths(paths, batch_size)
        if whole_paths:
            return paths
        else:
            paths_truncated = truncate_paths(paths, batch_size)
            return paths_truncated