def test_stack_and_pad_tensor_dict_list(self): result = stack_and_pad_tensor_dict_list(self.data, max_len=5) assert np.array_equal(result['obs'], np.array([[1, 1, 1, 0, 0], [1, 1, 1, 0, 0]])) assert np.array_equal(result['info']['lala'], np.array([[1, 1, 0, 0, 0], [1, 1, 0, 0, 0]])) assert np.array_equal(result['info']['baba'], np.array([[2, 2, 0, 0, 0], [2, 2, 0, 0, 0]]))
def _process_samples(self, itr, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (OrderedDict[dict]): A list of collected paths for each task. In RL^2, there are n environments/tasks and paths in each of them will be concatenated at some point and fed to the policy. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) * average_return: (numpy.float64) Raises: ValueError: If 'batch_idx' is not found. """ concatenated_paths = [] paths_by_task = collections.defaultdict(list) for path in paths: path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self._discount) path['lengths'] = [len(path['rewards'])] if 'batch_idx' in path: paths_by_task[path['batch_idx']].append(path) elif 'batch_idx' in path['agent_infos']: paths_by_task[path['agent_infos']['batch_idx'][0]].append(path) else: raise ValueError( 'Batch idx is required for RL2 but not found, ' 'Make sure to use garage.tf.algos.rl2.RL2Worker ' 'for sampling') # all path in paths_by_task[i] are sampled from task[i] for _paths in paths_by_task.values(): concatenated_path = self._concatenate_paths(_paths) concatenated_paths.append(concatenated_path) # stack and pad to max path length of the concatenated # path, which will be fed to inner algo # i.e. max_path_length * episode_per_task concatenated_paths_stacked = ( np_tensor_utils.stack_and_pad_tensor_dict_list( concatenated_paths, self._inner_algo.max_path_length)) name_map = None if hasattr(self._task_sampler, '_envs') and hasattr( self._task_sampler._envs[0].env, 'all_task_names'): names = [ env.env.all_task_names[0] for env in self._task_sampler._envs ] name_map = dict(enumerate(names)) undiscounted_returns = log_multitask_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), self._inner_algo.discount, name_map=name_map) concatenated_paths_stacked['paths'] = concatenated_paths concatenated_paths_stacked['average_return'] = np.mean( undiscounted_returns) return concatenated_paths_stacked