def process_samples(self, paths): r"""Process sample data based on the collected paths. Notes: P is the maximum path length (self.max_path_length) Args: paths (list[dict]): A list of collected paths Returns: torch.Tensor: The observations of the environment with shape :math:`(N, P, O*)`. torch.Tensor: The actions fed to the environment with shape :math:`(N, P, A*)`. torch.Tensor: The acquired rewards with shape :math:`(N, P)`. list[int]: Numbers of valid steps in each paths. torch.Tensor: Value function estimation at each step with shape :math:`(N, P)`. """ if self.exploration_bonus > 0.: paths = self._add_exploration_bonus(paths) valids = torch.Tensor([len(path['actions']) for path in paths]).int().to(ptu.device) obs = torch.stack([ pad_to_last(path['observations'], total_length=self.max_path_length, axis=0) for path in paths ]).to(ptu.device) actions = torch.stack([ pad_to_last(path['actions'], total_length=self.max_path_length, axis=0) for path in paths ]).to(ptu.device) rewards = torch.stack([ pad_to_last(path['rewards'].reshape(-1), total_length=self.max_path_length) for path in paths ]).to(ptu.device) returns = torch.stack([ pad_to_last(tu.discount_cumsum(path['rewards'].reshape(-1), self.discount).copy(), total_length=self.max_path_length) for path in paths ]).to(ptu.device) # batch x label_num x label_dim env_infos = [ ppp.list_of_dicts__to__dict_of_lists(p['env_infos']) for p in paths ] labels = torch.stack([ pad_to_last(env_info['sup_labels'], total_length=self.max_path_length, axis=0) for env_info in env_infos ]).to(ptu.device) with torch.no_grad(): baselines = self._value_function(obs).squeeze(-1) return obs, actions, rewards, returns, valids, baselines, labels
def process_samples(self, paths): r"""Process sample data based on the collected paths. Notes: P is the maximum path length (self.max_path_length) Args: paths (list[dict]): A list of collected paths Returns: torch.Tensor: The observations of the environment with shape :math:`(N, P, O*)`. torch.Tensor: The actions fed to the environment with shape :math:`(N, P, A*)`. torch.Tensor: The acquired rewards with shape :math:`(N, P)`. list[int]: Numbers of valid steps in each paths. torch.Tensor: Value function estimation at each step with shape :math:`(N, P)`. """ valids = torch.Tensor([len(path['actions']) for path in paths]).int().to(ptu.device) obs_n = torch.stack([ pad_to_last(path['observations'], total_length=self.max_path_length, axis=0) for path in paths ]).to(ptu.device) actions_n = torch.stack([ pad_to_last(path['actions'], total_length=self.max_path_length, axis=0) for path in paths ]).to(ptu.device) if 'raw_actions' in paths[0].keys(): raw_actions_n = torch.stack([ pad_to_last(path['raw_actions'], total_length=self.max_path_length, axis=0) for path in paths ]).to(ptu.device) else: raw_actions_n = None rewards_n = torch.stack([ pad_to_last(path['rewards'], total_length=self.max_path_length, axis=0) for path in paths ]).to(ptu.device) returns_n = torch.stack([ pad_to_last(tu.discount_cumsum(path['rewards'], self.discount).copy(), total_length=self.max_path_length, axis=0) for path in paths ]).to(ptu.device) return obs_n, actions_n, rewards_n, returns_n, valids, raw_actions_n
def process_samples(self, paths): r"""Process sample data based on the collected paths. Notes: P is the maximum path length (self.max_path_length) Args: paths (list[dict]): A list of collected paths Returns: torch.Tensor: The observations of the environment with shape :math:`(N, P, O*)`. torch.Tensor: The actions fed to the environment with shape :math:`(N, P, A*)`. torch.Tensor: The acquired rewards with shape :math:`(N, P)`. list[int]: Numbers of valid steps in each paths. torch.Tensor: Value function estimation at each step with shape :math:`(N, P)`. """ valids = torch.Tensor([len(path['actions']) for path in paths]).int() obs = torch.stack([ pad_to_last(path['observations'], total_length=self.max_path_length, axis=0) for path in paths ]) actions = torch.stack([ pad_to_last(path['actions'], total_length=self.max_path_length, axis=0) for path in paths ]) rewards = torch.stack([ pad_to_last(path['rewards'].reshape(-1), total_length=self.max_path_length) for path in paths ]) returns = torch.stack([ pad_to_last(tu.discount_cumsum(path['rewards'].reshape(-1), self.discount).copy(), total_length=self.max_path_length) for path in paths ]) with torch.no_grad(): baselines = self._value_function(obs).squeeze(-1) return obs, actions, rewards, returns, valids, baselines