def _compute_advantage(self, rewards, valids, baselines): r"""Compute mean value of loss. Notes: P is the maximum path length (self.max_path_length) Args: rewards (torch.Tensor): Acquired rewards with shape :math:`(N, P)`. valids (list[int]): Numbers of valid steps in each paths baselines (torch.Tensor): Value function estimation at each step with shape :math:`(N, P)`. Returns: torch.Tensor: Calculated advantage values given rewards and baselines with shape :math:`(N \dot [T], )`. """ advantages = compute_advantages(self.discount, self._gae_lambda, self.max_path_length, baselines, rewards) advantage_flat = torch.cat(filter_valids(advantages, valids)) if self._center_adv: means = advantage_flat.mean() variance = advantage_flat.var() advantage_flat = (advantage_flat - means) / (variance + 1e-8) if self._positive_adv: advantage_flat -= advantage_flat.min() return advantage_flat
def _compute_loss(self, itr, obs, avail_actions, actions, rewards, valids, baselines): """Compute mean value of loss. Args: itr (int): Iteration number. obs (torch.Tensor): Observation from the environment. actions (torch.Tensor): Predicted action. rewards (torch.Tensor): Feedback from the environment. valids (list[int]): Array of length of the valid values. baselines (torch.Tensor): Value function estimation at each step. Returns: torch.Tensor: Calculated mean value of loss """ del itr if self.policy.recurrent: policy_entropies = self._compute_policy_entropy( obs, avail_actions, actions) else: policy_entropies = self._compute_policy_entropy(obs, avail_actions) if self._maximum_entropy: rewards += self._policy_ent_coeff * policy_entropies advantages = compute_advantages(self.discount, self._gae_lambda, self.max_path_length, baselines, rewards) if self._center_adv: means, variances = list( zip(*[(valid_adv.mean(), valid_adv.var(unbiased=False)) for valid_adv in filter_valids(advantages, valids)])) advantages = F.batch_norm(advantages.t(), torch.Tensor(means), torch.Tensor(variances), eps=self._eps).t() if self._positive_adv: advantages -= advantages.min() objective = self._compute_objective(advantages, valids, obs, avail_actions, actions, rewards) if self._entropy_regularzied: objective += self._policy_ent_coeff * policy_entropies valid_objectives = filter_valids(objective, valids) return -torch.cat(valid_objectives).mean()
def _compute_loss(self, itr, paths, valids, obs, actions, rewards): """Compute mean value of loss. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths valids (list[int]): Array of length of the valid values obs (torch.Tensor): Observation from the environment. actions (torch.Tensor): Predicted action. rewards (torch.Tensor): Feedback from the environment. Returns: torch.Tensor: Calculated mean value of loss """ # pylint: disable=unused-argument policy_entropies = self._compute_policy_entropy(obs) baselines = torch.stack([ pad_to_last(self._get_baselines(path), total_length=self.max_path_length) for path in paths ]) if self._maximum_entropy: rewards += self._policy_ent_coeff * policy_entropies advantages = compute_advantages(self.discount, self._gae_lambda, self.max_path_length, baselines, rewards) if self._center_adv: means, variances = list( zip(*[(valid_adv.mean(), valid_adv.var()) for valid_adv in filter_valids(advantages, valids)])) advantages = F.batch_norm(advantages.t(), torch.Tensor(means), torch.Tensor(variances), eps=self._eps).t() if self._positive_adv: advantages -= advantages.min() objective = self._compute_objective(advantages, valids, obs, actions, rewards) if self._entropy_regularzied: objective += self._policy_ent_coeff * policy_entropies valid_objectives = filter_valids(objective, valids) return -torch.cat(valid_objectives).mean()