Exemplo n.º 1
0
    def forward(self, distribution_old: Distribution, value_old: Tensor,
                distribution: Distribution, value: Tensor, action: Tensor,
                reward: Tensor, advantage: Tensor):
        # Value loss
        value_old_clipped = value_old + (value - value_old).clamp(
            -self.v_clip_range, self.v_clip_range)
        v_old_loss_clipped = (reward - value_old_clipped).pow(2)
        v_loss = (reward - value).pow(2)
        value_loss = torch.min(v_old_loss_clipped, v_loss).mean()

        # Policy loss
        advantage = (advantage -
                     advantage.mean()) / (advantage.std(unbiased=False) + 1e-8)
        advantage.detach_()
        log_prob = distribution.log_prob(action)
        log_prob_old = distribution_old.log_prob(action)
        ratio = (log_prob - log_prob_old).exp().view(-1)

        surrogate = advantage * ratio
        surrogate_clipped = advantage * ratio.clamp(1 - self.clip_range,
                                                    1 + self.clip_range)
        policy_loss = torch.min(surrogate, surrogate_clipped).mean()

        # Entropy
        entropy = distribution.entropy().mean()

        # Total loss
        losses = policy_loss + self.c_entropy * entropy - self.c_value * value_loss
        total_loss = -losses
        self.reporter.scalar('ppo_loss/policy', -policy_loss.item())
        self.reporter.scalar('ppo_loss/entropy', -entropy.item())
        self.reporter.scalar('ppo_loss/value_loss', value_loss.item())
        self.reporter.scalar('ppo_loss/total', total_loss)
        return total_loss
Exemplo n.º 2
0
 def _compute_entropy(dist: td.Distribution):
     if isinstance(dist, td.TransformedDistribution):
         # TransformedDistribution is used by NormalProjectionNetwork with
         # scale_distribution=True, in which case we estimate with sampling.
         entropy, entropy_for_gradient = estimated_entropy(dist)
     else:
         entropy = dist.entropy()
         entropy_for_gradient = entropy
     return entropy, entropy_for_gradient
Exemplo n.º 3
0
def loss(distr: Distribution, actions: Tensor, critic_value: Tensor,
         c_entropy: float) -> Tensor:
    """Computes A2C actor loss, i.e. -C(s, a) log pi(a | s) - c_entropy H(pi(.|s)).

    :args distr: distribution on actions, accepting actions of the same size as actions
    :args actions: actions performed
    :args critic_value: advantage corresponding to the actions performed
    :args c_entropy: entropy loss weighting

    :return: loss
    """
    logp_action = distr.log_prob(actions)
    entropy = distr.entropy()

    loss_critic = (-logp_action * critic_value.detach()).mean()
    return loss_critic - c_entropy * entropy.mean()
Exemplo n.º 4
0
def loss(distr: Distribution,
         actions: Tensor,
         critic_value: Tensor,
         c_entropy: float,
         eps_clamp: float,
         c_kl: float,
         old_logp: Tensor,
         old_distr: Optional[Distribution] = None) -> None:
    """Computes PPO actor loss. See
    https://spinningup.openai.com/en/latest/algorithms/ppo.html
    for a detailled explanation

    :args distr: current distribution of actions,
        accepting actions of the same size as actions
    :args actions: actions performed
    :args critic_value: advantage corresponding to the actions performed
    :args c_entropy: entropy loss weighting
    :args eps_clamp: clamping parameter
    :args c_kl: kl penalty coefficient
    :args old_logp: log probabilities of the old distribution of actions
    :args old_distr: old ditribution of actions

    :return: loss
    """
    logp_action = distr.log_prob(actions)
    logr = (logp_action - old_logp)

    r_clipped = torch.where(critic_value.detach() > 0,
                            torch.clamp(logr, max=np.log(1 + eps_clamp)),
                            torch.clamp(logr,
                                        min=np.log(1 - eps_clamp))).exp()

    loss = -r_clipped * critic_value.detach()
    if c_entropy != 0.:
        loss -= c_entropy * distr.entropy()

    if c_kl != 0.:
        if old_distr is None:
            raise ValueError(
                "Optional argument old_distr is required if c_kl > 0")
        loss += c_kl * kl_divergence(old_distr, distr)

    return loss.mean()
Exemplo n.º 5
0
 def _compute_entropy(dist: td.Distribution):
     entropy = dist.entropy()
     return entropy