Exemplo n.º 1
0
 def train(self, training_batch: rlt.PolicyGradientInput) -> None:
     actions = training_batch.action
     rewards = training_batch.reward.detach()
     scores = self.scorer(training_batch.state)
     characteristic_eligibility = self.sampler.log_prob(scores,
                                                        actions).float()
     offset_reinforcement = discounted_returns(
         torch.clamp(rewards, max=self.params.reward_clip).clone(),
         self.params.gamma)
     if self.params.normalize:
         offset_reinforcement = whiten(
             offset_reinforcement, subtract_mean=self.params.subtract_mean)
     if self.params.offset_clamp_min:
         offset_reinforcement = offset_reinforcement.clamp(
             min=0)  # pyre-ignore
     if self.params.off_policy:
         target_propensity = self.sampler.log_prob(scores, actions).float()
         characteristic_eligibility = torch.exp(
             torch.clamp(
                 target_propensity - training_batch.log_prob.detach(),
                 max=torch.log(self.params.clip_param),
             ))
     self.losses.append(
         -(offset_reinforcement.float()) @ characteristic_eligibility)
     self.step += 1
     if self.step % self.params.update_freq == 0:
         self.update_model()
Exemplo n.º 2
0
    def _trajectory_to_losses(
            self,
            trajectory: rlt.PolicyGradientInput) -> Dict[str, torch.Tensor]:
        """
        Get a dict of losses for the trajectory. Dict always includes PPO loss.
        If a value baseline is trained, a loss for the value network is also included.
        """
        losses = {}
        actions = trajectory.action
        rewards = trajectory.reward.detach()
        scorer_inputs = []
        if inspect.getattr_static(trajectory, "graph", None) is not None:
            # GNN
            scorer_inputs.append(trajectory.graph)
        else:
            scorer_inputs.append(trajectory.state)
        if trajectory.possible_actions_mask is not None:
            scorer_inputs.append(trajectory.possible_actions_mask)
        scores = self.scorer(*scorer_inputs)
        offset_reinforcement = discounted_returns(
            torch.clamp(rewards, max=self.reward_clip).clone(), self.gamma)
        if self.normalize:
            offset_reinforcement = whiten(offset_reinforcement,
                                          subtract_mean=self.subtract_mean)
        if self.offset_clamp_min:
            offset_reinforcement = offset_reinforcement.clamp(
                min=0)  # pyre-ignore
        if self.value_net is not None:
            if self.normalize:
                raise RuntimeError(
                    "Can't apply a baseline and normalize rewards simultaneously"
                )
            # subtract learned value function baselines from rewards
            baselines = self.value_net(
                trajectory.state).squeeze()  # pyre-ignore
            # use reward-to-go as label for training the value function
            losses["value_net_loss"] = self.value_loss_fn(
                baselines, offset_reinforcement)
            # detach bcs we want PPO to tweak policy, not baseline
            offset_reinforcement = offset_reinforcement - baselines.detach()

        target_propensity = self.sampler.log_prob(scores, actions).float()
        characteristic_eligibility = torch.exp(
            target_propensity - trajectory.log_prob.detach()).float()

        losses["ppo_loss"] = -torch.min(
            offset_reinforcement.float() @ characteristic_eligibility,
            offset_reinforcement.float() @ torch.clamp(
                characteristic_eligibility,
                1 - self.ppo_epsilon,
                1 + self.ppo_epsilon,
            ),
        )
        if self.entropy_weight != 0:
            entropy = self.sampler.entropy(scores)
            # "-" bcs minimizing, not maximizing
            losses["ppo_loss"] = losses[
                "ppo_loss"] - self.entropy_weight * entropy
        return losses
Exemplo n.º 3
0
    def train(self, training_batch: rlt.PolicyGradientInput) -> None:
        actions = training_batch.action
        rewards = training_batch.reward.detach()
        if training_batch.possible_actions_mask is not None:
            scores = self.scorer(
                training_batch.state, training_batch.possible_actions_mask
            )
        else:
            scores = self.scorer(training_batch.state)
        characteristic_eligibility = self.sampler.log_prob(scores, actions).float()
        offset_reinforcement = discounted_returns(
            torch.clamp(rewards, max=self.params.reward_clip).clone(), self.params.gamma
        )
        if self.params.normalize:
            offset_reinforcement = whiten(
                offset_reinforcement, subtract_mean=self.params.subtract_mean
            )
        if self.params.offset_clamp_min:
            offset_reinforcement = offset_reinforcement.clamp(min=0)  # pyre-ignore
        if self.value_net is not None:
            if self.params.normalize:
                raise RuntimeError(
                    "Can't apply a baseline and normalize rewards simultaneously"
                )
            # subtract learned value function baselines from rewards
            baselines = self.value_net(training_batch.state).squeeze()
            # use reward-to-go as label for training the value function
            self.value_net_losses.append(
                self.value_loss_fn(baselines, offset_reinforcement)
            )
            # detach bcs we want REINFORCE to tweak policy, not baseline
            offset_reinforcement = offset_reinforcement - baselines.detach()

        if self.params.off_policy:
            target_propensity = self.sampler.log_prob(scores, actions).float()
            characteristic_eligibility = torch.exp(
                torch.clamp(
                    target_propensity - training_batch.log_prob.detach(),
                    max=math.log(float(self.params.clip_param)),
                )
            ).float()
        self.losses.append(-(offset_reinforcement.float()) @ characteristic_eligibility)
        self.step += 1
        if self.step % self.params.update_freq == 0:
            self.update_model()
Exemplo n.º 4
0
    def train_step_gen(self, training_batch: rlt.PolicyGradientInput,
                       batch_idx: int):
        actions = training_batch.action
        rewards = training_batch.reward.detach()
        scorer_inputs = []
        if inspect.getattr_static(training_batch, "graph", None) is not None:
            # GNN
            scorer_inputs.append(training_batch.graph)
        else:
            scorer_inputs.append(training_batch.state)
        if training_batch.possible_actions_mask is not None:
            scorer_inputs.append(training_batch.possible_actions_mask)
        scores = self.scorer(*scorer_inputs)
        characteristic_eligibility = self.sampler.log_prob(scores,
                                                           actions).float()
        offset_reinforcement = discounted_returns(
            torch.clamp(rewards, max=self.reward_clip).clone(), self.gamma)
        if self.normalize:
            offset_reinforcement = whiten(offset_reinforcement,
                                          subtract_mean=self.subtract_mean)
        if self.offset_clamp_min:
            offset_reinforcement = offset_reinforcement.clamp(
                min=0)  # pyre-ignore
        if self.value_net is not None:
            if self.normalize:
                raise RuntimeError(
                    "Can't apply a baseline and normalize rewards simultaneously"
                )
            baselines = self.value_net(training_batch.state).squeeze()
            yield self.value_loss_fn(baselines, offset_reinforcement)
            # subtract learned value function baselines from rewards
            offset_reinforcement = offset_reinforcement - baselines

        if self.off_policy:
            target_propensity = self.sampler.log_prob(scores, actions).float()
            characteristic_eligibility = torch.exp(
                torch.clamp(
                    target_propensity - training_batch.log_prob,
                    max=math.log(float(self.clip_param)),
                )).float()
        yield -(offset_reinforcement.float()
                ) @ characteristic_eligibility  # PG "loss"
Exemplo n.º 5
0
 def train(self, training_batch: rlt.PolicyGradientInput) -> None:
     actions = training_batch.action
     rewards = training_batch.reward.detach()
     scores = self.scorer(training_batch.state)
     characteristic_eligibility = self.sampler.log_prob(scores,
                                                        actions).float()
     offset_reinforcement = discounted_returns(rewards, self.params.gamma)
     if self.params.normalize:
         offset_reinforcement = whiten(
             offset_reinforcement, subtract_mean=self.params.subtract_mean)
     if self.params.offset_clamp_min:
         offset_reinforcement = offset_reinforcement.clamp(
             min=0)  # pyre-ignore
     correction = 1.0
     if self.params.off_policy:
         correction = torch.exp(characteristic_eligibility -
                                training_batch.log_prob)
         correction *= (correction < self.params.clip_param).float()
         characteristic_eligibility *= correction.detach()
     err = -(offset_reinforcement.float()) @ characteristic_eligibility
     self.optimizer.zero_grad()
     err.backward()
     self.optimizer.step()