def train(self, training_batch: rlt.PolicyGradientInput) -> None: actions = training_batch.action rewards = training_batch.reward.detach() scores = self.scorer(training_batch.state) characteristic_eligibility = self.sampler.log_prob(scores, actions).float() offset_reinforcement = discounted_returns( torch.clamp(rewards, max=self.params.reward_clip).clone(), self.params.gamma) if self.params.normalize: offset_reinforcement = whiten( offset_reinforcement, subtract_mean=self.params.subtract_mean) if self.params.offset_clamp_min: offset_reinforcement = offset_reinforcement.clamp( min=0) # pyre-ignore if self.params.off_policy: target_propensity = self.sampler.log_prob(scores, actions).float() characteristic_eligibility = torch.exp( torch.clamp( target_propensity - training_batch.log_prob.detach(), max=torch.log(self.params.clip_param), )) self.losses.append( -(offset_reinforcement.float()) @ characteristic_eligibility) self.step += 1 if self.step % self.params.update_freq == 0: self.update_model()
def _trajectory_to_losses( self, trajectory: rlt.PolicyGradientInput) -> Dict[str, torch.Tensor]: """ Get a dict of losses for the trajectory. Dict always includes PPO loss. If a value baseline is trained, a loss for the value network is also included. """ losses = {} actions = trajectory.action rewards = trajectory.reward.detach() scorer_inputs = [] if inspect.getattr_static(trajectory, "graph", None) is not None: # GNN scorer_inputs.append(trajectory.graph) else: scorer_inputs.append(trajectory.state) if trajectory.possible_actions_mask is not None: scorer_inputs.append(trajectory.possible_actions_mask) scores = self.scorer(*scorer_inputs) offset_reinforcement = discounted_returns( torch.clamp(rewards, max=self.reward_clip).clone(), self.gamma) if self.normalize: offset_reinforcement = whiten(offset_reinforcement, subtract_mean=self.subtract_mean) if self.offset_clamp_min: offset_reinforcement = offset_reinforcement.clamp( min=0) # pyre-ignore if self.value_net is not None: if self.normalize: raise RuntimeError( "Can't apply a baseline and normalize rewards simultaneously" ) # subtract learned value function baselines from rewards baselines = self.value_net( trajectory.state).squeeze() # pyre-ignore # use reward-to-go as label for training the value function losses["value_net_loss"] = self.value_loss_fn( baselines, offset_reinforcement) # detach bcs we want PPO to tweak policy, not baseline offset_reinforcement = offset_reinforcement - baselines.detach() target_propensity = self.sampler.log_prob(scores, actions).float() characteristic_eligibility = torch.exp( target_propensity - trajectory.log_prob.detach()).float() losses["ppo_loss"] = -torch.min( offset_reinforcement.float() @ characteristic_eligibility, offset_reinforcement.float() @ torch.clamp( characteristic_eligibility, 1 - self.ppo_epsilon, 1 + self.ppo_epsilon, ), ) if self.entropy_weight != 0: entropy = self.sampler.entropy(scores) # "-" bcs minimizing, not maximizing losses["ppo_loss"] = losses[ "ppo_loss"] - self.entropy_weight * entropy return losses
def train(self, training_batch: rlt.PolicyGradientInput) -> None: actions = training_batch.action rewards = training_batch.reward.detach() if training_batch.possible_actions_mask is not None: scores = self.scorer( training_batch.state, training_batch.possible_actions_mask ) else: scores = self.scorer(training_batch.state) characteristic_eligibility = self.sampler.log_prob(scores, actions).float() offset_reinforcement = discounted_returns( torch.clamp(rewards, max=self.params.reward_clip).clone(), self.params.gamma ) if self.params.normalize: offset_reinforcement = whiten( offset_reinforcement, subtract_mean=self.params.subtract_mean ) if self.params.offset_clamp_min: offset_reinforcement = offset_reinforcement.clamp(min=0) # pyre-ignore if self.value_net is not None: if self.params.normalize: raise RuntimeError( "Can't apply a baseline and normalize rewards simultaneously" ) # subtract learned value function baselines from rewards baselines = self.value_net(training_batch.state).squeeze() # use reward-to-go as label for training the value function self.value_net_losses.append( self.value_loss_fn(baselines, offset_reinforcement) ) # detach bcs we want REINFORCE to tweak policy, not baseline offset_reinforcement = offset_reinforcement - baselines.detach() if self.params.off_policy: target_propensity = self.sampler.log_prob(scores, actions).float() characteristic_eligibility = torch.exp( torch.clamp( target_propensity - training_batch.log_prob.detach(), max=math.log(float(self.params.clip_param)), ) ).float() self.losses.append(-(offset_reinforcement.float()) @ characteristic_eligibility) self.step += 1 if self.step % self.params.update_freq == 0: self.update_model()
def train_step_gen(self, training_batch: rlt.PolicyGradientInput, batch_idx: int): actions = training_batch.action rewards = training_batch.reward.detach() scorer_inputs = [] if inspect.getattr_static(training_batch, "graph", None) is not None: # GNN scorer_inputs.append(training_batch.graph) else: scorer_inputs.append(training_batch.state) if training_batch.possible_actions_mask is not None: scorer_inputs.append(training_batch.possible_actions_mask) scores = self.scorer(*scorer_inputs) characteristic_eligibility = self.sampler.log_prob(scores, actions).float() offset_reinforcement = discounted_returns( torch.clamp(rewards, max=self.reward_clip).clone(), self.gamma) if self.normalize: offset_reinforcement = whiten(offset_reinforcement, subtract_mean=self.subtract_mean) if self.offset_clamp_min: offset_reinforcement = offset_reinforcement.clamp( min=0) # pyre-ignore if self.value_net is not None: if self.normalize: raise RuntimeError( "Can't apply a baseline and normalize rewards simultaneously" ) baselines = self.value_net(training_batch.state).squeeze() yield self.value_loss_fn(baselines, offset_reinforcement) # subtract learned value function baselines from rewards offset_reinforcement = offset_reinforcement - baselines if self.off_policy: target_propensity = self.sampler.log_prob(scores, actions).float() characteristic_eligibility = torch.exp( torch.clamp( target_propensity - training_batch.log_prob, max=math.log(float(self.clip_param)), )).float() yield -(offset_reinforcement.float() ) @ characteristic_eligibility # PG "loss"
def train(self, training_batch: rlt.PolicyGradientInput) -> None: actions = training_batch.action rewards = training_batch.reward.detach() scores = self.scorer(training_batch.state) characteristic_eligibility = self.sampler.log_prob(scores, actions).float() offset_reinforcement = discounted_returns(rewards, self.params.gamma) if self.params.normalize: offset_reinforcement = whiten( offset_reinforcement, subtract_mean=self.params.subtract_mean) if self.params.offset_clamp_min: offset_reinforcement = offset_reinforcement.clamp( min=0) # pyre-ignore correction = 1.0 if self.params.off_policy: correction = torch.exp(characteristic_eligibility - training_batch.log_prob) correction *= (correction < self.params.clip_param).float() characteristic_eligibility *= correction.detach() err = -(offset_reinforcement.float()) @ characteristic_eligibility self.optimizer.zero_grad() err.backward() self.optimizer.step()