def handle(self, tdp: TrainingDataPage) -> None: if not self.trainer.calc_cpe_in_training: return if isinstance(tdp, TrainingDataPage): if isinstance(self.trainer, DQNTrainer): # This is required until we get rid of TrainingDataPage if self.trainer.maxq_learning: edp = EvaluationDataPage.create_from_training_batch( tdp.as_discrete_maxq_training_batch(), self.trainer) else: edp = EvaluationDataPage.create_from_training_batch( tdp.as_discrete_sarsa_training_batch(), self.trainer) else: edp = EvaluationDataPage.create_from_tdp(tdp, self.trainer) elif isinstance(tdp, TrainingBatch): if isinstance(self.trainer, SACTrainer): # TODO: Implement CPE for continuous algos edp = None else: edp = EvaluationDataPage.create_from_training_batch( tdp, self.trainer) if self.evaluation_data is None: self.evaluation_data = edp else: self.evaluation_data = self.evaluation_data.append(edp)
def handle(self, tdp: TrainingBatch) -> None: if not self.trainer.calc_cpe_in_training: return if isinstance(tdp, TrainingBatch): if isinstance(self.trainer, DQNTrainer): # This is required until we get rid of TrainingBatch if self.trainer.maxq_learning: edp = EvaluationDataPage.create_from_training_batch( tdp, self.trainer ) else: edp = EvaluationDataPage.create_from_training_batch( tdp, self.trainer ) else: edp = EvaluationDataPage.create_from_training_batch(tdp, self.trainer) elif isinstance(tdp, TrainingBatch): # TODO: Perhaps we can make an RLTrainer param to check if continuous? if isinstance(self.trainer, SACTrainer): # TODO: Implement CPE for continuous algos edp = None else: edp = EvaluationDataPage.create_from_training_batch(tdp, self.trainer) if self.evaluation_data is None: self.evaluation_data = edp else: self.evaluation_data = self.evaluation_data.append(edp)
def evaluate(self, eval_tdp: PreprocessedTrainingBatch): seq2slate_net = self.trainer.seq2slate_net baseline_net = self.trainer.baseline_net seq2slate_net_prev_mode = seq2slate_net.training baseline_net_prev_mode = baseline_net.training seq2slate_net.eval() baseline_net.eval() log_prob = (seq2slate_net(eval_tdp.training_input, mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE). log_probs.detach().flatten().cpu().numpy()) b = baseline_net(eval_tdp.training_input).squeeze().detach() advantage = (eval_tdp.training_input.slate_reward - b).flatten().cpu().numpy() self.baseline_loss.append( F.mse_loss(b, eval_tdp.training_input.slate_reward).item()) self.advantages.append(advantage) self.log_probs.append(log_prob) seq2slate_net.train(seq2slate_net_prev_mode) baseline_net.train(baseline_net_prev_mode) if not self.calc_cpe: return edp = EvaluationDataPage.create_from_training_batch( eval_tdp, self.trainer, self.reward_network) if self.eval_data_pages is None: self.eval_data_pages = edp else: self.eval_data_pages = self.eval_data_pages.append(edp)
def handle(self, tdp: PreprocessedTrainingBatch) -> None: if not self.trainer.calc_cpe_in_training: return # TODO: Perhaps we can make an RLTrainer param to check if continuous? if isinstance(self.trainer, (SACTrainer, TD3Trainer)): # TODO: Implement CPE for continuous algos edp = None else: edp = EvaluationDataPage.create_from_training_batch(tdp, self.trainer) if self.evaluation_data is None: self.evaluation_data = edp else: self.evaluation_data = self.evaluation_data.append(edp)
def handle(self, tdp: TrainingDataPage) -> None: if not self.trainer.calc_cpe_in_training: return if isinstance(tdp, TrainingDataPage): edp = EvaluationDataPage.create_from_tdp(tdp, self.trainer) elif isinstance(tdp, TrainingBatch): if isinstance(self.trainer, (_DQNTrainer, SACTrainer)): # TODO: Implement CPE for modular DQNTrainer & continuous algos edp = None else: edp = EvaluationDataPage.create_from_training_batch( tdp, self.trainer) if self.evaluation_data is None: self.evaluation_data = edp else: self.evaluation_data = self.evaluation_data.append(edp)
def test_seq2slate_eval_data_page(self): """ Create 3 slate ranking logs and evaluate using Direct Method, Inverse Propensity Scores, and Doubly Robust. The logs are as follows: state: [1, 0, 0], [0, 1, 0], [0, 0, 1] indices in logged slates: [3, 2], [3, 2], [3, 2] model output indices: [2, 3], [3, 2], [2, 3] logged reward: 4, 5, 7 logged propensities: 0.2, 0.5, 0.4 predicted rewards on logged slates: 2, 4, 6 predicted rewards on model outputted slates: 1, 4, 5 Direct Method uses the predicted rewards on model outputted slates. Thus the result is expected to be (1 + 4 + 5) / 3 Inverse Propensity Scores would scale the reward by 1.0 / logged propensities whenever the model output slate matches with the logged slate. Since only the second log matches with the model output, the IPS result is expected to be 5 / 0.5 / 3 Doubly Robust is the sum of the direct method result and propensity-scaled reward difference; the latter is defined as: 1.0 / logged_propensities * (logged reward - predicted reward on logged slate) * Indicator(model slate == logged slate) Since only the second logged slate matches with the model outputted slate, the DR result is expected to be (1 + 4 + 5) / 3 + 1.0 / 0.5 * (5 - 4) / 3 """ batch_size = 3 state_dim = 3 src_seq_len = 2 tgt_seq_len = 2 candidate_dim = 2 reward_net = FakeSeq2SlateRewardNetwork() seq2slate_net = FakeSeq2SlateTransformerNet() baseline_net = nn.Linear(1, 1) trainer = Seq2SlateTrainer( seq2slate_net, baseline_net, parameters=None, minibatch_size=3, use_gpu=False, ) src_seq = torch.eye(candidate_dim).repeat(batch_size, 1, 1) tgt_out_idx = torch.LongTensor([[3, 2], [3, 2], [3, 2]]) tgt_out_seq = src_seq[torch.arange(batch_size). repeat_interleave(tgt_seq_len), # type: ignore tgt_out_idx.flatten() - 2, ].reshape( batch_size, tgt_seq_len, candidate_dim) ptb = rlt.PreprocessedTrainingBatch( training_input=rlt.PreprocessedRankingInput( state=rlt.PreprocessedFeatureVector( float_features=torch.eye(state_dim)), src_seq=rlt.PreprocessedFeatureVector(float_features=src_seq), tgt_out_seq=rlt.PreprocessedFeatureVector( float_features=tgt_out_seq), src_src_mask=torch.ones(batch_size, src_seq_len, src_seq_len), tgt_out_idx=tgt_out_idx, tgt_out_probs=torch.tensor([0.2, 0.5, 0.4]), slate_reward=torch.tensor([4.0, 5.0, 7.0]), ), extras=rlt.ExtraData( sequence_number=torch.tensor([0, 0, 0]), mdp_id=np.array(["0", "1", "2"]), ), ) edp = EvaluationDataPage.create_from_training_batch( ptb, trainer, reward_net) doubly_robust_estimator = DoublyRobustEstimator() direct_method, inverse_propensity, doubly_robust = doubly_robust_estimator.estimate( edp) logger.info(f"{direct_method}, {inverse_propensity}, {doubly_robust}") avg_logged_reward = (4 + 5 + 7) / 3 self.assertAlmostEqual(direct_method.raw, (1 + 4 + 5) / 3, delta=1e-6) self.assertAlmostEqual(direct_method.normalized, direct_method.raw / avg_logged_reward, delta=1e-6) self.assertAlmostEqual(inverse_propensity.raw, 5 / 0.5 / 3, delta=1e-6) self.assertAlmostEqual( inverse_propensity.normalized, inverse_propensity.raw / avg_logged_reward, delta=1e-6, ) self.assertAlmostEqual(doubly_robust.raw, direct_method.raw + 1 / 0.5 * (5 - 4) / 3, delta=1e-6) self.assertAlmostEqual(doubly_robust.normalized, doubly_robust.raw / avg_logged_reward, delta=1e-6)