示例#1
0
 def forward(
     self,
     state: torch.Tensor,
     src_seq: torch.Tensor,
     tgt_out_seq: torch.Tensor,
     src_src_mask: torch.Tensor,
     tgt_out_idx: torch.Tensor,
 ) -> torch.Tensor:
     return self.model(
         rlt.PreprocessedRankingInput(
             state=rlt.FeatureData(float_features=state),
             src_seq=rlt.FeatureData(float_features=src_seq),
             tgt_out_seq=rlt.FeatureData(float_features=tgt_out_seq),
             src_src_mask=src_src_mask,
             tgt_out_idx=tgt_out_idx,
         )).predicted_reward
示例#2
0
    def test_seq2slate_eval_data_page(self):
        """
        Create 3 slate ranking logs and evaluate using Direct Method, Inverse
        Propensity Scores, and Doubly Robust.

        The logs are as follows:
        state: [1, 0, 0], [0, 1, 0], [0, 0, 1]
        indices in logged slates: [3, 2], [3, 2], [3, 2]
        model output indices: [2, 3], [3, 2], [2, 3]
        logged reward: 4, 5, 7
        logged propensities: 0.2, 0.5, 0.4
        predicted rewards on logged slates: 2, 4, 6
        predicted rewards on model outputted slates: 1, 4, 5
        predicted propensities: 0.4, 0.3, 0.7

        When eval_greedy=True:

        Direct Method uses the predicted rewards on model outputted slates.
        Thus the result is expected to be (1 + 4 + 5) / 3

        Inverse Propensity Scores would scale the reward by 1.0 / logged propensities
        whenever the model output slate matches with the logged slate.
        Since only the second log matches with the model output, the IPS result
        is expected to be 5 / 0.5 / 3

        Doubly Robust is the sum of the direct method result and propensity-scaled
        reward difference; the latter is defined as:
        1.0 / logged_propensities * (logged reward - predicted reward on logged slate)
         * Indicator(model slate == logged slate)
        Since only the second logged slate matches with the model outputted slate,
        the DR result is expected to be (1 + 4 + 5) / 3 + 1.0 / 0.5 * (5 - 4) / 3


        When eval_greedy=False:

        Only Inverse Propensity Scores would be accurate. Because it would be too
        expensive to compute all possible slates' propensities and predicted rewards
        for Direct Method.

        The expected IPS = (0.4 / 0.2 * 4 + 0.3 / 0.5 * 5 + 0.7 / 0.4 * 7) / 3
        """
        batch_size = 3
        state_dim = 3
        src_seq_len = 2
        tgt_seq_len = 2
        candidate_dim = 2

        reward_net = FakeSeq2SlateRewardNetwork()
        seq2slate_net = FakeSeq2SlateTransformerNet()

        src_seq = torch.eye(candidate_dim).repeat(batch_size, 1, 1)
        tgt_out_idx = torch.LongTensor([[3, 2], [3, 2], [3, 2]])
        tgt_out_seq = src_seq[
            torch.arange(batch_size).repeat_interleave(tgt_seq_len),
            tgt_out_idx.flatten() - 2, ].reshape(batch_size, tgt_seq_len,
                                                 candidate_dim)

        ptb = rlt.PreprocessedTrainingBatch(
            training_input=rlt.PreprocessedRankingInput(
                state=rlt.FeatureData(float_features=torch.eye(state_dim)),
                src_seq=rlt.FeatureData(float_features=src_seq),
                tgt_out_seq=rlt.FeatureData(float_features=tgt_out_seq),
                src_src_mask=torch.ones(batch_size, src_seq_len, src_seq_len),
                tgt_out_idx=tgt_out_idx,
                tgt_out_probs=torch.tensor([0.2, 0.5, 0.4]),
                slate_reward=torch.tensor([4.0, 5.0, 7.0]),
            ),
            extras=rlt.ExtraData(
                sequence_number=torch.tensor([0, 0, 0]),
                mdp_id=np.array(["0", "1", "2"]),
            ),
        )

        edp = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net, reward_net, ptb.training_input, eval_greedy=True)
        logger.info(
            "---------- Start evaluating eval_greedy=True -----------------")
        doubly_robust_estimator = OPEstimatorAdapter(DoublyRobustEstimator())
        dm_estimator = OPEstimatorAdapter(DMEstimator())
        ips_estimator = OPEstimatorAdapter(IPSEstimator())
        switch_estimator = OPEstimatorAdapter(SwitchEstimator())
        switch_dr_estimator = OPEstimatorAdapter(SwitchDREstimator())

        doubly_robust = doubly_robust_estimator.estimate(edp)
        inverse_propensity = ips_estimator.estimate(edp)
        direct_method = dm_estimator.estimate(edp)

        # Verify that Switch with low exponent is equivalent to IPS
        switch_ips = switch_estimator.estimate(edp, exp_base=1)
        # Verify that Switch with no candidates is equivalent to DM
        switch_dm = switch_estimator.estimate(edp, candidates=0)
        # Verify that SwitchDR with low exponent is equivalent to DR
        switch_dr_dr = switch_dr_estimator.estimate(edp, exp_base=1)
        # Verify that SwitchDR with no candidates is equivalent to DM
        switch_dr_dm = switch_dr_estimator.estimate(edp, candidates=0)

        logger.info(f"{direct_method}, {inverse_propensity}, {doubly_robust}")

        avg_logged_reward = (4 + 5 + 7) / 3
        self.assertAlmostEqual(direct_method.raw, (1 + 4 + 5) / 3, delta=1e-6)
        self.assertAlmostEqual(direct_method.normalized,
                               direct_method.raw / avg_logged_reward,
                               delta=1e-6)
        self.assertAlmostEqual(inverse_propensity.raw, 5 / 0.5 / 3, delta=1e-6)
        self.assertAlmostEqual(
            inverse_propensity.normalized,
            inverse_propensity.raw / avg_logged_reward,
            delta=1e-6,
        )
        self.assertAlmostEqual(doubly_robust.raw,
                               direct_method.raw + 1 / 0.5 * (5 - 4) / 3,
                               delta=1e-6)
        self.assertAlmostEqual(doubly_robust.normalized,
                               doubly_robust.raw / avg_logged_reward,
                               delta=1e-6)
        self.assertAlmostEqual(switch_ips.raw,
                               inverse_propensity.raw,
                               delta=1e-6)
        self.assertAlmostEqual(switch_dm.raw, direct_method.raw, delta=1e-6)
        self.assertAlmostEqual(switch_dr_dr.raw, doubly_robust.raw, delta=1e-6)
        self.assertAlmostEqual(switch_dr_dm.raw, direct_method.raw, delta=1e-6)
        logger.info(
            "---------- Finish evaluating eval_greedy=True -----------------")

        logger.info(
            "---------- Start evaluating eval_greedy=False -----------------")
        edp = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net, reward_net, ptb.training_input, eval_greedy=False)
        doubly_robust_estimator = OPEstimatorAdapter(DoublyRobustEstimator())
        dm_estimator = OPEstimatorAdapter(DMEstimator())
        ips_estimator = OPEstimatorAdapter(IPSEstimator())

        doubly_robust = doubly_robust_estimator.estimate(edp)
        inverse_propensity = ips_estimator.estimate(edp)
        direct_method = dm_estimator.estimate(edp)
        self.assertAlmostEqual(
            inverse_propensity.raw,
            (0.4 / 0.2 * 4 + 0.3 / 0.5 * 5 + 0.7 / 0.4 * 7) / 3,
            delta=1e-6,
        )
        self.assertAlmostEqual(
            inverse_propensity.normalized,
            inverse_propensity.raw / avg_logged_reward,
            delta=1e-6,
        )
        logger.info(
            "---------- Finish evaluating eval_greedy=False -----------------")
示例#3
0
    def _simulated_training_input(self, training_input, sim_tgt_out_idx,
                                  sim_distance, device):
        batch_size, max_tgt_seq_len = sim_tgt_out_idx.shape
        _, max_src_seq_len, candidate_feat_dim = (
            training_input.src_seq.float_features.shape)

        # candidates + padding_symbol + decoder_start_symbol
        candidate_size = max_src_seq_len + 2
        src_seq_augment = torch.zeros(batch_size,
                                      candidate_size,
                                      candidate_feat_dim,
                                      device=device)
        src_seq_augment[:, 2:, :] = training_input.src_seq.float_features

        sim_tgt_in_idx = torch.zeros_like(sim_tgt_out_idx).long()
        sim_tgt_in_idx[:, 0] = DECODER_START_SYMBOL
        sim_tgt_in_idx[:, 1:] = sim_tgt_out_idx[:, :-1]

        sim_tgt_in_seq = rlt.PreprocessedFeatureVector(
            float_features=src_seq_augment[
                torch.arange(batch_size, device=device
                             ).repeat_interleave(  # type: ignore
                                 max_tgt_seq_len),
                sim_tgt_in_idx.flatten(), ].view(batch_size, max_tgt_seq_len,
                                                 candidate_feat_dim))
        sim_tgt_out_seq = rlt.PreprocessedFeatureVector(
            float_features=src_seq_augment[
                torch.arange(batch_size, device=device
                             ).repeat_interleave(  # type: ignore
                                 max_tgt_seq_len),
                sim_tgt_out_idx.flatten(), ].view(batch_size, max_tgt_seq_len,
                                                  candidate_feat_dim))
        sim_tgt_out_probs = torch.tensor([1.0 / len(self.permutation_index)],
                                         device=self.device).repeat(batch_size)

        if self.reward_net is None:
            self.reward_net = _load_reward_net(self.reward_net_path,
                                               self.use_gpu)
        slate_reward = (self.reward_net(
            training_input.state.float_features,
            training_input.src_seq.float_features,
            sim_tgt_out_seq.float_features,
            training_input.src_src_mask,
            sim_tgt_out_idx,
        ).squeeze().detach())
        # guard-rail reward prediction range
        reward_clamp = self.parameters.simulation_reward_clamp
        if reward_clamp is not None:
            slate_reward = torch.clamp(slate_reward,
                                       min=reward_clamp.clamp_min,
                                       max=reward_clamp.clamp_max)
        # guard-rail sequence similarity
        distance_penalty = self.parameters.simulation_distance_penalty
        if distance_penalty is not None:
            slate_reward += distance_penalty * (self.MAX_DISTANCE -
                                                sim_distance)

        on_policy_input = rlt.PreprocessedRankingInput(
            state=training_input.state,
            src_seq=training_input.src_seq,
            src_src_mask=training_input.src_src_mask,
            tgt_in_seq=sim_tgt_in_seq,
            tgt_out_seq=sim_tgt_out_seq,
            tgt_tgt_mask=training_input.tgt_tgt_mask,
            slate_reward=slate_reward,
            src_in_idx=training_input.src_in_idx,
            tgt_in_idx=sim_tgt_in_idx,
            tgt_out_idx=sim_tgt_out_idx,
            tgt_out_probs=sim_tgt_out_probs,
        )
        return on_policy_input
示例#4
0
    def _simulated_training_input(
        self, training_input, sim_tgt_out_idx, sim_distance, device
    ):
        batch_size, max_tgt_seq_len = sim_tgt_out_idx.shape
        (
            _,
            max_src_seq_len,
            candidate_feat_dim,
        ) = training_input.src_seq.float_features.shape

        # candidates + padding_symbol + decoder_start_symbol
        candidate_size = max_src_seq_len + 2
        src_seq_augment = torch.zeros(
            batch_size, candidate_size, candidate_feat_dim, device=device
        )
        src_seq_augment[:, 2:, :] = training_input.src_seq.float_features

        sim_tgt_in_idx = torch.zeros_like(sim_tgt_out_idx).long()
        sim_tgt_in_idx[:, 0] = DECODER_START_SYMBOL
        sim_tgt_in_idx[:, 1:] = sim_tgt_out_idx[:, :-1]

        sim_tgt_in_seq = rlt.FeatureData(
            float_features=src_seq_augment[
                torch.arange(batch_size, device=device).repeat_interleave(
                    max_tgt_seq_len
                ),
                sim_tgt_in_idx.flatten(),
            ].view(batch_size, max_tgt_seq_len, candidate_feat_dim)
        )
        sim_tgt_out_seq = rlt.FeatureData(
            float_features=src_seq_augment[
                torch.arange(batch_size, device=device).repeat_interleave(
                    max_tgt_seq_len
                ),
                sim_tgt_out_idx.flatten(),
            ].view(batch_size, max_tgt_seq_len, candidate_feat_dim)
        )
        sim_tgt_out_probs = torch.tensor(
            [1.0 / len(self.permutation_index)], device=self.device
        ).repeat(batch_size)

        if self.reward_net is None:
            self.reward_net = _load_reward_net(self.reward_net_path, self.use_gpu)
        slate_reward = self.reward_net(
            training_input.state.float_features,
            training_input.src_seq.float_features,
            sim_tgt_out_seq.float_features,
            training_input.src_src_mask,
            sim_tgt_out_idx,
        ).detach()
        if slate_reward.ndim == 1:
            logger.warning(f"Slate reward should be 2-D tensor, unsqueezing")
            slate_reward = slate_reward.unsqueeze(1)
        elif slate_reward.ndim != 2:
            raise RuntimeError("Expect slate reward to be 2-D tensor")
        # guard-rail reward prediction range
        reward_clamp = self.parameters.simulation_reward_clamp
        if reward_clamp is not None:
            slate_reward = torch.clamp(
                slate_reward, min=reward_clamp.clamp_min, max=reward_clamp.clamp_max
            )
        # guard-rail sequence similarity
        distance_penalty = self.parameters.simulation_distance_penalty
        if distance_penalty is not None:
            slate_reward += distance_penalty * (self.MAX_DISTANCE - sim_distance)

        assert (
            len(slate_reward.shape) == 2 and slate_reward.shape[1] == 1
        ), f"{slate_reward.shape}"

        on_policy_input = rlt.PreprocessedRankingInput(
            state=training_input.state,
            src_seq=training_input.src_seq,
            src_src_mask=training_input.src_src_mask,
            tgt_in_seq=sim_tgt_in_seq,
            tgt_out_seq=sim_tgt_out_seq,
            tgt_tgt_mask=training_input.tgt_tgt_mask,
            slate_reward=slate_reward,
            src_in_idx=training_input.src_in_idx,
            tgt_in_idx=sim_tgt_in_idx,
            tgt_out_idx=sim_tgt_out_idx,
            tgt_out_probs=sim_tgt_out_probs,
        )
        return on_policy_input
示例#5
0
    def _simulated_training_input(self, training_input, sim_tgt_out_idx,
                                  sim_distance, device):
        batch_size, max_tgt_seq_len = sim_tgt_out_idx.shape
        (
            _,
            max_src_seq_len,
            candidate_feat_dim,
        ) = training_input.src_seq.float_features.shape

        # candidates + padding_symbol + decoder_start_symbol
        candidate_size = max_src_seq_len + 2
        src_seq_augment = torch.zeros(batch_size,
                                      candidate_size,
                                      candidate_feat_dim,
                                      device=device)
        src_seq_augment[:, 2:, :] = training_input.src_seq.float_features

        sim_tgt_in_idx = torch.zeros_like(sim_tgt_out_idx).long()
        sim_tgt_in_idx[:, 0] = DECODER_START_SYMBOL
        sim_tgt_in_idx[:, 1:] = sim_tgt_out_idx[:, :-1]

        sim_tgt_in_seq = rlt.FeatureData(float_features=src_seq_augment[
            torch.arange(batch_size, device=device
                         ).repeat_interleave(max_tgt_seq_len),
            sim_tgt_in_idx.flatten(), ].view(batch_size, max_tgt_seq_len,
                                             candidate_feat_dim))
        sim_tgt_out_seq = rlt.FeatureData(float_features=src_seq_augment[
            torch.arange(batch_size, device=device
                         ).repeat_interleave(max_tgt_seq_len),
            sim_tgt_out_idx.flatten(), ].view(batch_size, max_tgt_seq_len,
                                              candidate_feat_dim))
        sim_tgt_out_probs = torch.tensor([1.0 / len(self.permutation_index)],
                                         device=self.device).repeat(batch_size)

        if not self.reward_name_and_net:
            self.reward_name_and_net = _load_reward_net(
                self.sim_param.reward_name_path, self.use_gpu)

        sim_slate_reward = torch.zeros_like(training_input.slate_reward)
        for name, reward_net in self.reward_name_and_net.items():
            weight = self.sim_param.reward_name_weight[name]
            sr = reward_net(
                training_input.state.float_features,
                training_input.src_seq.float_features,
                sim_tgt_out_seq.float_features,
                training_input.src_src_mask,
                sim_tgt_out_idx,
            ).detach()
            assert sr.ndim == 2, f"Slate reward {name} output should be 2-D tensor"
            sim_slate_reward += weight * sr

        # guard-rail reward prediction range
        reward_clamp = self.sim_param.reward_clamp
        if reward_clamp is not None:
            sim_slate_reward = torch.clamp(sim_slate_reward,
                                           min=reward_clamp.clamp_min,
                                           max=reward_clamp.clamp_max)
        # guard-rail sequence similarity
        distance_penalty = self.sim_param.distance_penalty
        if distance_penalty is not None:
            sim_slate_reward += distance_penalty * (self.MAX_DISTANCE -
                                                    sim_distance)

        assert (len(sim_slate_reward.shape) == 2 and sim_slate_reward.shape[1]
                == 1), f"{sim_slate_reward.shape}"

        on_policy_input = rlt.PreprocessedRankingInput(
            state=training_input.state,
            src_seq=training_input.src_seq,
            src_src_mask=training_input.src_src_mask,
            tgt_in_seq=sim_tgt_in_seq,
            tgt_out_seq=sim_tgt_out_seq,
            tgt_tgt_mask=training_input.tgt_tgt_mask,
            slate_reward=sim_slate_reward,
            src_in_idx=training_input.src_in_idx,
            tgt_in_idx=sim_tgt_in_idx,
            tgt_out_idx=sim_tgt_out_idx,
            tgt_out_probs=sim_tgt_out_probs,
        )
        return on_policy_input