示例#1
0
def test_ipw_using_random_evaluation_policy(
    synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray
) -> None:
    """
    Test the format of ipw variants using synthetic bandit data and random evaluation policy
    """
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    # ipw estimtors can be used without estimated_rewards_by_reg_model
    for estimator in [ipw, snipw]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(
            estimated_policy_value, float
        ), f"invalid type response: {estimator}"
    # remove necessary keys
    del input_dict["reward"]
    del input_dict["pscore"]
    del input_dict["action"]
    for estimator in [ipw, snipw]:
        with pytest.raises(
            TypeError,
            match=re.escape(
                "estimate_policy_value() missing 3 required positional arguments: 'reward', 'action', and 'pscore'"
            ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)

    input_tensor_dict = {
        k: v if v is None else torch.from_numpy(v)
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_tensor_dict["action_dist"] = torch.from_numpy(action_dist)
    for estimator in [ipw, snipw]:
        estimated_policy_value = estimator.estimate_policy_value_tensor(
            **input_tensor_dict
        )
        assert isinstance(
            estimated_policy_value, torch.Tensor
        ), f"invalid type response: {estimator}"
    # remove necessary keys
    del input_tensor_dict["reward"]
    del input_tensor_dict["pscore"]
    del input_tensor_dict["action"]
    for estimator in [ipw, snipw]:
        with pytest.raises(
            TypeError,
            match=re.escape(
                "estimate_policy_value_tensor() missing 3 required positional arguments: 'reward', 'action', and 'pscore'"
            ),
        ):
            _ = estimator.estimate_policy_value_tensor(**input_tensor_dict)
示例#2
0
def test_dr_shrinkage_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the dr shrinkage estimators using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :,
                                                                   np.newaxis]
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    dm_value = dm.estimate_policy_value(**input_dict)
    dr_value = dr.estimate_policy_value(**input_dict)
    dr_shrink_0_value = dr_shrink_0.estimate_policy_value(**input_dict)
    dr_shrink_max_value = dr_shrink_max.estimate_policy_value(**input_dict)
    assert (
        dm_value == dr_shrink_0_value
    ), "DoublyRobustWithShrinkage (lambda=0) should be the same as DirectMethod"
    assert (
        np.abs(dr_value - dr_shrink_max_value) < 1e-5
    ), "DoublyRobustWithShrinkage (lambda=inf) should be almost the same as DoublyRobust"

    # prepare input dict
    input_tensor_dict = {
        k: v if v is None else torch.from_numpy(v)
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_tensor_dict["action_dist"] = torch.from_numpy(action_dist)
    input_tensor_dict["estimated_rewards_by_reg_model"] = torch.from_numpy(
        expected_reward)
    dm_value = dm.estimate_policy_value_tensor(**input_tensor_dict)
    dr_value = dr.estimate_policy_value_tensor(**input_tensor_dict)
    dr_shrink_0_value = dr_shrink_0.estimate_policy_value_tensor(
        **input_tensor_dict)
    dr_shrink_max_value = dr_shrink_max.estimate_policy_value_tensor(
        **input_tensor_dict)
    assert (
        dm_value.item() == dr_shrink_0_value.item()
    ), "DoublyRobustWithShrinkage (lambda=0) should be the same as DirectMethod"
    assert (
        np.abs(dr_value.item() - dr_shrink_max_value.item()) < 1e-5
    ), "DoublyRobustWithShrinkage (lambda=inf) should be almost the same as DoublyRobust"
示例#3
0
def test_switch_dr_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the switch_dr using synthetic bandit data and random evaluation policy
    """
    expected_reward = np.expand_dims(
        synthetic_bandit_feedback["expected_reward"], axis=-1)
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    dm_value = dm.estimate_policy_value(**input_dict)
    dr_value = dr.estimate_policy_value(**input_dict)
    switch_dr_0_value = switch_dr_0.estimate_policy_value(**input_dict)
    switch_dr_max_value = switch_dr_max.estimate_policy_value(**input_dict)
    assert (dm_value == switch_dr_0_value
            ), "SwitchDR (tau=0) should be the same as DirectMethod"
    assert (dr_value == switch_dr_max_value
            ), "SwitchDR (tau=1e10) should be the same as DoublyRobust"
示例#4
0
def test_dr_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the format of dr variants using synthetic bandit data and random evaluation policy
    """
    expected_reward = np.expand_dims(
        synthetic_bandit_feedback["expected_reward"], axis=-1)
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    # dr estimtors require all arguments
    for estimator in dr_estimators:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(estimated_policy_value,
                          float), f"invalid type response: {estimator}"
    # remove necessary keys
    del input_dict["reward"]
    del input_dict["pscore"]
    del input_dict["action"]
    del input_dict["estimated_rewards_by_reg_model"]
    for estimator in dr_estimators:
        with pytest.raises(
                TypeError,
                match=re.escape(
                    "estimate_policy_value() missing 4 required positional arguments: 'reward', 'action', 'pscore', and 'estimated_rewards_by_reg_model'"
                ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)
示例#5
0
def test_sg_dr_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the switch_dr using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :,
                                                                   np.newaxis]
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    dr_value = dr.estimate_policy_value(**input_dict)
    sg_dr_0_value = sg_dr_0.estimate_policy_value(**input_dict)
    assert (dr_value == sg_dr_0_value
            ), "SG-DR (lambda=0) should be the same as DoublyRobust"
    input_dict["estimated_pscore"] = input_dict["pscore"]
    del input_dict["pscore"]
    dr_value_estimated_pscore = dr_estimated_pscore.estimate_policy_value(
        **input_dict)
    assert (
        dr_value == dr_value_estimated_pscore
    ), "DoublyRobust with estimated_pscore (which is the same as pscore) should be the same as DoublyRobust"
示例#6
0
def test_boundedness_of_snipw_using_random_evaluation_policy(
    synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray
) -> None:
    """
    Test the boundedness of snipw estimators using synthetic bandit data and random evaluation policy
    """
    action_dist = random_action_dist
    # prepare snipw
    snipw = SelfNormalizedInverseProbabilityWeighting()
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    # make pscore too small (to check the boundedness of snipw)
    input_dict["pscore"] = input_dict["pscore"] ** 3
    estimated_policy_value = snipw.estimate_policy_value(**input_dict)
    assert (
        estimated_policy_value <= 1
    ), f"estimated policy value of snipw should be smaller than or equal to 1 (because of its 1-boundedness), but the value is: {estimated_policy_value}"

    # ipw with estimated pscore
    snipw_estimated_pscore = SelfNormalizedInverseProbabilityWeighting(
        use_estimated_pscore=True
    )
    input_dict["estimated_pscore"] = input_dict["pscore"]
    del input_dict["pscore"]
    estimated_policy_value = snipw_estimated_pscore.estimate_policy_value(**input_dict)
    assert (
        estimated_policy_value <= 1
    ), f"estimated policy value of snipw should be smaller than or equal to 1 (because of its 1-boundedness), but the value is: {estimated_policy_value}"
示例#7
0
def test_boundedness_of_sndr_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the boundedness of sndr estimators using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :,
                                                                   np.newaxis]
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    # make pscore too small (to check the boundedness of sndr)
    input_dict["pscore"] = input_dict["pscore"]**3
    estimated_policy_value = sndr.estimate_policy_value(**input_dict)
    assert (
        estimated_policy_value <= 2
    ), f"estimated policy value of sndr should be smaller than or equal to 2 (because of its 2-boundedness), but the value is: {estimated_policy_value}"

    # prepare input dict
    input_tensor_dict = {
        k: v if v is None else torch.from_numpy(v)
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_tensor_dict["action_dist"] = torch.from_numpy(action_dist)
    input_tensor_dict["estimated_rewards_by_reg_model"] = torch.from_numpy(
        expected_reward)
    # make pscore too small (to check the boundedness of sndr)
    input_tensor_dict["pscore"] = input_tensor_dict["pscore"]**3
    estimated_policy_value = sndr.estimate_policy_value_tensor(
        **input_tensor_dict)
    assert (
        estimated_policy_value.item() <= 2
    ), f"estimated policy value of sndr should be smaller than or equal to 2 (because of its 2-boundedness), but the value is: {estimated_policy_value.item()}"
示例#8
0
def test_ipw_using_random_evaluation_policy(
    synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray
) -> None:
    """
    Test the format of ipw variants using synthetic bandit data and random evaluation policy
    """
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    # ipw estimators can be used without estimated_rewards_by_reg_model
    for estimator in [ipw, snipw, ipw_tuning_mse, ipw_tuning_slope]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(
            estimated_policy_value, float
        ), f"invalid type response: {estimator}"

    # ipw with estimated pscore
    ipw_estimated_pscore = InverseProbabilityWeighting(use_estimated_pscore=True)
    snipw_estimated_pscore = SelfNormalizedInverseProbabilityWeighting(
        use_estimated_pscore=True
    )
    ipw_tuning_estimated_pscore = InverseProbabilityWeightingTuning(
        lambdas=[10, 1000], use_estimated_pscore=True
    )
    input_dict["estimated_pscore"] = input_dict["pscore"]
    del input_dict["pscore"]
    for estimator in [
        ipw_estimated_pscore,
        snipw_estimated_pscore,
        ipw_tuning_estimated_pscore,
    ]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(
            estimated_policy_value, float
        ), f"invalid type response: {estimator}"

    # remove necessary keys
    del input_dict["reward"]
    del input_dict["action"]
    for estimator in [ipw, snipw]:
        with pytest.raises(
            TypeError,
            match=re.escape(
                "estimate_policy_value() missing 2 required positional arguments: 'reward' and 'action'"
            ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)
def test_dr_using_random_evaluation_policy(
        synthetic_multi_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the format of dr variants using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_multi_bandit_feedback[
        "expected_reward"][:, :, np.newaxis]
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_multi_bandit_feedback.items() if k in [
            "reward", "action", "pscore", "pscore_avg", "stratum_idx",
            "position"
        ]
    }
    input_dict["action_dist"] = action_dist
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    naive_dr = NaiveDR()
    bal_dr = BalDR()
    weighted_dr = WeightedDR()
    # dr estimators require all arguments
    for estimator in [naive_dr, bal_dr, weighted_dr]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(estimated_policy_value,
                          float), f"invalid type response: {estimator}"

    naive_dr = NaiveDR(use_estimated_pscore=True)
    bal_dr = BalDR(use_estimated_pscore=True)
    weighted_dr = WeightedDR(use_estimated_pscore=True)
    input_dict["estimated_pscore"] = input_dict["pscore"]
    input_dict["estimated_pscore_avg"] = input_dict["pscore"]
    # dr estimators require all arguments
    for estimator in [naive_dr, bal_dr, weighted_dr]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(estimated_policy_value,
                          float), f"invalid type response: {estimator}"

    # remove necessary keys
    del input_dict["reward"]
    del input_dict["action"]
    del input_dict["estimated_rewards_by_reg_model"]
    for estimator in [naive_dr, bal_dr, weighted_dr]:
        with pytest.raises(
                TypeError,
                match=re.escape(
                    "estimate_policy_value() missing 3 required positional arguments: 'reward', 'action', and 'estimated_rewards_by_reg_model'"
                ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)
示例#10
0
def test_fixture(
    synthetic_bandit_feedback: BanditFeedback,
    expected_reward_0: np.ndarray,
    feedback_key_set: Set[str],
    random_action_dist: np.ndarray,
) -> None:
    """
    Check the validity of the fixture data generated by conftest.py
    """
    np.testing.assert_array_almost_equal(
        expected_reward_0, synthetic_bandit_feedback["expected_reward"][0])
    assert feedback_key_set == set(
        synthetic_bandit_feedback.keys()
    ), f"Key set of bandit feedback should be {feedback_key_set}, but {synthetic_bandit_feedback.keys()}"
def test_performance_of_binary_outcome_models(
        fixed_synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the performance of ope estimators using synthetic bandit data and random evaluation policy
    when the importance weight estimator is estimated by a logistic regression
    """
    bandit_feedback = fixed_synthetic_bandit_feedback.copy()
    action_dist = random_action_dist
    random_state = 12345
    auc_scores: Dict[str, float] = {}
    fit_methods = ["sample", "raw"]
    for fit_method in fit_methods:
        for model_name, model in binary_model_dict.items():
            importance_weight_estimator = ImportanceWeightEstimator(
                n_actions=bandit_feedback["n_actions"],
                action_context=bandit_feedback["action_context"],
                base_model=model(**hyperparams[model_name]),
                fitting_method=fit_method,
                len_list=1,
            )
            # train importance weight estimator on logged bandit feedback data
            estimated_importance_weight = importance_weight_estimator.fit_predict(
                context=bandit_feedback["context"],
                action=bandit_feedback["action"],
                action_dist=action_dist,
                n_folds=2,  # 2-fold cross-fitting
                random_state=random_state,
                evaluate_model_performance=True,
            )
            assert np.all(estimated_importance_weight >= 0
                          ), "estimated_importance_weight must be non-negative"
            # extract predictions
            tmp_y = []
            tmp_pred = []
            for i in range(len(importance_weight_estimator.eval_result["y"])):
                tmp_y.append(importance_weight_estimator.eval_result["y"][i])
                tmp_pred.append(
                    importance_weight_estimator.eval_result["proba"][i])
            y_test = np.array(tmp_y).flatten()
            y_pred = np.array(tmp_pred).flatten()
            auc_scores[model_name + "_" + fit_method] = roc_auc_score(
                y_true=y_test,
                y_score=y_pred,
            )

    for model_name in auc_scores:
        print(f"AUC of {model_name} is {auc_scores[model_name]}")
        assert (auc_scores[model_name] >
                0.5), f"AUC of {model_name} should be greater than 0.5"
示例#12
0
def test_ipw_using_random_evaluation_policy(
        synthetic_multi_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the format of ipw variants using synthetic bandit data and random evaluation policy
    """
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_multi_bandit_feedback.items() if k in [
            "reward", "action", "pscore", "pscore_avg", "stratum_idx",
            "position"
        ]
    }
    input_dict["action_dist"] = action_dist
    naive_ipw = NaiveIPW()
    bal_ipw = BalIPW()
    weighted_ipw = WeightedIPW()
    # ipw estimators can be used without estimated_rewards_by_reg_model
    for estimator in [naive_ipw, bal_ipw, weighted_ipw]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(estimated_policy_value,
                          float), f"invalid type response: {estimator}"

    # ipw with estimated pscore
    naive_ipw = NaiveIPW(use_estimated_pscore=True)
    bal_ipw = BalIPW(use_estimated_pscore=True)
    weighted_ipw = WeightedIPW(use_estimated_pscore=True)
    input_dict["estimated_pscore"] = input_dict["pscore"]
    input_dict["estimated_pscore_avg"] = input_dict["pscore"]
    del input_dict["pscore"]
    del input_dict["pscore_avg"]
    for estimator in [naive_ipw, bal_ipw, weighted_ipw]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(estimated_policy_value,
                          float), f"invalid type response: {estimator}"

    # remove necessary keys
    del input_dict["reward"]
    del input_dict["action"]
    for estimator in [naive_ipw, weighted_ipw, bal_ipw]:
        with pytest.raises(
                TypeError,
                match=re.escape(
                    "estimate_policy_value() missing 2 required positional arguments: 'reward' and 'action'"
                ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)
示例#13
0
def test_dm_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the performance of the direct method using synthetic bandit data and random evaluation policy
    """
    expected_reward = np.expand_dims(
        synthetic_bandit_feedback["expected_reward"], axis=-1)
    action_dist = random_action_dist
    # compute ground truth policy value using expected reward
    q_pi_e = np.average(expected_reward[:, :, 0],
                        weights=action_dist[:, :, 0],
                        axis=1)
    # compute statistics of ground truth policy value
    gt_mean = q_pi_e.mean()
    # prepare dm
    dm = DirectMethod()
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    # estimated_rewards_by_reg_model is required
    with pytest.raises(
            TypeError,
            match=re.escape(
                "estimate_policy_value() missing 1 required positional argument: 'estimated_rewards_by_reg_model'"
            ),
    ):
        _ = dm.estimate_policy_value(**input_dict)
    # add estimated_rewards_by_reg_model
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    # check expectation
    estimated_policy_value = dm.estimate_policy_value(**input_dict)
    assert (gt_mean == estimated_policy_value
            ), "DM should be perfect when the regression model is perfect"
    # remove unnecessary keys
    del input_dict["reward"]
    del input_dict["pscore"]
    del input_dict["action"]
    estimated_policy_value = dm.estimate_policy_value(**input_dict)
    assert (gt_mean == estimated_policy_value
            ), "DM should be perfect when the regression model is perfect"
示例#14
0
def test_performance_of_binary_outcome_models(
    fixed_synthetic_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the performance of ope estimators using synthetic bandit data and random evaluation policy
    when the propensity score estimator is estimated by a logistic regression
    """
    bandit_feedback = fixed_synthetic_bandit_feedback.copy()
    random_state = 12345
    auc_scores: Dict[str, float] = {}
    for model_name, model in binary_model_dict.items():
        propensity_score_estimator = PropensityScoreEstimator(
            n_actions=bandit_feedback["n_actions"],
            base_model=model(**hyperparams[model_name]),
            len_list=1,
        )
        # train propensity score estimator on logged bandit feedback data
        estimated_propensity_score = propensity_score_estimator.fit_predict(
            context=bandit_feedback["context"],
            action=bandit_feedback["action"],
            n_folds=2,  # 2-fold cross-fitting
            random_state=random_state,
            evaluate_model_performance=True,
        )
        assert np.all(
            estimated_propensity_score >= 0
        ), "estimated_propensity_score must be non-negative"
        # extract predictions
        tmp_y = []
        tmp_pred = []
        for i in range(len(propensity_score_estimator.eval_result["y"])):
            tmp_y.append(propensity_score_estimator.eval_result["y"][i])
            tmp_pred.append(propensity_score_estimator.eval_result["proba"][i])
        y_test = np.array(tmp_y).flatten()
        y_pred = np.array(tmp_pred).reshape(-1, tmp_pred[0].shape[1])
        auc_scores[model_name] = roc_auc_score(
            y_true=y_test, y_score=y_pred, multi_class="ovo"
        )

    for model_name in auc_scores:
        print(f"AUC (macro-ovo) of {model_name} is {auc_scores[model_name]}")
        assert (
            auc_scores[model_name] > 0.5
        ), f"AUC of {model_name} should be greater than 0.5"
示例#15
0
def test_ipw_using_random_evaluation_policy(
    synthetic_bandit_feedback_with_embed: BanditFeedback, random_action_dist: np.ndarray
) -> None:
    """
    Test the format of ipw variants using synthetic bandit data and random evaluation policy
    """
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback_with_embed.items()
        if k in ["reward", "action", "pi_b", "action_embed", "context", "position"]
    }
    input_dict["action_dist"] = action_dist
    mipw = MIPW(n_actions=synthetic_bandit_feedback_with_embed["n_actions"])
    mipw_exact = MIPW(
        n_actions=synthetic_bandit_feedback_with_embed["n_actions"],
        embedding_selection_method="exact",
    )
    mipw_greedy = MIPW(
        n_actions=synthetic_bandit_feedback_with_embed["n_actions"],
        embedding_selection_method="greedy",
    )
    snmipw = SNMIPW(n_actions=synthetic_bandit_feedback_with_embed["n_actions"])
    # ipw estimators can be used without estimated_rewards_by_reg_model
    for estimator in [mipw, mipw_exact, mipw_greedy, snmipw]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(
            estimated_policy_value, float
        ), f"invalid type response: {estimator}"

    # remove necessary keys
    del input_dict["reward"]
    del input_dict["action"]
    for estimator in [mipw, snmipw]:
        with pytest.raises(
            TypeError,
            match=re.escape(
                "estimate_policy_value() missing 2 required positional arguments: 'reward' and 'action'"
            ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)
示例#16
0
def test_bipw_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the format of bipw variants using synthetic bandit data and random evaluation policy
    """
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    # insert dummy values
    input_dict["estimated_importance_weights"] = np.ones(action_dist.shape[0])
    # check responce
    for estimator in [bipw]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(estimated_policy_value,
                          float), f"invalid type response: {estimator}"

    # make estimated_importance_weights too small (to check the boundedness of snbipw)
    input_dict["estimated_importance_weights"] = input_dict["pscore"]**3
    estimated_policy_value = bipw.estimate_policy_value(**input_dict)
    assert (
        estimated_policy_value <= 1
    ), f"estimated policy value of bipw should be smaller than or equal to 1 (because of its 1-boundedness), but the value is: {estimated_policy_value}"

    # remove necessary keys
    del input_dict["reward"]
    del input_dict["action"]
    del input_dict["estimated_importance_weights"]
    for estimator in [bipw]:
        with pytest.raises(
                TypeError,
                match=re.escape(
                    "estimate_policy_value() missing 3 required positional arguments: 'reward', 'action', and 'estimated_importance_weights'"
                ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)
示例#17
0
def test_performance_of_binary_outcome_models(
    fixed_synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray
) -> None:
    """
    Test the performance of ope estimators using synthetic bandit data and random evaluation policy
    when the regression model is estimated by a logistic regression
    """
    bandit_feedback = fixed_synthetic_bandit_feedback.copy()
    expected_reward = np.expand_dims(bandit_feedback["expected_reward"], axis=-1)
    action_dist = random_action_dist
    # compute ground truth policy value using expected reward
    q_pi_e = np.average(expected_reward[:, :, 0], weights=action_dist[:, :, 0], axis=1)
    # compute statistics of ground truth policy value
    gt_mean = q_pi_e.mean()
    random_state = 12345
    auc_scores: Dict[str, float] = {}
    # check ground truth
    print(f"gt_mean: {gt_mean}")
    # check the performance of regression models using doubly robust criteria (|\hat{q} - q| <= |q| is satisfied with a high probability)
    dr_criteria_pass_rate = 0.8
    fit_methods = ["normal", "iw", "mrdr"]
    for fit_method in fit_methods:
        for model_name, model in binary_model_dict.items():
            regression_model = RegressionModel(
                n_actions=bandit_feedback["n_actions"],
                len_list=int(bandit_feedback["position"].max() + 1),
                action_context=bandit_feedback["action_context"],
                base_model=model(**hyperparams[model_name]),
                fitting_method=fit_method,
            )
            if fit_method == "normal":
                # train regression model on logged bandit feedback data
                estimated_rewards_by_reg_model = regression_model.fit_predict(
                    context=bandit_feedback["context"],
                    action=bandit_feedback["action"],
                    reward=bandit_feedback["reward"],
                    n_folds=3,  # 3-fold cross-fitting
                    random_state=random_state,
                )
            else:
                # train regression model on logged bandit feedback data
                estimated_rewards_by_reg_model = regression_model.fit_predict(
                    context=bandit_feedback["context"],
                    action=bandit_feedback["action"],
                    reward=bandit_feedback["reward"],
                    pscore=bandit_feedback["pscore"],
                    position=bandit_feedback["position"],
                    action_dist=action_dist,
                    n_folds=3,  # 3-fold cross-fitting
                    random_state=random_state,
                )
            auc_scores[model_name + "_" + fit_method] = roc_auc_score(
                y_true=bandit_feedback["reward"],
                y_score=estimated_rewards_by_reg_model[
                    np.arange(bandit_feedback["reward"].shape[0]),
                    bandit_feedback["action"],
                    bandit_feedback["position"],
                ],
            )
            # compare dr criteria
            dr_criteria = np.abs((gt_mean - estimated_rewards_by_reg_model)) - np.abs(
                gt_mean
            )
            print(
                f"Dr criteria is satisfied with probability {np.mean(dr_criteria <= 0)} ------ model: {model_name} ({fit_method}),"
            )
            assert (
                np.mean(dr_criteria <= 0) >= dr_criteria_pass_rate
            ), f" should be satisfied with a probability at least {dr_criteria_pass_rate}"

    for model_name in auc_scores:
        print(f"AUC of {model_name} is {auc_scores[model_name]}")
        assert (
            auc_scores[model_name] > 0.5
        ), f"AUC of {model_name} should be greater than 0.5"
示例#18
0
def test_dr_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the format of dr variants using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :,
                                                                   np.newaxis]
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    # dr estimtors require all arguments
    for estimator in dr_estimators:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(estimated_policy_value,
                          float), f"invalid type response: {estimator}"
    # remove necessary keys
    del input_dict["reward"]
    del input_dict["pscore"]
    del input_dict["action"]
    del input_dict["estimated_rewards_by_reg_model"]
    for estimator in dr_estimators:
        with pytest.raises(
                TypeError,
                match=re.escape(
                    "estimate_policy_value() missing 4 required positional arguments: 'reward', 'action', 'pscore', and 'estimated_rewards_by_reg_model'"
                ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)

    # prepare input dict
    input_tensor_dict = {
        k: v if v is None else torch.from_numpy(v)
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_tensor_dict["action_dist"] = torch.from_numpy(action_dist)
    input_tensor_dict["estimated_rewards_by_reg_model"] = torch.from_numpy(
        expected_reward)
    # dr estimtors require all arguments
    for estimator in dr_estimators:
        if estimator.estimator_name == "switch-dr":
            with pytest.raises(
                    NotImplementedError,
                    match=re.escape(
                        "This is not implemented for Swtich-DR because it is indifferentiable."
                    ),
            ):
                _ = estimator.estimate_policy_value_tensor(**input_tensor_dict)
        else:
            estimated_policy_value = estimator.estimate_policy_value_tensor(
                **input_tensor_dict)
            assert isinstance(
                estimated_policy_value,
                torch.Tensor), f"invalid type response: {estimator}"
    # remove necessary keys
    del input_tensor_dict["reward"]
    del input_tensor_dict["pscore"]
    del input_tensor_dict["action"]
    del input_tensor_dict["estimated_rewards_by_reg_model"]
    for estimator in dr_estimators:
        if estimator.estimator_name == "switch-dr":
            with pytest.raises(
                    NotImplementedError,
                    match=re.escape(
                        "This is not implemented for Swtich-DR because it is indifferentiable."
                    ),
            ):
                _ = estimator.estimate_policy_value_tensor(**input_tensor_dict)
        else:
            with pytest.raises(
                    TypeError,
                    match=re.escape(
                        "estimate_policy_value_tensor() missing 4 required positional arguments: 'reward', 'action', 'pscore', and 'estimated_rewards_by_reg_model'"
                    ),
            ):
                _ = estimator.estimate_policy_value_tensor(**input_tensor_dict)