Python OffPolicyEvaluation.estimate_policy_values示例，obp.ope.OffPolicyEvaluation.estimate_policy_values Python示例

示例#1

0

显示文件

文件： test_meta.py 项目： jq/zr-obp

def test_meta_estimation_format(synthetic_bandit_feedback: BanditFeedback,
                                random_action_dist: np.ndarray) -> None:
    """
    Test the response format of OffPolicyEvaluation
    """
    # single ope estimator
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[dm])
    assert ope_.estimate_policy_values(random_action_dist) == {
        "dm": mock_policy_value
    }, "OffPolicyEvaluation.estimate_policy_values ([DirectMethod]) returns a wrong value"
    assert ope_.estimate_intervals(random_action_dist) == {
        "dm": mock_confidence_interval
    }, "OffPolicyEvaluation.estimate_intervals ([DirectMethod]) returns a wrong value"
    with pytest.raises(AssertionError,
                       match=r"action_dist must be 3-dimensional.*"):
        ope_.estimate_policy_values(
            random_action_dist[:, :, 0]
        ), "action_dist must be 3-dimensional when using OffPolicyEvaluation"
    # multiple ope estimators
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[dm, ipw])
    assert ope_.estimate_policy_values(random_action_dist) == {
        "dm": mock_policy_value,
        "ipw": mock_policy_value + ipw.eps,
    }, "OffPolicyEvaluation.estimate_policy_values ([DirectMethod, IPW]) returns a wrong value"
    assert ope_.estimate_intervals(random_action_dist) == {
        "dm": mock_confidence_interval,
        "ipw": {k: v + ipw.eps
                for k, v in mock_confidence_interval.items()},
    }, "OffPolicyEvaluation.estimate_intervals ([DirectMethod]) returns a wrong value"

示例#2

0

显示文件

文件： test_meta.py 项目： zwcdp/zr-obp

def test_meta_estimate_policy_values_using_valid_input_data(
    action_dist,
    estimated_rewards_by_reg_model,
    description: str,
    synthetic_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the response of estimate_policy_values using valid data
    """
    # single ope estimator
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[dm])
    assert ope_.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    ) == {
        "dm": mock_policy_value
    }, "OffPolicyEvaluation.estimate_policy_values ([DirectMethod]) returns a wrong value"
    # multiple ope estimators
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[dm, ipw])
    assert ope_.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    ) == {
        "dm": mock_policy_value,
        "ipw": mock_policy_value + ipw.eps,
    }, "OffPolicyEvaluation.estimate_policy_values ([DirectMethod, IPW]) returns a wrong value"

示例#3

0

显示文件

文件： test_meta.py 项目： aiueola/zr-obp

def test_meta_create_estimator_inputs_using_valid_input_data(
    action_dist,
    estimated_rewards_by_reg_model,
    description: str,
    synthetic_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the _create_estimator_inputs using invalid data
    """
    ope_ = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw]
    )
    estimator_inputs = ope_._create_estimator_inputs(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    )
    assert set(estimator_inputs.keys()) == set(["ipw"])
    assert set(estimator_inputs["ipw"].keys()) == set(
        [
            "reward",
            "action",
            "pscore",
            "position",
            "action_dist",
            "estimated_rewards_by_reg_model",
            "estimated_pscore",
            "estimated_importance_weights",
            "p_e_a",
            "pi_b",
            "context",
            "action_embed",
        ]
    ), f"Invalid response of _create_estimator_inputs (test case: {description})"
    # _create_estimator_inputs function is called in the following functions
    _ = ope_.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    )
    _ = ope_.estimate_intervals(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    )
    _ = ope_.summarize_off_policy_estimates(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    )
    _ = ope_.evaluate_performance_of_estimators(
        ground_truth_policy_value=0.1,
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    )
    _ = ope_.summarize_estimators_comparison(
        ground_truth_policy_value=0.1,
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    )

示例#4

0

显示文件

文件： test_meta.py 项目： aiueola/zr-obp

def test_meta_estimated_rewards_by_reg_model_inputs(
    synthetic_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the estimate_policy_values/estimate_intervals functions wrt estimated_rewards_by_reg_model
    """
    ope_ = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=[DirectMethod()]
    )

    action_dist = np.zeros(
        (synthetic_bandit_feedback["n_rounds"], synthetic_bandit_feedback["n_actions"])
    )
    with pytest.raises(ValueError):
        ope_.estimate_policy_values(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=None,
        )

    with pytest.raises(ValueError):
        ope_.estimate_intervals(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=None,
        )

示例#5

0

显示文件

def test_response_format_of_ope_estimators_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the response format of ope estimators using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :,
                                                                   np.newaxis]
    action_dist = random_action_dist
    # test all estimators
    all_estimators = ope.__all_estimators__
    estimators = [
        getattr(ope.estimators, estimator_name)()
        for estimator_name in all_estimators
    ]
    # conduct OPE
    ope_instance = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=estimators)
    estimated_policy_value = ope_instance.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=expected_reward)
    estimated_intervals = ope_instance.estimate_intervals(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=expected_reward,
        random_state=12345,
    )
    # check the format of OPE
    for key in estimated_policy_value:
        # check the keys of the output dictionary of the estimate_intervals method
        assert set(estimated_intervals[key].keys()) == set([
            "mean", "95.0% CI (lower)", "95.0% CI (upper)"
        ]), f"Confidence interval of {key} has invalid keys"
        # check the relationship between the means and the confidence bounds estimated by OPE estimators
        assert (
            estimated_intervals[key]["95.0% CI (lower)"] <=
            estimated_policy_value[key]
        ) and (
            estimated_intervals[key]["95.0% CI (upper)"] >=
            estimated_policy_value[key]
        ), f"Estimated policy value of {key} is not included in estimated intervals of that estimator"
        assert (estimated_intervals[key]["mean"] >=
                estimated_intervals[key]["95.0% CI (lower)"]
                ), f"Invalid confidence interval of {key}: lower bound > mean"
        assert (estimated_intervals[key]["mean"] <=
                estimated_intervals[key]["95.0% CI (upper)"]
                ), f"Invalid confidence interval of {key}: upper bound < mean"

示例#6

0

显示文件

文件： test_meta.py 项目： aiueola/zr-obp

def test_meta_create_estimator_inputs_using_invalid_input_data(
    action_dist,
    estimated_rewards_by_reg_model,
    description: str,
    synthetic_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the _create_estimator_inputs using valid data
    """
    ope_ = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw]
    )
    # raise ValueError when the shape of two arrays are different
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_._create_estimator_inputs(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
    # _create_estimator_inputs function is called in the following functions
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.estimate_policy_values(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.estimate_intervals(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.summarize_off_policy_estimates(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.evaluate_performance_of_estimators(
            ground_truth_policy_value=0.1,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.summarize_estimators_comparison(
            ground_truth_policy_value=0.1,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

示例#7

0

显示文件

def test_performance_of_ope_estimators_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the performance of ope estimators using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :,
                                                                   np.newaxis]
    action_dist = random_action_dist
    # compute ground truth policy value using expected reward
    q_pi_e = np.average(expected_reward[:, :, 0],
                        weights=action_dist[:, :, 0],
                        axis=1)
    # compute statistics of ground truth policy value
    gt_mean = q_pi_e.mean()
    gt_std = q_pi_e.std(ddof=1)
    # test most of the estimators (ReplayMethod is not tested because it is out of scope)
    all_estimators = ope.__all_estimators__
    estimators = [
        getattr(ope.estimators, estimator_name)()
        for estimator_name in all_estimators
        if estimator_name not in ["ReplayMethod"]
    ]
    # conduct OPE
    ope_instance = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=estimators)
    estimated_policy_value = ope_instance.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=expected_reward)
    # check the performance of OPE
    ci_bound = gt_std * 3 / np.sqrt(q_pi_e.shape[0])
    print(f"gt_mean: {gt_mean}, 3 * gt_std / sqrt(n): {ci_bound}")
    for key in estimated_policy_value:
        print(
            f"estimated_value: {estimated_policy_value[key]} ------ estimator: {key}, "
        )
        # test the performance of each estimator
        assert (
            np.abs(gt_mean - estimated_policy_value[key]) <= ci_bound
        ), f"OPE of {key} did not work well (absolute error is greater than 3*sigma)"

示例#8

0

显示文件

文件： test_all_estimators.py 项目： aiueola/zr-obp

def test_response_format_of_ope_estimators_using_random_evaluation_policy(
    synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray
) -> None:
    """
    Test the response format of ope estimators using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis]
    action_dist = random_action_dist
    # test all estimators
    all_estimators = ope.__all_estimators__
    estimators_standard = [
        getattr(ope.estimators, estimator_name)() for estimator_name in all_estimators
    ]
    all_estimators_tuning = ope.__all_estimators_tuning__
    estimators_tuning = [
        getattr(ope.estimators_tuning, estimator_name)(
            lambdas=[1, 100, 10000, np.inf],
            tuning_method=tuning_method,
        )
        for estimator_name in all_estimators_tuning
        for tuning_method in ["slope", "mse"]
    ]
    all_estimators_tuning_sg = ope.__all_estimators_tuning_sg__
    estimators_tuning_sg = [
        getattr(ope.estimators_tuning, estimator_name)(
            lambdas=[0.001, 0.01, 0.1, 1.0],
            tuning_method=tuning_method,
        )
        for estimator_name in all_estimators_tuning_sg
        for tuning_method in ["slope", "mse"]
    ]
    estimators = estimators_standard + estimators_tuning + estimators_tuning_sg
    # skip estimation
    estimated_pscore = None
    estimated_importance_weights = (
        random_action_dist[
            np.arange(synthetic_bandit_feedback["action"].shape[0]),
            synthetic_bandit_feedback["action"],
            np.zeros(
                synthetic_bandit_feedback["action"].shape[0], dtype=int
            ),  # position is None
        ]
        / synthetic_bandit_feedback["pscore"]
    )
    # conduct OPE
    ope_instance = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=estimators
    )
    estimated_policy_value = ope_instance.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=expected_reward,
        estimated_pscore=estimated_pscore,
        estimated_importance_weights=estimated_importance_weights,
    )
    estimated_intervals = ope_instance.estimate_intervals(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=expected_reward,
        estimated_pscore=estimated_pscore,
        estimated_importance_weights=estimated_importance_weights,
        random_state=12345,
    )
    # check the format of OPE
    for key in estimated_policy_value:
        # check the keys of the output dictionary of the estimate_intervals method
        assert set(estimated_intervals[key].keys()) == set(
            ["mean", "95.0% CI (lower)", "95.0% CI (upper)"]
        ), f"Confidence interval of {key} has invalid keys"
        # check the relationship between the means and the confidence bounds estimated by OPE estimators
        assert (
            estimated_intervals[key]["95.0% CI (lower)"] <= estimated_policy_value[key]
        ) and (
            estimated_intervals[key]["95.0% CI (upper)"] >= estimated_policy_value[key]
        ), f"Estimated policy value of {key} is not included in estimated intervals of that estimator"
        assert (
            estimated_intervals[key]["mean"]
            >= estimated_intervals[key]["95.0% CI (lower)"]
        ), f"Invalid confidence interval of {key}: lower bound > mean"
        assert (
            estimated_intervals[key]["mean"]
            <= estimated_intervals[key]["95.0% CI (upper)"]
        ), f"Invalid confidence interval of {key}: upper bound < mean"

示例#9

0

显示文件

文件： test_all_estimators.py 项目： aiueola/zr-obp

def test_performance_of_ope_estimators_using_random_evaluation_policy(
    synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray
) -> None:
    """
    Test the performance of ope estimators using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis]
    action_dist = random_action_dist
    # compute ground truth policy value using expected reward
    q_pi_e = np.average(expected_reward[:, :, 0], weights=action_dist[:, :, 0], axis=1)
    # compute statistics of ground truth policy value
    gt_mean = q_pi_e.mean()
    # test most of the estimators (ReplayMethod is not tested because it is out of scope)
    all_estimators = ope.__all_estimators__
    estimators_standard = [
        getattr(ope.estimators, estimator_name)()
        for estimator_name in all_estimators
        if estimator_name not in ["ReplayMethod"]
    ]
    all_estimators_tuning = ope.__all_estimators_tuning__
    estimators_tuning = [
        getattr(ope.estimators_tuning, estimator_name)(
            lambdas=[1, 100, 10000, np.inf],
            tuning_method=tuning_method,
        )
        for estimator_name in all_estimators_tuning
        for tuning_method in ["slope", "mse"]
    ]
    all_estimators_tuning_sg = ope.__all_estimators_tuning_sg__
    estimators_tuning_sg = [
        getattr(ope.estimators_tuning, estimator_name)(
            lambdas=[0.001, 0.01, 0.1, 1.0],
            tuning_method=tuning_method,
        )
        for estimator_name in all_estimators_tuning_sg
        for tuning_method in ["slope", "mse"]
    ]
    estimators = estimators_standard + estimators_tuning + estimators_tuning_sg
    # skip estimation
    estimated_pscore = None
    estimated_importance_weights = (
        random_action_dist[
            np.arange(synthetic_bandit_feedback["action"].shape[0]),
            synthetic_bandit_feedback["action"],
            np.zeros(
                synthetic_bandit_feedback["action"].shape[0], dtype=int
            ),  # position is None
        ]
        / synthetic_bandit_feedback["pscore"]
    )
    # conduct OPE
    ope_instance = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=estimators
    )
    estimated_policy_value = ope_instance.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=expected_reward,
        estimated_pscore=estimated_pscore,
        estimated_importance_weights=estimated_importance_weights,
    )
    # check the performance of OPE
    print(f"gt_mean: {gt_mean}")
    for key in estimated_policy_value:
        print(
            f"estimated_value: {estimated_policy_value[key]} ------ estimator: {key}, "
        )
        # test the performance of each estimator
        assert (
            np.abs(gt_mean - estimated_policy_value[key]) / gt_mean <= 0.1
        ), f"OPE of {key} did not work well (relative absolute error is greater than 10%)"