def test_meta_estimate_intervals_using_invalid_input_data( action_dist, estimated_rewards_by_reg_model, description_1: str, alpha, n_bootstrap_samples, random_state, description_2: str, synthetic_bandit_feedback: BanditFeedback, ) -> None: """ Test the response of estimate_intervals using invalid data """ ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[dm]) with pytest.raises(ValueError, match=f"{description_2}*"): _ = ope_.estimate_intervals( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) # estimate_intervals function is called in summarize_off_policy_estimates with pytest.raises(ValueError, match=f"{description_2}*"): _ = ope_.summarize_off_policy_estimates( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, )
def test_meta_create_estimator_inputs_using_valid_input_data( action_dist, estimated_rewards_by_reg_model, description: str, synthetic_bandit_feedback: BanditFeedback, ) -> None: """ Test the _create_estimator_inputs using invalid data """ ope_ = OffPolicyEvaluation( bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw] ) estimator_inputs = ope_._create_estimator_inputs( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) assert set(estimator_inputs.keys()) == set(["ipw"]) assert set(estimator_inputs["ipw"].keys()) == set( [ "reward", "action", "pscore", "position", "action_dist", "estimated_rewards_by_reg_model", "estimated_pscore", "estimated_importance_weights", "p_e_a", "pi_b", "context", "action_embed", ] ), f"Invalid response of _create_estimator_inputs (test case: {description})" # _create_estimator_inputs function is called in the following functions _ = ope_.estimate_policy_values( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) _ = ope_.estimate_intervals( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) _ = ope_.summarize_off_policy_estimates( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) _ = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=0.1, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) _ = ope_.summarize_estimators_comparison( ground_truth_policy_value=0.1, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, )
def test_meta_create_estimator_inputs_using_invalid_input_data( action_dist, estimated_rewards_by_reg_model, description: str, synthetic_bandit_feedback: BanditFeedback, ) -> None: """ Test the _create_estimator_inputs using valid data """ ope_ = OffPolicyEvaluation( bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw] ) # raise ValueError when the shape of two arrays are different with pytest.raises(ValueError, match=f"{description}*"): _ = ope_._create_estimator_inputs( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) # _create_estimator_inputs function is called in the following functions with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.estimate_policy_values( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.estimate_intervals( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.summarize_off_policy_estimates( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=0.1, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.summarize_estimators_comparison( ground_truth_policy_value=0.1, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, )
def test_meta_summarize_off_policy_estimates( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw, ipw3]) value, interval = ope_.summarize_off_policy_estimates(random_action_dist) expected_value = pd.DataFrame( { "ipw": mock_policy_value + ipw.eps, "ipw3": mock_policy_value + ipw3.eps, }, index=["estimated_policy_value"], ).T expected_interval = pd.DataFrame({ "ipw": {k: v + ipw.eps for k, v in mock_confidence_interval.items()}, "ipw3": {k: v + ipw3.eps for k, v in mock_confidence_interval.items()}, }).T assert_frame_equal(value, expected_value), "Invalid summarization (policy value)" assert_frame_equal(interval, expected_interval), "Invalid summarization (interval)"
def test_meta_summarize_off_policy_estimates( action_dist, estimated_rewards_by_reg_model, description_1: str, alpha: float, n_bootstrap_samples: int, random_state: int, description_2: str, synthetic_bandit_feedback: BanditFeedback, ) -> None: """ Test the response of summarize_off_policy_estimates using valid data """ ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw, ipw3]) value, interval = ope_.summarize_off_policy_estimates( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) expected_value = pd.DataFrame( { "ipw": mock_policy_value + ipw.eps, "ipw3": mock_policy_value + ipw3.eps, }, index=["estimated_policy_value"], ).T expected_value["relative_estimated_policy_value"] = ( expected_value["estimated_policy_value"] / synthetic_bandit_feedback["reward"].mean()) expected_interval = pd.DataFrame({ "ipw": {k: v + ipw.eps for k, v in mock_confidence_interval.items()}, "ipw3": {k: v + ipw3.eps for k, v in mock_confidence_interval.items()}, }).T assert_frame_equal(value, expected_value), "Invalid summarization (policy value)" assert_frame_equal(interval, expected_interval), "Invalid summarization (interval)" # check relative estimated policy value when the average of bandit_feedback["reward"] is zero zero_reward_bandit_feedback = deepcopy(synthetic_bandit_feedback) zero_reward_bandit_feedback["reward"] = np.zeros( zero_reward_bandit_feedback["reward"].shape[0]) ope_ = OffPolicyEvaluation(bandit_feedback=zero_reward_bandit_feedback, ope_estimators=[ipw, ipw3]) value, _ = ope_.summarize_off_policy_estimates( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) expected_value = pd.DataFrame( { "ipw": mock_policy_value + ipw.eps, "ipw3": mock_policy_value + ipw3.eps, }, index=["estimated_policy_value"], ).T expected_value["relative_estimated_policy_value"] = np.nan assert_frame_equal(value, expected_value), "Invalid summarization (policy value)"
HistGradientBoostingClassifier(**hyperparams)) # run a counterfactual bandit algorithm on logged bandit feedback data selected_actions = run_bandit_simulation(bandit_feedback=bandit_feedback, policy=policy) # estimate the policy value of a given counterfactual algorithm by the three OPE estimators. ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, regression_model=RegressionModel(base_model=base_model), action_context=obd.action_context, ope_estimators=[ InverseProbabilityWeighting(), DirectMethod(), DoublyRobust() ], ) estimated_policy_value, estimated_interval = ope.summarize_off_policy_estimates( selected_actions=selected_actions) # calculate estimated policy value relative to that of the behavior policy print("=" * 70) print(f"random_state={random_state}: counterfactual policy={policy_name}") print("-" * 70) estimated_policy_value["relative_estimated_policy_value"] = ( estimated_policy_value.estimated_policy_value / ground_truth) print(estimated_policy_value) print("=" * 70) # save counterfactual policy evaluation results in `./logs` directory save_path = Path( "./logs") / behavior_policy / campaign / "cf_policy_selection" save_path.mkdir(exist_ok=True, parents=True) pd.DataFrame(estimated_policy_value).to_csv(save_path /