def test_meta_evaluate_performance_of_estimators_using_invalid_input_data( action_dist, estimated_rewards_by_reg_model, description_1: str, metric, ground_truth_policy_value, err, description_2: str, synthetic_multi_bandit_feedback: BanditFeedback, ) -> None: """ Test the response of evaluate_performance_of_estimators using invalid data """ ope_ = MultiLoggersOffPolicyEvaluation( bandit_feedback=synthetic_multi_bandit_feedback, ope_estimators=[dm]) with pytest.raises(err, match=f"{description_2}*"): _ = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, action_dist=action_dist, metric=metric, ) # estimate_intervals function is called in summarize_off_policy_estimates with pytest.raises(err, match=f"{description_2}*"): _ = ope_.summarize_estimators_comparison( ground_truth_policy_value=ground_truth_policy_value, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, action_dist=action_dist, metric=metric, )
def test_meta_evaluate_performance_of_estimators_using_valid_input_data( action_dist, estimated_rewards_by_reg_model, description_1: str, metric, ground_truth_policy_value, description_2: str, synthetic_multi_bandit_feedback: BanditFeedback, ) -> None: """ Test the response of evaluate_performance_of_estimators using valid data """ if metric == "relative-ee": # calculate relative-ee eval_metric_ope_dict = { "ipw": np.abs((mock_policy_value + ipw.eps - ground_truth_policy_value) / ground_truth_policy_value), "ipw3": np.abs((mock_policy_value + ipw3.eps - ground_truth_policy_value) / ground_truth_policy_value), } else: # calculate se eval_metric_ope_dict = { "ipw": (mock_policy_value + ipw.eps - ground_truth_policy_value)**2, "ipw3": (mock_policy_value + ipw3.eps - ground_truth_policy_value)**2, } # check performance estimators ope_ = MultiLoggersOffPolicyEvaluation( bandit_feedback=synthetic_multi_bandit_feedback, ope_estimators=[ipw, ipw3]) performance = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, action_dist=action_dist, metric=metric, ) for k, v in performance.items(): assert k in eval_metric_ope_dict, "Invalid key of performance response" assert v == eval_metric_ope_dict[ k], "Invalid value of performance response" performance_df = ope_.summarize_estimators_comparison( ground_truth_policy_value=ground_truth_policy_value, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, action_dist=action_dist, metric=metric, ) assert_frame_equal( performance_df, pd.DataFrame(eval_metric_ope_dict, index=[metric]).T), "Invalid summarization (performance)"
def test_meta_create_estimator_inputs_using_valid_input_data( action_dist, estimated_rewards_by_reg_model, description: str, synthetic_multi_bandit_feedback: BanditFeedback, ) -> None: """ Test the _create_estimator_inputs using invalid data """ ope_ = MultiLoggersOffPolicyEvaluation( bandit_feedback=synthetic_multi_bandit_feedback, ope_estimators=[ipw]) estimator_inputs = ope_._create_estimator_inputs( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) assert set(estimator_inputs.keys()) == set(["ipw"]) assert set(estimator_inputs["ipw"].keys()) == set( [ "reward", "action", "pscore", "position", "action_dist", "stratum_idx", "pscore_avg", "estimated_rewards_by_reg_model", "estimated_pscore", "estimated_pscore_avg", ] ), f"Invalid response of _create_estimator_inputs (test case: {description})" # _create_estimator_inputs function is called in the following functions _ = ope_.estimate_policy_values( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) _ = ope_.estimate_intervals( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) _ = ope_.summarize_off_policy_estimates( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) _ = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=0.1, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) _ = ope_.summarize_estimators_comparison( ground_truth_policy_value=0.1, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, )
def test_meta_create_estimator_inputs_using_invalid_input_data( action_dist, estimated_rewards_by_reg_model, description: str, synthetic_multi_bandit_feedback: BanditFeedback, ) -> None: """ Test the _create_estimator_inputs using valid data """ ope_ = MultiLoggersOffPolicyEvaluation( bandit_feedback=synthetic_multi_bandit_feedback, ope_estimators=[ipw]) # raise ValueError when the shape of two arrays are different with pytest.raises(ValueError, match=f"{description}*"): _ = ope_._create_estimator_inputs( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) # _create_estimator_inputs function is called in the following functions with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.estimate_policy_values( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.estimate_intervals( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.summarize_off_policy_estimates( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=0.1, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.summarize_estimators_comparison( ground_truth_policy_value=0.1, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, )