def process(b: int): # sample bootstrap from batch logged bandit feedback boot_bandit_feedback = obd.sample_bootstrap_bandit_feedback( test_size=test_size, is_timeseries_split=True, random_state=b ) # train an evaluation on the training set of the logged bandit feedback data action_dist = counterfactual_policy.fit( context=boot_bandit_feedback["context"], action=boot_bandit_feedback["action"], reward=boot_bandit_feedback["reward"], pscore=boot_bandit_feedback["pscore"], position=boot_bandit_feedback["position"], ) # make action selections (predictions) action_dist = counterfactual_policy.predict( context=boot_bandit_feedback["context_test"] ) # estimate the policy value of a given counterfactual algorithm by the three OPE estimators. ipw = InverseProbabilityWeighting() return ipw.estimate_policy_value( reward=boot_bandit_feedback["reward_test"], action=boot_bandit_feedback["action_test"], position=boot_bandit_feedback["position_test"], pscore=boot_bandit_feedback["pscore_test"], action_dist=action_dist, )
def test_ipw_using_invalid_input_data( action_dist: np.ndarray, action: np.ndarray, reward: np.ndarray, pscore: np.ndarray, position: np.ndarray, use_estimated_pscore: bool, estimated_pscore: np.ndarray, description: str, ) -> None: # prepare ipw instances ipw = InverseProbabilityWeighting(use_estimated_pscore=use_estimated_pscore) snipw = SelfNormalizedInverseProbabilityWeighting( use_estimated_pscore=use_estimated_pscore ) sgipw = SubGaussianInverseProbabilityWeighting( use_estimated_pscore=use_estimated_pscore ) ipw_tuning = InverseProbabilityWeightingTuning( lambdas=[10, 1000], use_estimated_pscore=use_estimated_pscore ) sgipw_tuning = SubGaussianInverseProbabilityWeightingTuning( lambdas=[0.01, 0.1], use_estimated_pscore=use_estimated_pscore ) with pytest.raises(ValueError, match=f"{description}*"): _ = ipw.estimate_policy_value( action_dist=action_dist, action=action, reward=reward, pscore=pscore, position=position, estimated_pscore=estimated_pscore, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ipw.estimate_interval( action_dist=action_dist, action=action, reward=reward, pscore=pscore, position=position, estimated_pscore=estimated_pscore, ) with pytest.raises(ValueError, match=f"{description}*"): _ = snipw.estimate_policy_value( action_dist=action_dist, action=action, reward=reward, pscore=pscore, position=position, estimated_pscore=estimated_pscore, ) with pytest.raises(ValueError, match=f"{description}*"): _ = snipw.estimate_interval( action_dist=action_dist, action=action, reward=reward, pscore=pscore, position=position, estimated_pscore=estimated_pscore, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ipw_tuning.estimate_policy_value( action_dist=action_dist, action=action, reward=reward, pscore=pscore, position=position, estimated_pscore=estimated_pscore, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ipw_tuning.estimate_interval( action_dist=action_dist, action=action, reward=reward, pscore=pscore, position=position, estimated_pscore=estimated_pscore, ) with pytest.raises(ValueError, match=f"{description}*"): _ = sgipw.estimate_policy_value( action_dist=action_dist, action=action, reward=reward, pscore=pscore, position=position, estimated_pscore=estimated_pscore, ) with pytest.raises(ValueError, match=f"{description}*"): _ = sgipw.estimate_interval( action_dist=action_dist, action=action, reward=reward, pscore=pscore, position=position, estimated_pscore=estimated_pscore, ) with pytest.raises(ValueError, match=f"{description}*"): _ = sgipw_tuning.estimate_policy_value( action_dist=action_dist, action=action, reward=reward, pscore=pscore, position=position, estimated_pscore=estimated_pscore, ) with pytest.raises(ValueError, match=f"{description}*"): _ = sgipw_tuning.estimate_interval( action_dist=action_dist, action=action, reward=reward, pscore=pscore, position=position, estimated_pscore=estimated_pscore, )
action_dist = evaluation_policy.fit( context=boot_bandit_feedback["context"], action=boot_bandit_feedback["action"], reward=boot_bandit_feedback["reward"], pscore=boot_bandit_feedback["pscore"], position=boot_bandit_feedback["position"], ) # make action selections (predictions) action_dist = evaluation_policy.predict( context=boot_bandit_feedback["context_test"]) # estimate the policy value of a given counterfactual algorithm by the three OPE estimators. ipw = InverseProbabilityWeighting() ope_results[b] = (ipw.estimate_policy_value( reward=boot_bandit_feedback["reward_test"], action=boot_bandit_feedback["action_test"], position=boot_bandit_feedback["position_test"], pscore=boot_bandit_feedback["pscore_test"], action_dist=action_dist, ) / ground_truth) print( f"{b+1}th iteration: {np.round((time.time() - start) / 60, 2)}min") ope_results_dict = estimate_confidence_interval_by_bootstrap( samples=ope_results, random_state=random_state) ope_results_dict["mean(no-boot)"] = ope_results.mean() ope_results_dict["std"] = np.std(ope_results, ddof=1) ope_results_df = pd.DataFrame(ope_results_dict, index=["ipw"]) # calculate estimated policy value relative to that of the behavior policy print("=" * 70) print(f"random_state={random_state}: evaluation policy={policy_name}")