def test_sample_bootstrap_bandit_feedback(): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") bandit_feedback = dataset.obtain_batch_bandit_feedback() bootstrap_bf = dataset.sample_bootstrap_bandit_feedback() bf_keys = {"action", "position", "reward", "pscore", "context"} for k in bf_keys: assert len(bandit_feedback[k]) == len(bootstrap_bf[k]) bandit_feedback_timeseries: Dict = dataset.obtain_batch_bandit_feedback( is_timeseries_split=True)[0] bootstrap_bf_timeseries = dataset.sample_bootstrap_bandit_feedback( is_timeseries_split=True) for k in bf_keys: assert len(bandit_feedback_timeseries[k]) == len( bootstrap_bf_timeseries[k])
def test_sample_bootstrap_bandit_feedback(): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") bandit_feedback = dataset.obtain_batch_bandit_feedback() bootstrap_bf = dataset.sample_bootstrap_bandit_feedback() assert len(bandit_feedback["action"]) == len(bootstrap_bf["action"]) assert len(bandit_feedback["position"]) == len(bootstrap_bf["position"]) assert len(bandit_feedback["reward"]) == len(bootstrap_bf["reward"]) assert len(bandit_feedback["pscore"]) == len(bootstrap_bf["pscore"]) assert len(bandit_feedback["context"]) == len(bootstrap_bf["context"])
def test_sample_bootstrap_bandit_feedback(): with pytest.raises(ValueError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.sample_bootstrap_bandit_feedback(is_timeseries_split=True, test_size=1.3) with pytest.raises(ValueError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.sample_bootstrap_bandit_feedback(is_timeseries_split=True, test_size=-0.5) with pytest.raises(ValueError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.sample_bootstrap_bandit_feedback(sample_size=-50) with pytest.raises(TypeError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.sample_bootstrap_bandit_feedback(sample_size=50.0) with pytest.raises(ValueError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.sample_bootstrap_bandit_feedback(sample_size=10000000) dataset = OpenBanditDataset(behavior_policy="random", campaign="all") bandit_feedback = dataset.obtain_batch_bandit_feedback() bootstrap_bf = dataset.sample_bootstrap_bandit_feedback() bf_keys = {"action", "position", "reward", "pscore", "context"} for k in bf_keys: assert len(bandit_feedback[k]) == len(bootstrap_bf[k]) bandit_feedback_timeseries: Dict = dataset.obtain_batch_bandit_feedback( is_timeseries_split=True)[0] bootstrap_bf_timeseries = dataset.sample_bootstrap_bandit_feedback( is_timeseries_split=True) for k in bf_keys: assert len(bandit_feedback_timeseries[k]) == len( bootstrap_bf_timeseries[k]) sample_size = 1000 dataset = OpenBanditDataset(behavior_policy="random", campaign="all") bootstrap_bf = dataset.sample_bootstrap_bandit_feedback( sample_size=sample_size) assert bootstrap_bf["n_rounds"] == sample_size for k in bf_keys: assert len(bootstrap_bf[k]) == sample_size
# compared OPE estimators ope_estimators = [DirectMethod(), InverseProbabilityWeighting(), DoublyRobust()] # a base ML model for regression model used in Direct Method and Doubly Robust base_model = CalibratedClassifierCV(LogisticRegression(**hyperparams)) # ground-truth policy value of a counterfactual policy # , which is estimated with factual (observed) rewards (on-policy estimation) ground_truth_policy_value = OpenBanditDataset.calc_on_policy_policy_value_estimate( behavior_policy=counterfactual_policy, campaign=campaign, data_path=data_path ) evaluation_of_ope_results = { est.estimator_name: np.zeros(n_boot_samples) for est in ope_estimators } for b in np.arange(n_boot_samples): # sample bootstrap from batch logged bandit feedback boot_bandit_feedback = obd.sample_bootstrap_bandit_feedback(random_state=b) # run a counterfactual bandit algorithm on logged bandit feedback data selected_actions = run_bandit_simulation( bandit_feedback=boot_bandit_feedback, policy=policy ) # evaluate the estimation performance of OPE estimators ope = OffPolicyEvaluation( bandit_feedback=boot_bandit_feedback, action_context=obd.action_context, regression_model=RegressionModel(base_model=base_model), ope_estimators=ope_estimators, ) relative_estimation_errors = ope.evaluate_performance_of_estimators( selected_actions=selected_actions, ground_truth_policy_value=ground_truth_policy_value, )
) start = time.time() relative_ee = { est.estimator_name: np.zeros(n_boot_samples) for est in ope_estimators } for b in np.arange(n_boot_samples): # load the pre-trained regression model with open(reg_model_path / f"reg_model_{b}.pkl", "rb") as f: reg_model = pickle.load(f) with open(reg_model_path / f"is_for_reg_model_{b}.pkl", "rb") as f: is_for_reg_model = pickle.load(f) # sample bootstrap samples from batch logged bandit feedback boot_bandit_feedback = obd.sample_bootstrap_bandit_feedback( test_size=test_size, is_timeseries_split=is_timeseries_split, random_state=b) for key_ in ["context", "action", "reward", "pscore", "position"]: boot_bandit_feedback[key_] = boot_bandit_feedback[key_][ ~is_for_reg_model] if evaluation_policy == "bts": policy = BernoulliTS( n_actions=obd.n_actions, len_list=obd.len_list, is_zozotown_prior= True, # replicate the policy in the ZOZOTOWN production campaign=campaign, random_state=random_state, ) action_dist = policy.compute_batch_action_dist( n_sim=100000, n_rounds=boot_bandit_feedback["n_rounds"])