def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # define evaluation policy using IPWLearner evaluation_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy] ), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit feedback evaluation_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback action_dist = evaluation_policy.predict( context=bandit_feedback_test["context"], ) # estimate the mean reward function of the test set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model] ), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_test["context"], action=bandit_feedback_test["action"], reward=bandit_feedback_test["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback_test, ope_estimators=ope_estimators, ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=action_dist, ), action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return relative_ee_i
n_jobs = args.n_jobs random_state = args.random_state np.random.seed(random_state) data_path = Path("../open_bandit_dataset") # define a dataset class obd = OBDWithInteractionFeatures( behavior_policy=behavior_policy, campaign=campaign, data_path=data_path, context_set=context_set, ) # define a counterfactual policy based on IPWLearner counterfactual_policy = IPWLearner( base_model=base_model_dict[base_model](**hyperparams[base_model]), n_actions=obd.n_actions, len_list=obd.len_list, ) policy_name = f"{base_model}_{context_set}" # ground-truth policy value of the Bernoulli TS policy (the current best policy) in the test set # , which is the empirical mean of the factual (observed) rewards (on-policy estimation) ground_truth = obd.calc_on_policy_policy_value_estimate( behavior_policy="bts", campaign=campaign, data_path=data_path, test_size=test_size, is_timeseries_split=True, ) def process(b: int):
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # define evaluation policy using IPWLearner ipw_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # baseline method 1. RandomPolicy random_policy = RandomPolicy(n_actions=dataset.n_actions) # baseline method 2. UniformSampleWeightLearner uniform_sample_weight_policy = UniformSampleWeightLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit feedback ipw_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) uniform_sample_weight_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback ipw_action_dist = ipw_policy.predict( context=bandit_feedback_test["context"], ) random_action_dist = random_policy.predict( context=bandit_feedback_test["context"], ) uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict( context=bandit_feedback_test["context"], ) # get the ground truth policy value for each learner gt_ipw_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=ipw_action_dist, ) gt_random_policy = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=random_action_dist, ) gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=uniform_sample_weight_action_dist, ) return gt_ipw_learner, gt_random_policy, gt_uniform_sample_weight_learner
n_jobs = args.n_jobs random_state = args.random_state np.random.seed(random_state) # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=random_state, ) # define evaluation policy using IPWLearner evaluation_policy = IPWLearner( n_actions=dataset.n_actions, len_list=dataset.len_list, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) def process(i: int): # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit feedback evaluation_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"],
def process(i: int): # synthetic data generator dataset = SyntheticBanditDatasetWithActionEmbeds( n_actions=n_actions, dim_context=dim_context, beta=3.0, n_cat_dim=3, n_cat_per_dim=5, reward_function=logistic_reward_function, random_state=i, ) # define evaluation policy using IPWLearner evaluation_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_iw_estimator]( **hyperparams[base_model_for_iw_estimator]), ) # sample new training and test sets of synthetic logged bandit data bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit data evaluation_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit data action_dist = evaluation_policy.predict_proba( context=bandit_feedback_test["context"], ) # estimate the reward function of the test set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_test["context"], action=bandit_feedback_test["action"], reward=bandit_feedback_test["reward"], n_folds=2, random_state=12345, ) # fit propensity score estimators pscore_estimator = PropensityScoreEstimator( len_list=1, n_actions=n_actions, base_model=base_model_dict[base_model_for_pscore_estimator]( **hyperparams[base_model_for_pscore_estimator]), calibration_cv=3, ) estimated_pscore = pscore_estimator.fit_predict( action=bandit_feedback_test["action"], position=bandit_feedback_test["position"], context=bandit_feedback_test["context"], n_folds=3, random_state=12345, ) # fit importance weight estimators estimated_importance_weights_dict = {} for clf_name, clf_arguments in bipw_model_configurations.items(): clf = ImportanceWeightEstimator( len_list=1, n_actions=n_actions, fitting_method=clf_arguments["fitting_method"], base_model=clf_arguments["base_model"], ) estimated_importance_weights_dict[clf_name] = clf.fit_predict( action=bandit_feedback_test["action"], context=bandit_feedback_test["context"], action_dist=action_dist, position=bandit_feedback_test["position"], n_folds=2, evaluate_model_performance=False, random_state=12345, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback_test, ope_estimators=ope_estimators + [ MarginalizedInverseProbabilityWeighting(n_actions=n_actions, estimator_name="mipw"), MarginalizedInverseProbabilityWeighting( n_actions=n_actions, embedding_selection_method="greedy", estimator_name="mipw (greedy selection)", ), SelfNormalizedMarginalizedInverseProbabilityWeighting( n_actions=n_actions, estimator_name="snmipw"), ], ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=action_dist, ), action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, estimated_pscore=estimated_pscore, estimated_importance_weights=estimated_importance_weights_dict, action_embed=bandit_feedback_test["action_embed"], pi_b=bandit_feedback_test["pi_b"], metric="relative-ee", ) return relative_ee_i
**hyperparams[base_model_for_reg_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) # define random evaluation policy random_policy = Random(n_actions=dataset.n_actions, random_state=random_state) # define evaluation policy using IPWLearner ipw_learner = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # define evaluation policy using NNPolicyLearner nn_policy_learner = NNPolicyLearner( n_actions=dataset.n_actions, dim_context=dim_context, off_policy_objective=ope_estimator_dict[ope_estimator]. estimate_policy_value_tensor, hidden_layer_size=tuple((n_hidden for _ in range(n_layers))), activation=activation, solver=solver, batch_size=batch_size, early_stopping=early_stopping, random_state=random_state, )
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # sample new training and test sets of synthetic logged bandit data bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # defining policy learners ipw_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) q_policy = QLearner( n_actions=dataset.n_actions, base_model=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) nn_policy = NNPolicyLearner( n_actions=dataset.n_actions, dim_context=dim_context, off_policy_objective="ipw", ) # baseline method 1. RandomPolicy random_policy = RandomPolicy(n_actions=dataset.n_actions) # baseline method 2. UniformSampleWeightLearner uniform_sample_weight_policy = UniformSampleWeightLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # policy training ipw_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) q_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) nn_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) uniform_sample_weight_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # prediction/making decisions ipw_action_dist = ipw_policy.predict( context=bandit_feedback_test["context"], ) q_action_dist = q_policy.predict( context=bandit_feedback_test["context"], ) nn_action_dist = nn_policy.predict( context=bandit_feedback_test["context"], ) random_action_dist = random_policy.predict( context=bandit_feedback_test["context"], ) uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict( context=bandit_feedback_test["context"], ) # evaluation gt_ipw_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=ipw_action_dist, ) gt_q_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=q_action_dist, ) gt_nn_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=nn_action_dist, ) gt_random_policy = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=random_action_dist, ) gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=uniform_sample_weight_action_dist, ) return ( gt_ipw_learner, gt_q_learner, gt_nn_learner, gt_random_policy, gt_uniform_sample_weight_learner, )
test_size = args.test_size random_state = args.random_state np.random.seed(random_state) data_path = Path("../open_bandit_dataset") # define a dataset class obd = OBDWithInteractionFeatures( behavior_policy=behavior_policy, campaign=campaign, data_path=data_path, context_set=context_set, ) # define a evaluation policy evaluation_policy = IPWLearner( base_model=base_model_dict[base_model](**hyperparams[base_model]), n_actions=obd.n_actions, len_list=obd.len_list, ) policy_name = f"{base_model}_{context_set}" # ground-truth policy value of the Bernoulli TS policy (the current best policy) in the test set # , which is the empirical mean of the factual (observed) rewards (on-policy estimation) ground_truth = obd.calc_on_policy_policy_value_estimate( behavior_policy="bts", campaign=campaign, data_path=data_path, test_size=test_size, is_timeseries_split=True, ) start = time.time()