def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # define evaluation policy using IPWLearner evaluation_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy] ), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit feedback evaluation_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback action_dist = evaluation_policy.predict( context=bandit_feedback_test["context"], ) # estimate the mean reward function of the test set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model] ), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_test["context"], action=bandit_feedback_test["action"], reward=bandit_feedback_test["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback_test, ope_estimators=ope_estimators, ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=action_dist, ), action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return relative_ee_i
def test_synthetic_calc_policy_value_using_invalid_inputs( expected_reward, action_dist, description, ): n_actions = 10 dataset = SyntheticBanditDataset(n_actions=n_actions) with pytest.raises(ValueError, match=f"{description}*"): _ = dataset.calc_ground_truth_policy_value( expected_reward=expected_reward, action_dist=action_dist)
def test_synthetic_calc_policy_value_using_valid_inputs( expected_reward, action_dist, description, ): n_actions = 10 dataset = SyntheticBanditDataset(n_actions=n_actions) policy_value = dataset.calc_ground_truth_policy_value( expected_reward=expected_reward, action_dist=action_dist) assert isinstance( policy_value, float), "Invalid response of calc_ground_truth_policy_value"
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # define evaluation policy using IPWLearner ipw_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # baseline method 1. RandomPolicy random_policy = RandomPolicy(n_actions=dataset.n_actions) # baseline method 2. UniformSampleWeightLearner uniform_sample_weight_policy = UniformSampleWeightLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit feedback ipw_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) uniform_sample_weight_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback ipw_action_dist = ipw_policy.predict( context=bandit_feedback_test["context"], ) random_action_dist = random_policy.predict( context=bandit_feedback_test["context"], ) uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict( context=bandit_feedback_test["context"], ) # get the ground truth policy value for each learner gt_ipw_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=ipw_action_dist, ) gt_random_policy = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=random_action_dist, ) gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=uniform_sample_weight_action_dist, ) return gt_ipw_learner, gt_random_policy, gt_uniform_sample_weight_learner
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # estimate the mean reward function of the train set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) ope_estimator = DoublyRobust() # define evaluation policy using NNPolicyLearner nn_policy = NNPolicyLearner( n_actions=dataset.n_actions, dim_context=dim_context, off_policy_objective=ope_estimator.estimate_policy_value_tensor, ) # baseline method 1. RandomPolicy random_policy = RandomPolicy(n_actions=dataset.n_actions) # baseline method 2. UniformSampleWeightLearner uniform_sample_weight_policy = UniformSampleWeightLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # estimate the mean reward function of the train set of synthetic bandit feedback with ML model estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], n_folds=3, # 3-fold cross-fitting random_state=12345, ) # train the evaluation policy on the training set of the synthetic logged bandit feedback nn_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) uniform_sample_weight_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback nn_policy_action_dist = nn_policy.predict( context=bandit_feedback_test["context"], ) random_action_dist = random_policy.predict( context=bandit_feedback_test["context"], ) uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict( context=bandit_feedback_test["context"], ) # get the ground truth policy value for each learner gt_nn_policy_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=nn_policy_action_dist, ) gt_random_policy = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=random_action_dist, ) gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=uniform_sample_weight_action_dist, ) return gt_nn_policy_learner, gt_random_policy, gt_uniform_sample_weight_learner
action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) # predict the action decisions for the test set of the synthetic logged bandit feedback random_action_dist = random_policy.compute_batch_action_dist( n_rounds=n_rounds) ipw_learner_action_dist = ipw_learner.predict( context=bandit_feedback_test["context"], ) nn_policy_learner_action_dist = nn_policy_learner.predict_proba( context=bandit_feedback_test["context"], ) # evaluate learners' performances using ground-truth polocy values random_policy_value = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=random_action_dist, ) ipw_learner_policy_value = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=ipw_learner_action_dist, ) nn_policy_learner_policy_value = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=nn_policy_learner_action_dist, ) policy_value_df = DataFrame( [ [random_policy_value], [ipw_learner_policy_value], [nn_policy_learner_policy_value],
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # sample new training and test sets of synthetic logged bandit data bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # defining policy learners ipw_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) q_policy = QLearner( n_actions=dataset.n_actions, base_model=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) nn_policy = NNPolicyLearner( n_actions=dataset.n_actions, dim_context=dim_context, off_policy_objective="ipw", ) # baseline method 1. RandomPolicy random_policy = RandomPolicy(n_actions=dataset.n_actions) # baseline method 2. UniformSampleWeightLearner uniform_sample_weight_policy = UniformSampleWeightLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # policy training ipw_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) q_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) nn_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) uniform_sample_weight_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # prediction/making decisions ipw_action_dist = ipw_policy.predict( context=bandit_feedback_test["context"], ) q_action_dist = q_policy.predict( context=bandit_feedback_test["context"], ) nn_action_dist = nn_policy.predict( context=bandit_feedback_test["context"], ) random_action_dist = random_policy.predict( context=bandit_feedback_test["context"], ) uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict( context=bandit_feedback_test["context"], ) # evaluation gt_ipw_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=ipw_action_dist, ) gt_q_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=q_action_dist, ) gt_nn_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=nn_action_dist, ) gt_random_policy = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=random_action_dist, ) gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=uniform_sample_weight_action_dist, ) return ( gt_ipw_learner, gt_q_learner, gt_nn_learner, gt_random_policy, gt_uniform_sample_weight_learner, )