def process(i: int): # sample new data of synthetic logged bandit feedback bandit_feedback = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # simulate the evaluation policy action_dist = run_bandit_simulation(bandit_feedback=bandit_feedback, policy=evaluation_policy) # estimate the ground-truth policy values of the evaluation policy # by Monte-Carlo Simulation using p(r|x,a), the reward distribution ground_truth_policy_value = calc_ground_truth_policy_value( bandit_feedback=bandit_feedback, reward_sampler=dataset.sample_reward, # p(r|x,a) policy=evaluation_policy, n_sim=n_sim, # the number of simulations ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, ope_estimators=ope_estimators, ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, action_dist=action_dist, ) return relative_ee_i
def test_meta_evaluate_performance_of_estimators( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: gt = 0.5 # calculate relative-ee eval_metric_ope_dict = { "ipw": np.abs((mock_policy_value + ipw.eps - gt) / gt), "ipw3": np.abs((mock_policy_value + ipw3.eps - gt) / gt), } # check performance estimators ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw, ipw3]) performance = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=gt, action_dist=random_action_dist, metric="relative-ee", ) for k, v in performance.items(): assert k in eval_metric_ope_dict, "Invalid key of performance response" assert v == eval_metric_ope_dict[ k], "Invalid value of performance response" # zero division error when using relative-ee with pytest.raises(ZeroDivisionError, match=r"float division by zero"): _ = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=0.0, action_dist=random_action_dist, metric="relative-ee", ) # check summarization performance_df = ope_.summarize_estimators_comparison( ground_truth_policy_value=gt, action_dist=random_action_dist, metric="relative-ee", ) assert_frame_equal( performance_df, pd.DataFrame(eval_metric_ope_dict, index=["relative-ee" ]).T), "Invalid summarization (performance)"
def process(i: int): # split the original data into training and evaluation sets dataset.split_train_eval(eval_size=eval_size, random_state=i) # obtain logged bandit feedback generated by behavior policy bandit_feedback = dataset.obtain_batch_bandit_feedback(random_state=i) # obtain action choice probabilities by an evaluation policy action_dist = dataset.obtain_action_dist_by_eval_policy( base_classifier_e=base_model_dict[base_model_for_evaluation_policy] (**hyperparams[base_model_for_evaluation_policy]), alpha_e=alpha_e, ) # calculate the ground-truth performance of the evaluation policy ground_truth_policy_value = dataset.calc_ground_truth_policy_value( action_dist=action_dist) # estimate the mean reward function of the evaluation set of multi-class classification data with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback["context"], action=bandit_feedback["action"], reward=bandit_feedback["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, ope_estimators=ope_estimators, ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return relative_ee_i