def test_synthetic_sample_reward_using_valid_inputs(context, action, description): n_actions = 10 dataset = SyntheticBanditDataset(n_actions=n_actions, dim_context=3) reward = dataset.sample_reward(context=context, action=action) assert isinstance(reward, np.ndarray), "Invalid response of sample_reward" assert reward.shape == action.shape, "Invalid response of sample_reward"
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # define evaluation policy using IPWLearner evaluation_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy] ), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit feedback evaluation_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback action_dist = evaluation_policy.predict( context=bandit_feedback_test["context"], ) # estimate the mean reward function of the test set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model] ), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_test["context"], action=bandit_feedback_test["action"], reward=bandit_feedback_test["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback_test, ope_estimators=ope_estimators, ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=action_dist, ), action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return relative_ee_i
def process(i: int): # synthetic data generator with uniformly random policy dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=None, # uniformly random random_state=i, ) # sample new data of synthetic logged bandit feedback bandit_feedback = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # simulate the evaluation policy action_dist = run_bandit_simulation(bandit_feedback=bandit_feedback, policy=evaluation_policy) # estimate the ground-truth policy values of the evaluation policy # by Monte-Carlo Simulation using p(r|x,a), the reward distribution ground_truth_policy_value = calc_ground_truth_policy_value( bandit_feedback=bandit_feedback, reward_sampler=dataset.sample_reward, # p(r|x,a) policy=evaluation_policy, n_sim=n_sim, # the number of simulations ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, ope_estimators=ope_estimators, ) metric_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, action_dist=action_dist, ) return metric_i
def test_synthetic_sample_reward_using_invalid_inputs(context, action, description): n_actions = 10 dataset = SyntheticBanditDataset(n_actions=n_actions) with pytest.raises(ValueError, match=f"{description}*"): _ = dataset.sample_reward(context=context, action=action)
def test_synthetic_calc_policy_value_using_invalid_inputs( expected_reward, action_dist, description, ): n_actions = 10 dataset = SyntheticBanditDataset(n_actions=n_actions) with pytest.raises(ValueError, match=f"{description}*"): _ = dataset.calc_ground_truth_policy_value( expected_reward=expected_reward, action_dist=action_dist)
def test_synthetic_calc_policy_value_using_valid_inputs( expected_reward, action_dist, description, ): n_actions = 10 dataset = SyntheticBanditDataset(n_actions=n_actions) policy_value = dataset.calc_ground_truth_policy_value( expected_reward=expected_reward, action_dist=action_dist) assert isinstance( policy_value, float), "Invalid response of calc_ground_truth_policy_value"
def synthetic_bandit_feedback() -> BanditFeedback: n_actions = 10 dim_context = 5 random_state = 12345 n_rounds = 10000 dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=random_state, ) bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) return bandit_feedback
def test_synthetic_init(): # when reward_function is None, expected_reward is randomly sampled in [0, 1] # this check includes the test of `sample_contextfree_expected_reward` function dataset = SyntheticBanditDataset(n_actions=2, beta=0) assert len(dataset.expected_reward) == 2 assert np.all(0 <= dataset.expected_reward) and np.all( dataset.expected_reward <= 1) # one-hot action_context when None is given ohe = np.eye(2, dtype=int) assert np.allclose(dataset.action_context, ohe)
def test_synthetic_init(): # n_actions with pytest.raises(ValueError): SyntheticBanditDataset(n_actions=1) with pytest.raises(ValueError): SyntheticBanditDataset(n_actions="3") # dim_context with pytest.raises(ValueError): SyntheticBanditDataset(n_actions=2, dim_context=0) with pytest.raises(ValueError): SyntheticBanditDataset(n_actions=2, dim_context="2") # reward_type with pytest.raises(ValueError): SyntheticBanditDataset(n_actions=2, reward_type="aaa") # when reward_function is None, expected_reward is randomly sampled in [0, 1] # this check includes the test of `sample_contextfree_expected_reward` function dataset = SyntheticBanditDataset(n_actions=2) assert len(dataset.expected_reward) == 2 assert np.all(0 <= dataset.expected_reward) and np.all( dataset.expected_reward <= 1) # when behavior_policy_function is None, behavior_policy is set to uniform one uniform_policy = np.array([0.5, 0.5]) assert np.allclose(dataset.behavior_policy, uniform_policy) # action_context ohe = np.eye(2, dtype=int) assert np.allclose(dataset.action_context, ohe)
def test_synthetic_init_using_invalid_inputs( n_actions, dim_context, reward_type, reward_std, beta, n_deficient_actions, action_context, random_state, err, description, ): with pytest.raises(err, match=f"{description}*"): _ = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_type=reward_type, reward_std=reward_std, beta=beta, n_deficient_actions=n_deficient_actions, action_context=action_context, random_state=random_state, )
def test_synthetic_obtain_batch_bandit_feedback(): # n_rounds with pytest.raises(ValueError): dataset = SyntheticBanditDataset(n_actions=2) dataset.obtain_batch_bandit_feedback(n_rounds=0) with pytest.raises(ValueError): dataset = SyntheticBanditDataset(n_actions=2) dataset.obtain_batch_bandit_feedback(n_rounds="3") # bandit feedback n_rounds = 10 n_actions = 5 dataset = SyntheticBanditDataset(n_actions=n_actions) bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) assert bandit_feedback["n_rounds"] == n_rounds assert bandit_feedback["n_actions"] == n_actions assert (bandit_feedback["context"].shape[0] == n_rounds # n_rounds and bandit_feedback["context"].shape[1] == 1 # default dim_context ) assert (bandit_feedback["action_context"].shape[0] == n_actions and bandit_feedback["action_context"].shape[1] == n_actions) assert (bandit_feedback["action"].ndim == 1 and len(bandit_feedback["action"]) == n_rounds) assert (bandit_feedback["position"].ndim == 1 and len(bandit_feedback["position"]) == n_rounds) assert (bandit_feedback["reward"].ndim == 1 and len(bandit_feedback["reward"]) == n_rounds) assert (bandit_feedback["expected_reward"].shape[0] == n_rounds and bandit_feedback["expected_reward"].shape[1] == n_actions) assert (bandit_feedback["pscore"].ndim == 1 and len(bandit_feedback["pscore"]) == n_rounds)
# configurations n_runs = args.n_runs n_rounds = args.n_rounds n_actions = args.n_actions dim_context = args.dim_context base_model_for_evaluation_policy = args.base_model_for_evaluation_policy base_model_for_reg_model = args.base_model_for_reg_model n_jobs = args.n_jobs random_state = args.random_state np.random.seed(random_state) # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=random_state, ) # define evaluation policy using IPWLearner evaluation_policy = IPWLearner( n_actions=dataset.n_actions, len_list=dataset.len_list, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) def process(i: int): # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds)
args = parser.parse_args() print(args) n_runs = args.n_runs n_rounds = args.n_rounds n_actions = args.n_actions dim_context = args.dim_context dim_action_context = args.dim_action_context counterfactual_policy = args.counterfactual_policy random_state = args.random_state np.random.seed(random_state) dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, dim_action_context=dim_action_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=random_state, ) # hyparparameters for counterfactual policies kwargs = dict( n_actions=dataset.n_actions, len_list=dataset.len_list, random_state=random_state, ) if ("logistic" in counterfactual_policy) or ("linear" in counterfactual_policy): kwargs["dim"] = dim_context if counterfactual_policy in [ "linear_ucb",
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # estimate the mean reward function of the train set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) ope_estimator = DoublyRobust() # define evaluation policy using NNPolicyLearner nn_policy = NNPolicyLearner( n_actions=dataset.n_actions, dim_context=dim_context, off_policy_objective=ope_estimator.estimate_policy_value_tensor, ) # baseline method 1. RandomPolicy random_policy = RandomPolicy(n_actions=dataset.n_actions) # baseline method 2. UniformSampleWeightLearner uniform_sample_weight_policy = UniformSampleWeightLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # estimate the mean reward function of the train set of synthetic bandit feedback with ML model estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], n_folds=3, # 3-fold cross-fitting random_state=12345, ) # train the evaluation policy on the training set of the synthetic logged bandit feedback nn_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) uniform_sample_weight_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback nn_policy_action_dist = nn_policy.predict( context=bandit_feedback_test["context"], ) random_action_dist = random_policy.predict( context=bandit_feedback_test["context"], ) uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict( context=bandit_feedback_test["context"], ) # get the ground truth policy value for each learner gt_nn_policy_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=nn_policy_action_dist, ) gt_random_policy = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=random_action_dist, ) gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=uniform_sample_weight_action_dist, ) return gt_nn_policy_learner, gt_random_policy, gt_uniform_sample_weight_learner
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # sample new training and test sets of synthetic logged bandit data bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # defining policy learners ipw_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) q_policy = QLearner( n_actions=dataset.n_actions, base_model=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) nn_policy = NNPolicyLearner( n_actions=dataset.n_actions, dim_context=dim_context, off_policy_objective="ipw", ) # baseline method 1. RandomPolicy random_policy = RandomPolicy(n_actions=dataset.n_actions) # baseline method 2. UniformSampleWeightLearner uniform_sample_weight_policy = UniformSampleWeightLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # policy training ipw_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) q_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) nn_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) uniform_sample_weight_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # prediction/making decisions ipw_action_dist = ipw_policy.predict( context=bandit_feedback_test["context"], ) q_action_dist = q_policy.predict( context=bandit_feedback_test["context"], ) nn_action_dist = nn_policy.predict( context=bandit_feedback_test["context"], ) random_action_dist = random_policy.predict( context=bandit_feedback_test["context"], ) uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict( context=bandit_feedback_test["context"], ) # evaluation gt_ipw_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=ipw_action_dist, ) gt_q_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=q_action_dist, ) gt_nn_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=nn_action_dist, ) gt_random_policy = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=random_action_dist, ) gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=uniform_sample_weight_action_dist, ) return ( gt_ipw_learner, gt_q_learner, gt_nn_learner, gt_random_policy, gt_uniform_sample_weight_learner, )
def test_synthetic_obtain_batch_bandit_feedback(): # n_rounds with pytest.raises(ValueError): dataset = SyntheticBanditDataset(n_actions=2) dataset.obtain_batch_bandit_feedback(n_rounds=0) with pytest.raises(TypeError): dataset = SyntheticBanditDataset(n_actions=2) dataset.obtain_batch_bandit_feedback(n_rounds="3") # bandit feedback n_rounds = 10 n_actions = 5 for n_deficient_actions in [0, 2]: dataset = SyntheticBanditDataset( n_actions=n_actions, beta=0, n_deficient_actions=n_deficient_actions) bandit_feedback = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) assert bandit_feedback["n_rounds"] == n_rounds assert bandit_feedback["n_actions"] == n_actions assert (bandit_feedback["context"].shape[0] == n_rounds # n_rounds and bandit_feedback["context"].shape[1] == 1 # default dim_context ) assert (bandit_feedback["action_context"].shape[0] == n_actions and bandit_feedback["action_context"].shape[1] == n_actions) assert (bandit_feedback["action"].ndim == 1 and len(bandit_feedback["action"]) == n_rounds) assert bandit_feedback["position"] is None assert (bandit_feedback["reward"].ndim == 1 and len(bandit_feedback["reward"]) == n_rounds) assert (bandit_feedback["expected_reward"].shape[0] == n_rounds and bandit_feedback["expected_reward"].shape[1] == n_actions) assert (bandit_feedback["pi_b"].shape[0] == n_rounds and bandit_feedback["pi_b"].shape[1] == n_actions) # when `beta=0`, behavior_policy should be uniform if n_deficient_actions == 0: uniform_policy = np.ones_like(bandit_feedback["pi_b"]) / n_actions assert np.allclose(bandit_feedback["pi_b"], uniform_policy) assert np.allclose(bandit_feedback["pi_b"][:, :, 0].sum(1), np.ones(n_rounds)) assert (bandit_feedback["pi_b"] == 0 ).sum() == n_deficient_actions * n_rounds assert (bandit_feedback["pscore"].ndim == 1 and len(bandit_feedback["pscore"]) == n_rounds)
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # define evaluation policy using IPWLearner ipw_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # baseline method 1. RandomPolicy random_policy = RandomPolicy(n_actions=dataset.n_actions) # baseline method 2. UniformSampleWeightLearner uniform_sample_weight_policy = UniformSampleWeightLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit feedback ipw_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) uniform_sample_weight_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback ipw_action_dist = ipw_policy.predict( context=bandit_feedback_test["context"], ) random_action_dist = random_policy.predict( context=bandit_feedback_test["context"], ) uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict( context=bandit_feedback_test["context"], ) # get the ground truth policy value for each learner gt_ipw_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=ipw_action_dist, ) gt_random_policy = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=random_action_dist, ) gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=uniform_sample_weight_action_dist, ) return gt_ipw_learner, gt_random_policy, gt_uniform_sample_weight_learner
# configurations n_runs = args.n_runs n_rounds = args.n_rounds n_actions = args.n_actions dim_context = args.dim_context n_sim = args.n_sim evaluation_policy_name = args.evaluation_policy_name n_jobs = args.n_jobs random_state = args.random_state np.random.seed(random_state) # synthetic data generator with uniformly random policy dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=None, # uniformly random random_state=random_state, ) # define evaluation policy evaluation_policy_dict = dict( bernoulli_ts=BernoulliTS(n_actions=n_actions, random_state=random_state), epsilon_greedy=EpsilonGreedy(n_actions=n_actions, epsilon=0.1, random_state=random_state), lin_epsilon_greedy=LinEpsilonGreedy(dim=dim_context, n_actions=n_actions, epsilon=0.1, random_state=random_state), lin_ts=LinTS(dim=dim_context,
base_model_for_evaluation_policy = args.base_model_for_evaluation_policy base_model_for_reg_model = args.base_model_for_reg_model ope_estimator = args.ope_estimator n_hidden = args.n_hidden n_layers = args.n_layers activation = args.activation solver = args.solver batch_size = args.batch_size if args.batch_size else "auto" early_stopping = args.early_stopping random_state = args.random_state # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=random_state, ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # estimate the mean reward function of the train set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), )