Пример #1
0
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # define evaluation policy using IPWLearner
        evaluation_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]
            ),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        evaluation_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        action_dist = evaluation_policy.predict(
            context=bandit_feedback_test["context"],
        )
        # estimate the mean reward function of the test set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]
            ),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_test["context"],
            action=bandit_feedback_test["action"],
            reward=bandit_feedback_test["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=random_state,
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback_test,
            ope_estimators=ope_estimators,
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=dataset.calc_ground_truth_policy_value(
                expected_reward=bandit_feedback_test["expected_reward"],
                action_dist=action_dist,
            ),
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return relative_ee_i
Пример #2
0
def test_synthetic_calc_policy_value_using_invalid_inputs(
    expected_reward,
    action_dist,
    description,
):
    n_actions = 10
    dataset = SyntheticBanditDataset(n_actions=n_actions)

    with pytest.raises(ValueError, match=f"{description}*"):
        _ = dataset.calc_ground_truth_policy_value(
            expected_reward=expected_reward, action_dist=action_dist)
Пример #3
0
def test_synthetic_calc_policy_value_using_valid_inputs(
    expected_reward,
    action_dist,
    description,
):
    n_actions = 10
    dataset = SyntheticBanditDataset(n_actions=n_actions)

    policy_value = dataset.calc_ground_truth_policy_value(
        expected_reward=expected_reward, action_dist=action_dist)
    assert isinstance(
        policy_value,
        float), "Invalid response of calc_ground_truth_policy_value"
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # define evaluation policy using IPWLearner
        ipw_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        # baseline method 1. RandomPolicy
        random_policy = RandomPolicy(n_actions=dataset.n_actions)
        # baseline method 2. UniformSampleWeightLearner
        uniform_sample_weight_policy = UniformSampleWeightLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        ipw_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        uniform_sample_weight_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        ipw_action_dist = ipw_policy.predict(
            context=bandit_feedback_test["context"], )
        random_action_dist = random_policy.predict(
            context=bandit_feedback_test["context"], )
        uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict(
            context=bandit_feedback_test["context"], )
        # get the ground truth policy value for each learner
        gt_ipw_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=ipw_action_dist,
        )
        gt_random_policy = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=random_action_dist,
        )
        gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=uniform_sample_weight_action_dist,
        )

        return gt_ipw_learner, gt_random_policy, gt_uniform_sample_weight_learner
Пример #5
0
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # estimate the mean reward function of the train set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]),
        )
        ope_estimator = DoublyRobust()
        # define evaluation policy using NNPolicyLearner
        nn_policy = NNPolicyLearner(
            n_actions=dataset.n_actions,
            dim_context=dim_context,
            off_policy_objective=ope_estimator.estimate_policy_value_tensor,
        )
        # baseline method 1. RandomPolicy
        random_policy = RandomPolicy(n_actions=dataset.n_actions)
        # baseline method 2. UniformSampleWeightLearner
        uniform_sample_weight_policy = UniformSampleWeightLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # estimate the mean reward function of the train set of synthetic bandit feedback with ML model
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=12345,
        )
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        nn_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
        uniform_sample_weight_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        nn_policy_action_dist = nn_policy.predict(
            context=bandit_feedback_test["context"], )
        random_action_dist = random_policy.predict(
            context=bandit_feedback_test["context"], )
        uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict(
            context=bandit_feedback_test["context"], )
        # get the ground truth policy value for each learner
        gt_nn_policy_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=nn_policy_action_dist,
        )
        gt_random_policy = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=random_action_dist,
        )
        gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=uniform_sample_weight_action_dist,
        )

        return gt_nn_policy_learner, gt_random_policy, gt_uniform_sample_weight_learner
Пример #6
0
        action=bandit_feedback_train["action"],
        reward=bandit_feedback_train["reward"],
        pscore=bandit_feedback_train["pscore"],
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    )
    # predict the action decisions for the test set of the synthetic logged bandit feedback
    random_action_dist = random_policy.compute_batch_action_dist(
        n_rounds=n_rounds)
    ipw_learner_action_dist = ipw_learner.predict(
        context=bandit_feedback_test["context"], )
    nn_policy_learner_action_dist = nn_policy_learner.predict_proba(
        context=bandit_feedback_test["context"], )

    # evaluate learners' performances using ground-truth polocy values
    random_policy_value = dataset.calc_ground_truth_policy_value(
        expected_reward=bandit_feedback_test["expected_reward"],
        action_dist=random_action_dist,
    )
    ipw_learner_policy_value = dataset.calc_ground_truth_policy_value(
        expected_reward=bandit_feedback_test["expected_reward"],
        action_dist=ipw_learner_action_dist,
    )
    nn_policy_learner_policy_value = dataset.calc_ground_truth_policy_value(
        expected_reward=bandit_feedback_test["expected_reward"],
        action_dist=nn_policy_learner_action_dist,
    )

    policy_value_df = DataFrame(
        [
            [random_policy_value],
            [ipw_learner_policy_value],
            [nn_policy_learner_policy_value],
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # sample new training and test sets of synthetic logged bandit data
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)

        # defining policy learners
        ipw_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        q_policy = QLearner(
            n_actions=dataset.n_actions,
            base_model=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        nn_policy = NNPolicyLearner(
            n_actions=dataset.n_actions,
            dim_context=dim_context,
            off_policy_objective="ipw",
        )
        # baseline method 1. RandomPolicy
        random_policy = RandomPolicy(n_actions=dataset.n_actions)
        # baseline method 2. UniformSampleWeightLearner
        uniform_sample_weight_policy = UniformSampleWeightLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )

        # policy training
        ipw_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        q_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        nn_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        uniform_sample_weight_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )

        # prediction/making decisions
        ipw_action_dist = ipw_policy.predict(
            context=bandit_feedback_test["context"], )
        q_action_dist = q_policy.predict(
            context=bandit_feedback_test["context"], )
        nn_action_dist = nn_policy.predict(
            context=bandit_feedback_test["context"], )
        random_action_dist = random_policy.predict(
            context=bandit_feedback_test["context"], )
        uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict(
            context=bandit_feedback_test["context"], )

        # evaluation
        gt_ipw_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=ipw_action_dist,
        )
        gt_q_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=q_action_dist,
        )
        gt_nn_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=nn_action_dist,
        )
        gt_random_policy = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=random_action_dist,
        )
        gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=uniform_sample_weight_action_dist,
        )

        return (
            gt_ipw_learner,
            gt_q_learner,
            gt_nn_learner,
            gt_random_policy,
            gt_uniform_sample_weight_learner,
        )