Exemplo n.º 1
0
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # define evaluation policy using IPWLearner
        evaluation_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]
            ),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        evaluation_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        action_dist = evaluation_policy.predict(
            context=bandit_feedback_test["context"],
        )
        # estimate the mean reward function of the test set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]
            ),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_test["context"],
            action=bandit_feedback_test["action"],
            reward=bandit_feedback_test["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=random_state,
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback_test,
            ope_estimators=ope_estimators,
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=dataset.calc_ground_truth_policy_value(
                expected_reward=bandit_feedback_test["expected_reward"],
                action_dist=action_dist,
            ),
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return relative_ee_i
Exemplo n.º 2
0
def test_synthetic_obtain_batch_bandit_feedback():
    # n_rounds
    with pytest.raises(ValueError):
        dataset = SyntheticBanditDataset(n_actions=2)
        dataset.obtain_batch_bandit_feedback(n_rounds=0)

    with pytest.raises(ValueError):
        dataset = SyntheticBanditDataset(n_actions=2)
        dataset.obtain_batch_bandit_feedback(n_rounds="3")

    # bandit feedback
    n_rounds = 10
    n_actions = 5
    dataset = SyntheticBanditDataset(n_actions=n_actions)
    bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
    assert bandit_feedback["n_rounds"] == n_rounds
    assert bandit_feedback["n_actions"] == n_actions
    assert (bandit_feedback["context"].shape[0] == n_rounds  # n_rounds
            and bandit_feedback["context"].shape[1] == 1  # default dim_context
            )
    assert (bandit_feedback["action_context"].shape[0] == n_actions
            and bandit_feedback["action_context"].shape[1] == n_actions)
    assert (bandit_feedback["action"].ndim == 1
            and len(bandit_feedback["action"]) == n_rounds)
    assert (bandit_feedback["position"].ndim == 1
            and len(bandit_feedback["position"]) == n_rounds)
    assert (bandit_feedback["reward"].ndim == 1
            and len(bandit_feedback["reward"]) == n_rounds)
    assert (bandit_feedback["expected_reward"].shape[0] == n_rounds
            and bandit_feedback["expected_reward"].shape[1] == n_actions)
    assert (bandit_feedback["pscore"].ndim == 1
            and len(bandit_feedback["pscore"]) == n_rounds)
Exemplo n.º 3
0
    def process(i: int):
        # synthetic data generator with uniformly random policy
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=None,  # uniformly random
            random_state=i,
        )
        # sample new data of synthetic logged bandit feedback
        bandit_feedback = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # simulate the evaluation policy
        action_dist = run_bandit_simulation(bandit_feedback=bandit_feedback,
                                            policy=evaluation_policy)
        # estimate the ground-truth policy values of the evaluation policy
        # by Monte-Carlo Simulation using p(r|x,a), the reward distribution
        ground_truth_policy_value = calc_ground_truth_policy_value(
            bandit_feedback=bandit_feedback,
            reward_sampler=dataset.sample_reward,  # p(r|x,a)
            policy=evaluation_policy,
            n_sim=n_sim,  # the number of simulations
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback,
            ope_estimators=ope_estimators,
        )
        metric_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            action_dist=action_dist,
        )

        return metric_i
Exemplo n.º 4
0
def test_synthetic_obtain_batch_bandit_feedback():
    # n_rounds
    with pytest.raises(ValueError):
        dataset = SyntheticBanditDataset(n_actions=2)
        dataset.obtain_batch_bandit_feedback(n_rounds=0)

    with pytest.raises(TypeError):
        dataset = SyntheticBanditDataset(n_actions=2)
        dataset.obtain_batch_bandit_feedback(n_rounds="3")

    # bandit feedback
    n_rounds = 10
    n_actions = 5
    for n_deficient_actions in [0, 2]:
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            beta=0,
            n_deficient_actions=n_deficient_actions)
        bandit_feedback = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        assert bandit_feedback["n_rounds"] == n_rounds
        assert bandit_feedback["n_actions"] == n_actions
        assert (bandit_feedback["context"].shape[0] == n_rounds  # n_rounds
                and
                bandit_feedback["context"].shape[1] == 1  # default dim_context
                )
        assert (bandit_feedback["action_context"].shape[0] == n_actions
                and bandit_feedback["action_context"].shape[1] == n_actions)
        assert (bandit_feedback["action"].ndim == 1
                and len(bandit_feedback["action"]) == n_rounds)
        assert bandit_feedback["position"] is None
        assert (bandit_feedback["reward"].ndim == 1
                and len(bandit_feedback["reward"]) == n_rounds)
        assert (bandit_feedback["expected_reward"].shape[0] == n_rounds
                and bandit_feedback["expected_reward"].shape[1] == n_actions)
        assert (bandit_feedback["pi_b"].shape[0] == n_rounds
                and bandit_feedback["pi_b"].shape[1] == n_actions)
        # when `beta=0`, behavior_policy should be uniform
        if n_deficient_actions == 0:
            uniform_policy = np.ones_like(bandit_feedback["pi_b"]) / n_actions
            assert np.allclose(bandit_feedback["pi_b"], uniform_policy)
        assert np.allclose(bandit_feedback["pi_b"][:, :, 0].sum(1),
                           np.ones(n_rounds))
        assert (bandit_feedback["pi_b"] == 0
                ).sum() == n_deficient_actions * n_rounds
        assert (bandit_feedback["pscore"].ndim == 1
                and len(bandit_feedback["pscore"]) == n_rounds)
Exemplo n.º 5
0
def synthetic_bandit_feedback() -> BanditFeedback:
    n_actions = 10
    dim_context = 5
    random_state = 12345
    n_rounds = 10000
    dataset = SyntheticBanditDataset(
        n_actions=n_actions,
        dim_context=dim_context,
        reward_function=logistic_reward_function,
        behavior_policy_function=linear_behavior_policy,
        random_state=random_state,
    )
    bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
    return bandit_feedback
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # define evaluation policy using IPWLearner
        ipw_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        # baseline method 1. RandomPolicy
        random_policy = RandomPolicy(n_actions=dataset.n_actions)
        # baseline method 2. UniformSampleWeightLearner
        uniform_sample_weight_policy = UniformSampleWeightLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        ipw_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        uniform_sample_weight_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        ipw_action_dist = ipw_policy.predict(
            context=bandit_feedback_test["context"], )
        random_action_dist = random_policy.predict(
            context=bandit_feedback_test["context"], )
        uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict(
            context=bandit_feedback_test["context"], )
        # get the ground truth policy value for each learner
        gt_ipw_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=ipw_action_dist,
        )
        gt_random_policy = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=random_action_dist,
        )
        gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=uniform_sample_weight_action_dist,
        )

        return gt_ipw_learner, gt_random_policy, gt_uniform_sample_weight_learner
Exemplo n.º 7
0
        InverseProbabilityWeighting(),
        SelfNormalizedInverseProbabilityWeighting(),
        DoublyRobust(),
        SelfNormalizedDoublyRobust(),
        SwitchDoublyRobust(),
    ]
    # a base ML model for regression model used in Direct Method and Doubly Robust
    base_model = CalibratedClassifierCV(RandomForest(**hyperparams))

    evaluation_of_ope_results = {
        est.estimator_name: np.zeros(n_runs)
        for est in ope_estimators
    }
    for i in np.arange(n_runs):
        # sample a new set of logged bandit feedback
        bandit_feedback = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # run a counterfactual bandit algorithm on logged bandit feedback data
        selected_actions = run_bandit_simulation(
            bandit_feedback=bandit_feedback, policy=policy)
        # estimate the ground-truth policy values of the counterfactual policy
        # using the full expected reward contained in the bandit feedback dictionary
        ground_truth_policy_value = bandit_feedback["expected_reward"][
            np.arange(n_rounds),
            selected_actions.flatten()].mean()
        # evaluate the estimation performance of OPE estimators
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback,
            action_context=dataset.action_context,
            regression_model=RegressionModel(base_model=base_model),
            ope_estimators=ope_estimators,
        )
Exemplo n.º 8
0
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # estimate the mean reward function of the train set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]),
        )
        ope_estimator = DoublyRobust()
        # define evaluation policy using NNPolicyLearner
        nn_policy = NNPolicyLearner(
            n_actions=dataset.n_actions,
            dim_context=dim_context,
            off_policy_objective=ope_estimator.estimate_policy_value_tensor,
        )
        # baseline method 1. RandomPolicy
        random_policy = RandomPolicy(n_actions=dataset.n_actions)
        # baseline method 2. UniformSampleWeightLearner
        uniform_sample_weight_policy = UniformSampleWeightLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # estimate the mean reward function of the train set of synthetic bandit feedback with ML model
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=12345,
        )
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        nn_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
        uniform_sample_weight_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        nn_policy_action_dist = nn_policy.predict(
            context=bandit_feedback_test["context"], )
        random_action_dist = random_policy.predict(
            context=bandit_feedback_test["context"], )
        uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict(
            context=bandit_feedback_test["context"], )
        # get the ground truth policy value for each learner
        gt_nn_policy_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=nn_policy_action_dist,
        )
        gt_random_policy = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=random_action_dist,
        )
        gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=uniform_sample_weight_action_dist,
        )

        return gt_nn_policy_learner, gt_random_policy, gt_uniform_sample_weight_learner
Exemplo n.º 9
0
    activation = args.activation
    solver = args.solver
    batch_size = args.batch_size if args.batch_size else "auto"
    early_stopping = args.early_stopping
    random_state = args.random_state

    # synthetic data generator
    dataset = SyntheticBanditDataset(
        n_actions=n_actions,
        dim_context=dim_context,
        reward_function=logistic_reward_function,
        behavior_policy_function=linear_behavior_policy,
        random_state=random_state,
    )
    # sample new training and test sets of synthetic logged bandit feedback
    bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
        n_rounds=n_rounds)
    bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
        n_rounds=n_rounds)
    # estimate the mean reward function of the train set of synthetic bandit feedback with ML model
    regression_model = RegressionModel(
        n_actions=dataset.n_actions,
        action_context=dataset.action_context,
        base_model=base_model_dict[base_model_for_reg_model](
            **hyperparams[base_model_for_reg_model]),
    )
    estimated_rewards_by_reg_model = regression_model.fit_predict(
        context=bandit_feedback_train["context"],
        action=bandit_feedback_train["action"],
        reward=bandit_feedback_train["reward"],
        n_folds=3,  # 3-fold cross-fitting
        random_state=random_state,
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # sample new training and test sets of synthetic logged bandit data
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)

        # defining policy learners
        ipw_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        q_policy = QLearner(
            n_actions=dataset.n_actions,
            base_model=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        nn_policy = NNPolicyLearner(
            n_actions=dataset.n_actions,
            dim_context=dim_context,
            off_policy_objective="ipw",
        )
        # baseline method 1. RandomPolicy
        random_policy = RandomPolicy(n_actions=dataset.n_actions)
        # baseline method 2. UniformSampleWeightLearner
        uniform_sample_weight_policy = UniformSampleWeightLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )

        # policy training
        ipw_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        q_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        nn_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        uniform_sample_weight_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )

        # prediction/making decisions
        ipw_action_dist = ipw_policy.predict(
            context=bandit_feedback_test["context"], )
        q_action_dist = q_policy.predict(
            context=bandit_feedback_test["context"], )
        nn_action_dist = nn_policy.predict(
            context=bandit_feedback_test["context"], )
        random_action_dist = random_policy.predict(
            context=bandit_feedback_test["context"], )
        uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict(
            context=bandit_feedback_test["context"], )

        # evaluation
        gt_ipw_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=ipw_action_dist,
        )
        gt_q_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=q_action_dist,
        )
        gt_nn_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=nn_action_dist,
        )
        gt_random_policy = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=random_action_dist,
        )
        gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=uniform_sample_weight_action_dist,
        )

        return (
            gt_ipw_learner,
            gt_q_learner,
            gt_nn_learner,
            gt_random_policy,
            gt_uniform_sample_weight_learner,
        )