Пример #1
0
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # define evaluation policy using IPWLearner
        evaluation_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]
            ),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        evaluation_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        action_dist = evaluation_policy.predict(
            context=bandit_feedback_test["context"],
        )
        # estimate the mean reward function of the test set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]
            ),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_test["context"],
            action=bandit_feedback_test["action"],
            reward=bandit_feedback_test["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=random_state,
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback_test,
            ope_estimators=ope_estimators,
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=dataset.calc_ground_truth_policy_value(
                expected_reward=bandit_feedback_test["expected_reward"],
                action_dist=action_dist,
            ),
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return relative_ee_i
Пример #2
0
    n_jobs = args.n_jobs
    random_state = args.random_state
    np.random.seed(random_state)
    data_path = Path("../open_bandit_dataset")

    # define a dataset class
    obd = OBDWithInteractionFeatures(
        behavior_policy=behavior_policy,
        campaign=campaign,
        data_path=data_path,
        context_set=context_set,
    )
    # define a counterfactual policy based on IPWLearner
    counterfactual_policy = IPWLearner(
        base_model=base_model_dict[base_model](**hyperparams[base_model]),
        n_actions=obd.n_actions,
        len_list=obd.len_list,
    )
    policy_name = f"{base_model}_{context_set}"

    # ground-truth policy value of the Bernoulli TS policy (the current best policy) in the test set
    # , which is the empirical mean of the factual (observed) rewards (on-policy estimation)
    ground_truth = obd.calc_on_policy_policy_value_estimate(
        behavior_policy="bts",
        campaign=campaign,
        data_path=data_path,
        test_size=test_size,
        is_timeseries_split=True,
    )

    def process(b: int):
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # define evaluation policy using IPWLearner
        ipw_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        # baseline method 1. RandomPolicy
        random_policy = RandomPolicy(n_actions=dataset.n_actions)
        # baseline method 2. UniformSampleWeightLearner
        uniform_sample_weight_policy = UniformSampleWeightLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        ipw_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        uniform_sample_weight_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        ipw_action_dist = ipw_policy.predict(
            context=bandit_feedback_test["context"], )
        random_action_dist = random_policy.predict(
            context=bandit_feedback_test["context"], )
        uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict(
            context=bandit_feedback_test["context"], )
        # get the ground truth policy value for each learner
        gt_ipw_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=ipw_action_dist,
        )
        gt_random_policy = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=random_action_dist,
        )
        gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=uniform_sample_weight_action_dist,
        )

        return gt_ipw_learner, gt_random_policy, gt_uniform_sample_weight_learner
    n_jobs = args.n_jobs
    random_state = args.random_state
    np.random.seed(random_state)

    # synthetic data generator
    dataset = SyntheticBanditDataset(
        n_actions=n_actions,
        dim_context=dim_context,
        reward_function=logistic_reward_function,
        behavior_policy_function=linear_behavior_policy,
        random_state=random_state,
    )
    # define evaluation policy using IPWLearner
    evaluation_policy = IPWLearner(
        n_actions=dataset.n_actions,
        len_list=dataset.len_list,
        base_classifier=base_model_dict[base_model_for_evaluation_policy](
            **hyperparams[base_model_for_evaluation_policy]),
    )

    def process(i: int):
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        evaluation_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
Пример #5
0
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDatasetWithActionEmbeds(
            n_actions=n_actions,
            dim_context=dim_context,
            beta=3.0,
            n_cat_dim=3,
            n_cat_per_dim=5,
            reward_function=logistic_reward_function,
            random_state=i,
        )
        # define evaluation policy using IPWLearner
        evaluation_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_iw_estimator](
                **hyperparams[base_model_for_iw_estimator]),
        )
        # sample new training and test sets of synthetic logged bandit data
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit data
        evaluation_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit data
        action_dist = evaluation_policy.predict_proba(
            context=bandit_feedback_test["context"], )
        # estimate the reward function of the test set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_test["context"],
            action=bandit_feedback_test["action"],
            reward=bandit_feedback_test["reward"],
            n_folds=2,
            random_state=12345,
        )
        # fit propensity score estimators
        pscore_estimator = PropensityScoreEstimator(
            len_list=1,
            n_actions=n_actions,
            base_model=base_model_dict[base_model_for_pscore_estimator](
                **hyperparams[base_model_for_pscore_estimator]),
            calibration_cv=3,
        )
        estimated_pscore = pscore_estimator.fit_predict(
            action=bandit_feedback_test["action"],
            position=bandit_feedback_test["position"],
            context=bandit_feedback_test["context"],
            n_folds=3,
            random_state=12345,
        )
        # fit importance weight estimators
        estimated_importance_weights_dict = {}
        for clf_name, clf_arguments in bipw_model_configurations.items():
            clf = ImportanceWeightEstimator(
                len_list=1,
                n_actions=n_actions,
                fitting_method=clf_arguments["fitting_method"],
                base_model=clf_arguments["base_model"],
            )
            estimated_importance_weights_dict[clf_name] = clf.fit_predict(
                action=bandit_feedback_test["action"],
                context=bandit_feedback_test["context"],
                action_dist=action_dist,
                position=bandit_feedback_test["position"],
                n_folds=2,
                evaluate_model_performance=False,
                random_state=12345,
            )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback_test,
            ope_estimators=ope_estimators + [
                MarginalizedInverseProbabilityWeighting(n_actions=n_actions,
                                                        estimator_name="mipw"),
                MarginalizedInverseProbabilityWeighting(
                    n_actions=n_actions,
                    embedding_selection_method="greedy",
                    estimator_name="mipw (greedy selection)",
                ),
                SelfNormalizedMarginalizedInverseProbabilityWeighting(
                    n_actions=n_actions, estimator_name="snmipw"),
            ],
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=dataset.calc_ground_truth_policy_value(
                expected_reward=bandit_feedback_test["expected_reward"],
                action_dist=action_dist,
            ),
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
            estimated_pscore=estimated_pscore,
            estimated_importance_weights=estimated_importance_weights_dict,
            action_embed=bandit_feedback_test["action_embed"],
            pi_b=bandit_feedback_test["pi_b"],
            metric="relative-ee",
        )

        return relative_ee_i
Пример #6
0
         **hyperparams[base_model_for_reg_model]),
 )
 estimated_rewards_by_reg_model = regression_model.fit_predict(
     context=bandit_feedback_train["context"],
     action=bandit_feedback_train["action"],
     reward=bandit_feedback_train["reward"],
     n_folds=3,  # 3-fold cross-fitting
     random_state=random_state,
 )
 # define random evaluation policy
 random_policy = Random(n_actions=dataset.n_actions,
                        random_state=random_state)
 # define evaluation policy using IPWLearner
 ipw_learner = IPWLearner(
     n_actions=dataset.n_actions,
     base_classifier=base_model_dict[base_model_for_evaluation_policy](
         **hyperparams[base_model_for_evaluation_policy]),
 )
 # define evaluation policy using NNPolicyLearner
 nn_policy_learner = NNPolicyLearner(
     n_actions=dataset.n_actions,
     dim_context=dim_context,
     off_policy_objective=ope_estimator_dict[ope_estimator].
     estimate_policy_value_tensor,
     hidden_layer_size=tuple((n_hidden for _ in range(n_layers))),
     activation=activation,
     solver=solver,
     batch_size=batch_size,
     early_stopping=early_stopping,
     random_state=random_state,
 )
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # sample new training and test sets of synthetic logged bandit data
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)

        # defining policy learners
        ipw_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        q_policy = QLearner(
            n_actions=dataset.n_actions,
            base_model=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        nn_policy = NNPolicyLearner(
            n_actions=dataset.n_actions,
            dim_context=dim_context,
            off_policy_objective="ipw",
        )
        # baseline method 1. RandomPolicy
        random_policy = RandomPolicy(n_actions=dataset.n_actions)
        # baseline method 2. UniformSampleWeightLearner
        uniform_sample_weight_policy = UniformSampleWeightLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )

        # policy training
        ipw_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        q_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        nn_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        uniform_sample_weight_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )

        # prediction/making decisions
        ipw_action_dist = ipw_policy.predict(
            context=bandit_feedback_test["context"], )
        q_action_dist = q_policy.predict(
            context=bandit_feedback_test["context"], )
        nn_action_dist = nn_policy.predict(
            context=bandit_feedback_test["context"], )
        random_action_dist = random_policy.predict(
            context=bandit_feedback_test["context"], )
        uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict(
            context=bandit_feedback_test["context"], )

        # evaluation
        gt_ipw_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=ipw_action_dist,
        )
        gt_q_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=q_action_dist,
        )
        gt_nn_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=nn_action_dist,
        )
        gt_random_policy = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=random_action_dist,
        )
        gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=uniform_sample_weight_action_dist,
        )

        return (
            gt_ipw_learner,
            gt_q_learner,
            gt_nn_learner,
            gt_random_policy,
            gt_uniform_sample_weight_learner,
        )
Пример #8
0
    test_size = args.test_size
    random_state = args.random_state
    np.random.seed(random_state)
    data_path = Path("../open_bandit_dataset")

    # define a dataset class
    obd = OBDWithInteractionFeatures(
        behavior_policy=behavior_policy,
        campaign=campaign,
        data_path=data_path,
        context_set=context_set,
    )
    # define a evaluation policy
    evaluation_policy = IPWLearner(
        base_model=base_model_dict[base_model](**hyperparams[base_model]),
        n_actions=obd.n_actions,
        len_list=obd.len_list,
    )
    policy_name = f"{base_model}_{context_set}"

    # ground-truth policy value of the Bernoulli TS policy (the current best policy) in the test set
    # , which is the empirical mean of the factual (observed) rewards (on-policy estimation)
    ground_truth = obd.calc_on_policy_policy_value_estimate(
        behavior_policy="bts",
        campaign=campaign,
        data_path=data_path,
        test_size=test_size,
        is_timeseries_split=True,
    )

    start = time.time()