예제 #1
0
 def process(b: int):
     # sample bootstrap from batch logged bandit feedback
     boot_bandit_feedback = obd.sample_bootstrap_bandit_feedback(
         test_size=test_size, is_timeseries_split=True, random_state=b
     )
     # train an evaluation on the training set of the logged bandit feedback data
     action_dist = counterfactual_policy.fit(
         context=boot_bandit_feedback["context"],
         action=boot_bandit_feedback["action"],
         reward=boot_bandit_feedback["reward"],
         pscore=boot_bandit_feedback["pscore"],
         position=boot_bandit_feedback["position"],
     )
     # make action selections (predictions)
     action_dist = counterfactual_policy.predict(
         context=boot_bandit_feedback["context_test"]
     )
     # estimate the policy value of a given counterfactual algorithm by the three OPE estimators.
     ipw = InverseProbabilityWeighting()
     return ipw.estimate_policy_value(
         reward=boot_bandit_feedback["reward_test"],
         action=boot_bandit_feedback["action_test"],
         position=boot_bandit_feedback["position_test"],
         pscore=boot_bandit_feedback["pscore_test"],
         action_dist=action_dist,
     )
예제 #2
0
def test_ipw_using_invalid_input_data(
    action_dist: np.ndarray,
    action: np.ndarray,
    reward: np.ndarray,
    pscore: np.ndarray,
    position: np.ndarray,
    use_estimated_pscore: bool,
    estimated_pscore: np.ndarray,
    description: str,
) -> None:
    # prepare ipw instances
    ipw = InverseProbabilityWeighting(use_estimated_pscore=use_estimated_pscore)
    snipw = SelfNormalizedInverseProbabilityWeighting(
        use_estimated_pscore=use_estimated_pscore
    )
    sgipw = SubGaussianInverseProbabilityWeighting(
        use_estimated_pscore=use_estimated_pscore
    )
    ipw_tuning = InverseProbabilityWeightingTuning(
        lambdas=[10, 1000], use_estimated_pscore=use_estimated_pscore
    )
    sgipw_tuning = SubGaussianInverseProbabilityWeightingTuning(
        lambdas=[0.01, 0.1], use_estimated_pscore=use_estimated_pscore
    )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ipw.estimate_policy_value(
            action_dist=action_dist,
            action=action,
            reward=reward,
            pscore=pscore,
            position=position,
            estimated_pscore=estimated_pscore,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ipw.estimate_interval(
            action_dist=action_dist,
            action=action,
            reward=reward,
            pscore=pscore,
            position=position,
            estimated_pscore=estimated_pscore,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = snipw.estimate_policy_value(
            action_dist=action_dist,
            action=action,
            reward=reward,
            pscore=pscore,
            position=position,
            estimated_pscore=estimated_pscore,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = snipw.estimate_interval(
            action_dist=action_dist,
            action=action,
            reward=reward,
            pscore=pscore,
            position=position,
            estimated_pscore=estimated_pscore,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ipw_tuning.estimate_policy_value(
            action_dist=action_dist,
            action=action,
            reward=reward,
            pscore=pscore,
            position=position,
            estimated_pscore=estimated_pscore,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ipw_tuning.estimate_interval(
            action_dist=action_dist,
            action=action,
            reward=reward,
            pscore=pscore,
            position=position,
            estimated_pscore=estimated_pscore,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = sgipw.estimate_policy_value(
            action_dist=action_dist,
            action=action,
            reward=reward,
            pscore=pscore,
            position=position,
            estimated_pscore=estimated_pscore,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = sgipw.estimate_interval(
            action_dist=action_dist,
            action=action,
            reward=reward,
            pscore=pscore,
            position=position,
            estimated_pscore=estimated_pscore,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = sgipw_tuning.estimate_policy_value(
            action_dist=action_dist,
            action=action,
            reward=reward,
            pscore=pscore,
            position=position,
            estimated_pscore=estimated_pscore,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = sgipw_tuning.estimate_interval(
            action_dist=action_dist,
            action=action,
            reward=reward,
            pscore=pscore,
            position=position,
            estimated_pscore=estimated_pscore,
        )
예제 #3
0
        action_dist = evaluation_policy.fit(
            context=boot_bandit_feedback["context"],
            action=boot_bandit_feedback["action"],
            reward=boot_bandit_feedback["reward"],
            pscore=boot_bandit_feedback["pscore"],
            position=boot_bandit_feedback["position"],
        )
        # make action selections (predictions)
        action_dist = evaluation_policy.predict(
            context=boot_bandit_feedback["context_test"])
        # estimate the policy value of a given counterfactual algorithm by the three OPE estimators.
        ipw = InverseProbabilityWeighting()
        ope_results[b] = (ipw.estimate_policy_value(
            reward=boot_bandit_feedback["reward_test"],
            action=boot_bandit_feedback["action_test"],
            position=boot_bandit_feedback["position_test"],
            pscore=boot_bandit_feedback["pscore_test"],
            action_dist=action_dist,
        ) / ground_truth)

        print(
            f"{b+1}th iteration: {np.round((time.time() - start) / 60, 2)}min")
    ope_results_dict = estimate_confidence_interval_by_bootstrap(
        samples=ope_results, random_state=random_state)
    ope_results_dict["mean(no-boot)"] = ope_results.mean()
    ope_results_dict["std"] = np.std(ope_results, ddof=1)
    ope_results_df = pd.DataFrame(ope_results_dict, index=["ipw"])

    # calculate estimated policy value relative to that of the behavior policy
    print("=" * 70)
    print(f"random_state={random_state}: evaluation policy={policy_name}")