Exemplo n.º 1
0
def test_synthetic_slate_obtain_batch_bandit_feedback_using_uniform_random_behavior_policy_largescale(
):
    # set parameters
    n_unique_action = 100
    len_list = 10
    dim_context = 2
    reward_type = "binary"
    random_state = 12345
    n_rounds = 10000
    dataset = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        random_state=random_state,
    )
    # obtain feedback
    bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
    # check slate bandit feedback (common test)
    check_slate_bandit_feedback(bandit_feedback=bandit_feedback)
    # check pscore marginal
    pscore_item_position = 1 / n_unique_action
    assert np.allclose(
        np.unique(bandit_feedback["pscore_item_position"]),
        pscore_item_position
    ), f"pscore_item_position must be [{pscore_item_position}], but {np.unique(bandit_feedback['pscore_item_position'])}"
Exemplo n.º 2
0
def test_synthetic_slate_using_valid_inputs(
    n_unique_action,
    len_list,
    dim_context,
    reward_type,
    random_state,
    n_rounds,
    reward_structure,
    click_model,
    behavior_policy_function,
    reward_function,
    return_pscore_item_position,
    description,
):
    dataset = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        reward_structure=reward_structure,
        click_model=click_model,
        random_state=random_state,
        behavior_policy_function=behavior_policy_function,
        base_reward_function=reward_function,
    )
    # obtain feedback
    bandit_feedback = dataset.obtain_batch_bandit_feedback(
        n_rounds=n_rounds,
        return_pscore_item_position=return_pscore_item_position)
    # check slate bandit feedback (common test)
    check_slate_bandit_feedback(bandit_feedback=bandit_feedback)
    pscore_columns = [
        "pscore_cascade",
        "pscore",
        "pscore_item_position",
    ]
    bandit_feedback_df = pd.DataFrame()
    for column in [
            "slate_id",
            "position",
            "action",
            "reward",
            "expected_reward_factual",
    ] + pscore_columns:
        bandit_feedback_df[column] = bandit_feedback[column]
    print(f"-------{description}--------")
    print(bandit_feedback_df.groupby("position")["reward"].describe())
    if reward_type == "binary":
        assert set(np.unique(bandit_feedback["reward"])) == set([0, 1])
Exemplo n.º 3
0
def test_synthetic_slate_obtain_batch_bandit_feedback_using_uniform_random_behavior_policy(
):
    # set parameters
    n_unique_action = 10
    len_list = 3
    dim_context = 2
    reward_type = "binary"
    random_state = 12345
    n_rounds = 100
    dataset = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        random_state=random_state,
    )
    # obtain feedback
    bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
    # check slate bandit feedback (common test)
    check_slate_bandit_feedback(bandit_feedback=bandit_feedback)
    pscore_columns = [
        "pscore_cascade",
        "pscore",
        "pscore_item_position",
    ]
    bandit_feedback_df = pd.DataFrame()
    for column in ["slate_id", "position", "action"] + pscore_columns:
        bandit_feedback_df[column] = bandit_feedback[column]
    # check pscore marginal
    pscore_item_position = 1 / n_unique_action
    assert np.allclose(
        bandit_feedback_df["pscore_item_position"].unique(),
        pscore_item_position
    ), f"pscore_item_position must be [{pscore_item_position}], but {bandit_feedback_df['pscore_item_position'].unique()}"
    # check pscore joint
    pscore_cascade = []
    pscore_above = 1.0
    for position_ in np.arange(len_list):
        pscore_above = pscore_above * 1.0 / (n_unique_action - position_)
        pscore_cascade.append(pscore_above)
    assert np.allclose(
        bandit_feedback_df["pscore_cascade"], np.tile(pscore_cascade, n_rounds)
    ), f"pscore_cascade must be {pscore_cascade} for all impresessions"
    assert np.allclose(
        bandit_feedback_df["pscore"].unique(),
        [pscore_above]), f"pscore must be {pscore_above} for all slates"
Exemplo n.º 4
0
def synthetic_slate_bandit_feedback() -> BanditFeedback:
    # set parameters
    n_unique_action = 10
    len_list = 3
    dim_context = 2
    reward_type = "binary"
    random_state = 12345
    n_rounds = 100
    dataset = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        random_state=random_state,
    )
    # obtain feedback
    bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
    return bandit_feedback
Exemplo n.º 5
0
def test_synthetic_slate_obtain_batch_bandit_feedback_using_linear_behavior_policy(
):
    # set parameters
    n_unique_action = 10
    len_list = 3
    dim_context = 2
    reward_type = "binary"
    random_state = 12345
    n_rounds = 100
    dataset = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        random_state=random_state,
        behavior_policy_function=linear_behavior_policy_logit,
    )
    with pytest.raises(ValueError):
        _ = dataset.obtain_batch_bandit_feedback(n_rounds=-1)
    with pytest.raises(ValueError):
        _ = dataset.obtain_batch_bandit_feedback(n_rounds="a")

    # obtain feedback
    bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
    # check slate bandit feedback (common test)
    check_slate_bandit_feedback(bandit_feedback=bandit_feedback)
    # print reward
    pscore_columns = [
        "pscore_cascade",
        "pscore",
        "pscore_item_position",
    ]
    bandit_feedback_df = pd.DataFrame()
    for column in ["slate_id", "position", "action", "reward"
                   ] + pscore_columns:
        bandit_feedback_df[column] = bandit_feedback[column]
    print(bandit_feedback_df.groupby("position")["reward"].describe())
    if reward_type == "binary":
        assert set(np.unique(bandit_feedback["reward"])) == set([0, 1])
Exemplo n.º 6
0
def test_synthetic_slate_init_using_invalid_inputs(
    n_unique_action,
    len_list,
    dim_context,
    reward_type,
    reward_structure,
    click_model,
    random_state,
    description,
):
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = SyntheticSlateBanditDataset(
            n_unique_action=n_unique_action,
            len_list=len_list,
            dim_context=dim_context,
            reward_type=reward_type,
            reward_structure=reward_structure,
            click_model=click_model,
            random_state=random_state,
        )
Exemplo n.º 7
0
def test_synthetic_slate_obtain_batch_bandit_feedback_using_linear_behavior_policy_without_pscore_item_position(
):
    # set parameters
    n_unique_action = 80
    len_list = 3
    dim_context = 2
    reward_type = "binary"
    random_state = 12345
    n_rounds = 100
    dataset = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        random_state=random_state,
        behavior_policy_function=linear_behavior_policy_logit,
    )
    # obtain feedback
    bandit_feedback = dataset.obtain_batch_bandit_feedback(
        n_rounds=n_rounds, return_pscore_item_position=False)
    # check slate bandit feedback (common test)
    check_slate_bandit_feedback(bandit_feedback=bandit_feedback)
    assert (
        bandit_feedback["pscore_item_position"] is None
    ), f"pscore marginal must be None, but {bandit_feedback['pscore_item_position']}"

    # random seed should be fixed
    dataset2 = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        random_state=random_state,
        behavior_policy_function=linear_behavior_policy_logit,
    )
    # obtain feedback
    bandit_feedback2 = dataset2.obtain_batch_bandit_feedback(
        n_rounds=n_rounds, return_pscore_item_position=False)
    # check slate bandit feedback (common test)
    check_slate_bandit_feedback(bandit_feedback=bandit_feedback2)
    # check random seed effect
    assert np.allclose(
        bandit_feedback["expected_reward_factual"],
        bandit_feedback2["expected_reward_factual"],
    )
    if reward_type == "binary":
        assert set(np.unique(bandit_feedback["reward"])) == set([0, 1])
Exemplo n.º 8
0
def test_slate_ope_performance_using_standard_additive_log():
    # set parameters
    n_unique_action = 10
    len_list = 3
    dim_context = 2
    reward_type = "binary"
    random_state = 12345
    n_rounds = 1000
    reward_structure = "standard_additive"
    click_model = None
    behavior_policy_function = linear_behavior_policy_logit
    reward_function = logistic_reward_function
    dataset = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        reward_structure=reward_structure,
        click_model=click_model,
        random_state=random_state,
        behavior_policy_function=behavior_policy_function,
        base_reward_function=reward_function,
    )
    random_behavior_dataset = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        reward_structure=reward_structure,
        click_model=click_model,
        random_state=random_state,
        behavior_policy_function=None,
        base_reward_function=reward_function,
    )
    # obtain feedback
    bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
    slate_id = bandit_feedback["slate_id"]
    reward = bandit_feedback["reward"]
    pscore = bandit_feedback["pscore"]
    pscore_item_position = bandit_feedback["pscore_item_position"]
    pscore_cascade = bandit_feedback["pscore_cascade"]
    position = bandit_feedback["position"]

    # obtain random behavior feedback
    random_behavior_feedback = random_behavior_dataset.obtain_batch_bandit_feedback(
        n_rounds=n_rounds)

    sips_estimated_policy_value = sips.estimate_policy_value(
        slate_id=slate_id,
        reward=reward,
        pscore=pscore,
        position=position,
        evaluation_policy_pscore=random_behavior_feedback["pscore"],
    )
    iips_estimated_policy_value = iips.estimate_policy_value(
        slate_id=slate_id,
        reward=reward,
        pscore_item_position=pscore_item_position,
        position=position,
        evaluation_policy_pscore_item_position=random_behavior_feedback[
            "pscore_item_position"],
    )
    rips_estimated_policy_value = rips.estimate_policy_value(
        slate_id=slate_id,
        reward=reward,
        pscore_cascade=pscore_cascade,
        position=position,
        evaluation_policy_pscore_cascade=random_behavior_feedback[
            "pscore_cascade"],
    )
    # compute statistics of ground truth policy value
    q_pi_e = (random_behavior_feedback["reward"].reshape(
        (n_rounds, dataset.len_list)).sum(axis=1))
    gt_mean = q_pi_e.mean()
    gt_std = q_pi_e.std(ddof=1)
    print("Standard additive")
    # check the performance of OPE
    ci_bound = gt_std * 3 / np.sqrt(q_pi_e.shape[0])
    print(f"gt_mean: {gt_mean}, 3 * gt_std / sqrt(n): {ci_bound}")
    estimated_policy_value = {
        "sips": sips_estimated_policy_value,
        "iips": iips_estimated_policy_value,
        "rips": rips_estimated_policy_value,
    }
    for key in estimated_policy_value:
        print(
            f"estimated_value: {estimated_policy_value[key]} ------ estimator: {key}, "
        )
        # test the performance of each estimator
        assert (
            np.abs(gt_mean - estimated_policy_value[key]) <= ci_bound
        ), f"OPE of {key} did not work well (absolute error is greater than 3*sigma)"
Exemplo n.º 9
0
def test_slate_ope_performance_using_standard_additive_log():
    # set parameters
    n_unique_action = 10
    len_list = 3
    dim_context = 2
    reward_type = "binary"
    random_state = 12345
    n_rounds = 1000
    reward_structure = "standard_additive"
    click_model = None
    behavior_policy_function = linear_behavior_policy_logit
    reward_function = logistic_reward_function
    dataset = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        reward_structure=reward_structure,
        click_model=click_model,
        random_state=random_state,
        behavior_policy_function=behavior_policy_function,
        base_reward_function=reward_function,
    )
    random_behavior_dataset = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        reward_structure=reward_structure,
        click_model=click_model,
        random_state=random_state,
        behavior_policy_function=None,
        base_reward_function=reward_function,
    )
    # obtain feedback
    bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
    slate_id = bandit_feedback["slate_id"]
    context = bandit_feedback["context"]
    action = bandit_feedback["action"]
    reward = bandit_feedback["reward"]
    pscore = bandit_feedback["pscore_cascade"]
    position = bandit_feedback["position"]

    # obtain random behavior feedback
    random_behavior_feedback = random_behavior_dataset.obtain_batch_bandit_feedback(
        n_rounds=n_rounds)
    evaluation_policy_logit_ = np.ones(
        (n_rounds, n_unique_action)) / n_unique_action
    evaluation_policy_action_dist = (
        np.ones(n_rounds * len_list * n_unique_action) / n_unique_action)
    (
        _,
        _,
        evaluation_policy_pscore,
    ) = dataset.obtain_pscore_given_evaluation_policy_logit(
        action=action,
        evaluation_policy_logit_=evaluation_policy_logit_,
        return_pscore_item_position=False,
    )
    evaluation_policy_action_dist = dataset.calc_evaluation_policy_action_dist(
        action=action,
        evaluation_policy_logit_=evaluation_policy_logit_,
    )

    # obtain q_hat
    base_regression_model = SlateRegressionModel(
        base_model=DecisionTreeRegressor(max_depth=3, random_state=12345),
        len_list=len_list,
        n_unique_action=n_unique_action,
        fitting_method="iw",
    )
    q_hat = base_regression_model.fit_predict(
        context=context,
        action=action,
        reward=reward,
        pscore_cascade=pscore,
        evaluation_policy_pscore_cascade=evaluation_policy_pscore,
        evaluation_policy_action_dist=evaluation_policy_action_dist,
    )

    # check if q_hat=0 case coincides with rips
    cascade_dr_estimated_policy_value = dr.estimate_policy_value(
        slate_id=slate_id,
        action=action,
        reward=reward,
        pscore_cascade=pscore,
        position=position,
        evaluation_policy_pscore_cascade=evaluation_policy_pscore,
        q_hat=q_hat,
        evaluation_policy_action_dist=evaluation_policy_action_dist,
    )
    # compute statistics of ground truth policy value
    q_pi_e = (random_behavior_feedback["reward"].reshape(
        (n_rounds, dataset.len_list)).sum(axis=1))
    gt_mean = q_pi_e.mean()
    gt_std = q_pi_e.std(ddof=1)
    print("Cascade additive")
    # check the performance of OPE
    ci_bound = gt_std * 3 / np.sqrt(q_pi_e.shape[0])
    print(f"gt_mean: {gt_mean}, 3 * gt_std / sqrt(n): {ci_bound}")
    estimated_policy_value = {
        "cascade-dr": cascade_dr_estimated_policy_value,
    }
    for key in estimated_policy_value:
        print(
            f"estimated_value: {estimated_policy_value[key]} ------ estimator: {key}, "
        )
        # test the performance of each estimator
        assert (
            np.abs(gt_mean - estimated_policy_value[key]) <= ci_bound
        ), f"OPE of {key} did not work well (absolute error is greater than 3*sigma)"

    # check if q_hat = 0 case of cascade-dr coincides with rips
    cascade_dr_estimated_policy_value_ = dr.estimate_policy_value(
        slate_id=slate_id,
        action=action,
        reward=reward,
        pscore_cascade=pscore,
        position=position,
        evaluation_policy_pscore_cascade=evaluation_policy_pscore,
        q_hat=np.zeros_like(q_hat),
        evaluation_policy_action_dist=evaluation_policy_action_dist,
    )
    rips_estimated_policy_value = rips.estimate_policy_value(
        slate_id=slate_id,
        reward=reward,
        pscore_cascade=pscore,
        position=position,
        evaluation_policy_pscore_cascade=evaluation_policy_pscore,
    )
    assert np.allclose(
        np.array([cascade_dr_estimated_policy_value_]),
        np.array([rips_estimated_policy_value]),
    )
Exemplo n.º 10
0
def test_cascade_dr_criterion_using_standard_additive_log():
    # set parameters
    n_unique_action = 3
    len_list = 3
    dim_context = 2
    reward_type = "binary"
    random_state = 12345
    n_rounds = 1000
    reward_structure = "standard_additive"
    click_model = None
    behavior_policy_function = linear_behavior_policy_logit
    reward_function = logistic_reward_function
    dataset = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        reward_structure=reward_structure,
        click_model=click_model,
        random_state=random_state,
        behavior_policy_function=behavior_policy_function,
        base_reward_function=reward_function,
    )
    # obtain feedback
    bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
    context = bandit_feedback["context"]
    action = bandit_feedback["action"]
    reward = bandit_feedback["reward"]
    pscore = bandit_feedback["pscore_cascade"]

    # random evaluation policy
    evaluation_policy_logit_ = np.ones(
        (n_rounds, n_unique_action)) / n_unique_action
    evaluation_policy_action_dist = (
        np.ones(n_rounds * len_list * n_unique_action) / n_unique_action)
    (
        _,
        _,
        evaluation_policy_pscore,
    ) = dataset.obtain_pscore_given_evaluation_policy_logit(
        action=action,
        evaluation_policy_logit_=evaluation_policy_logit_,
        return_pscore_item_position=False,
    )
    evaluation_policy_action_dist = dataset.calc_evaluation_policy_action_dist(
        action=action,
        evaluation_policy_logit_=evaluation_policy_logit_,
    )
    q_expected = calc_ground_truth_mean_reward_function(
        dataset=dataset,
        context=context,
        action=action,
        evaluation_policy_logit_=evaluation_policy_logit_,
    )

    # obtain q_hat and check if q_hat is effective
    cascade_dr_criterion_pass_rate = 0.7
    for fitting_method in ["normal", "iw"]:
        for model_name, model in model_dict.items():
            base_regression_model = SlateRegressionModel(
                base_model=model(**hyperparams[model_name]),
                len_list=len_list,
                n_unique_action=n_unique_action,
                fitting_method=fitting_method,
            )
            q_hat = base_regression_model.fit_predict(
                context=context,
                action=action,
                reward=reward,
                pscore_cascade=pscore,
                evaluation_policy_pscore_cascade=evaluation_policy_pscore,
                evaluation_policy_action_dist=evaluation_policy_action_dist,
            )
            # compare dr criterion
            cascade_dr_criterion = np.abs((q_expected - q_hat)) - np.abs(q_hat)
            print(
                f"Dr criterion is satisfied with probability {np.mean(cascade_dr_criterion <= 0)} ------ model: {model_name} ({fitting_method}),"
            )
            assert (
                np.mean(cascade_dr_criterion <= 0) >=
                cascade_dr_criterion_pass_rate
            ), f" should be satisfied with a probability at least {cascade_dr_criterion_pass_rate}"