Пример #1
0
def test_slate_ope_performance_using_standard_additive_log():
    # set parameters
    n_unique_action = 10
    len_list = 3
    dim_context = 2
    reward_type = "binary"
    random_state = 12345
    n_rounds = 1000
    reward_structure = "standard_additive"
    click_model = None
    behavior_policy_function = linear_behavior_policy_logit
    reward_function = logistic_reward_function
    dataset = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        reward_structure=reward_structure,
        click_model=click_model,
        random_state=random_state,
        behavior_policy_function=behavior_policy_function,
        base_reward_function=reward_function,
    )
    random_behavior_dataset = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        reward_structure=reward_structure,
        click_model=click_model,
        random_state=random_state,
        behavior_policy_function=None,
        base_reward_function=reward_function,
    )
    # obtain feedback
    bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
    slate_id = bandit_feedback["slate_id"]
    context = bandit_feedback["context"]
    action = bandit_feedback["action"]
    reward = bandit_feedback["reward"]
    pscore = bandit_feedback["pscore_cascade"]
    position = bandit_feedback["position"]

    # obtain random behavior feedback
    random_behavior_feedback = random_behavior_dataset.obtain_batch_bandit_feedback(
        n_rounds=n_rounds)
    evaluation_policy_logit_ = np.ones(
        (n_rounds, n_unique_action)) / n_unique_action
    evaluation_policy_action_dist = (
        np.ones(n_rounds * len_list * n_unique_action) / n_unique_action)
    (
        _,
        _,
        evaluation_policy_pscore,
    ) = dataset.obtain_pscore_given_evaluation_policy_logit(
        action=action,
        evaluation_policy_logit_=evaluation_policy_logit_,
        return_pscore_item_position=False,
    )
    evaluation_policy_action_dist = dataset.calc_evaluation_policy_action_dist(
        action=action,
        evaluation_policy_logit_=evaluation_policy_logit_,
    )

    # obtain q_hat
    base_regression_model = SlateRegressionModel(
        base_model=DecisionTreeRegressor(max_depth=3, random_state=12345),
        len_list=len_list,
        n_unique_action=n_unique_action,
        fitting_method="iw",
    )
    q_hat = base_regression_model.fit_predict(
        context=context,
        action=action,
        reward=reward,
        pscore_cascade=pscore,
        evaluation_policy_pscore_cascade=evaluation_policy_pscore,
        evaluation_policy_action_dist=evaluation_policy_action_dist,
    )

    # check if q_hat=0 case coincides with rips
    cascade_dr_estimated_policy_value = dr.estimate_policy_value(
        slate_id=slate_id,
        action=action,
        reward=reward,
        pscore_cascade=pscore,
        position=position,
        evaluation_policy_pscore_cascade=evaluation_policy_pscore,
        q_hat=q_hat,
        evaluation_policy_action_dist=evaluation_policy_action_dist,
    )
    # compute statistics of ground truth policy value
    q_pi_e = (random_behavior_feedback["reward"].reshape(
        (n_rounds, dataset.len_list)).sum(axis=1))
    gt_mean = q_pi_e.mean()
    gt_std = q_pi_e.std(ddof=1)
    print("Cascade additive")
    # check the performance of OPE
    ci_bound = gt_std * 3 / np.sqrt(q_pi_e.shape[0])
    print(f"gt_mean: {gt_mean}, 3 * gt_std / sqrt(n): {ci_bound}")
    estimated_policy_value = {
        "cascade-dr": cascade_dr_estimated_policy_value,
    }
    for key in estimated_policy_value:
        print(
            f"estimated_value: {estimated_policy_value[key]} ------ estimator: {key}, "
        )
        # test the performance of each estimator
        assert (
            np.abs(gt_mean - estimated_policy_value[key]) <= ci_bound
        ), f"OPE of {key} did not work well (absolute error is greater than 3*sigma)"

    # check if q_hat = 0 case of cascade-dr coincides with rips
    cascade_dr_estimated_policy_value_ = dr.estimate_policy_value(
        slate_id=slate_id,
        action=action,
        reward=reward,
        pscore_cascade=pscore,
        position=position,
        evaluation_policy_pscore_cascade=evaluation_policy_pscore,
        q_hat=np.zeros_like(q_hat),
        evaluation_policy_action_dist=evaluation_policy_action_dist,
    )
    rips_estimated_policy_value = rips.estimate_policy_value(
        slate_id=slate_id,
        reward=reward,
        pscore_cascade=pscore,
        position=position,
        evaluation_policy_pscore_cascade=evaluation_policy_pscore,
    )
    assert np.allclose(
        np.array([cascade_dr_estimated_policy_value_]),
        np.array([rips_estimated_policy_value]),
    )
def test_cascade_dr_criterion_using_standard_additive_log():
    # set parameters
    n_unique_action = 3
    len_list = 3
    dim_context = 2
    reward_type = "binary"
    random_state = 12345
    n_rounds = 1000
    reward_structure = "standard_additive"
    click_model = None
    behavior_policy_function = linear_behavior_policy_logit
    reward_function = logistic_reward_function
    dataset = SyntheticSlateBanditDataset(
        n_unique_action=n_unique_action,
        len_list=len_list,
        dim_context=dim_context,
        reward_type=reward_type,
        reward_structure=reward_structure,
        click_model=click_model,
        random_state=random_state,
        behavior_policy_function=behavior_policy_function,
        base_reward_function=reward_function,
    )
    # obtain feedback
    bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
    context = bandit_feedback["context"]
    action = bandit_feedback["action"]
    reward = bandit_feedback["reward"]
    pscore = bandit_feedback["pscore_cascade"]

    # random evaluation policy
    evaluation_policy_logit_ = np.ones(
        (n_rounds, n_unique_action)) / n_unique_action
    evaluation_policy_action_dist = (
        np.ones(n_rounds * len_list * n_unique_action) / n_unique_action)
    (
        _,
        _,
        evaluation_policy_pscore,
    ) = dataset.obtain_pscore_given_evaluation_policy_logit(
        action=action,
        evaluation_policy_logit_=evaluation_policy_logit_,
        return_pscore_item_position=False,
    )
    evaluation_policy_action_dist = dataset.calc_evaluation_policy_action_dist(
        action=action,
        evaluation_policy_logit_=evaluation_policy_logit_,
    )
    q_expected = calc_ground_truth_mean_reward_function(
        dataset=dataset,
        context=context,
        action=action,
        evaluation_policy_logit_=evaluation_policy_logit_,
    )

    # obtain q_hat and check if q_hat is effective
    cascade_dr_criterion_pass_rate = 0.7
    for fitting_method in ["normal", "iw"]:
        for model_name, model in model_dict.items():
            base_regression_model = SlateRegressionModel(
                base_model=model(**hyperparams[model_name]),
                len_list=len_list,
                n_unique_action=n_unique_action,
                fitting_method=fitting_method,
            )
            q_hat = base_regression_model.fit_predict(
                context=context,
                action=action,
                reward=reward,
                pscore_cascade=pscore,
                evaluation_policy_pscore_cascade=evaluation_policy_pscore,
                evaluation_policy_action_dist=evaluation_policy_action_dist,
            )
            # compare dr criterion
            cascade_dr_criterion = np.abs((q_expected - q_hat)) - np.abs(q_hat)
            print(
                f"Dr criterion is satisfied with probability {np.mean(cascade_dr_criterion <= 0)} ------ model: {model_name} ({fitting_method}),"
            )
            assert (
                np.mean(cascade_dr_criterion <= 0) >=
                cascade_dr_criterion_pass_rate
            ), f" should be satisfied with a probability at least {cascade_dr_criterion_pass_rate}"