Exemplos de SyntheticBanditDataset em Python, exemplos de obp.dataset.SyntheticBanditDataset em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_synthetic.py Projeto: zwcdp/zr-obp

def test_synthetic_sample_reward_using_valid_inputs(context, action, description):
    n_actions = 10
    dataset = SyntheticBanditDataset(n_actions=n_actions, dim_context=3)

    reward = dataset.sample_reward(context=context, action=action)
    assert isinstance(reward, np.ndarray), "Invalid response of sample_reward"
    assert reward.shape == action.shape, "Invalid response of sample_reward"

Exemplo n.º 2

0

Exibir arquivo

    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # define evaluation policy using IPWLearner
        evaluation_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]
            ),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        evaluation_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        action_dist = evaluation_policy.predict(
            context=bandit_feedback_test["context"],
        )
        # estimate the mean reward function of the test set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]
            ),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_test["context"],
            action=bandit_feedback_test["action"],
            reward=bandit_feedback_test["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=random_state,
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback_test,
            ope_estimators=ope_estimators,
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=dataset.calc_ground_truth_policy_value(
                expected_reward=bandit_feedback_test["expected_reward"],
                action_dist=action_dist,
            ),
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return relative_ee_i

Exemplo n.º 3

0

Exibir arquivo

    def process(i: int):
        # synthetic data generator with uniformly random policy
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=None,  # uniformly random
            random_state=i,
        )
        # sample new data of synthetic logged bandit feedback
        bandit_feedback = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # simulate the evaluation policy
        action_dist = run_bandit_simulation(bandit_feedback=bandit_feedback,
                                            policy=evaluation_policy)
        # estimate the ground-truth policy values of the evaluation policy
        # by Monte-Carlo Simulation using p(r|x,a), the reward distribution
        ground_truth_policy_value = calc_ground_truth_policy_value(
            bandit_feedback=bandit_feedback,
            reward_sampler=dataset.sample_reward,  # p(r|x,a)
            policy=evaluation_policy,
            n_sim=n_sim,  # the number of simulations
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback,
            ope_estimators=ope_estimators,
        )
        metric_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            action_dist=action_dist,
        )

        return metric_i

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test_synthetic.py Projeto: aiueola/zr-obp

def test_synthetic_sample_reward_using_invalid_inputs(context, action,
                                                      description):
    n_actions = 10
    dataset = SyntheticBanditDataset(n_actions=n_actions)

    with pytest.raises(ValueError, match=f"{description}*"):
        _ = dataset.sample_reward(context=context, action=action)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test_synthetic.py Projeto: aiueola/zr-obp

def test_synthetic_calc_policy_value_using_invalid_inputs(
    expected_reward,
    action_dist,
    description,
):
    n_actions = 10
    dataset = SyntheticBanditDataset(n_actions=n_actions)

    with pytest.raises(ValueError, match=f"{description}*"):
        _ = dataset.calc_ground_truth_policy_value(
            expected_reward=expected_reward, action_dist=action_dist)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test_synthetic.py Projeto: aiueola/zr-obp

def test_synthetic_calc_policy_value_using_valid_inputs(
    expected_reward,
    action_dist,
    description,
):
    n_actions = 10
    dataset = SyntheticBanditDataset(n_actions=n_actions)

    policy_value = dataset.calc_ground_truth_policy_value(
        expected_reward=expected_reward, action_dist=action_dist)
    assert isinstance(
        policy_value,
        float), "Invalid response of calc_ground_truth_policy_value"

Exemplo n.º 7

0

Exibir arquivo

Arquivo: conftest.py Projeto: zwcdp/zr-obp

def synthetic_bandit_feedback() -> BanditFeedback:
    n_actions = 10
    dim_context = 5
    random_state = 12345
    n_rounds = 10000
    dataset = SyntheticBanditDataset(
        n_actions=n_actions,
        dim_context=dim_context,
        reward_function=logistic_reward_function,
        behavior_policy_function=linear_behavior_policy,
        random_state=random_state,
    )
    bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
    return bandit_feedback

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test_synthetic.py Projeto: aiueola/zr-obp

def test_synthetic_init():
    # when reward_function is None, expected_reward is randomly sampled in [0, 1]
    # this check includes the test of `sample_contextfree_expected_reward` function
    dataset = SyntheticBanditDataset(n_actions=2, beta=0)
    assert len(dataset.expected_reward) == 2
    assert np.all(0 <= dataset.expected_reward) and np.all(
        dataset.expected_reward <= 1)

    # one-hot action_context when None is given
    ohe = np.eye(2, dtype=int)
    assert np.allclose(dataset.action_context, ohe)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_synthetic.py Projeto: jq/zr-obp

def test_synthetic_init():
    # n_actions
    with pytest.raises(ValueError):
        SyntheticBanditDataset(n_actions=1)

    with pytest.raises(ValueError):
        SyntheticBanditDataset(n_actions="3")

    # dim_context
    with pytest.raises(ValueError):
        SyntheticBanditDataset(n_actions=2, dim_context=0)

    with pytest.raises(ValueError):
        SyntheticBanditDataset(n_actions=2, dim_context="2")

    # reward_type
    with pytest.raises(ValueError):
        SyntheticBanditDataset(n_actions=2, reward_type="aaa")

    # when reward_function is None, expected_reward is randomly sampled in [0, 1]
    # this check includes the test of `sample_contextfree_expected_reward` function
    dataset = SyntheticBanditDataset(n_actions=2)
    assert len(dataset.expected_reward) == 2
    assert np.all(0 <= dataset.expected_reward) and np.all(
        dataset.expected_reward <= 1)

    # when behavior_policy_function is None, behavior_policy is set to uniform one
    uniform_policy = np.array([0.5, 0.5])
    assert np.allclose(dataset.behavior_policy, uniform_policy)

    # action_context
    ohe = np.eye(2, dtype=int)
    assert np.allclose(dataset.action_context, ohe)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: test_synthetic.py Projeto: aiueola/zr-obp

def test_synthetic_init_using_invalid_inputs(
    n_actions,
    dim_context,
    reward_type,
    reward_std,
    beta,
    n_deficient_actions,
    action_context,
    random_state,
    err,
    description,
):
    with pytest.raises(err, match=f"{description}*"):
        _ = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_type=reward_type,
            reward_std=reward_std,
            beta=beta,
            n_deficient_actions=n_deficient_actions,
            action_context=action_context,
            random_state=random_state,
        )

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test_synthetic.py Projeto: jq/zr-obp

def test_synthetic_obtain_batch_bandit_feedback():
    # n_rounds
    with pytest.raises(ValueError):
        dataset = SyntheticBanditDataset(n_actions=2)
        dataset.obtain_batch_bandit_feedback(n_rounds=0)

    with pytest.raises(ValueError):
        dataset = SyntheticBanditDataset(n_actions=2)
        dataset.obtain_batch_bandit_feedback(n_rounds="3")

    # bandit feedback
    n_rounds = 10
    n_actions = 5
    dataset = SyntheticBanditDataset(n_actions=n_actions)
    bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
    assert bandit_feedback["n_rounds"] == n_rounds
    assert bandit_feedback["n_actions"] == n_actions
    assert (bandit_feedback["context"].shape[0] == n_rounds  # n_rounds
            and bandit_feedback["context"].shape[1] == 1  # default dim_context
            )
    assert (bandit_feedback["action_context"].shape[0] == n_actions
            and bandit_feedback["action_context"].shape[1] == n_actions)
    assert (bandit_feedback["action"].ndim == 1
            and len(bandit_feedback["action"]) == n_rounds)
    assert (bandit_feedback["position"].ndim == 1
            and len(bandit_feedback["position"]) == n_rounds)
    assert (bandit_feedback["reward"].ndim == 1
            and len(bandit_feedback["reward"]) == n_rounds)
    assert (bandit_feedback["expected_reward"].shape[0] == n_rounds
            and bandit_feedback["expected_reward"].shape[1] == n_actions)
    assert (bandit_feedback["pscore"].ndim == 1
            and len(bandit_feedback["pscore"]) == n_rounds)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: evaluate_off_policy_estimators.py Projeto: smart-patrol/zr-obp

    # configurations
    n_runs = args.n_runs
    n_rounds = args.n_rounds
    n_actions = args.n_actions
    dim_context = args.dim_context
    base_model_for_evaluation_policy = args.base_model_for_evaluation_policy
    base_model_for_reg_model = args.base_model_for_reg_model
    n_jobs = args.n_jobs
    random_state = args.random_state
    np.random.seed(random_state)

    # synthetic data generator
    dataset = SyntheticBanditDataset(
        n_actions=n_actions,
        dim_context=dim_context,
        reward_function=logistic_reward_function,
        behavior_policy_function=linear_behavior_policy,
        random_state=random_state,
    )
    # define evaluation policy using IPWLearner
    evaluation_policy = IPWLearner(
        n_actions=dataset.n_actions,
        len_list=dataset.len_list,
        base_classifier=base_model_dict[base_model_for_evaluation_policy](
            **hyperparams[base_model_for_evaluation_policy]),
    )

    def process(i: int):
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)

Exemplo n.º 13

0

Exibir arquivo

    args = parser.parse_args()
    print(args)

    n_runs = args.n_runs
    n_rounds = args.n_rounds
    n_actions = args.n_actions
    dim_context = args.dim_context
    dim_action_context = args.dim_action_context
    counterfactual_policy = args.counterfactual_policy
    random_state = args.random_state
    np.random.seed(random_state)

    dataset = SyntheticBanditDataset(
        n_actions=n_actions,
        dim_context=dim_context,
        dim_action_context=dim_action_context,
        reward_function=logistic_reward_function,
        behavior_policy_function=linear_behavior_policy,
        random_state=random_state,
    )

    # hyparparameters for counterfactual policies
    kwargs = dict(
        n_actions=dataset.n_actions,
        len_list=dataset.len_list,
        random_state=random_state,
    )
    if ("logistic" in counterfactual_policy) or ("linear"
                                                 in counterfactual_policy):
        kwargs["dim"] = dim_context
    if counterfactual_policy in [
            "linear_ucb",

Exemplo n.º 14

0

Exibir arquivo

    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # estimate the mean reward function of the train set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]),
        )
        ope_estimator = DoublyRobust()
        # define evaluation policy using NNPolicyLearner
        nn_policy = NNPolicyLearner(
            n_actions=dataset.n_actions,
            dim_context=dim_context,
            off_policy_objective=ope_estimator.estimate_policy_value_tensor,
        )
        # baseline method 1. RandomPolicy
        random_policy = RandomPolicy(n_actions=dataset.n_actions)
        # baseline method 2. UniformSampleWeightLearner
        uniform_sample_weight_policy = UniformSampleWeightLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # estimate the mean reward function of the train set of synthetic bandit feedback with ML model
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=12345,
        )
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        nn_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
        uniform_sample_weight_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        nn_policy_action_dist = nn_policy.predict(
            context=bandit_feedback_test["context"], )
        random_action_dist = random_policy.predict(
            context=bandit_feedback_test["context"], )
        uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict(
            context=bandit_feedback_test["context"], )
        # get the ground truth policy value for each learner
        gt_nn_policy_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=nn_policy_action_dist,
        )
        gt_random_policy = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=random_action_dist,
        )
        gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=uniform_sample_weight_action_dist,
        )

        return gt_nn_policy_learner, gt_random_policy, gt_uniform_sample_weight_learner

Exemplo n.º 15

0

Exibir arquivo

Arquivo: test_offline_learner_performance.py Projeto: aiueola/zr-obp

    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # sample new training and test sets of synthetic logged bandit data
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)

        # defining policy learners
        ipw_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        q_policy = QLearner(
            n_actions=dataset.n_actions,
            base_model=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        nn_policy = NNPolicyLearner(
            n_actions=dataset.n_actions,
            dim_context=dim_context,
            off_policy_objective="ipw",
        )
        # baseline method 1. RandomPolicy
        random_policy = RandomPolicy(n_actions=dataset.n_actions)
        # baseline method 2. UniformSampleWeightLearner
        uniform_sample_weight_policy = UniformSampleWeightLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )

        # policy training
        ipw_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        q_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        nn_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        uniform_sample_weight_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )

        # prediction/making decisions
        ipw_action_dist = ipw_policy.predict(
            context=bandit_feedback_test["context"], )
        q_action_dist = q_policy.predict(
            context=bandit_feedback_test["context"], )
        nn_action_dist = nn_policy.predict(
            context=bandit_feedback_test["context"], )
        random_action_dist = random_policy.predict(
            context=bandit_feedback_test["context"], )
        uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict(
            context=bandit_feedback_test["context"], )

        # evaluation
        gt_ipw_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=ipw_action_dist,
        )
        gt_q_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=q_action_dist,
        )
        gt_nn_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=nn_action_dist,
        )
        gt_random_policy = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=random_action_dist,
        )
        gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=uniform_sample_weight_action_dist,
        )

        return (
            gt_ipw_learner,
            gt_q_learner,
            gt_nn_learner,
            gt_random_policy,
            gt_uniform_sample_weight_learner,
        )

Exemplo n.º 16

0

Exibir arquivo

Arquivo: test_synthetic.py Projeto: aiueola/zr-obp

def test_synthetic_obtain_batch_bandit_feedback():
    # n_rounds
    with pytest.raises(ValueError):
        dataset = SyntheticBanditDataset(n_actions=2)
        dataset.obtain_batch_bandit_feedback(n_rounds=0)

    with pytest.raises(TypeError):
        dataset = SyntheticBanditDataset(n_actions=2)
        dataset.obtain_batch_bandit_feedback(n_rounds="3")

    # bandit feedback
    n_rounds = 10
    n_actions = 5
    for n_deficient_actions in [0, 2]:
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            beta=0,
            n_deficient_actions=n_deficient_actions)
        bandit_feedback = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        assert bandit_feedback["n_rounds"] == n_rounds
        assert bandit_feedback["n_actions"] == n_actions
        assert (bandit_feedback["context"].shape[0] == n_rounds  # n_rounds
                and
                bandit_feedback["context"].shape[1] == 1  # default dim_context
                )
        assert (bandit_feedback["action_context"].shape[0] == n_actions
                and bandit_feedback["action_context"].shape[1] == n_actions)
        assert (bandit_feedback["action"].ndim == 1
                and len(bandit_feedback["action"]) == n_rounds)
        assert bandit_feedback["position"] is None
        assert (bandit_feedback["reward"].ndim == 1
                and len(bandit_feedback["reward"]) == n_rounds)
        assert (bandit_feedback["expected_reward"].shape[0] == n_rounds
                and bandit_feedback["expected_reward"].shape[1] == n_actions)
        assert (bandit_feedback["pi_b"].shape[0] == n_rounds
                and bandit_feedback["pi_b"].shape[1] == n_actions)
        # when `beta=0`, behavior_policy should be uniform
        if n_deficient_actions == 0:
            uniform_policy = np.ones_like(bandit_feedback["pi_b"]) / n_actions
            assert np.allclose(bandit_feedback["pi_b"], uniform_policy)
        assert np.allclose(bandit_feedback["pi_b"][:, :, 0].sum(1),
                           np.ones(n_rounds))
        assert (bandit_feedback["pi_b"] == 0
                ).sum() == n_deficient_actions * n_rounds
        assert (bandit_feedback["pscore"].ndim == 1
                and len(bandit_feedback["pscore"]) == n_rounds)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: test_offline_learner_performance.py Projeto: grainpowder/zr-obp

    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # define evaluation policy using IPWLearner
        ipw_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        # baseline method 1. RandomPolicy
        random_policy = RandomPolicy(n_actions=dataset.n_actions)
        # baseline method 2. UniformSampleWeightLearner
        uniform_sample_weight_policy = UniformSampleWeightLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        ipw_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        uniform_sample_weight_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        ipw_action_dist = ipw_policy.predict(
            context=bandit_feedback_test["context"], )
        random_action_dist = random_policy.predict(
            context=bandit_feedback_test["context"], )
        uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict(
            context=bandit_feedback_test["context"], )
        # get the ground truth policy value for each learner
        gt_ipw_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=ipw_action_dist,
        )
        gt_random_policy = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=random_action_dist,
        )
        gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=uniform_sample_weight_action_dist,
        )

        return gt_ipw_learner, gt_random_policy, gt_uniform_sample_weight_learner

Exemplo n.º 18

0

Exibir arquivo

    # configurations
    n_runs = args.n_runs
    n_rounds = args.n_rounds
    n_actions = args.n_actions
    dim_context = args.dim_context
    n_sim = args.n_sim
    evaluation_policy_name = args.evaluation_policy_name
    n_jobs = args.n_jobs
    random_state = args.random_state
    np.random.seed(random_state)

    # synthetic data generator with uniformly random policy
    dataset = SyntheticBanditDataset(
        n_actions=n_actions,
        dim_context=dim_context,
        reward_function=logistic_reward_function,
        behavior_policy_function=None,  # uniformly random
        random_state=random_state,
    )
    # define evaluation policy
    evaluation_policy_dict = dict(
        bernoulli_ts=BernoulliTS(n_actions=n_actions,
                                 random_state=random_state),
        epsilon_greedy=EpsilonGreedy(n_actions=n_actions,
                                     epsilon=0.1,
                                     random_state=random_state),
        lin_epsilon_greedy=LinEpsilonGreedy(dim=dim_context,
                                            n_actions=n_actions,
                                            epsilon=0.1,
                                            random_state=random_state),
        lin_ts=LinTS(dim=dim_context,

Exemplo n.º 19

0

Exibir arquivo

    base_model_for_evaluation_policy = args.base_model_for_evaluation_policy
    base_model_for_reg_model = args.base_model_for_reg_model
    ope_estimator = args.ope_estimator
    n_hidden = args.n_hidden
    n_layers = args.n_layers
    activation = args.activation
    solver = args.solver
    batch_size = args.batch_size if args.batch_size else "auto"
    early_stopping = args.early_stopping
    random_state = args.random_state

    # synthetic data generator
    dataset = SyntheticBanditDataset(
        n_actions=n_actions,
        dim_context=dim_context,
        reward_function=logistic_reward_function,
        behavior_policy_function=linear_behavior_policy,
        random_state=random_state,
    )
    # sample new training and test sets of synthetic logged bandit feedback
    bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
        n_rounds=n_rounds)
    bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
        n_rounds=n_rounds)
    # estimate the mean reward function of the train set of synthetic bandit feedback with ML model
    regression_model = RegressionModel(
        n_actions=dataset.n_actions,
        action_context=dataset.action_context,
        base_model=base_model_dict[base_model_for_reg_model](
            **hyperparams[base_model_for_reg_model]),
    )