예제 #1
0
def test_ipw_using_random_evaluation_policy(
    synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray
) -> None:
    """
    Test the format of ipw variants using synthetic bandit data and random evaluation policy
    """
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    # ipw estimtors can be used without estimated_rewards_by_reg_model
    for estimator in [ipw, snipw]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(
            estimated_policy_value, float
        ), f"invalid type response: {estimator}"
    # remove necessary keys
    del input_dict["reward"]
    del input_dict["pscore"]
    del input_dict["action"]
    for estimator in [ipw, snipw]:
        with pytest.raises(
            TypeError,
            match=re.escape(
                "estimate_policy_value() missing 3 required positional arguments: 'reward', 'action', and 'pscore'"
            ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)

    input_tensor_dict = {
        k: v if v is None else torch.from_numpy(v)
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_tensor_dict["action_dist"] = torch.from_numpy(action_dist)
    for estimator in [ipw, snipw]:
        estimated_policy_value = estimator.estimate_policy_value_tensor(
            **input_tensor_dict
        )
        assert isinstance(
            estimated_policy_value, torch.Tensor
        ), f"invalid type response: {estimator}"
    # remove necessary keys
    del input_tensor_dict["reward"]
    del input_tensor_dict["pscore"]
    del input_tensor_dict["action"]
    for estimator in [ipw, snipw]:
        with pytest.raises(
            TypeError,
            match=re.escape(
                "estimate_policy_value_tensor() missing 3 required positional arguments: 'reward', 'action', and 'pscore'"
            ),
        ):
            _ = estimator.estimate_policy_value_tensor(**input_tensor_dict)
예제 #2
0
def test_dr_shrinkage_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the dr shrinkage estimators using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :,
                                                                   np.newaxis]
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    dm_value = dm.estimate_policy_value(**input_dict)
    dr_value = dr.estimate_policy_value(**input_dict)
    dr_shrink_0_value = dr_shrink_0.estimate_policy_value(**input_dict)
    dr_shrink_max_value = dr_shrink_max.estimate_policy_value(**input_dict)
    assert (
        dm_value == dr_shrink_0_value
    ), "DoublyRobustWithShrinkage (lambda=0) should be the same as DirectMethod"
    assert (
        np.abs(dr_value - dr_shrink_max_value) < 1e-5
    ), "DoublyRobustWithShrinkage (lambda=inf) should be almost the same as DoublyRobust"

    # prepare input dict
    input_tensor_dict = {
        k: v if v is None else torch.from_numpy(v)
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_tensor_dict["action_dist"] = torch.from_numpy(action_dist)
    input_tensor_dict["estimated_rewards_by_reg_model"] = torch.from_numpy(
        expected_reward)
    dm_value = dm.estimate_policy_value_tensor(**input_tensor_dict)
    dr_value = dr.estimate_policy_value_tensor(**input_tensor_dict)
    dr_shrink_0_value = dr_shrink_0.estimate_policy_value_tensor(
        **input_tensor_dict)
    dr_shrink_max_value = dr_shrink_max.estimate_policy_value_tensor(
        **input_tensor_dict)
    assert (
        dm_value.item() == dr_shrink_0_value.item()
    ), "DoublyRobustWithShrinkage (lambda=0) should be the same as DirectMethod"
    assert (
        np.abs(dr_value.item() - dr_shrink_max_value.item()) < 1e-5
    ), "DoublyRobustWithShrinkage (lambda=inf) should be almost the same as DoublyRobust"
예제 #3
0
def test_boundedness_of_snipw_using_random_evaluation_policy(
    synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray
) -> None:
    """
    Test the boundedness of snipw estimators using synthetic bandit data and random evaluation policy
    """
    action_dist = random_action_dist
    # prepare snipw
    snipw = SelfNormalizedInverseProbabilityWeighting()
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    # make pscore too small (to check the boundedness of snipw)
    input_dict["pscore"] = input_dict["pscore"] ** 3
    estimated_policy_value = snipw.estimate_policy_value(**input_dict)
    assert (
        estimated_policy_value <= 1
    ), f"estimated policy value of snipw should be smaller than or equal to 1 (because of its 1-boundedness), but the value is: {estimated_policy_value}"

    # ipw with estimated pscore
    snipw_estimated_pscore = SelfNormalizedInverseProbabilityWeighting(
        use_estimated_pscore=True
    )
    input_dict["estimated_pscore"] = input_dict["pscore"]
    del input_dict["pscore"]
    estimated_policy_value = snipw_estimated_pscore.estimate_policy_value(**input_dict)
    assert (
        estimated_policy_value <= 1
    ), f"estimated policy value of snipw should be smaller than or equal to 1 (because of its 1-boundedness), but the value is: {estimated_policy_value}"
예제 #4
0
def test_switch_dr_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the switch_dr using synthetic bandit data and random evaluation policy
    """
    expected_reward = np.expand_dims(
        synthetic_bandit_feedback["expected_reward"], axis=-1)
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    dm_value = dm.estimate_policy_value(**input_dict)
    dr_value = dr.estimate_policy_value(**input_dict)
    switch_dr_0_value = switch_dr_0.estimate_policy_value(**input_dict)
    switch_dr_max_value = switch_dr_max.estimate_policy_value(**input_dict)
    assert (dm_value == switch_dr_0_value
            ), "SwitchDR (tau=0) should be the same as DirectMethod"
    assert (dr_value == switch_dr_max_value
            ), "SwitchDR (tau=1e10) should be the same as DoublyRobust"
예제 #5
0
def test_dr_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the format of dr variants using synthetic bandit data and random evaluation policy
    """
    expected_reward = np.expand_dims(
        synthetic_bandit_feedback["expected_reward"], axis=-1)
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    # dr estimtors require all arguments
    for estimator in dr_estimators:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(estimated_policy_value,
                          float), f"invalid type response: {estimator}"
    # remove necessary keys
    del input_dict["reward"]
    del input_dict["pscore"]
    del input_dict["action"]
    del input_dict["estimated_rewards_by_reg_model"]
    for estimator in dr_estimators:
        with pytest.raises(
                TypeError,
                match=re.escape(
                    "estimate_policy_value() missing 4 required positional arguments: 'reward', 'action', 'pscore', and 'estimated_rewards_by_reg_model'"
                ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)
예제 #6
0
def test_sg_dr_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the switch_dr using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :,
                                                                   np.newaxis]
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    dr_value = dr.estimate_policy_value(**input_dict)
    sg_dr_0_value = sg_dr_0.estimate_policy_value(**input_dict)
    assert (dr_value == sg_dr_0_value
            ), "SG-DR (lambda=0) should be the same as DoublyRobust"
    input_dict["estimated_pscore"] = input_dict["pscore"]
    del input_dict["pscore"]
    dr_value_estimated_pscore = dr_estimated_pscore.estimate_policy_value(
        **input_dict)
    assert (
        dr_value == dr_value_estimated_pscore
    ), "DoublyRobust with estimated_pscore (which is the same as pscore) should be the same as DoublyRobust"
예제 #7
0
def test_boundedness_of_sndr_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the boundedness of sndr estimators using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :,
                                                                   np.newaxis]
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    # make pscore too small (to check the boundedness of sndr)
    input_dict["pscore"] = input_dict["pscore"]**3
    estimated_policy_value = sndr.estimate_policy_value(**input_dict)
    assert (
        estimated_policy_value <= 2
    ), f"estimated policy value of sndr should be smaller than or equal to 2 (because of its 2-boundedness), but the value is: {estimated_policy_value}"

    # prepare input dict
    input_tensor_dict = {
        k: v if v is None else torch.from_numpy(v)
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_tensor_dict["action_dist"] = torch.from_numpy(action_dist)
    input_tensor_dict["estimated_rewards_by_reg_model"] = torch.from_numpy(
        expected_reward)
    # make pscore too small (to check the boundedness of sndr)
    input_tensor_dict["pscore"] = input_tensor_dict["pscore"]**3
    estimated_policy_value = sndr.estimate_policy_value_tensor(
        **input_tensor_dict)
    assert (
        estimated_policy_value.item() <= 2
    ), f"estimated policy value of sndr should be smaller than or equal to 2 (because of its 2-boundedness), but the value is: {estimated_policy_value.item()}"
예제 #8
0
def test_ipw_using_random_evaluation_policy(
    synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray
) -> None:
    """
    Test the format of ipw variants using synthetic bandit data and random evaluation policy
    """
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    # ipw estimators can be used without estimated_rewards_by_reg_model
    for estimator in [ipw, snipw, ipw_tuning_mse, ipw_tuning_slope]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(
            estimated_policy_value, float
        ), f"invalid type response: {estimator}"

    # ipw with estimated pscore
    ipw_estimated_pscore = InverseProbabilityWeighting(use_estimated_pscore=True)
    snipw_estimated_pscore = SelfNormalizedInverseProbabilityWeighting(
        use_estimated_pscore=True
    )
    ipw_tuning_estimated_pscore = InverseProbabilityWeightingTuning(
        lambdas=[10, 1000], use_estimated_pscore=True
    )
    input_dict["estimated_pscore"] = input_dict["pscore"]
    del input_dict["pscore"]
    for estimator in [
        ipw_estimated_pscore,
        snipw_estimated_pscore,
        ipw_tuning_estimated_pscore,
    ]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(
            estimated_policy_value, float
        ), f"invalid type response: {estimator}"

    # remove necessary keys
    del input_dict["reward"]
    del input_dict["action"]
    for estimator in [ipw, snipw]:
        with pytest.raises(
            TypeError,
            match=re.escape(
                "estimate_policy_value() missing 2 required positional arguments: 'reward' and 'action'"
            ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)
예제 #9
0
def test_dr_using_random_evaluation_policy(
        synthetic_multi_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the format of dr variants using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_multi_bandit_feedback[
        "expected_reward"][:, :, np.newaxis]
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_multi_bandit_feedback.items() if k in [
            "reward", "action", "pscore", "pscore_avg", "stratum_idx",
            "position"
        ]
    }
    input_dict["action_dist"] = action_dist
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    naive_dr = NaiveDR()
    bal_dr = BalDR()
    weighted_dr = WeightedDR()
    # dr estimators require all arguments
    for estimator in [naive_dr, bal_dr, weighted_dr]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(estimated_policy_value,
                          float), f"invalid type response: {estimator}"

    naive_dr = NaiveDR(use_estimated_pscore=True)
    bal_dr = BalDR(use_estimated_pscore=True)
    weighted_dr = WeightedDR(use_estimated_pscore=True)
    input_dict["estimated_pscore"] = input_dict["pscore"]
    input_dict["estimated_pscore_avg"] = input_dict["pscore"]
    # dr estimators require all arguments
    for estimator in [naive_dr, bal_dr, weighted_dr]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(estimated_policy_value,
                          float), f"invalid type response: {estimator}"

    # remove necessary keys
    del input_dict["reward"]
    del input_dict["action"]
    del input_dict["estimated_rewards_by_reg_model"]
    for estimator in [naive_dr, bal_dr, weighted_dr]:
        with pytest.raises(
                TypeError,
                match=re.escape(
                    "estimate_policy_value() missing 3 required positional arguments: 'reward', 'action', and 'estimated_rewards_by_reg_model'"
                ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)
예제 #10
0
def test_ipw_using_random_evaluation_policy(
        synthetic_multi_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the format of ipw variants using synthetic bandit data and random evaluation policy
    """
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_multi_bandit_feedback.items() if k in [
            "reward", "action", "pscore", "pscore_avg", "stratum_idx",
            "position"
        ]
    }
    input_dict["action_dist"] = action_dist
    naive_ipw = NaiveIPW()
    bal_ipw = BalIPW()
    weighted_ipw = WeightedIPW()
    # ipw estimators can be used without estimated_rewards_by_reg_model
    for estimator in [naive_ipw, bal_ipw, weighted_ipw]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(estimated_policy_value,
                          float), f"invalid type response: {estimator}"

    # ipw with estimated pscore
    naive_ipw = NaiveIPW(use_estimated_pscore=True)
    bal_ipw = BalIPW(use_estimated_pscore=True)
    weighted_ipw = WeightedIPW(use_estimated_pscore=True)
    input_dict["estimated_pscore"] = input_dict["pscore"]
    input_dict["estimated_pscore_avg"] = input_dict["pscore"]
    del input_dict["pscore"]
    del input_dict["pscore_avg"]
    for estimator in [naive_ipw, bal_ipw, weighted_ipw]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(estimated_policy_value,
                          float), f"invalid type response: {estimator}"

    # remove necessary keys
    del input_dict["reward"]
    del input_dict["action"]
    for estimator in [naive_ipw, weighted_ipw, bal_ipw]:
        with pytest.raises(
                TypeError,
                match=re.escape(
                    "estimate_policy_value() missing 2 required positional arguments: 'reward' and 'action'"
                ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)
예제 #11
0
def test_dm_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the performance of the direct method using synthetic bandit data and random evaluation policy
    """
    expected_reward = np.expand_dims(
        synthetic_bandit_feedback["expected_reward"], axis=-1)
    action_dist = random_action_dist
    # compute ground truth policy value using expected reward
    q_pi_e = np.average(expected_reward[:, :, 0],
                        weights=action_dist[:, :, 0],
                        axis=1)
    # compute statistics of ground truth policy value
    gt_mean = q_pi_e.mean()
    # prepare dm
    dm = DirectMethod()
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    # estimated_rewards_by_reg_model is required
    with pytest.raises(
            TypeError,
            match=re.escape(
                "estimate_policy_value() missing 1 required positional argument: 'estimated_rewards_by_reg_model'"
            ),
    ):
        _ = dm.estimate_policy_value(**input_dict)
    # add estimated_rewards_by_reg_model
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    # check expectation
    estimated_policy_value = dm.estimate_policy_value(**input_dict)
    assert (gt_mean == estimated_policy_value
            ), "DM should be perfect when the regression model is perfect"
    # remove unnecessary keys
    del input_dict["reward"]
    del input_dict["pscore"]
    del input_dict["action"]
    estimated_policy_value = dm.estimate_policy_value(**input_dict)
    assert (gt_mean == estimated_policy_value
            ), "DM should be perfect when the regression model is perfect"
예제 #12
0
def test_ipw_using_random_evaluation_policy(
    synthetic_bandit_feedback_with_embed: BanditFeedback, random_action_dist: np.ndarray
) -> None:
    """
    Test the format of ipw variants using synthetic bandit data and random evaluation policy
    """
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback_with_embed.items()
        if k in ["reward", "action", "pi_b", "action_embed", "context", "position"]
    }
    input_dict["action_dist"] = action_dist
    mipw = MIPW(n_actions=synthetic_bandit_feedback_with_embed["n_actions"])
    mipw_exact = MIPW(
        n_actions=synthetic_bandit_feedback_with_embed["n_actions"],
        embedding_selection_method="exact",
    )
    mipw_greedy = MIPW(
        n_actions=synthetic_bandit_feedback_with_embed["n_actions"],
        embedding_selection_method="greedy",
    )
    snmipw = SNMIPW(n_actions=synthetic_bandit_feedback_with_embed["n_actions"])
    # ipw estimators can be used without estimated_rewards_by_reg_model
    for estimator in [mipw, mipw_exact, mipw_greedy, snmipw]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(
            estimated_policy_value, float
        ), f"invalid type response: {estimator}"

    # remove necessary keys
    del input_dict["reward"]
    del input_dict["action"]
    for estimator in [mipw, snmipw]:
        with pytest.raises(
            TypeError,
            match=re.escape(
                "estimate_policy_value() missing 2 required positional arguments: 'reward' and 'action'"
            ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)
예제 #13
0
def test_bipw_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the format of bipw variants using synthetic bandit data and random evaluation policy
    """
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    # insert dummy values
    input_dict["estimated_importance_weights"] = np.ones(action_dist.shape[0])
    # check responce
    for estimator in [bipw]:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(estimated_policy_value,
                          float), f"invalid type response: {estimator}"

    # make estimated_importance_weights too small (to check the boundedness of snbipw)
    input_dict["estimated_importance_weights"] = input_dict["pscore"]**3
    estimated_policy_value = bipw.estimate_policy_value(**input_dict)
    assert (
        estimated_policy_value <= 1
    ), f"estimated policy value of bipw should be smaller than or equal to 1 (because of its 1-boundedness), but the value is: {estimated_policy_value}"

    # remove necessary keys
    del input_dict["reward"]
    del input_dict["action"]
    del input_dict["estimated_importance_weights"]
    for estimator in [bipw]:
        with pytest.raises(
                TypeError,
                match=re.escape(
                    "estimate_policy_value() missing 3 required positional arguments: 'reward', 'action', and 'estimated_importance_weights'"
                ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)
예제 #14
0
def test_dr_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the format of dr variants using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :,
                                                                   np.newaxis]
    action_dist = random_action_dist
    # prepare input dict
    input_dict = {
        k: v
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_dict["action_dist"] = action_dist
    input_dict["estimated_rewards_by_reg_model"] = expected_reward
    # dr estimtors require all arguments
    for estimator in dr_estimators:
        estimated_policy_value = estimator.estimate_policy_value(**input_dict)
        assert isinstance(estimated_policy_value,
                          float), f"invalid type response: {estimator}"
    # remove necessary keys
    del input_dict["reward"]
    del input_dict["pscore"]
    del input_dict["action"]
    del input_dict["estimated_rewards_by_reg_model"]
    for estimator in dr_estimators:
        with pytest.raises(
                TypeError,
                match=re.escape(
                    "estimate_policy_value() missing 4 required positional arguments: 'reward', 'action', 'pscore', and 'estimated_rewards_by_reg_model'"
                ),
        ):
            _ = estimator.estimate_policy_value(**input_dict)

    # prepare input dict
    input_tensor_dict = {
        k: v if v is None else torch.from_numpy(v)
        for k, v in synthetic_bandit_feedback.items()
        if k in ["reward", "action", "pscore", "position"]
    }
    input_tensor_dict["action_dist"] = torch.from_numpy(action_dist)
    input_tensor_dict["estimated_rewards_by_reg_model"] = torch.from_numpy(
        expected_reward)
    # dr estimtors require all arguments
    for estimator in dr_estimators:
        if estimator.estimator_name == "switch-dr":
            with pytest.raises(
                    NotImplementedError,
                    match=re.escape(
                        "This is not implemented for Swtich-DR because it is indifferentiable."
                    ),
            ):
                _ = estimator.estimate_policy_value_tensor(**input_tensor_dict)
        else:
            estimated_policy_value = estimator.estimate_policy_value_tensor(
                **input_tensor_dict)
            assert isinstance(
                estimated_policy_value,
                torch.Tensor), f"invalid type response: {estimator}"
    # remove necessary keys
    del input_tensor_dict["reward"]
    del input_tensor_dict["pscore"]
    del input_tensor_dict["action"]
    del input_tensor_dict["estimated_rewards_by_reg_model"]
    for estimator in dr_estimators:
        if estimator.estimator_name == "switch-dr":
            with pytest.raises(
                    NotImplementedError,
                    match=re.escape(
                        "This is not implemented for Swtich-DR because it is indifferentiable."
                    ),
            ):
                _ = estimator.estimate_policy_value_tensor(**input_tensor_dict)
        else:
            with pytest.raises(
                    TypeError,
                    match=re.escape(
                        "estimate_policy_value_tensor() missing 4 required positional arguments: 'reward', 'action', 'pscore', and 'estimated_rewards_by_reg_model'"
                    ),
            ):
                _ = estimator.estimate_policy_value_tensor(**input_tensor_dict)