示例#1
0
def test_standardization_matches_causallib(linear_data_pandas):
    w, t, y = linear_data_pandas
    causallib_standardization = Standardization(LinearRegression())
    causallib_standardization.fit(w, t, y)
    individual_potential_outcomes = causallib_standardization.estimate_individual_outcome(
        w, t)
    causallib_ite_estimates = individual_potential_outcomes[
        1] - individual_potential_outcomes[0]
    mean_potential_outcomes = causallib_standardization.estimate_population_outcome(
        w, t)
    causallib_ate_estimate = mean_potential_outcomes[
        1] - mean_potential_outcomes[0]

    standardization = StandardizationEstimator()
    standardization.fit(w, t, y)
    assert causallib_ate_estimate == standardization.estimate_ate()
    pd.testing.assert_series_equal(causallib_ite_estimates,
                                   standardization.estimate_ite())
示例#2
0
class TestStandardizationClassification(TestStandardizationCommon):
    @classmethod
    def setUpClass(cls):
        # Three-class outcome, since decision_function might return a vector when n_classes=2, and we wish to check the
        # matrix form of the output behaves as expected:
        X, y = make_classification(n_features=3,
                                   n_informative=2,
                                   n_redundant=0,
                                   n_repeated=0,
                                   n_classes=3,
                                   n_clusters_per_class=1,
                                   flip_y=0.0,
                                   class_sep=10.0)
        X, a = X[:, :-1], X[:, -1]
        a = (a > np.median(a)).astype(int)
        cls.data_3cls = {
            "X": pd.DataFrame(X),
            "a": pd.Series(a),
            "y": pd.Series(y)
        }

        # X, y = make_classification(n_features=2, n_informative=1, n_redundant=0, n_repeated=0, n_classes=2,
        #                            n_clusters_per_class=1, flip_y=0.0, class_sep=10.0)
        # X, a = X[:, :-1], X[:, -1]
        # a = (a > np.median(a)).astype(int)
        # cls.data_2cls = {"X": pd.DataFrame(X), "a": pd.Series(a), "y": pd.Series(y)}

    def verify_individual_multiclass_output(self):
        self.estimator.fit(self.data_3cls["X"], self.data_3cls["a"],
                           self.data_3cls["y"])
        ind_outcome = self.estimator.estimate_individual_outcome(
            self.data_3cls["X"], self.data_3cls["a"])

        with self.subTest("Output size, # samples:"):
            self.assertEqual(self.data_3cls["X"].shape[0],
                             ind_outcome.shape[0])
        with self.subTest("Output size, # predictions:"):
            with self.subTest(
                    "Output's multiindex level names are describing treatment and outcome"
            ):
                self.assertEqual(["a", "y"], ind_outcome.columns.names)
            with self.subTest(
                    "Output's number of predictions is the same as number of outcome and treatment values"
            ):
                self.assertEqual(
                    self.data_3cls["a"].nunique() *
                    self.data_3cls["y"].nunique(), ind_outcome.shape[1])
                self.assertEqual(
                    self.data_3cls["a"].nunique(),
                    ind_outcome.columns.get_level_values("a").unique().size)
                self.assertEqual(
                    self.data_3cls["y"].nunique(),
                    ind_outcome.columns.get_level_values("y").unique().size)
        return ind_outcome

    def test_predict_proba(self):
        self.estimator = Standardization(LogisticRegression(C=1e6,
                                                            solver='lbfgs'),
                                         predict_proba=True)
        ind_outcome = self.verify_individual_multiclass_output()
        with self.subTest("Test results are probabilities - sum to 1:"):
            for treatment_value, y_pred in ind_outcome.groupby(level="a",
                                                               axis="columns"):
                pd.testing.assert_series_equal(
                    pd.Series(1.0, index=y_pred.index),
                    y_pred.sum(axis="columns"))

    def test_decision_function(self):
        self.estimator = Standardization(SVC(decision_function_shape='ovr'),
                                         predict_proba=True)
        self.verify_individual_multiclass_output()

    def test_predict(self):
        self.estimator = Standardization(LogisticRegression(C=1e6,
                                                            solver='lbfgs'),
                                         predict_proba=False)
        self.estimator.fit(self.data_3cls["X"], self.data_3cls["a"],
                           self.data_3cls["y"])
        ind_outcome = self.estimator.estimate_individual_outcome(
            self.data_3cls["X"], self.data_3cls["a"])
        with self.subTest("Output size, # predictions:"):
            self.assertEqual(self.data_3cls["a"].nunique(),
                             ind_outcome.shape[1])
            self.assertNotEqual(self.data_3cls["y"].nunique(),
                                ind_outcome.shape[1])