def test_warm_start(self):
        for classification in [True, False]:
            # dgp
            X1 = np.random.normal(0, 1, size=(500, 5))
            X2 = np.random.choice([0, 1], size=(500, 1))
            X3 = np.random.choice([0, 1, 2], size=(500, 1))
            X = np.hstack((X1, X2, X3))
            X_df = pd.DataFrame(X, columns=[f"x{i} " for i in range(7)])
            y = np.random.choice([0, 1], size=(500, ))
            y_df = pd.Series(y)
            # model
            hetero_inds = [0, 1, 2]
            feat_inds = [1, 3, 5]
            categorical = [5, 6]
            ca = CausalAnalysis(feat_inds,
                                categorical,
                                heterogeneity_inds=hetero_inds,
                                classification=classification,
                                nuisance_models='linear',
                                heterogeneity_model="linear",
                                n_jobs=-1)
            ca.fit(X_df, y)
            eff = ca.global_causal_effect(alpha=0.05)
            eff = ca.local_causal_effect(X_df, alpha=0.05)

            ca.feature_inds = [1, 2, 3, 5]
            ca.fit(X_df, y, warm_start=True)
            eff = ca.global_causal_effect(alpha=0.05)
            eff = ca.local_causal_effect(X_df, alpha=0.05)
示例#2
0
 def test_empty_hinds(self):
     for h_model in ['linear', 'forest']:
         for classification in [True, False]:
             X1 = np.random.normal(0, 1, size=(500, 5))
             X2 = np.random.choice([0, 1], size=(500, 1))
             X3 = np.random.choice([0, 1, 2], size=(500, 1))
             X = np.hstack((X1, X2, X3))
             X_df = pd.DataFrame(X, columns=[f"x{i} " for i in range(7)])
             y = np.random.choice([0, 1], size=(500, ))
             y_df = pd.Series(y)
             # model
             hetero_inds = [[], [], []]
             feat_inds = [1, 3, 5]
             categorical = [5, 6]
             ca = CausalAnalysis(feat_inds,
                                 categorical,
                                 heterogeneity_inds=hetero_inds,
                                 classification=classification,
                                 nuisance_models='linear',
                                 heterogeneity_model=h_model,
                                 n_jobs=-1)
             ca.fit(X_df, y)
             eff = ca.global_causal_effect(alpha=0.05)
             eff = ca.local_causal_effect(X_df, alpha=0.05)
             for ind in feat_inds:
                 tree, val, always_trt = ca._policy_tree_output(X_df, ind)
    def test_forest_with_pandas(self):
        y = pd.Series(np.random.choice([0, 1], size=(500, )))
        X = pd.DataFrame({
            'a': np.random.normal(size=500),
            'b': np.random.normal(size=500),
            'c': np.random.choice([0, 1], size=500),
            'd': np.random.choice(['a', 'b', 'c'], size=500)
        })
        inds = ['a', 'b', 'c', 'd']
        cats = ['c', 'd']
        hinds = ['a', 'd']

        ca = CausalAnalysis(inds, cats, hinds, heterogeneity_model='forest')
        ca.fit(X, y)
        glo = ca.global_causal_effect()
        coh = ca.cohort_causal_effect(X[:2])
        loc = ca.local_causal_effect(X[:2])

        # global and cohort data should have exactly the same structure, but different values
        assert glo.index.equals(coh.index)

        # local index should have as many times entries as global as there were rows passed in
        assert len(loc.index) == 2 * len(glo.index)

        assert glo.index.names == ['feature', 'feature_value']
        assert loc.index.names == ['sample'] + glo.index.names

        # features; for categoricals they should appear #cats-1 times each
        fts = ['a', 'b', 'c', 'd', 'd']

        for i in range(len(fts)):
            assert fts[i] == glo.index[i][0] == loc.index[i][1] == loc.index[
                len(fts) + i][1]

        glo_dict = ca._global_causal_effect_dict()
        coh_dict = ca._cohort_causal_effect_dict(X[:2])
        loc_dict = ca._local_causal_effect_dict(X[:2])

        glo_point_est = np.array(
            glo_dict[_CausalInsightsConstants.PointEstimateKey])
        coh_point_est = np.array(
            coh_dict[_CausalInsightsConstants.PointEstimateKey])
        loc_point_est = np.array(
            loc_dict[_CausalInsightsConstants.PointEstimateKey])

        # global shape is (d_y, sum(d_t))
        assert glo_point_est.shape == coh_point_est.shape == (1, 5)
        assert loc_point_est.shape == (2, ) + glo_point_est.shape

        ca._policy_tree_output(X, inds[1])
        ca._heterogeneity_tree_string(X, inds[1])
        ca._heterogeneity_tree_string(X, inds[3])

        # Can't handle multi-dimensional treatments
        with self.assertRaises(AssertionError):
            ca._policy_tree_output(X, inds[3])
示例#4
0
    def test_final_models(self):
        d_y = (1, )
        y = np.random.choice([0, 1], size=(500, ) + d_y)
        X = np.hstack((np.random.normal(size=(500, 2)),
                       np.random.choice([0, 1], size=(500, 1)),
                       np.random.choice([0, 1, 2], size=(500, 1))))
        inds = [0, 1, 2, 3]
        cats = [2, 3]
        hinds = [0, 3]
        for h_model in ['forest', 'linear']:
            for classification in [False, True]:
                ca = CausalAnalysis(inds,
                                    cats,
                                    hinds,
                                    classification=classification,
                                    heterogeneity_model=h_model)
                ca.fit(X, y)
                glo = ca.global_causal_effect()
                coh = ca.cohort_causal_effect(X[:2])
                loc = ca.local_causal_effect(X[:2])
                glo_dict = ca._global_causal_effect_dict()
                coh_dict = ca._cohort_causal_effect_dict(X[:2])
                loc_dict = ca._local_causal_effect_dict(X[:2])

                ca._policy_tree_output(X, 1)
                ca._heterogeneity_tree_output(X, 1)
                ca._heterogeneity_tree_output(X, 3)

                # Make sure we handle continuous, binary, and multi-class treatments
                # For multiple discrete treatments, one "always treat" value per non-default treatment
                for (idx, length) in [(0, 1), (1, 1), (2, 1), (3, 2)]:
                    _, policy_val, always_trt = ca._policy_tree_output(X, idx)
                    assert isinstance(always_trt, list)
                    assert np.array(policy_val).shape == ()
                    assert np.array(always_trt).shape == (length, )

                    # policy value should exceed always treating with any treatment
                    assert_less_close(always_trt, policy_val)

                if not classification:
                    # ExitStack can be used as a "do nothing" ContextManager
                    cm = ExitStack()
                else:
                    cm = self.assertRaises(Exception)
                with cm:
                    inf = ca.whatif(X[:2], np.ones(shape=(2, )), 1, y[:2])
                    inf = ca.whatif(X[:2], np.ones(shape=(2, )), 2, y[:2])
                    ca._whatif_dict(X[:2], np.ones(shape=(2, )), 1, y[:2])

        with self.assertRaises(AssertionError):
            ca = CausalAnalysis(inds,
                                cats,
                                hinds,
                                classification=classification,
                                heterogeneity_model='other')
            ca.fit(X, y)
    def test_final_models(self):
        d_y = (1, )
        y = np.random.choice([0, 1], size=(500, ) + d_y)
        X = np.hstack((np.random.normal(size=(500, 2)),
                       np.random.choice([0, 1], size=(500, 1)),
                       np.random.choice([0, 1, 2], size=(500, 1))))
        inds = [0, 1, 2, 3]
        cats = [2, 3]
        hinds = [0, 3]
        for h_model in ['forest', 'linear']:
            for classification in [False, True]:
                ca = CausalAnalysis(inds,
                                    cats,
                                    hinds,
                                    classification=classification,
                                    heterogeneity_model=h_model)
                ca.fit(X, y)
                glo = ca.global_causal_effect()
                coh = ca.cohort_causal_effect(X[:2])
                loc = ca.local_causal_effect(X[:2])
                glo_dict = ca._global_causal_effect_dict()
                coh_dict = ca._cohort_causal_effect_dict(X[:2])
                loc_dict = ca._local_causal_effect_dict(X[:2])

                ca._policy_tree_output(X, 1)
                ca._heterogeneity_tree_string(X, 1)
                ca._heterogeneity_tree_string(X, 3)

                # Can't handle multi-dimensional treatments
                with self.assertRaises(AssertionError):
                    ca._policy_tree_output(X, 3)

                if not classification:
                    # ExitStack can be used as a "do nothing" ContextManager
                    cm = ExitStack()
                else:
                    cm = self.assertRaises(Exception)
                with cm:
                    inf = ca.whatif(X[:2], np.ones(shape=(2, )), 1, y[:2])
                    inf.summary_frame()
                    inf = ca.whatif(X[:2], np.ones(shape=(2, )), 2, y[:2])
                    inf.summary_frame()

                    ca._whatif_dict(X[:2], np.ones(shape=(2, )), 1, y[:2])

        with self.assertRaises(AssertionError):
            ca = CausalAnalysis(inds,
                                cats,
                                hinds,
                                classification=classification,
                                heterogeneity_model='other')
            ca.fit(X, y)
    def test_one_feature(self):
        # make sure we don't run into problems dropping every index
        y = pd.Series(np.random.choice([0, 1], size=(500, )))
        X = pd.DataFrame({
            'a': np.random.normal(size=500),
            'b': np.random.normal(size=500),
            'c': np.random.choice([0, 1], size=500),
            'd': np.random.choice(['a', 'b', 'c'], size=500)
        })
        inds = ['a']
        cats = ['c', 'd']
        hinds = ['a', 'd']

        ca = CausalAnalysis(inds, cats, hinds, classification=False)
        ca.fit(X, y)
        glo = ca.global_causal_effect()
        coh = ca.cohort_causal_effect(X[:2])
        loc = ca.local_causal_effect(X[:2])

        # global and cohort data should have exactly the same structure, but different values
        assert glo.index.equals(coh.index)

        # local index should have as many times entries as global as there were rows passed in
        assert len(loc.index) == 2 * len(glo.index)

        assert glo.index.names == ['feature']
        assert loc.index.names == ['sample']

        glo_dict = ca._global_causal_effect_dict()
        coh_dict = ca._cohort_causal_effect_dict(X[:2])
        loc_dict = ca._local_causal_effect_dict(X[:2])

        glo_point_est = np.array(
            glo_dict[_CausalInsightsConstants.PointEstimateKey])
        coh_point_est = np.array(
            coh_dict[_CausalInsightsConstants.PointEstimateKey])
        loc_point_est = np.array(
            loc_dict[_CausalInsightsConstants.PointEstimateKey])

        # global shape is (d_y, sum(d_t))
        assert glo_point_est.shape == coh_point_est.shape == (1, 1)
        assert loc_point_est.shape == (2, ) + glo_point_est.shape

        ca._policy_tree_output(X, inds[0])
        ca._heterogeneity_tree_string(X, inds[0])
    def test_basic_pandas(self):
        for classification in [False, True]:
            y = pd.Series(np.random.choice([0, 1], size=(500, )))
            X = pd.DataFrame({
                'a': np.random.normal(size=500),
                'b': np.random.normal(size=500),
                'c': np.random.choice([0, 1], size=500),
                'd': np.random.choice(['a', 'b', 'c'], size=500)
            })
            n_inds = [0, 1, 2, 3]
            t_inds = ['a', 'b', 'c', 'd']
            n_cats = [2, 3]
            t_cats = ['c', 'd']
            n_hinds = [0, 3]
            t_hinds = ['a', 'd']
            for (inds, cats, hinds) in [(n_inds, n_cats, n_hinds),
                                        (t_inds, t_cats, t_hinds)]:
                ca = CausalAnalysis(inds,
                                    cats,
                                    hinds,
                                    classification=classification)
                ca.fit(X, y)
                glo = ca.global_causal_effect()
                coh = ca.cohort_causal_effect(X[:2])
                loc = ca.local_causal_effect(X[:2])

                # global and cohort data should have exactly the same structure, but different values
                assert glo.index.equals(coh.index)

                # local index should have as many times entries as global as there were rows passed in
                assert len(loc.index) == 2 * len(glo.index)

                assert glo.index.names == ['feature', 'feature_value']
                assert loc.index.names == ['sample'] + glo.index.names

                # features; for categoricals they should appear #cats-1 times each
                fts = ['a', 'b', 'c', 'd', 'd']

                for i in range(len(fts)):
                    assert fts[i] == glo.index[i][0] == loc.index[i][
                        1] == loc.index[len(fts) + i][1]

                glo_dict = ca._global_causal_effect_dict()
                coh_dict = ca._cohort_causal_effect_dict(X[:2])
                loc_dict = ca._local_causal_effect_dict(X[:2])

                glo_point_est = np.array(
                    glo_dict[_CausalInsightsConstants.PointEstimateKey])
                coh_point_est = np.array(
                    coh_dict[_CausalInsightsConstants.PointEstimateKey])
                loc_point_est = np.array(
                    loc_dict[_CausalInsightsConstants.PointEstimateKey])

                # global shape is (d_y, sum(d_t))
                assert glo_point_est.shape == coh_point_est.shape == (1, 5)
                assert loc_point_est.shape == (2, ) + glo_point_est.shape

                ca._policy_tree_output(X, inds[1])
                ca._heterogeneity_tree_string(X, inds[1])
                ca._heterogeneity_tree_string(X, inds[3])

                # Can't handle multi-dimensional treatments
                with self.assertRaises(AssertionError):
                    ca._policy_tree_output(X, inds[3])

                if not classification:
                    # ExitStack can be used as a "do nothing" ContextManager
                    cm = ExitStack()
                else:
                    cm = self.assertRaises(Exception)
                with cm:
                    inf = ca.whatif(X[:2], np.ones(shape=(2, )), inds[1],
                                    y[:2])
                    assert np.shape(inf.point_estimate) == np.shape(y[:2])
                    inf.summary_frame()
                    inf = ca.whatif(X[:2], np.ones(shape=(2, )), inds[2],
                                    y[:2])
                    assert np.shape(inf.point_estimate) == np.shape(y[:2])
                    inf.summary_frame()

                    ca._whatif_dict(X[:2], np.ones(shape=(2, )), inds[1],
                                    y[:2])

            badargs = [
                (n_inds, n_cats, [4]),  # hinds out of range
                (n_inds, n_cats, ["test"])  # hinds out of range
            ]

            for args in badargs:
                with self.assertRaises(Exception):
                    ca = CausalAnalysis(*args)
                    ca.fit(X, y)
    def test_automl_first_stage(self):
        d_y = (1, )
        for classification in [False, True]:
            y = np.random.choice([0, 1], size=(500, ) + d_y)
            X = np.hstack((np.random.normal(size=(500, 2)),
                           np.random.choice([0, 1], size=(500, 1)),
                           np.random.choice([0, 1, 2], size=(500, 1))))
            inds = [0, 1, 2, 3]
            cats = [2, 3]
            hinds = [0, 3]
            ca = CausalAnalysis(inds,
                                cats,
                                hinds,
                                classification=classification,
                                nuisance_models='automl')
            ca.fit(X, y)
            glo = ca.global_causal_effect()
            coh = ca.cohort_causal_effect(X[:2])
            loc = ca.local_causal_effect(X[:2])

            # global and cohort data should have exactly the same structure, but different values
            assert glo.index.equals(coh.index)

            # local index should have as many times entries as global as there were rows passed in
            assert len(loc.index) == 2 * len(glo.index)

            assert glo.index.names == ['feature', 'feature_value']
            assert loc.index.names == ['sample'] + glo.index.names

            glo_dict = ca._global_causal_effect_dict()
            coh_dict = ca._cohort_causal_effect_dict(X[:2])
            loc_dict = ca._local_causal_effect_dict(X[:2])

            glo_point_est = np.array(
                glo_dict[_CausalInsightsConstants.PointEstimateKey])
            coh_point_est = np.array(
                coh_dict[_CausalInsightsConstants.PointEstimateKey])
            loc_point_est = np.array(
                loc_dict[_CausalInsightsConstants.PointEstimateKey])

            ca._policy_tree_output(X, 1)
            ca._heterogeneity_tree_string(X, 1)
            ca._heterogeneity_tree_string(X, 3)

            # Can't handle multi-dimensional treatments
            with self.assertRaises(AssertionError):
                ca._policy_tree_output(X, 3)

            # global shape is (d_y, sum(d_t))
            assert glo_point_est.shape == coh_point_est.shape == (1, 5)
            assert loc_point_est.shape == (2, ) + glo_point_est.shape
            if not classification:
                # ExitStack can be used as a "do nothing" ContextManager
                cm = ExitStack()
            else:
                cm = self.assertRaises(Exception)
            with cm:
                inf = ca.whatif(X[:2], np.ones(shape=(2, )), 1, y[:2])
                assert np.shape(inf.point_estimate) == np.shape(y[:2])
                inf.summary_frame()
                inf = ca.whatif(X[:2], np.ones(shape=(2, )), 2, y[:2])
                assert np.shape(inf.point_estimate) == np.shape(y[:2])
                inf.summary_frame()

                ca._whatif_dict(X[:2], np.ones(shape=(2, )), 1, y[:2])

            # features; for categoricals they should appear #cats-1 times each
            fts = ['x0', 'x1', 'x2', 'x3', 'x3']

            for i in range(len(fts)):
                assert fts[i] == glo.index[i][0] == loc.index[i][
                    1] == loc.index[len(fts) + i][1]

            badargs = [
                (inds, cats, [4]),  # hinds out of range
                (inds, cats, ["test"])  # hinds out of range
            ]

            for args in badargs:
                with self.assertRaises(Exception):
                    ca = CausalAnalysis(*args)
                    ca.fit(X, y)
示例#9
0
    def test_forest_with_pandas(self):
        y = pd.Series(np.random.choice([0, 1], size=(500, )))
        X = pd.DataFrame({
            'a': np.random.normal(size=500),
            'b': np.random.normal(size=500),
            'c': np.random.choice([0, 1], size=500),
            'd': np.random.choice(['a', 'b', 'c'], size=500)
        })
        inds = ['a', 'b', 'c', 'd']
        cats = ['c', 'd']
        hinds = ['a', 'd']

        ca = CausalAnalysis(inds, cats, hinds, heterogeneity_model='forest')
        ca.fit(X, y)
        glo = ca.global_causal_effect()
        coh = ca.cohort_causal_effect(X[:2])
        loc = ca.local_causal_effect(X[:2])

        # global and cohort data should have exactly the same structure, but different values
        assert glo.index.equals(coh.index)

        # local index should have as many times entries as global as there were rows passed in
        assert len(loc.index) == 2 * len(glo.index)

        assert glo.index.names == ['feature', 'feature_value']
        assert loc.index.names == ['sample'] + glo.index.names

        # features; for categoricals they should appear #cats-1 times each
        fts = ['a', 'b', 'c', 'd', 'd']

        for i in range(len(fts)):
            assert fts[i] == glo.index[i][0] == loc.index[i][1] == loc.index[
                len(fts) + i][1]

        glo_dict = ca._global_causal_effect_dict()
        coh_dict = ca._cohort_causal_effect_dict(X[:2])
        loc_dict = ca._local_causal_effect_dict(X[:2])

        glo_point_est = np.array(
            glo_dict[_CausalInsightsConstants.PointEstimateKey])
        coh_point_est = np.array(
            coh_dict[_CausalInsightsConstants.PointEstimateKey])
        loc_point_est = np.array(
            loc_dict[_CausalInsightsConstants.PointEstimateKey])

        # global shape is (d_y, sum(d_t))
        assert glo_point_est.shape == coh_point_est.shape == (1, 5)
        assert loc_point_est.shape == (2, ) + glo_point_est.shape

        ca._policy_tree_output(X, inds[1])
        ca._heterogeneity_tree_output(X, inds[1])
        ca._heterogeneity_tree_output(X, inds[3])

        # Make sure we handle continuous, binary, and multi-class treatments
        # For multiple discrete treatments, one "always treat" value per non-default treatment
        for (idx, length) in [(0, 1), (1, 1), (2, 1), (3, 2)]:
            _, policy_val, always_trt = ca._policy_tree_output(X, inds[idx])
            assert isinstance(always_trt, list)
            assert np.array(policy_val).shape == ()
            assert np.array(always_trt).shape == (length, )

            # policy value should exceed always treating with any treatment
            assert_less_close(always_trt, policy_val)
示例#10
0
    def test_basic_array(self):
        for d_y in [(), (1, )]:
            for classification in [False, True]:
                y = np.random.choice([0, 1], size=(500, ) + d_y)
                X = np.hstack((np.random.normal(size=(500, 2)),
                               np.random.choice([0, 1], size=(500, 1)),
                               np.random.choice([0, 1, 2], size=(500, 1))))
                inds = [0, 1, 2, 3]
                cats = [2, 3]
                hinds = [0, 3]
                ca = CausalAnalysis(inds,
                                    cats,
                                    hinds,
                                    classification=classification)
                ca.fit(X, y)
                glo = ca.global_causal_effect()
                coh = ca.cohort_causal_effect(X[:2])
                loc = ca.local_causal_effect(X[:2])

                # global and cohort data should have exactly the same structure, but different values
                assert glo.index.equals(coh.index)

                # local index should have as many times entries as global as there were rows passed in
                assert len(loc.index) == 2 * len(glo.index)

                assert glo.index.names == ['feature', 'feature_value']
                assert loc.index.names == ['sample'] + glo.index.names

                glo_dict = ca._global_causal_effect_dict()
                coh_dict = ca._cohort_causal_effect_dict(X[:2])
                loc_dict = ca._local_causal_effect_dict(X[:2])

                glo_point_est = np.array(
                    glo_dict[_CausalInsightsConstants.PointEstimateKey])
                coh_point_est = np.array(
                    coh_dict[_CausalInsightsConstants.PointEstimateKey])
                loc_point_est = np.array(
                    loc_dict[_CausalInsightsConstants.PointEstimateKey])

                ca._heterogeneity_tree_output(X, 1)
                ca._heterogeneity_tree_output(X, 3)

                # Make sure we handle continuous, binary, and multi-class treatments
                # For multiple discrete treatments, one "always treat" value per non-default treatment
                for (idx, length) in [(0, 1), (1, 1), (2, 1), (3, 2)]:
                    _, policy_val, always_trt = ca._policy_tree_output(X, idx)
                    assert isinstance(always_trt, list)
                    assert np.array(policy_val).shape == ()
                    assert np.array(always_trt).shape == (length, )

                    # policy value should exceed always treating with any treatment
                    assert_less_close(always_trt, policy_val)

                # global shape is (d_y, sum(d_t))
                assert glo_point_est.shape == coh_point_est.shape == (1, 5)
                assert loc_point_est.shape == (2, ) + glo_point_est.shape
                if not classification:
                    # ExitStack can be used as a "do nothing" ContextManager
                    cm = ExitStack()
                else:
                    cm = self.assertRaises(Exception)
                with cm:
                    inf = ca.whatif(X[:2], np.ones(shape=(2, )), 1, y[:2])
                    assert np.shape(inf.point_estimate) == (2, )
                    inf = ca.whatif(X[:2], np.ones(shape=(2, )), 2, y[:2])
                    assert np.shape(inf.point_estimate) == (2, )

                    ca._whatif_dict(X[:2], np.ones(shape=(2, )), 1, y[:2])

                # features; for categoricals they should appear #cats-1 times each
                fts = ['x0', 'x1', 'x2', 'x3', 'x3']

                for i in range(len(fts)):
                    assert fts[i] == glo.index[i][0] == loc.index[i][
                        1] == loc.index[len(fts) + i][1]

                badargs = [
                    (inds, cats, [4]),  # hinds out of range
                    (inds, cats, ["test"])  # hinds out of range
                ]

                for args in badargs:
                    with self.assertRaises(Exception):
                        ca = CausalAnalysis(*args)
                        ca.fit(X, y)
    def compute(self):
        """Computes the causal insights by running the causal configuration."""
        for config in self._causal_config_list:
            if config.is_computed:
                continue

            config.is_computed = True
            if config.nuisance_model not in [CausalConstants.AUTOML,
                                             CausalConstants.LINEAR]:
                message = (f"nuisance_model should be one of "
                           f"['{CausalConstants.AUTOML}', "
                           f"'{CausalConstants.LINEAR}'], "
                           f"got {config.nuisance_model}")
                raise UserConfigValidationException(message)

            is_classification = self._task_type == ModelTask.CLASSIFICATION
            X = pd.concat([self._train, self._test], ignore_index=True)\
                .drop([self._target_column], axis=1)
            y = pd.concat([self._train, self._test], ignore_index=True)[
                self._target_column].values.ravel()

            categoricals = self._categorical_features
            if categoricals is None:
                categoricals = []

            analysis = CausalAnalysis(
                config.treatment_features,
                categoricals,
                heterogeneity_inds=config.heterogeneity_features,
                classification=is_classification,
                nuisance_models=config.nuisance_model,
                upper_bound_on_cat_expansion=config.max_cat_expansion,
                skip_cat_limit_checks=config.skip_cat_limit_checks,
                n_jobs=-1)
            analysis.fit(X, y)

            config.causal_analysis = analysis

            X_test = self._test.drop([self._target_column], axis=1)

            config.global_effects = analysis.global_causal_effect(
                alpha=config.alpha, keep_all_levels=True)
            config.local_effects = analysis.local_causal_effect(
                X_test, alpha=config.alpha, keep_all_levels=True)

            config.policies = []
            for treatment_feature in config.treatment_features:
                local_policies = analysis.individualized_policy(
                    X_test, treatment_feature,
                    treatment_costs=config.treatment_cost,
                    alpha=config.alpha)

                tree = analysis._policy_tree_output(
                    X_test, treatment_feature,
                    treatment_costs=config.treatment_cost,
                    max_depth=config.max_tree_depth,
                    min_samples_leaf=config.min_tree_leaf_samples,
                    alpha=config.alpha)

                policy = {
                    self.TREATMENT_FEATURE: treatment_feature,
                    self.CONTROL_TREATMENT: tree.control_name,
                    self.LOCAL_POLICIES: local_policies,
                    self.POLICY_GAINS: {
                        self.RECOMMENDED_POLICY_GAINS: tree.policy_value,
                        self.TREATMENT_GAINS: tree.always_treat,
                    },
                    self.POLICY_TREE: tree.tree_dictionary
                }
                config.policies.append(policy)
示例#12
0
    def test_scaling_transforms(self):
        # shouldn't matter if X is scaled much larger or much smaller than W, we should still get good estimates
        n = 2000
        X = np.random.normal(size=(n, 5))
        W = np.random.normal(size=(n, 5))
        W[:, 0] = 1  # make one of the columns a constant
        xt, wt, xy, wy, theta = [
            np.random.normal(size=sz)
            for sz in [(5, 1), (5, 1), (5, 1), (5, 1), (1, 1)]
        ]
        T = X @ xt + W @ wt + np.random.normal(size=(n, 1))
        Y = X @ xy + W @ wy + T @ theta
        arr1 = np.hstack([X, W, T])
        # rescaling X shouldn't affect the first stage models because they normalize the inputs
        arr2 = np.hstack([1000 * X, W, T])
        for hmodel in ['linear', 'forest']:
            inds = [-1]  # we just care about T
            cats = []
            hinds = list(range(X.shape[1]))
            ca = CausalAnalysis(inds,
                                cats,
                                hinds,
                                heterogeneity_model=hmodel,
                                random_state=123)
            ca.fit(arr1, Y)
            eff1 = ca.global_causal_effect()

            ca.fit(arr2, Y)
            eff2 = ca.global_causal_effect()

            np.testing.assert_allclose(eff1.point.values,
                                       eff2.point.values,
                                       rtol=1e-5)
            np.testing.assert_allclose(eff1.ci_lower.values,
                                       eff2.ci_lower.values,
                                       rtol=1e-5)
            np.testing.assert_allclose(eff1.ci_upper.values,
                                       eff2.ci_upper.values,
                                       rtol=1e-5)

            np.testing.assert_allclose(eff1.point.values,
                                       theta.flatten(),
                                       rtol=1e-2)

        # to recover individual coefficients with linear models, we need to be more careful in how we set up X to avoid
        # cross terms
        X = np.zeros(shape=(n, 5))
        X[range(X.shape[0]), np.random.choice(5, size=n)] = 1
        xt, wt, xy, wy, theta = [
            np.random.normal(size=sz)
            for sz in [(5, 1), (5, 1), (5, 1), (5, 1), (5, 1)]
        ]
        T = X @ xt + W @ wt + np.random.normal(size=(n, 1))
        Y = X @ xy + W @ wy + T * (X @ theta)
        arr1 = np.hstack([X, W, T])
        arr2 = np.hstack([1000 * X, W, T])
        for hmodel in ['linear', 'forest']:
            inds = [-1]  # we just care about T
            cats = []
            hinds = list(range(X.shape[1]))
            ca = CausalAnalysis(inds,
                                cats,
                                hinds,
                                heterogeneity_model=hmodel,
                                random_state=123)
            ca.fit(arr1, Y)
            eff1 = ca.global_causal_effect()
            loc1 = ca.local_causal_effect(
                np.hstack([
                    np.eye(X.shape[1]),
                    np.zeros((X.shape[1], arr1.shape[1] - X.shape[1]))
                ]))
            ca.fit(arr2, Y)
            eff2 = ca.global_causal_effect()
            loc2 = ca.local_causal_effect(
                # scale by 1000 to match the input to this model:
                # the scale of X does matter for the final model, which keeps results in user-denominated units
                1000 * np.hstack([
                    np.eye(X.shape[1]),
                    np.zeros((X.shape[1], arr1.shape[1] - X.shape[1]))
                ]))

            # rescaling X still shouldn't affect the first stage models
            np.testing.assert_allclose(eff1.point.values,
                                       eff2.point.values,
                                       rtol=1e-5)
            np.testing.assert_allclose(eff1.ci_lower.values,
                                       eff2.ci_lower.values,
                                       rtol=1e-5)
            np.testing.assert_allclose(eff1.ci_upper.values,
                                       eff2.ci_upper.values,
                                       rtol=1e-5)

            np.testing.assert_allclose(loc1.point.values,
                                       loc2.point.values,
                                       rtol=1e-2)
示例#13
0
    def test_one_feature(self):
        # make sure we don't run into problems dropping every index
        y = pd.Series(np.random.choice([0, 1], size=(500, )))
        X = pd.DataFrame({
            'a': np.random.normal(size=500),
            'b': np.random.normal(size=500),
            'c': np.random.choice([0, 1], size=500),
            'd': np.random.choice(['a', 'b', 'c'], size=500)
        })
        inds = ['a']
        cats = ['c', 'd']
        hinds = ['a', 'd']

        ca = CausalAnalysis(inds, cats, hinds, classification=False)
        ca.fit(X, y)
        glo = ca.global_causal_effect()
        coh = ca.cohort_causal_effect(X[:2])
        loc = ca.local_causal_effect(X[:2])

        # global and cohort data should have exactly the same structure, but different values
        assert glo.index.equals(coh.index)

        # local index should have as many times entries as global as there were rows passed in
        assert len(loc.index) == 2 * len(glo.index)

        assert glo.index.names == ['feature']
        assert loc.index.names == ['sample']

        glo_dict = ca._global_causal_effect_dict()
        glo_dict2 = ca._global_causal_effect_dict(row_wise=True)

        coh_dict = ca._cohort_causal_effect_dict(X[:2])
        coh_dict2 = ca._cohort_causal_effect_dict(X[:2], row_wise=True)

        loc_dict = ca._local_causal_effect_dict(X[:2])
        loc_dict2 = ca._local_causal_effect_dict(X[:2], row_wise=True)

        glo_point_est = np.array(
            glo_dict[_CausalInsightsConstants.PointEstimateKey])
        coh_point_est = np.array(
            coh_dict[_CausalInsightsConstants.PointEstimateKey])
        loc_point_est = np.array(
            loc_dict[_CausalInsightsConstants.PointEstimateKey])

        # global shape is (d_y, sum(d_t))
        assert glo_point_est.shape == coh_point_est.shape == (1, 1)
        assert loc_point_est.shape == (2, ) + glo_point_est.shape

        glo2 = ca.global_causal_effect(keep_all_levels=True)
        coh2 = ca.cohort_causal_effect(X[:2], keep_all_levels=True)
        loc2 = ca.local_causal_effect(X[:2], keep_all_levels=True)
        assert ({ind.name
                 for ind in glo2.index.levels} == {
                     ind.name
                     for ind in coh2.index.levels
                 } == {"outcome", "feature", "feature_value"})
        assert {ind.name
                for ind in loc2.index.levels
                } == {"sample", "outcome", "feature", "feature_value"}

        # global and cohort row-wise dicts have d_y * d_t entries
        assert len(glo_dict2[_CausalInsightsConstants.RowData]) == len(
            coh_dict2[_CausalInsightsConstants.RowData]) == 1
        # local dictionary is flattened to n_rows * d_y * d_t
        assert len(loc_dict2[_CausalInsightsConstants.RowData]) == 2

        ca._policy_tree_output(X, inds[0])
        ca._heterogeneity_tree_output(X, inds[0])
示例#14
0
    def test_basic_pandas(self):
        for classification in [False, True]:
            y = pd.Series(np.random.choice([0, 1], size=(500, )))
            X = pd.DataFrame({
                'a': np.random.normal(size=500),
                'b': np.random.normal(size=500),
                'c': np.random.choice([0, 1], size=500),
                'd': np.random.choice(['a', 'b', 'c'], size=500)
            })
            n_inds = [0, 1, 2, 3]
            t_inds = ['a', 'b', 'c', 'd']
            n_cats = [2, 3]
            t_cats = ['c', 'd']
            n_hinds = [0, 3]
            t_hinds = ['a', 'd']
            for (inds, cats, hinds) in [(n_inds, n_cats, n_hinds),
                                        (t_inds, t_cats, t_hinds)]:
                ca = CausalAnalysis(inds,
                                    cats,
                                    hinds,
                                    classification=classification)
                ca.fit(X, y)
                glo = ca.global_causal_effect()
                coh = ca.cohort_causal_effect(X[:2])
                loc = ca.local_causal_effect(X[:2])

                # global and cohort data should have exactly the same structure, but different values
                assert glo.index.equals(coh.index)

                # local index should have as many times entries as global as there were rows passed in
                assert len(loc.index) == 2 * len(glo.index)

                assert glo.index.names == ['feature', 'feature_value']
                assert loc.index.names == ['sample'] + glo.index.names

                # features; for categoricals they should appear #cats-1 times each
                fts = ['a', 'b', 'c', 'd', 'd']

                for i in range(len(fts)):
                    assert fts[i] == glo.index[i][0] == loc.index[i][
                        1] == loc.index[len(fts) + i][1]

                glo_dict = ca._global_causal_effect_dict()
                glo_dict2 = ca._global_causal_effect_dict(row_wise=True)

                coh_dict = ca._cohort_causal_effect_dict(X[:2])
                coh_dict2 = ca._cohort_causal_effect_dict(X[:2], row_wise=True)

                loc_dict = ca._local_causal_effect_dict(X[:2])
                loc_dict2 = ca._local_causal_effect_dict(X[:2], row_wise=True)

                glo_point_est = np.array(
                    glo_dict[_CausalInsightsConstants.PointEstimateKey])
                coh_point_est = np.array(
                    coh_dict[_CausalInsightsConstants.PointEstimateKey])
                loc_point_est = np.array(
                    loc_dict[_CausalInsightsConstants.PointEstimateKey])

                # global shape is (d_y, sum(d_t))
                assert glo_point_est.shape == coh_point_est.shape == (1, 5)
                assert loc_point_est.shape == (2, ) + glo_point_est.shape

                # global and cohort row-wise dicts have d_y * d_t entries
                assert len(glo_dict2[_CausalInsightsConstants.RowData]) == len(
                    coh_dict2[_CausalInsightsConstants.RowData]) == 5
                # local dictionary is flattened to n_rows * d_y * d_t
                assert len(loc_dict2[_CausalInsightsConstants.RowData]) == 10

                pto = ca._policy_tree_output(X, inds[1])
                ca._heterogeneity_tree_output(X, inds[1])
                ca._heterogeneity_tree_output(X, inds[3])

                # continuous treatments have typical treatment values equal to
                # the mean of the absolute value of non-zero entries
                np.testing.assert_allclose(ca.typical_treatment_value(inds[0]),
                                           np.mean(np.abs(X['a'])))
                np.testing.assert_allclose(ca.typical_treatment_value(inds[1]),
                                           np.mean(np.abs(X['b'])))
                # discrete treatments have typical treatment value 1
                assert ca.typical_treatment_value(
                    inds[2]) == ca.typical_treatment_value(inds[3]) == 1

                # Make sure we handle continuous, binary, and multi-class treatments
                # For multiple discrete treatments, one "always treat" value per non-default treatment
                for (idx, length) in [(0, 1), (1, 1), (2, 1), (3, 2)]:
                    pto = ca._policy_tree_output(X, inds[idx])
                    policy_val = pto.policy_value
                    always_trt = pto.always_treat
                    assert isinstance(pto.control_name, str)
                    assert isinstance(always_trt, dict)
                    assert np.array(policy_val).shape == ()
                    assert len(always_trt) == length
                    for val in always_trt.values():
                        assert np.array(val).shape == ()

                    # policy value should exceed always treating with any treatment
                    assert_less_close(np.array(list(always_trt.values())),
                                      policy_val)

                if not classification:
                    # ExitStack can be used as a "do nothing" ContextManager
                    cm = ExitStack()
                else:
                    cm = self.assertRaises(Exception)
                with cm:
                    inf = ca.whatif(X[:2], np.ones(shape=(2, )), inds[1],
                                    y[:2])
                    assert np.shape(inf.point_estimate) == np.shape(y[:2])
                    inf = ca.whatif(X[:2], np.ones(shape=(2, )), inds[2],
                                    y[:2])
                    assert np.shape(inf.point_estimate) == np.shape(y[:2])

                    ca._whatif_dict(X[:2], np.ones(shape=(2, )), inds[1],
                                    y[:2])
                    ca._whatif_dict(X[:2],
                                    np.ones(shape=(2, )),
                                    inds[1],
                                    y[:2],
                                    row_wise=True)

            badargs = [
                (n_inds, n_cats, [4]),  # hinds out of range
                (n_inds, n_cats, ["test"])  # hinds out of range
            ]

            for args in badargs:
                with self.assertRaises(Exception):
                    ca = CausalAnalysis(*args)
                    ca.fit(X, y)
    def compute(self):
        """Computes the causal effects by running the causal
           configuration."""
        is_classification = self._task_type == ModelTask.CLASSIFICATION
        for result in self._results:
            causal_config = result.config
            if not result.is_computed:
                analysis = CausalAnalysis(
                    causal_config.treatment_features,
                    self._categorical_features,
                    heterogeneity_inds=causal_config.heterogeneity_features,
                    classification=is_classification,
                    nuisance_models=causal_config.nuisance_model,
                    heterogeneity_model=causal_config.heterogeneity_model,
                    upper_bound_on_cat_expansion=causal_config.
                    upper_bound_on_cat_expansion,
                    skip_cat_limit_checks=causal_config.skip_cat_limit_checks,
                    n_jobs=causal_config.n_jobs,
                    categories=causal_config.categories,
                    verbose=causal_config.verbose,
                    random_state=causal_config.random_state,
                )

                X_train = self._train.drop([self._target_column], axis=1)
                X_test = self._test.drop([self._target_column], axis=1)
                y_train = self._train[self._target_column].values.ravel()

                self._fit_causal_analysis(
                    analysis, X_train, y_train,
                    causal_config.upper_bound_on_cat_expansion)
                result.causal_analysis = analysis

                result.global_effects = analysis.global_causal_effect(
                    alpha=causal_config.alpha, keep_all_levels=True)
                result.local_effects = analysis.local_causal_effect(
                    X_test, alpha=causal_config.alpha, keep_all_levels=True)

                result.policies = []

                # Check treatment_cost is valid
                if isinstance(causal_config.treatment_cost, int) and \
                        causal_config.treatment_cost == 0:
                    revised_treatment_cost = [0] * len(
                        causal_config.treatment_features)
                else:
                    revised_treatment_cost = causal_config.treatment_cost

                if not isinstance(revised_treatment_cost, list):
                    message = (
                        "treatment_cost must be a list with "
                        "the same number of elements as "
                        "treatment_features where each element "
                        "is either a constant cost of treatment "
                        "or an array specifying the cost of "
                        "treatment per sample. "
                        "Found treatment_cost of type "
                        f"{type(revised_treatment_cost)}, expected list.")
                    raise UserConfigValidationException(message)
                elif len(revised_treatment_cost) != \
                        len(causal_config.treatment_features):
                    message = ("treatment_cost must be a list with "
                               "the same number of elements as "
                               "treatment_features. "
                               "Length of treatment_cost was "
                               f"{len(revised_treatment_cost)}, expected "
                               f"{len(causal_config.treatment_features)}.")
                    raise UserConfigValidationException(message)

                for i in range(len(causal_config.treatment_features)):
                    policy = self._create_policy(
                        result, X_test,
                        causal_config.treatment_features[i],
                        revised_treatment_cost[i],
                        causal_config.alpha, causal_config.max_tree_depth,
                        causal_config.min_tree_leaf_samples)
                    result.policies.append(policy)

                result._validate_schema()