예제 #1
0
 def test_drlearner_clipping(self):
     X = np.linspace(0, 1, 200).reshape(-1, 1)
     T = np.random.binomial(1, X)
     Y = np.random.normal(size=T.shape)
     X[0] = -1000  # one split will have only X values between 0 and 1,
     # so the predicted propensity for this point will be extremely low
     learner = DRLearner()
     learner.fit(Y, T, X)
     effect = learner.const_marginal_effect(np.array([[0.5]]))
     assert not(np.any(np.isnan(effect)))
예제 #2
0
 def test_DRLearner(self):
     """Tests whether the DRLearner can accurately estimate constant and
        heterogeneous treatment effects.
     """
     DR_learner = DRLearner(model_regression=LinearRegression(),
                            model_final=LinearRegression())
     # Test inputs
     # self._test_inputs(DR_learner)
     # Test constant treatment effect
     self._test_te(DR_learner, tol=0.5, te_type="const")
     # Test heterogeneous treatment effect
     outcome_model = Pipeline([('poly', PolynomialFeatures()), ('model', LinearRegression())])
     DR_learner = DRLearner(model_regression=outcome_model,
                            model_final=LinearRegression())
     self._test_te(DR_learner, tol=0.5, te_type="heterogeneous")
     # Test heterogenous treatment effect for W =/= None
     self._test_with_W(DR_learner, tol=0.5)
예제 #3
0
 def __init__(self,
              outcome_model=LinearRegression(),
              prop_score_model=LogisticRegression(),
              final_model=LinearRegression(),
              trim_eps=1e-6):
     # TODO: add other options that DRLearner allows?
     drlearner = DRLearner(model_propensity=prop_score_model,
                           model_regression=outcome_model,
                           model_final=final_model,
                           min_propensity=trim_eps)
     super().__init__(econml_estimator=drlearner)
예제 #4
0
 def test_dr_random_state(self):
     Y, T, X, W, X_test = self._make_data(500, 2)
     for est in [
             DRLearner(model_final=RandomForestRegressor(
                 max_depth=3,
                 n_estimators=10,
                 min_samples_leaf=100,
                 bootstrap=True,
                 random_state=123),
                       cv=2,
                       random_state=123),
             LinearDRLearner(random_state=123),
             SparseLinearDRLearner(cv=2, random_state=123),
             ForestDRLearner(
                 model_regression=RandomForestRegressor(n_estimators=10,
                                                        max_depth=4,
                                                        random_state=123),
                 model_propensity=RandomForestClassifier(n_estimators=10,
                                                         max_depth=4,
                                                         random_state=123),
                 cv=2,
                 random_state=123)
     ]:
         TestRandomState._test_random_state(est, X_test, Y, T, X=X, W=W)
예제 #5
0
    def test_inference_with_none_stderr(self):
        Y, T, X, W = TestInference.Y, TestInference.T, TestInference.X, TestInference.W
        est = DML(model_y=LinearRegression(),
                  model_t=LinearRegression(),
                  model_final=Lasso(alpha=0.1, fit_intercept=False),
                  featurizer=PolynomialFeatures(degree=1, include_bias=False),
                  random_state=123)
        est.fit(Y, T, X=X, W=W)
        est.summary()
        est.coef__inference().summary_frame()
        est.intercept__inference().summary_frame()
        est.effect_inference(X).summary_frame()
        est.effect_inference(X).population_summary()
        est.const_marginal_effect_inference(X).summary_frame()
        est.marginal_effect_inference(T, X).summary_frame()

        est = NonParamDML(model_y=LinearRegression(),
                          model_t=LinearRegression(),
                          model_final=LinearRegression(fit_intercept=False),
                          featurizer=PolynomialFeatures(degree=1,
                                                        include_bias=False),
                          random_state=123)
        est.fit(Y, T, X=X, W=W)
        est.effect_inference(X).summary_frame()
        est.effect_inference(X).population_summary()
        est.const_marginal_effect_inference(X).summary_frame()
        est.marginal_effect_inference(T, X).summary_frame()

        est = DRLearner(model_regression=LinearRegression(),
                        model_propensity=LogisticRegression(),
                        model_final=LinearRegression())
        est.fit(Y, T, X=X, W=W)
        est.effect_inference(X).summary_frame()
        est.effect_inference(X).population_summary()
        est.const_marginal_effect_inference(X).summary_frame()
        est.marginal_effect_inference(T, X).summary_frame()
예제 #6
0
    def test_cate_api(self):
        """Test that we correctly implement the CATE API."""
        n = 20

        def make_random(is_discrete, d):
            if d is None:
                return None
            sz = (n, d) if d > 0 else (n,)
            if is_discrete:
                while True:
                    arr = np.random.choice(['a', 'b', 'c'], size=sz)
                    # ensure that we've got at least two of every element
                    _, counts = np.unique(arr, return_counts=True)
                    if len(counts) == 3 and counts.min() > 1:
                        return arr
            else:
                return np.random.normal(size=sz)

        for d_y in [0, 1]:
            is_discrete = True
            for d_t in [0, 1]:
                for d_x in [2, None]:
                    for d_w in [2, None]:
                        W, X, Y, T = [make_random(is_discrete, d)
                                      for is_discrete, d in [(False, d_w),
                                                             (False, d_x),
                                                             (False, d_y),
                                                             (is_discrete, d_t)]]

                        if (X is None) and (W is None):
                            continue
                        d_t_final = 2 if is_discrete else d_t

                        effect_shape = (n,) + ((d_y,) if d_y > 0 else ())
                        effect_summaryframe_shape = (
                            n * (d_y if d_y > 0 else 1), 6)
                        marginal_effect_shape = ((n,) +
                                                 ((d_y,) if d_y > 0 else ()) +
                                                 ((d_t_final,) if d_t_final > 0 else ()))
                        marginal_effect_summaryframe_shape = (n * (d_y if d_y > 0 else 1),
                                                              6 * (d_t_final if d_t_final > 0 else 1))

                        # since T isn't passed to const_marginal_effect, defaults to one row if X is None
                        const_marginal_effect_shape = ((n if d_x else 1,) +
                                                       ((d_y,) if d_y > 0 else ()) +
                                                       ((d_t_final,) if d_t_final > 0 else()))
                        const_marginal_effect_summaryframe_shape = (
                            (n if d_x else 1) * (d_y if d_y > 0 else 1),
                            6 * (d_t_final if d_t_final > 0 else 1))

                        for est in [LinearDRLearner(model_propensity=LogisticRegression(C=1000, solver='lbfgs',
                                                                                        multi_class='auto')),
                                    DRLearner(model_propensity=LogisticRegression(multi_class='auto'),
                                              model_regression=LinearRegression(),
                                              model_final=StatsModelsLinearRegression(),
                                              multitask_model_final=True)]:

                            # TODO: add stratification to bootstrap so that we can use it even with discrete treatments
                            infs = [None]
                            if isinstance(est, LinearDRLearner):
                                infs.append('statsmodels')

                            for inf in infs:
                                with self.subTest(d_w=d_w, d_x=d_x, d_y=d_y, d_t=d_t,
                                                  is_discrete=is_discrete, est=est, inf=inf):
                                    est.fit(Y, T, X, W, inference=inf)
                                    # make sure we can call the marginal_effect and effect methods
                                    const_marg_eff = est.const_marginal_effect(
                                        X)
                                    marg_eff = est.marginal_effect(T, X)
                                    self.assertEqual(
                                        shape(marg_eff), marginal_effect_shape)
                                    self.assertEqual(
                                        shape(const_marg_eff), const_marginal_effect_shape)

                                    np.testing.assert_array_equal(
                                        marg_eff if d_x else marg_eff[0:1], const_marg_eff)

                                    T0 = np.full_like(T, 'a')
                                    eff = est.effect(X, T0=T0, T1=T)
                                    self.assertEqual(shape(eff), effect_shape)
                                    if inf is not None:
                                        const_marg_eff_int = est.const_marginal_effect_interval(
                                            X)
                                        marg_eff_int = est.marginal_effect_interval(
                                            T, X)
                                        const_marg_effect_inf = est.const_marginal_effect_inference(
                                            X)
                                        T1 = np.full_like(T, 'b')
                                        effect_inf = est.effect_inference(
                                            X, T0=T0, T1=T1)
                                        marg_effect_inf = est.marginal_effect_inference(
                                            T, X)
                                        self.assertEqual(shape(marg_eff_int),
                                                         (2,) + marginal_effect_shape)
                                        self.assertEqual(shape(const_marg_eff_int),
                                                         (2,) + const_marginal_effect_shape)
                                        self.assertEqual(shape(est.effect_interval(X, T0=T0, T1=T)),
                                                         (2,) + effect_shape)

                                        # test const marginal inference
                                        self.assertEqual(shape(const_marg_effect_inf.summary_frame()),
                                                         const_marginal_effect_summaryframe_shape)
                                        self.assertEqual(shape(const_marg_effect_inf.point_estimate),
                                                         const_marginal_effect_shape)
                                        self.assertEqual(shape(const_marg_effect_inf.stderr),
                                                         const_marginal_effect_shape)
                                        self.assertEqual(shape(const_marg_effect_inf.var),
                                                         const_marginal_effect_shape)
                                        self.assertEqual(shape(const_marg_effect_inf.pvalue()),
                                                         const_marginal_effect_shape)
                                        self.assertEqual(shape(const_marg_effect_inf.zstat()),
                                                         const_marginal_effect_shape)
                                        self.assertEqual(shape(const_marg_effect_inf.conf_int()),
                                                         (2,) + const_marginal_effect_shape)
                                        np.testing.assert_array_almost_equal(const_marg_effect_inf.conf_int()
                                                                             [0], const_marg_eff_int[0], decimal=5)
                                        const_marg_effect_inf.population_summary()._repr_html_()

                                        # test effect inference
                                        self.assertEqual(shape(effect_inf.summary_frame()),
                                                         effect_summaryframe_shape)
                                        self.assertEqual(shape(effect_inf.point_estimate),
                                                         effect_shape)
                                        self.assertEqual(shape(effect_inf.stderr),
                                                         effect_shape)
                                        self.assertEqual(shape(effect_inf.var),
                                                         effect_shape)
                                        self.assertEqual(shape(effect_inf.pvalue()),
                                                         effect_shape)
                                        self.assertEqual(shape(effect_inf.zstat()),
                                                         effect_shape)
                                        self.assertEqual(shape(effect_inf.conf_int()),
                                                         (2,) + effect_shape)
                                        np.testing.assert_array_almost_equal(effect_inf.conf_int()
                                                                             [0], est.effect_interval(
                                                                                 X, T0=T0, T1=T1)
                                                                             [0], decimal=5)
                                        effect_inf.population_summary()._repr_html_()

                                        # test marginal effect inference
                                        self.assertEqual(shape(marg_effect_inf.summary_frame()),
                                                         marginal_effect_summaryframe_shape)
                                        self.assertEqual(shape(marg_effect_inf.point_estimate),
                                                         marginal_effect_shape)
                                        self.assertEqual(shape(marg_effect_inf.stderr),
                                                         marginal_effect_shape)
                                        self.assertEqual(shape(marg_effect_inf.var),
                                                         marginal_effect_shape)
                                        self.assertEqual(shape(marg_effect_inf.pvalue()),
                                                         marginal_effect_shape)
                                        self.assertEqual(shape(marg_effect_inf.zstat()),
                                                         marginal_effect_shape)
                                        self.assertEqual(shape(marg_effect_inf.conf_int()),
                                                         (2,) + marginal_effect_shape)
                                        np.testing.assert_array_almost_equal(marg_effect_inf.conf_int()
                                                                             [0], marg_eff_int[0], decimal=5)
                                        marg_effect_inf.population_summary()._repr_html_()

                                    est.score(Y, T, X, W)

                                    # make sure we can call effect with implied scalar treatments, no matter the
                                    # dimensions of T, and also that we warn when there are multiple treatments
                                    if d_t > 1:
                                        cm = self.assertWarns(Warning)
                                    else:
                                        cm = ExitStack()  # ExitStack can be used as a "do nothing" ContextManager
                                    with cm:
                                        effect_shape2 = (
                                            n if d_x else 1,) + ((d_y,) if d_y > 0 else())
                                        eff = est.effect(X, T0='a', T1='b')
                                        self.assertEqual(
                                            shape(eff), effect_shape2)
예제 #7
0
    def test_drlearner_all_attributes(self):
        from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestRegressor
        from sklearn.linear_model import LinearRegression, LogisticRegression
        from econml.utilities import StatsModelsLinearRegression
        import scipy.special
        np.random.seed(123)
        controls = np.random.uniform(-1, 1, size=(5000, 3))
        T = np.random.binomial(2, scipy.special.expit(controls[:, 0]))
        sigma = 0.01
        y = (1 + .5 * controls[:, 0]) * T + controls[:,
                                                     0] + np.random.normal(0, sigma, size=(5000,))
        for X in [controls]:
            for W in [None, controls]:
                for sample_weight in [None, 1 + np.random.randint(10, size=X.shape[0])]:
                    for sample_var in [None, 1 + np.random.randint(10, size=X.shape[0])]:
                        for featurizer in [None, PolynomialFeatures(degree=2, include_bias=False)]:
                            for models in [(GradientBoostingClassifier(), GradientBoostingRegressor(),
                                            RandomForestRegressor(n_estimators=100,
                                                                  max_depth=5, min_samples_leaf=50)),
                                           (GradientBoostingClassifier(), GradientBoostingRegressor(),
                                            RandomForestRegressor(n_estimators=100,
                                                                  max_depth=5, min_samples_leaf=50)),
                                           (LogisticRegression(solver='lbfgs', multi_class='auto'),
                                            LinearRegression(), StatsModelsLinearRegression())]:
                                for multitask_model_final in [False, True]:
                                    if (not isinstance(models, StatsModelsLinearRegression))\
                                            and (sample_var is not None):
                                        continue
                                    with self.subTest(X=X, W=W, sample_weight=sample_weight, sample_var=sample_var,
                                                      featurizer=featurizer, models=models,
                                                      multitask_model_final=multitask_model_final):
                                        est = DRLearner(model_propensity=models[0],
                                                        model_regression=models[1],
                                                        model_final=models[2],
                                                        featurizer=featurizer,
                                                        multitask_model_final=multitask_model_final)
                                        if (X is None) and (W is None):
                                            with pytest.raises(AttributeError) as e_info:
                                                est.fit(y, T, X=X, W=W,
                                                        sample_weight=sample_weight, sample_var=sample_var)
                                            continue
                                        est.fit(
                                            y, T, X=X, W=W, sample_weight=sample_weight, sample_var=sample_var)
                                        np.testing.assert_allclose(est.effect(X[:3], T0=0, T1=1), 1 + .5 * X[:3, 0],
                                                                   rtol=0, atol=.15)
                                        np.testing.assert_allclose(est.const_marginal_effect(X[:3]),
                                                                   np.hstack(
                                                                       [1 + .5 * X[:3, [0]],
                                                                        2 * (1 + .5 * X[:3, [0]])]),
                                                                   rtol=0, atol=.15)
                                        for t in [1, 2]:
                                            np.testing.assert_allclose(est.marginal_effect(t, X[:3]),
                                                                       np.hstack([1 + .5 * X[:3, [0]],
                                                                                  2 * (1 + .5 * X[:3, [0]])]),
                                                                       rtol=0, atol=.15)
                                        assert isinstance(est.score_, float)
                                        assert isinstance(
                                            est.score(y, T, X=X, W=W), float)

                                        feat_names = ['A', 'B', 'C']
                                        out_feat_names = feat_names
                                        if featurizer is not None:
                                            out_feat_names = featurizer.fit(
                                                X).get_feature_names(feat_names)
                                            np.testing.assert_array_equal(
                                                est.featurizer.n_input_features_, 3)
                                        np.testing.assert_array_equal(est.cate_feature_names(feat_names),
                                                                      out_feat_names)

                                        if isinstance(models[0], GradientBoostingClassifier):
                                            np.testing.assert_array_equal(np.array([mdl.feature_importances_
                                                                                    for mdl
                                                                                    in est.models_regression]).shape,
                                                                          [2, 2 + X.shape[1] +
                                                                           (W.shape[1] if W is not None else 0)])
                                            np.testing.assert_array_equal(np.array([mdl.feature_importances_
                                                                                    for mdl
                                                                                    in est.models_propensity]).shape,
                                                                          [2, X.shape[1] +
                                                                           (W.shape[1] if W is not None else 0)])
                                        else:
                                            np.testing.assert_array_equal(np.array([mdl.coef_
                                                                                    for mdl
                                                                                    in est.models_regression]).shape,
                                                                          [2, 2 + X.shape[1] +
                                                                           (W.shape[1] if W is not None else 0)])
                                            np.testing.assert_array_equal(np.array([mdl.coef_
                                                                                    for mdl
                                                                                    in est.models_propensity]).shape,
                                                                          [2, 3, X.shape[1] +
                                                                           (W.shape[1] if W is not None else 0)])
                                        if multitask_model_final:
                                            if isinstance(models[2], RandomForestRegressor):
                                                np.testing.assert_equal(np.argsort(
                                                    est.multitask_model_cate.feature_importances_)[-1], 0)
                                            else:
                                                true_coef = np.zeros(
                                                    (2, len(out_feat_names)))
                                                true_coef[:, 0] = [.5, 1]
                                                np.testing.assert_allclose(
                                                    est.multitask_model_cate.coef_, true_coef, rtol=0, atol=.15)
                                                np.testing.assert_allclose(
                                                    est.multitask_model_cate.intercept_, [1, 2], rtol=0, atol=.15)
                                        else:
                                            for t in [1, 2]:
                                                if isinstance(models[2], RandomForestRegressor):
                                                    np.testing.assert_equal(np.argsort(
                                                        est.model_cate(T=t).feature_importances_)[-1], 0)
                                                else:
                                                    true_coef = np.zeros(
                                                        len(out_feat_names))
                                                    true_coef[0] = .5 * t
                                                    np.testing.assert_allclose(
                                                        est.model_cate(T=t).coef_, true_coef, rtol=0, atol=.15)
                                                    np.testing.assert_allclose(
                                                        est.model_cate(T=t).intercept_, t, rtol=0, atol=.15)
예제 #8
0
    def test_comparison(self):
        def reg():
            return LinearRegression()

        def clf():
            return LogisticRegression()

        y, T, X, true_eff = self._get_data()
        (X_train, X_val, T_train, T_val, Y_train, Y_val, _,
         true_eff_val) = train_test_split(X, T, y, true_eff, test_size=.4)

        models = [
            ('ldml',
             LinearDML(model_y=reg(),
                       model_t=clf(),
                       discrete_treatment=True,
                       linear_first_stages=False,
                       cv=3)),
            ('sldml',
             SparseLinearDML(model_y=reg(),
                             model_t=clf(),
                             discrete_treatment=True,
                             featurizer=PolynomialFeatures(degree=2,
                                                           include_bias=False),
                             linear_first_stages=False,
                             cv=3)),
            ('xlearner',
             XLearner(models=reg(), cate_models=reg(),
                      propensity_model=clf())),
            ('dalearner',
             DomainAdaptationLearner(models=reg(),
                                     final_models=reg(),
                                     propensity_model=clf())),
            ('slearner', SLearner(overall_model=reg())),
            ('tlearner', TLearner(models=reg())),
            ('drlearner',
             DRLearner(model_propensity=clf(),
                       model_regression=reg(),
                       model_final=reg(),
                       cv=3)),
            ('rlearner',
             NonParamDML(model_y=reg(),
                         model_t=clf(),
                         model_final=reg(),
                         discrete_treatment=True,
                         cv=3)),
            ('dml3dlasso',
             DML(model_y=reg(),
                 model_t=clf(),
                 model_final=reg(),
                 discrete_treatment=True,
                 featurizer=PolynomialFeatures(degree=3),
                 linear_first_stages=False,
                 cv=3))
        ]

        models = Parallel(n_jobs=-1, verbose=1)(
            delayed(_fit_model)(name, mdl, Y_train, T_train, X_train)
            for name, mdl in models)

        scorer = RScorer(model_y=reg(),
                         model_t=clf(),
                         discrete_treatment=True,
                         cv=3,
                         mc_iters=2,
                         mc_agg='median')
        scorer.fit(Y_val, T_val, X=X_val)
        rscore = [scorer.score(mdl) for _, mdl in models]
        rootpehe_score = [
            np.sqrt(
                np.mean(
                    (true_eff_val.flatten() - mdl.effect(X_val).flatten())**2))
            for _, mdl in models
        ]
        assert LinearRegression().fit(
            np.array(rscore).reshape(-1, 1),
            np.array(rootpehe_score)).coef_ < 0.5
        mdl, _ = scorer.best_model([mdl for _, mdl in models])
        rootpehe_best = np.sqrt(
            np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2))
        assert rootpehe_best < 1.2 * np.min(rootpehe_score)
        mdl, _ = scorer.ensemble([mdl for _, mdl in models])
        rootpehe_ensemble = np.sqrt(
            np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2))
        assert rootpehe_ensemble < 1.2 * np.min(rootpehe_score)
예제 #9
0
    def test_auto_inference(self):
        Y, T, X, W = TestInference.Y, TestInference.T, TestInference.X, TestInference.W
        est = DRLearner(model_regression=LinearRegression(),
                        model_propensity=LogisticRegression(),
                        model_final=StatsModelsLinearRegression())
        est.fit(Y, T, X=X, W=W)
        est.effect_inference(X).summary_frame()
        est.effect_inference(X).population_summary()
        est.const_marginal_effect_inference(X).summary_frame()
        est.marginal_effect_inference(T, X).summary_frame()
        est = DRLearner(model_regression=LinearRegression(),
                        model_propensity=LogisticRegression(),
                        model_final=LinearRegression(),
                        multitask_model_final=True)
        est.fit(Y, T, X=X, W=W)
        with pytest.raises(AttributeError):
            est.effect_inference(X)

        est = DML(model_y=LinearRegression(),
                  model_t=LinearRegression(),
                  model_final=StatsModelsLinearRegression(fit_intercept=False),
                  random_state=123)
        est.fit(Y, T, X=X, W=W)
        est.summary()
        est.coef__inference().summary_frame()
        assert est.coef__inference().stderr is not None
        est.intercept__inference().summary_frame()
        assert est.intercept__inference().stderr is not None
        est.effect_inference(X).summary_frame()
        assert est.effect_inference(X).stderr is not None
        est.effect_inference(X).population_summary()
        est.const_marginal_effect_inference(X).summary_frame()
        assert est.const_marginal_effect_inference(X).stderr is not None
        est.marginal_effect_inference(T, X).summary_frame()
        assert est.marginal_effect_inference(T, X).stderr is not None

        est = NonParamDML(model_y=LinearRegression(),
                          model_t=LinearRegression(),
                          model_final=DebiasedLasso(),
                          random_state=123)
        est.fit(Y, T, X=X, W=W)
        est.effect_inference(X).summary_frame()
        assert est.effect_inference(X).stderr is not None
        est.effect_inference(X).population_summary()
        est.const_marginal_effect_inference(X).summary_frame()
        assert est.const_marginal_effect_inference(X).stderr is not None
        est.marginal_effect_inference(T, X).summary_frame()
        assert est.marginal_effect_inference(T, X).stderr is not None