def test_final_models(self): d_y = (1, ) y = np.random.choice([0, 1], size=(500, ) + d_y) X = np.hstack((np.random.normal(size=(500, 2)), np.random.choice([0, 1], size=(500, 1)), np.random.choice([0, 1, 2], size=(500, 1)))) inds = [0, 1, 2, 3] cats = [2, 3] hinds = [0, 3] for h_model in ['forest', 'linear']: for classification in [False, True]: ca = CausalAnalysis(inds, cats, hinds, classification=classification, heterogeneity_model=h_model) ca.fit(X, y) glo = ca.global_causal_effect() coh = ca.cohort_causal_effect(X[:2]) loc = ca.local_causal_effect(X[:2]) glo_dict = ca._global_causal_effect_dict() coh_dict = ca._cohort_causal_effect_dict(X[:2]) loc_dict = ca._local_causal_effect_dict(X[:2]) ca._policy_tree_output(X, 1) ca._heterogeneity_tree_output(X, 1) ca._heterogeneity_tree_output(X, 3) # Make sure we handle continuous, binary, and multi-class treatments # For multiple discrete treatments, one "always treat" value per non-default treatment for (idx, length) in [(0, 1), (1, 1), (2, 1), (3, 2)]: _, policy_val, always_trt = ca._policy_tree_output(X, idx) assert isinstance(always_trt, list) assert np.array(policy_val).shape == () assert np.array(always_trt).shape == (length, ) # policy value should exceed always treating with any treatment assert_less_close(always_trt, policy_val) if not classification: # ExitStack can be used as a "do nothing" ContextManager cm = ExitStack() else: cm = self.assertRaises(Exception) with cm: inf = ca.whatif(X[:2], np.ones(shape=(2, )), 1, y[:2]) inf = ca.whatif(X[:2], np.ones(shape=(2, )), 2, y[:2]) ca._whatif_dict(X[:2], np.ones(shape=(2, )), 1, y[:2]) with self.assertRaises(AssertionError): ca = CausalAnalysis(inds, cats, hinds, classification=classification, heterogeneity_model='other') ca.fit(X, y)
def test_forest_with_pandas(self): y = pd.Series(np.random.choice([0, 1], size=(500, ))) X = pd.DataFrame({ 'a': np.random.normal(size=500), 'b': np.random.normal(size=500), 'c': np.random.choice([0, 1], size=500), 'd': np.random.choice(['a', 'b', 'c'], size=500) }) inds = ['a', 'b', 'c', 'd'] cats = ['c', 'd'] hinds = ['a', 'd'] ca = CausalAnalysis(inds, cats, hinds, heterogeneity_model='forest') ca.fit(X, y) glo = ca.global_causal_effect() coh = ca.cohort_causal_effect(X[:2]) loc = ca.local_causal_effect(X[:2]) # global and cohort data should have exactly the same structure, but different values assert glo.index.equals(coh.index) # local index should have as many times entries as global as there were rows passed in assert len(loc.index) == 2 * len(glo.index) assert glo.index.names == ['feature', 'feature_value'] assert loc.index.names == ['sample'] + glo.index.names # features; for categoricals they should appear #cats-1 times each fts = ['a', 'b', 'c', 'd', 'd'] for i in range(len(fts)): assert fts[i] == glo.index[i][0] == loc.index[i][1] == loc.index[ len(fts) + i][1] glo_dict = ca._global_causal_effect_dict() coh_dict = ca._cohort_causal_effect_dict(X[:2]) loc_dict = ca._local_causal_effect_dict(X[:2]) glo_point_est = np.array( glo_dict[_CausalInsightsConstants.PointEstimateKey]) coh_point_est = np.array( coh_dict[_CausalInsightsConstants.PointEstimateKey]) loc_point_est = np.array( loc_dict[_CausalInsightsConstants.PointEstimateKey]) # global shape is (d_y, sum(d_t)) assert glo_point_est.shape == coh_point_est.shape == (1, 5) assert loc_point_est.shape == (2, ) + glo_point_est.shape ca._policy_tree_output(X, inds[1]) ca._heterogeneity_tree_string(X, inds[1]) ca._heterogeneity_tree_string(X, inds[3]) # Can't handle multi-dimensional treatments with self.assertRaises(AssertionError): ca._policy_tree_output(X, inds[3])
def test_final_models(self): d_y = (1, ) y = np.random.choice([0, 1], size=(500, ) + d_y) X = np.hstack((np.random.normal(size=(500, 2)), np.random.choice([0, 1], size=(500, 1)), np.random.choice([0, 1, 2], size=(500, 1)))) inds = [0, 1, 2, 3] cats = [2, 3] hinds = [0, 3] for h_model in ['forest', 'linear']: for classification in [False, True]: ca = CausalAnalysis(inds, cats, hinds, classification=classification, heterogeneity_model=h_model) ca.fit(X, y) glo = ca.global_causal_effect() coh = ca.cohort_causal_effect(X[:2]) loc = ca.local_causal_effect(X[:2]) glo_dict = ca._global_causal_effect_dict() coh_dict = ca._cohort_causal_effect_dict(X[:2]) loc_dict = ca._local_causal_effect_dict(X[:2]) ca._policy_tree_output(X, 1) ca._heterogeneity_tree_string(X, 1) ca._heterogeneity_tree_string(X, 3) # Can't handle multi-dimensional treatments with self.assertRaises(AssertionError): ca._policy_tree_output(X, 3) if not classification: # ExitStack can be used as a "do nothing" ContextManager cm = ExitStack() else: cm = self.assertRaises(Exception) with cm: inf = ca.whatif(X[:2], np.ones(shape=(2, )), 1, y[:2]) inf.summary_frame() inf = ca.whatif(X[:2], np.ones(shape=(2, )), 2, y[:2]) inf.summary_frame() ca._whatif_dict(X[:2], np.ones(shape=(2, )), 1, y[:2]) with self.assertRaises(AssertionError): ca = CausalAnalysis(inds, cats, hinds, classification=classification, heterogeneity_model='other') ca.fit(X, y)
def test_empty_hinds(self): for h_model in ['linear', 'forest']: for classification in [True, False]: X1 = np.random.normal(0, 1, size=(500, 5)) X2 = np.random.choice([0, 1], size=(500, 1)) X3 = np.random.choice([0, 1, 2], size=(500, 1)) X = np.hstack((X1, X2, X3)) X_df = pd.DataFrame(X, columns=[f"x{i} " for i in range(7)]) y = np.random.choice([0, 1], size=(500, )) y_df = pd.Series(y) # model hetero_inds = [[], [], []] feat_inds = [1, 3, 5] categorical = [5, 6] ca = CausalAnalysis(feat_inds, categorical, heterogeneity_inds=hetero_inds, classification=classification, nuisance_models='linear', heterogeneity_model=h_model, n_jobs=-1) ca.fit(X_df, y) eff = ca.global_causal_effect(alpha=0.05) eff = ca.local_causal_effect(X_df, alpha=0.05) for ind in feat_inds: tree, val, always_trt = ca._policy_tree_output(X_df, ind)
def test_one_feature(self): # make sure we don't run into problems dropping every index y = pd.Series(np.random.choice([0, 1], size=(500, ))) X = pd.DataFrame({ 'a': np.random.normal(size=500), 'b': np.random.normal(size=500), 'c': np.random.choice([0, 1], size=500), 'd': np.random.choice(['a', 'b', 'c'], size=500) }) inds = ['a'] cats = ['c', 'd'] hinds = ['a', 'd'] ca = CausalAnalysis(inds, cats, hinds, classification=False) ca.fit(X, y) glo = ca.global_causal_effect() coh = ca.cohort_causal_effect(X[:2]) loc = ca.local_causal_effect(X[:2]) # global and cohort data should have exactly the same structure, but different values assert glo.index.equals(coh.index) # local index should have as many times entries as global as there were rows passed in assert len(loc.index) == 2 * len(glo.index) assert glo.index.names == ['feature'] assert loc.index.names == ['sample'] glo_dict = ca._global_causal_effect_dict() coh_dict = ca._cohort_causal_effect_dict(X[:2]) loc_dict = ca._local_causal_effect_dict(X[:2]) glo_point_est = np.array( glo_dict[_CausalInsightsConstants.PointEstimateKey]) coh_point_est = np.array( coh_dict[_CausalInsightsConstants.PointEstimateKey]) loc_point_est = np.array( loc_dict[_CausalInsightsConstants.PointEstimateKey]) # global shape is (d_y, sum(d_t)) assert glo_point_est.shape == coh_point_est.shape == (1, 1) assert loc_point_est.shape == (2, ) + glo_point_est.shape ca._policy_tree_output(X, inds[0]) ca._heterogeneity_tree_string(X, inds[0])
def test_basic_pandas(self): for classification in [False, True]: y = pd.Series(np.random.choice([0, 1], size=(500, ))) X = pd.DataFrame({ 'a': np.random.normal(size=500), 'b': np.random.normal(size=500), 'c': np.random.choice([0, 1], size=500), 'd': np.random.choice(['a', 'b', 'c'], size=500) }) n_inds = [0, 1, 2, 3] t_inds = ['a', 'b', 'c', 'd'] n_cats = [2, 3] t_cats = ['c', 'd'] n_hinds = [0, 3] t_hinds = ['a', 'd'] for (inds, cats, hinds) in [(n_inds, n_cats, n_hinds), (t_inds, t_cats, t_hinds)]: ca = CausalAnalysis(inds, cats, hinds, classification=classification) ca.fit(X, y) glo = ca.global_causal_effect() coh = ca.cohort_causal_effect(X[:2]) loc = ca.local_causal_effect(X[:2]) # global and cohort data should have exactly the same structure, but different values assert glo.index.equals(coh.index) # local index should have as many times entries as global as there were rows passed in assert len(loc.index) == 2 * len(glo.index) assert glo.index.names == ['feature', 'feature_value'] assert loc.index.names == ['sample'] + glo.index.names # features; for categoricals they should appear #cats-1 times each fts = ['a', 'b', 'c', 'd', 'd'] for i in range(len(fts)): assert fts[i] == glo.index[i][0] == loc.index[i][ 1] == loc.index[len(fts) + i][1] glo_dict = ca._global_causal_effect_dict() coh_dict = ca._cohort_causal_effect_dict(X[:2]) loc_dict = ca._local_causal_effect_dict(X[:2]) glo_point_est = np.array( glo_dict[_CausalInsightsConstants.PointEstimateKey]) coh_point_est = np.array( coh_dict[_CausalInsightsConstants.PointEstimateKey]) loc_point_est = np.array( loc_dict[_CausalInsightsConstants.PointEstimateKey]) # global shape is (d_y, sum(d_t)) assert glo_point_est.shape == coh_point_est.shape == (1, 5) assert loc_point_est.shape == (2, ) + glo_point_est.shape ca._policy_tree_output(X, inds[1]) ca._heterogeneity_tree_string(X, inds[1]) ca._heterogeneity_tree_string(X, inds[3]) # Can't handle multi-dimensional treatments with self.assertRaises(AssertionError): ca._policy_tree_output(X, inds[3]) if not classification: # ExitStack can be used as a "do nothing" ContextManager cm = ExitStack() else: cm = self.assertRaises(Exception) with cm: inf = ca.whatif(X[:2], np.ones(shape=(2, )), inds[1], y[:2]) assert np.shape(inf.point_estimate) == np.shape(y[:2]) inf.summary_frame() inf = ca.whatif(X[:2], np.ones(shape=(2, )), inds[2], y[:2]) assert np.shape(inf.point_estimate) == np.shape(y[:2]) inf.summary_frame() ca._whatif_dict(X[:2], np.ones(shape=(2, )), inds[1], y[:2]) badargs = [ (n_inds, n_cats, [4]), # hinds out of range (n_inds, n_cats, ["test"]) # hinds out of range ] for args in badargs: with self.assertRaises(Exception): ca = CausalAnalysis(*args) ca.fit(X, y)
def test_automl_first_stage(self): d_y = (1, ) for classification in [False, True]: y = np.random.choice([0, 1], size=(500, ) + d_y) X = np.hstack((np.random.normal(size=(500, 2)), np.random.choice([0, 1], size=(500, 1)), np.random.choice([0, 1, 2], size=(500, 1)))) inds = [0, 1, 2, 3] cats = [2, 3] hinds = [0, 3] ca = CausalAnalysis(inds, cats, hinds, classification=classification, nuisance_models='automl') ca.fit(X, y) glo = ca.global_causal_effect() coh = ca.cohort_causal_effect(X[:2]) loc = ca.local_causal_effect(X[:2]) # global and cohort data should have exactly the same structure, but different values assert glo.index.equals(coh.index) # local index should have as many times entries as global as there were rows passed in assert len(loc.index) == 2 * len(glo.index) assert glo.index.names == ['feature', 'feature_value'] assert loc.index.names == ['sample'] + glo.index.names glo_dict = ca._global_causal_effect_dict() coh_dict = ca._cohort_causal_effect_dict(X[:2]) loc_dict = ca._local_causal_effect_dict(X[:2]) glo_point_est = np.array( glo_dict[_CausalInsightsConstants.PointEstimateKey]) coh_point_est = np.array( coh_dict[_CausalInsightsConstants.PointEstimateKey]) loc_point_est = np.array( loc_dict[_CausalInsightsConstants.PointEstimateKey]) ca._policy_tree_output(X, 1) ca._heterogeneity_tree_string(X, 1) ca._heterogeneity_tree_string(X, 3) # Can't handle multi-dimensional treatments with self.assertRaises(AssertionError): ca._policy_tree_output(X, 3) # global shape is (d_y, sum(d_t)) assert glo_point_est.shape == coh_point_est.shape == (1, 5) assert loc_point_est.shape == (2, ) + glo_point_est.shape if not classification: # ExitStack can be used as a "do nothing" ContextManager cm = ExitStack() else: cm = self.assertRaises(Exception) with cm: inf = ca.whatif(X[:2], np.ones(shape=(2, )), 1, y[:2]) assert np.shape(inf.point_estimate) == np.shape(y[:2]) inf.summary_frame() inf = ca.whatif(X[:2], np.ones(shape=(2, )), 2, y[:2]) assert np.shape(inf.point_estimate) == np.shape(y[:2]) inf.summary_frame() ca._whatif_dict(X[:2], np.ones(shape=(2, )), 1, y[:2]) # features; for categoricals they should appear #cats-1 times each fts = ['x0', 'x1', 'x2', 'x3', 'x3'] for i in range(len(fts)): assert fts[i] == glo.index[i][0] == loc.index[i][ 1] == loc.index[len(fts) + i][1] badargs = [ (inds, cats, [4]), # hinds out of range (inds, cats, ["test"]) # hinds out of range ] for args in badargs: with self.assertRaises(Exception): ca = CausalAnalysis(*args) ca.fit(X, y)
def test_forest_with_pandas(self): y = pd.Series(np.random.choice([0, 1], size=(500, ))) X = pd.DataFrame({ 'a': np.random.normal(size=500), 'b': np.random.normal(size=500), 'c': np.random.choice([0, 1], size=500), 'd': np.random.choice(['a', 'b', 'c'], size=500) }) inds = ['a', 'b', 'c', 'd'] cats = ['c', 'd'] hinds = ['a', 'd'] ca = CausalAnalysis(inds, cats, hinds, heterogeneity_model='forest') ca.fit(X, y) glo = ca.global_causal_effect() coh = ca.cohort_causal_effect(X[:2]) loc = ca.local_causal_effect(X[:2]) # global and cohort data should have exactly the same structure, but different values assert glo.index.equals(coh.index) # local index should have as many times entries as global as there were rows passed in assert len(loc.index) == 2 * len(glo.index) assert glo.index.names == ['feature', 'feature_value'] assert loc.index.names == ['sample'] + glo.index.names # features; for categoricals they should appear #cats-1 times each fts = ['a', 'b', 'c', 'd', 'd'] for i in range(len(fts)): assert fts[i] == glo.index[i][0] == loc.index[i][1] == loc.index[ len(fts) + i][1] glo_dict = ca._global_causal_effect_dict() coh_dict = ca._cohort_causal_effect_dict(X[:2]) loc_dict = ca._local_causal_effect_dict(X[:2]) glo_point_est = np.array( glo_dict[_CausalInsightsConstants.PointEstimateKey]) coh_point_est = np.array( coh_dict[_CausalInsightsConstants.PointEstimateKey]) loc_point_est = np.array( loc_dict[_CausalInsightsConstants.PointEstimateKey]) # global shape is (d_y, sum(d_t)) assert glo_point_est.shape == coh_point_est.shape == (1, 5) assert loc_point_est.shape == (2, ) + glo_point_est.shape ca._policy_tree_output(X, inds[1]) ca._heterogeneity_tree_output(X, inds[1]) ca._heterogeneity_tree_output(X, inds[3]) # Make sure we handle continuous, binary, and multi-class treatments # For multiple discrete treatments, one "always treat" value per non-default treatment for (idx, length) in [(0, 1), (1, 1), (2, 1), (3, 2)]: _, policy_val, always_trt = ca._policy_tree_output(X, inds[idx]) assert isinstance(always_trt, list) assert np.array(policy_val).shape == () assert np.array(always_trt).shape == (length, ) # policy value should exceed always treating with any treatment assert_less_close(always_trt, policy_val)
def test_basic_array(self): for d_y in [(), (1, )]: for classification in [False, True]: y = np.random.choice([0, 1], size=(500, ) + d_y) X = np.hstack((np.random.normal(size=(500, 2)), np.random.choice([0, 1], size=(500, 1)), np.random.choice([0, 1, 2], size=(500, 1)))) inds = [0, 1, 2, 3] cats = [2, 3] hinds = [0, 3] ca = CausalAnalysis(inds, cats, hinds, classification=classification) ca.fit(X, y) glo = ca.global_causal_effect() coh = ca.cohort_causal_effect(X[:2]) loc = ca.local_causal_effect(X[:2]) # global and cohort data should have exactly the same structure, but different values assert glo.index.equals(coh.index) # local index should have as many times entries as global as there were rows passed in assert len(loc.index) == 2 * len(glo.index) assert glo.index.names == ['feature', 'feature_value'] assert loc.index.names == ['sample'] + glo.index.names glo_dict = ca._global_causal_effect_dict() coh_dict = ca._cohort_causal_effect_dict(X[:2]) loc_dict = ca._local_causal_effect_dict(X[:2]) glo_point_est = np.array( glo_dict[_CausalInsightsConstants.PointEstimateKey]) coh_point_est = np.array( coh_dict[_CausalInsightsConstants.PointEstimateKey]) loc_point_est = np.array( loc_dict[_CausalInsightsConstants.PointEstimateKey]) ca._heterogeneity_tree_output(X, 1) ca._heterogeneity_tree_output(X, 3) # Make sure we handle continuous, binary, and multi-class treatments # For multiple discrete treatments, one "always treat" value per non-default treatment for (idx, length) in [(0, 1), (1, 1), (2, 1), (3, 2)]: _, policy_val, always_trt = ca._policy_tree_output(X, idx) assert isinstance(always_trt, list) assert np.array(policy_val).shape == () assert np.array(always_trt).shape == (length, ) # policy value should exceed always treating with any treatment assert_less_close(always_trt, policy_val) # global shape is (d_y, sum(d_t)) assert glo_point_est.shape == coh_point_est.shape == (1, 5) assert loc_point_est.shape == (2, ) + glo_point_est.shape if not classification: # ExitStack can be used as a "do nothing" ContextManager cm = ExitStack() else: cm = self.assertRaises(Exception) with cm: inf = ca.whatif(X[:2], np.ones(shape=(2, )), 1, y[:2]) assert np.shape(inf.point_estimate) == (2, ) inf = ca.whatif(X[:2], np.ones(shape=(2, )), 2, y[:2]) assert np.shape(inf.point_estimate) == (2, ) ca._whatif_dict(X[:2], np.ones(shape=(2, )), 1, y[:2]) # features; for categoricals they should appear #cats-1 times each fts = ['x0', 'x1', 'x2', 'x3', 'x3'] for i in range(len(fts)): assert fts[i] == glo.index[i][0] == loc.index[i][ 1] == loc.index[len(fts) + i][1] badargs = [ (inds, cats, [4]), # hinds out of range (inds, cats, ["test"]) # hinds out of range ] for args in badargs: with self.assertRaises(Exception): ca = CausalAnalysis(*args) ca.fit(X, y)
def compute(self): """Computes the causal insights by running the causal configuration.""" for config in self._causal_config_list: if config.is_computed: continue config.is_computed = True if config.nuisance_model not in [CausalConstants.AUTOML, CausalConstants.LINEAR]: message = (f"nuisance_model should be one of " f"['{CausalConstants.AUTOML}', " f"'{CausalConstants.LINEAR}'], " f"got {config.nuisance_model}") raise UserConfigValidationException(message) is_classification = self._task_type == ModelTask.CLASSIFICATION X = pd.concat([self._train, self._test], ignore_index=True)\ .drop([self._target_column], axis=1) y = pd.concat([self._train, self._test], ignore_index=True)[ self._target_column].values.ravel() categoricals = self._categorical_features if categoricals is None: categoricals = [] analysis = CausalAnalysis( config.treatment_features, categoricals, heterogeneity_inds=config.heterogeneity_features, classification=is_classification, nuisance_models=config.nuisance_model, upper_bound_on_cat_expansion=config.max_cat_expansion, skip_cat_limit_checks=config.skip_cat_limit_checks, n_jobs=-1) analysis.fit(X, y) config.causal_analysis = analysis X_test = self._test.drop([self._target_column], axis=1) config.global_effects = analysis.global_causal_effect( alpha=config.alpha, keep_all_levels=True) config.local_effects = analysis.local_causal_effect( X_test, alpha=config.alpha, keep_all_levels=True) config.policies = [] for treatment_feature in config.treatment_features: local_policies = analysis.individualized_policy( X_test, treatment_feature, treatment_costs=config.treatment_cost, alpha=config.alpha) tree = analysis._policy_tree_output( X_test, treatment_feature, treatment_costs=config.treatment_cost, max_depth=config.max_tree_depth, min_samples_leaf=config.min_tree_leaf_samples, alpha=config.alpha) policy = { self.TREATMENT_FEATURE: treatment_feature, self.CONTROL_TREATMENT: tree.control_name, self.LOCAL_POLICIES: local_policies, self.POLICY_GAINS: { self.RECOMMENDED_POLICY_GAINS: tree.policy_value, self.TREATMENT_GAINS: tree.always_treat, }, self.POLICY_TREE: tree.tree_dictionary } config.policies.append(policy)
def test_one_feature(self): # make sure we don't run into problems dropping every index y = pd.Series(np.random.choice([0, 1], size=(500, ))) X = pd.DataFrame({ 'a': np.random.normal(size=500), 'b': np.random.normal(size=500), 'c': np.random.choice([0, 1], size=500), 'd': np.random.choice(['a', 'b', 'c'], size=500) }) inds = ['a'] cats = ['c', 'd'] hinds = ['a', 'd'] ca = CausalAnalysis(inds, cats, hinds, classification=False) ca.fit(X, y) glo = ca.global_causal_effect() coh = ca.cohort_causal_effect(X[:2]) loc = ca.local_causal_effect(X[:2]) # global and cohort data should have exactly the same structure, but different values assert glo.index.equals(coh.index) # local index should have as many times entries as global as there were rows passed in assert len(loc.index) == 2 * len(glo.index) assert glo.index.names == ['feature'] assert loc.index.names == ['sample'] glo_dict = ca._global_causal_effect_dict() glo_dict2 = ca._global_causal_effect_dict(row_wise=True) coh_dict = ca._cohort_causal_effect_dict(X[:2]) coh_dict2 = ca._cohort_causal_effect_dict(X[:2], row_wise=True) loc_dict = ca._local_causal_effect_dict(X[:2]) loc_dict2 = ca._local_causal_effect_dict(X[:2], row_wise=True) glo_point_est = np.array( glo_dict[_CausalInsightsConstants.PointEstimateKey]) coh_point_est = np.array( coh_dict[_CausalInsightsConstants.PointEstimateKey]) loc_point_est = np.array( loc_dict[_CausalInsightsConstants.PointEstimateKey]) # global shape is (d_y, sum(d_t)) assert glo_point_est.shape == coh_point_est.shape == (1, 1) assert loc_point_est.shape == (2, ) + glo_point_est.shape glo2 = ca.global_causal_effect(keep_all_levels=True) coh2 = ca.cohort_causal_effect(X[:2], keep_all_levels=True) loc2 = ca.local_causal_effect(X[:2], keep_all_levels=True) assert ({ind.name for ind in glo2.index.levels} == { ind.name for ind in coh2.index.levels } == {"outcome", "feature", "feature_value"}) assert {ind.name for ind in loc2.index.levels } == {"sample", "outcome", "feature", "feature_value"} # global and cohort row-wise dicts have d_y * d_t entries assert len(glo_dict2[_CausalInsightsConstants.RowData]) == len( coh_dict2[_CausalInsightsConstants.RowData]) == 1 # local dictionary is flattened to n_rows * d_y * d_t assert len(loc_dict2[_CausalInsightsConstants.RowData]) == 2 ca._policy_tree_output(X, inds[0]) ca._heterogeneity_tree_output(X, inds[0])
def test_basic_pandas(self): for classification in [False, True]: y = pd.Series(np.random.choice([0, 1], size=(500, ))) X = pd.DataFrame({ 'a': np.random.normal(size=500), 'b': np.random.normal(size=500), 'c': np.random.choice([0, 1], size=500), 'd': np.random.choice(['a', 'b', 'c'], size=500) }) n_inds = [0, 1, 2, 3] t_inds = ['a', 'b', 'c', 'd'] n_cats = [2, 3] t_cats = ['c', 'd'] n_hinds = [0, 3] t_hinds = ['a', 'd'] for (inds, cats, hinds) in [(n_inds, n_cats, n_hinds), (t_inds, t_cats, t_hinds)]: ca = CausalAnalysis(inds, cats, hinds, classification=classification) ca.fit(X, y) glo = ca.global_causal_effect() coh = ca.cohort_causal_effect(X[:2]) loc = ca.local_causal_effect(X[:2]) # global and cohort data should have exactly the same structure, but different values assert glo.index.equals(coh.index) # local index should have as many times entries as global as there were rows passed in assert len(loc.index) == 2 * len(glo.index) assert glo.index.names == ['feature', 'feature_value'] assert loc.index.names == ['sample'] + glo.index.names # features; for categoricals they should appear #cats-1 times each fts = ['a', 'b', 'c', 'd', 'd'] for i in range(len(fts)): assert fts[i] == glo.index[i][0] == loc.index[i][ 1] == loc.index[len(fts) + i][1] glo_dict = ca._global_causal_effect_dict() glo_dict2 = ca._global_causal_effect_dict(row_wise=True) coh_dict = ca._cohort_causal_effect_dict(X[:2]) coh_dict2 = ca._cohort_causal_effect_dict(X[:2], row_wise=True) loc_dict = ca._local_causal_effect_dict(X[:2]) loc_dict2 = ca._local_causal_effect_dict(X[:2], row_wise=True) glo_point_est = np.array( glo_dict[_CausalInsightsConstants.PointEstimateKey]) coh_point_est = np.array( coh_dict[_CausalInsightsConstants.PointEstimateKey]) loc_point_est = np.array( loc_dict[_CausalInsightsConstants.PointEstimateKey]) # global shape is (d_y, sum(d_t)) assert glo_point_est.shape == coh_point_est.shape == (1, 5) assert loc_point_est.shape == (2, ) + glo_point_est.shape # global and cohort row-wise dicts have d_y * d_t entries assert len(glo_dict2[_CausalInsightsConstants.RowData]) == len( coh_dict2[_CausalInsightsConstants.RowData]) == 5 # local dictionary is flattened to n_rows * d_y * d_t assert len(loc_dict2[_CausalInsightsConstants.RowData]) == 10 pto = ca._policy_tree_output(X, inds[1]) ca._heterogeneity_tree_output(X, inds[1]) ca._heterogeneity_tree_output(X, inds[3]) # continuous treatments have typical treatment values equal to # the mean of the absolute value of non-zero entries np.testing.assert_allclose(ca.typical_treatment_value(inds[0]), np.mean(np.abs(X['a']))) np.testing.assert_allclose(ca.typical_treatment_value(inds[1]), np.mean(np.abs(X['b']))) # discrete treatments have typical treatment value 1 assert ca.typical_treatment_value( inds[2]) == ca.typical_treatment_value(inds[3]) == 1 # Make sure we handle continuous, binary, and multi-class treatments # For multiple discrete treatments, one "always treat" value per non-default treatment for (idx, length) in [(0, 1), (1, 1), (2, 1), (3, 2)]: pto = ca._policy_tree_output(X, inds[idx]) policy_val = pto.policy_value always_trt = pto.always_treat assert isinstance(pto.control_name, str) assert isinstance(always_trt, dict) assert np.array(policy_val).shape == () assert len(always_trt) == length for val in always_trt.values(): assert np.array(val).shape == () # policy value should exceed always treating with any treatment assert_less_close(np.array(list(always_trt.values())), policy_val) if not classification: # ExitStack can be used as a "do nothing" ContextManager cm = ExitStack() else: cm = self.assertRaises(Exception) with cm: inf = ca.whatif(X[:2], np.ones(shape=(2, )), inds[1], y[:2]) assert np.shape(inf.point_estimate) == np.shape(y[:2]) inf = ca.whatif(X[:2], np.ones(shape=(2, )), inds[2], y[:2]) assert np.shape(inf.point_estimate) == np.shape(y[:2]) ca._whatif_dict(X[:2], np.ones(shape=(2, )), inds[1], y[:2]) ca._whatif_dict(X[:2], np.ones(shape=(2, )), inds[1], y[:2], row_wise=True) badargs = [ (n_inds, n_cats, [4]), # hinds out of range (n_inds, n_cats, ["test"]) # hinds out of range ] for args in badargs: with self.assertRaises(Exception): ca = CausalAnalysis(*args) ca.fit(X, y)