def custom_data_average_treatment_effect_test(self, data): model = CausalModel( data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None ) target_estimand = model.identify_effect() estimator_ate = self._Estimator( data['df'], identified_estimand=target_estimand, treatment=data["treatment_name"], outcome=data["outcome_name"], test_significance=None ) true_ate = data["ate"] ate_estimate = estimator_ate.estimate_effect() error = ate_estimate.value - true_ate print("Error in ATE estimate = {0} with tolerance {1}%. Estimated={2},True={3}".format( error, self._error_tolerance * 100, ate_estimate.value, true_ate) ) res = True if (error < true_ate * self._error_tolerance) else False assert res
def predict(self, dataset: DatasetInterface): data = dataset.get_data() # Temporally add treatment. data['treatment'] = True treatment = 'treatment' outcome = dataset.get_outcome() common_causes = dataset.get_causes() model = CausalModel(data, treatment, outcome, common_causes=common_causes, proceed_when_unidentifiable=True) # Identify the causal effect relation = model.identify_effect() # Estimate the causal effect estimate = model.estimate_effect( relation, method_name="backdoor.linear_regression", test_significance=True) # Refute the obtained estimate result = model.refute_estimate(relation, estimate, method_name="random_common_cause") return result.estimated_effect, result.new_effect
def test_graph_input(self, beta, num_instruments, num_samples, num_treatments): num_common_causes = 5 data = dowhy.datasets.linear_dataset(beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_samples=num_samples, num_treatments = num_treatments, treatment_is_binary=True) model = CausalModel( data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None ) # removing two common causes gml_str = 'graph[directed 1 node[ id "{0}" label "{0}"]node[ id "{1}" label "{1}"]node[ id "Unobserved Confounders" label "Unobserved Confounders"]edge[source "{0}" target "{1}"]edge[source "Unobserved Confounders" target "{0}"]edge[source "Unobserved Confounders" target "{1}"]node[ id "X0" label "X0"] edge[ source "X0" target "{0}"] node[ id "X1" label "X1"] edge[ source "X1" target "{0}"] node[ id "X2" label "X2"] edge[ source "X2" target "{0}"] edge[ source "X0" target "{1}"] edge[ source "X1" target "{1}"] edge[ source "X2" target "{1}"] node[ id "Z0" label "Z0"] edge[ source "Z0" target "{0}"]]'.format(data["treatment_name"][0], data["outcome_name"]) print(gml_str) model = CausalModel( data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=gml_str, proceed_when_unidentifiable=True, test_significance=None, missing_nodes_as_confounders=True ) common_causes = model.get_common_causes() assert all(node_name in common_causes for node_name in ["X1", "X2"])
def test_causalml_XGBTRegressor(self, init_data): # Defined a linear dataset with a given set of properties data = init_data # Create a model that captures the same model = CausalModel( data=data['df'], treatment=data['treatment_name'], outcome=data['outcome_name'], effect_modifiers=data['effect_modifier_names'], graph=data['gml_graph'] ) # Identify the effects within the model identified_estimand = model.identify_effect( proceed_when_unidentifiable=True ) xgbt_estimate = model.estimate_effect( identified_estimand, method_name="backdoor.causalml.inference.meta.XGBTRegressor", method_params={"init_params":{}} ) print("The XGBT estimate obtained:") print(xgbt_estimate)
def test_causalml_RLearner(self, init_data): # Defined a linear dataset with a given set of properties data = init_data # Create a model that captures the same model = CausalModel( data=data['df'], treatment=data['treatment_name'], outcome=data['outcome_name'], effect_modifiers=data['effect_modifier_names'], graph=data['gml_graph'] ) # Identify the effects within the model identified_estimand = model.identify_effect( proceed_when_unidentifiable=True ) rl_estimate = None try: rl_estimate = model.estimate_effect( identified_estimand, method_name="backdoor.causalml.inference.meta.BaseRRegressor", method_params={"init_params":{ 'learner':XGBRegressor() } } ) except ValueError: print("Error with respect to the number of samples") print("The R Learner estimate obtained:") print(rl_estimate)
def test_5(self): treatment = "T" outcome = "Y" variables = ["X1", "X2"] causal_graph = "digraph{T->Y;X1->T;X1->Y;X2->T;}" columns = list(treatment) + list(outcome) + list(variables) df = pd.DataFrame(columns=columns) # Calculate causal effect twice: once for unit (t=1, c=0), once for specific increase (t=100, c=50) causal_model = CausalModel(df, treatment, outcome, graph=causal_graph) identified_estimand = causal_model.identify_effect( method_name="id-algorithm") # Compare with ground truth set_a = set(identified_estimand._product[0]._product[0]._product[0] ['outcome_vars']._set) set_b = set(identified_estimand._product[0]._product[0]._product[0] ['condition_vars']._set) set_c = set(identified_estimand._product[0]._product[1]._product[0] ['outcome_vars']._set) set_d = set(identified_estimand._product[0]._product[1]._product[0] ['condition_vars']._set) assert identified_estimand._product[0]._sum == ['X1'] assert len(set_a.difference({'Y'})) == 0 assert len(set_b.difference({'X1', 'X2', 'T'})) == 0 assert len(set_c.difference({'X1'})) == 0 assert len(set_d) == 0
def test_graph_input4(self, beta, num_instruments, num_samples, num_treatments): num_common_causes = 5 data = dowhy.datasets.linear_dataset(beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_samples=num_samples, num_treatments = num_treatments, treatment_is_binary=True) model = CausalModel( data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None ) # removing two common causes gml_str = "tests/sample_dag.txt" print(gml_str) model = CausalModel( data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=gml_str, proceed_when_unidentifiable=True, test_significance=None, missing_nodes_as_confounders=True ) common_causes = model.get_common_causes() assert all(node_name in common_causes for node_name in ["X1", "X2"]) all_nodes = model._graph.get_all_nodes(include_unobserved=True) assert all(node_name in all_nodes for node_name in ["Unobserved Confounders", "X0", "X1", "X2", "Z0", "v0", "y"]) all_nodes = model._graph.get_all_nodes(include_unobserved=False) assert "Unobserved Confounders" not in all_nodes
def average_treatment_effect_test_continuous(self, dataset="linear", beta=1, num_common_causes=3, num_instruments=2, num_samples=100000, treatment_is_binary=False): data = dowhy.datasets.linear_dataset( beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_samples=num_samples, treatment_is_binary=treatment_is_binary) model = CausalModel(data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None) target_estimand = model.identify_effect() estimator_ate = self._Estimator(data['df'], identified_estimand=target_estimand, treatment=data["treatment_name"], outcome=data["outcome_name"], test_significance=None) true_ate = data["ate"] ate_estimate = estimator_ate.estimate_effect() error = abs(ate_estimate.value - true_ate) print( "Error in ATE estimate = {0} with tolerance {1}%. Estimated={2},True={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, true_ate)) res = True if ( error < abs(true_ate) * self._error_tolerance) else False assert res
def test_causalml_MLPTRegressor(self, init_data): # Defined a linear dataset with a given set of properties data = init_data # Create a model that captures the same model = CausalModel( data=data['df'], treatment=data['treatment_name'], outcome=data['outcome_name'], effect_modifiers=data['effect_modifier_names'], graph=data['gml_graph'] ) # Identify the effects within the model identified_estimand = model.identify_effect( proceed_when_unidentifiable=True ) mlpt_estimate = model.estimate_effect( identified_estimand, method_name="backdoor.causalml.inference.meta.MLPTRegressor", method_params={"init_params":{ 'hidden_layer_sizes':(10,10), 'learning_rate_init':0.1, 'early_stopping':True } } ) print("The MLPT estimate obtained:") print(mlpt_estimate)
def simulate_dag_violations( methods, # estimators to use beta, # true treatment effect num_w_affected, # number of common causes affected effect_on_w, # effect of U on common causes num_z_affected, # number of common causes affected effect_on_z, # effect of U on instruments num_t_affected, # number of treatments affected effect_on_t, # effect of U on treatment effect_on_y, # effect of U on outcomes times, # number of simulation ): output = [] for _ in range(times): # beta, num_common_causes, num_instruments, num_samples, etc. are as in the tutorial data = modified_linear_dataset( beta=beta, # u -> common causes num_w_affected=num_w_affected, effect_on_w=effect_on_w, # u -> instruments num_z_affected=num_z_affected, effect_on_z=effect_on_z, # u -> treatment num_t_affected=num_t_affected, effect_on_t=effect_on_t, # u -> outcome effect_on_y=effect_on_y, num_common_causes=5, num_instruments=2, num_samples=10000, treatment_is_binary=True, ) df = data["df"] model = CausalModel( data=df, treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], instruments=data["instrument_names"], proceed_when_unidentifiable=True, ) identified_estimand = model.identify_effect() estimates = [ model.estimate_effect( identified_estimand, method_name=i[0], method_params=i[1] ).value for i in methods ] tmp_output = list(zip(estimates, [item[0] for item in methods])) output = output + tmp_output return output
def test_external_estimator(self, beta, num_samples, num_treatments): num_common_causes = 5 data = dowhy.datasets.linear_dataset( beta=beta, num_common_causes=num_common_causes, num_samples=num_samples, num_treatments=num_treatments, treatment_is_binary=True, ) model = CausalModel( data=data["df"], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None, ) identified_estimand = model.identify_effect(proceed_when_unidentifiable=True) estimate = model.estimate_effect( identified_estimand, method_name="backdoor.tests.causal_estimators.mock_external_estimator.PropensityScoreWeightingEstimator", control_value=0, treatment_value=1, target_units="ate", # condition used for CATE confidence_intervals=True, method_params={ "propensity_score_model": linear_model.LogisticRegression(max_iter=1000) }, ) assert estimate.estimator.propensity_score_model.max_iter == 1000
def test_graph_input3(self, beta, num_instruments, num_samples, num_treatments): num_common_causes = 5 data = dowhy.datasets.linear_dataset(beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_samples=num_samples, num_treatments = num_treatments, treatment_is_binary=True) model = CausalModel( data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None ) # removing two common causes gml_str = """dag { "Unobserved Confounders" [pos="0.491,-1.056"] X0 [pos="-2.109,0.057"] X1 [adjusted, pos="-0.453,-1.562"] X2 [pos="-2.268,-1.210"] Z0 [pos="-1.918,-1.735"] v0 [latent, pos="-1.525,-1.293"] y [outcome, pos="-1.164,-0.116"] "Unobserved Confounders" -> v0 "Unobserved Confounders" -> y X0 -> v0 X0 -> y X1 -> v0 X1 -> y X2 -> v0 X2 -> y Z0 -> v0 v0 -> y } """ print(gml_str) model = CausalModel( data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=gml_str, proceed_when_unidentifiable=True, test_significance=None, missing_nodes_as_confounders=True ) common_causes = model.get_common_causes() assert all(node_name in common_causes for node_name in ["X1", "X2"]) all_nodes = model._graph.get_all_nodes(include_unobserved=True) assert all(node_name in all_nodes for node_name in ["Unobserved Confounders", "X0", "X1", "X2", "Z0", "v0", "y"]) all_nodes = model._graph.get_all_nodes(include_unobserved=False) assert "Unobserved Confounders" not in all_nodes
def test_graph_refutation(self, num_variables,num_samples): data = dowhy.datasets.dataset_from_random_graph(num_vars = num_variables, num_samples= num_samples) df = data["df"] model = CausalModel( data=df, treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], ) graph_refutation_object = model.refute_graph(k = 1, independence_test = {'test_for_continuous': 'partial_correlation', 'test_for_discrete' : 'conditional_mutual_information'}) assert graph_refutation_object.refutation_result == True
def test_1(self): treatment = "T" outcome = "Y" causal_graph = "digraph{T->Y;}" columns = list(treatment) + list(outcome) df = pd.DataFrame(columns=columns) # Calculate causal effect twice: once for unit (t=1, c=0), once for specific increase (t=100, c=50) causal_model = CausalModel(df, treatment, outcome, graph=causal_graph) identified_estimand = causal_model.identify_effect( method_name="id-algorithm") # Only P(Y|T) should be present for test to succeed. identified_str = identified_estimand.__str__() gt_str = "Predictor: P(Y|T)" assert identified_str == gt_str
def test_2(self): treatment = "T" outcome = "Y" variables = ["X1", "X2"] causal_graph = "digraph{T->X1;T->X2;X1->X2;X2->Y;T->Y}" vars = list(treatment) + list(outcome) + list(variables) df = pd.DataFrame(columns=vars) treatment_name = parse_state(treatment) outcome_name = parse_state(outcome) # Causal model initialization causal_model = CausalModel(df, treatment, outcome, graph=causal_graph) # Causal identifier identification identifier = CausalIdentifier(causal_model._graph, estimand_type=None, method_name="default", proceed_when_unidentifiable=None) # Obtain backdoor sets path = Backdoor(identifier._graph._graph, treatment_name, outcome_name) backdoor_sets = path.get_backdoor_vars() assert len(backdoor_sets) == 0
def test_4(self): treatment = "T" outcome = "Y" variables = ["X1"] causal_graph = "digraph{T->Y;T->X1;X1->Y;}" columns = list(treatment) + list(outcome) + list(variables) df = pd.DataFrame(columns=columns) # Calculate causal effect twice: once for unit (t=1, c=0), once for specific increase (t=100, c=50) causal_model = CausalModel(df, treatment, outcome, graph=causal_graph) identified_estimand = causal_model.identify_effect( method_name="id-algorithm") # Compare with ground truth identified_str = identified_estimand.__str__() gt_str = "Sum over {X1}:\n\tPredictor: P(Y|T,X1)\n\tPredictor: P(X1|T)" assert identified_str == gt_str
def test_2(self): ''' Test undirected edge between treatment and outcome. ''' treatment = "T" outcome = "Y" causal_graph = "digraph{T->Y; Y->T;}" columns = list(treatment) + list(outcome) df = pd.DataFrame(columns=columns) # Calculate causal effect twice: once for unit (t=1, c=0), once for specific increase (t=100, c=50) causal_model = CausalModel(df, treatment, outcome, graph=causal_graph) # Since undirected graph, identify effect must throw an error. with pytest.raises(Exception): identified_estimand = causal_model.identify_effect( method_name="id-algorithm")
def predict_tutorial(self, data: pd.DataFrame): # https://towardsdatascience.com/implementing-causal-inference-a-key-step-towards-agi-de2cde8ea599 data = pd.read_csv( 'https://raw.githubusercontent.com/AMLab-Amsterdam/CEVAE/master/datasets/IHDP/csv/ihdp_npci_1.csv', header=None) col = [ 'treatment', 'y_factual', 'y_cfactual', 'mu0', 'mu1', ] for i in range(1, 26): col.append('x' + str(i)) data.columns = col data = data.astype({'treatment': 'bool'}, copy=False) result = data.head() # Create a causal model from the data and given common causes. xs = "" for i in range(1, 26): xs += ("x" + str(i) + "+") model = CausalModel(data=data, treatment='treatment', outcome='y_factual', common_causes=xs.split('+')) # Identify the causal effect identified_estimand = model.identify_effect() print(identified_estimand) # Estimate the causal effect and compare it with Average Treatment Effect estimate = model.estimate_effect( identified_estimand, method_name="backdoor.linear_regression", test_significance=True) print(estimate) print("Causal Estimate is " + str(estimate.value)) refute_results = model.refute_estimate( identified_estimand, estimate, method_name="random_common_cause") print(refute_results) dd = 3
def dowhy_quick_backdoor_estimator(dataframe, outcome, treatment, cofounders_list, method_name, populaton_of_interest='ate', view_model=False): """ Make a quick statistical assessment for the mean of 2 different samples (hypothesis test based) :param dataframe: original dataframe in a subject level :param group_col: the name of the group column :param category_col: the name of the category_col column :returns group_share_per_category_df: df containing the % share each category has by group """ causal_model = CausalModel(data=dataframe, treatment=treatment, outcome=outcome, common_causes=cofounders_list) if view_model: causal_model.view_model(layout="dot") identified_estimand = causal_model.identify_effect( proceed_when_unidentifiable=True) causal_estimate = causal_model.estimate_effect( identified_estimand, method_name=method_name, target_units= populaton_of_interest #, confidence_intervals=True # not in this release ) return causal_estimate.value
def predict_example(self, data: pd.DataFrame): # https://github.com/Microsoft/dowhy # https://ntanmayee.github.io/articles/2018/11/16/tools-for-causality.html x = 'E1' y = 'E3' causes = ['E1', 'E2'] model = CausalModel(data=data, treatment=causes, outcome=y, proceed_when_unidentifiable=True) # Identify causal effect and return target estimands identified_estimand = model.identify_effect() # Estimate the target estimand using a statistical method. estimate = model.estimate_effect( identified_estimand, method_name="backdoor.propensity_score_matching") # Refute the obtained estimate using multiple robustness checks. refute_results = model.refute_estimate( identified_estimand, estimate, method_name="random_common_cause")
def null_refutation_test(self, data=None, dataset="linear", beta=10, num_common_causes=1, num_instruments=1, num_samples=100000, treatment_is_binary=True): # Supports user-provided dataset object if data is None: data = dowhy.datasets.linear_dataset(beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_samples=num_samples, treatment_is_binary=treatment_is_binary) model = CausalModel( data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None ) target_estimand = model.identify_effect() ate_estimate = model.estimate_effect( identified_estimand=target_estimand, method_name=self.estimator_method, test_significance=None ) true_ate = data["ate"] self.logger.debug(true_ate) # To test if there are any exceptions ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method, confounders_effect_on_treatment = self.confounders_effect_on_t, confounders_effect_on_outcome = self.confounders_effect_on_y, effect_strength_on_treatment =self.effect_strength_on_t, effect_strength_on_outcome=self.effect_strength_on_y) self.logger.debug(ref.new_effect) # To test if the estimate is identical if refutation parameters are zero refute = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method, confounders_effect_on_treatment = self.confounders_effect_on_t, confounders_effect_on_outcome = self.confounders_effect_on_y, effect_strength_on_treatment = 0, effect_strength_on_outcome = 0) error = abs(refute.new_effect - ate_estimate.value) print("Error in refuted estimate = {0} with tolerance {1}%. Estimated={2},After Refutation={3}".format( error, self._error_tolerance * 100, ate_estimate.value, refute.new_effect) ) res = True if (error < abs(ate_estimate.value) * self._error_tolerance) else False assert res
def test_1(self): treatment = "T" outcome = "Y" variables = ["X1", "X2"] causal_graph = "digraph{X1->T;X2->T;X1->X2;X2->Y;T->Y}" vars = list(treatment) + list(outcome) + list(variables) df = pd.DataFrame(columns=vars) treatment_name = parse_state(treatment) outcome_name = parse_state(outcome) # Causal model initialization causal_model = CausalModel(df, treatment, outcome, graph=causal_graph) # Causal identifier identification identifier = CausalIdentifier(causal_model._graph, estimand_type=None, method_name="default", proceed_when_unidentifiable=None) # Obtain backdoor sets path = Backdoor(identifier._graph._graph, treatment_name, outcome_name) backdoor_sets = path.get_backdoor_vars() print(backdoor_sets) # Check if backdoor sets are valid i.e. if they block all paths between the treatment and the outcome backdoor_paths = identifier._graph.get_backdoor_paths( treatment_name, outcome_name) check_set = set(backdoor_sets[0]['backdoor_set']) check = identifier._graph.check_valid_backdoor_set( treatment_name, outcome_name, check_set, backdoor_paths=backdoor_paths, dseparation_algo="naive") print(check) assert check["is_dseparated"]
def att_causal_estimator(df, outcome, treatment, cofounders_list, method_name, view_model=False): causal_model = CausalModel(data=df, treatment=treatment, outcome=outcome, common_causes=cofounders_list) if view_model: causal_model.view_model(layout="dot") identified_estimand = causal_model.identify_effect( proceed_when_unidentifiable=True) causal_estimate = causal_model.estimate_effect( identified_estimand, method_name=method_name, target_units='att', #, confidence_intervals=True ) return (causal_estimate.value)
def test_backdoor_estimators(self): # Setup data data = datasets.linear_dataset(10, num_common_causes=4, num_samples=10000, num_instruments=2, num_effect_modifiers=2, num_treatments=1, treatment_is_binary=False) df = data['df'] model = CausalModel(data=data["df"], treatment=data["treatment_name"], outcome=data["outcome_name"], effect_modifiers=data["effect_modifier_names"], graph=data["gml_graph"]) identified_estimand = model.identify_effect( proceed_when_unidentifiable=True) # Test LinearDML dml_estimate = model.estimate_effect( identified_estimand, method_name="backdoor.econml.dml.LinearDML", control_value=0, treatment_value=1, target_units=lambda df: df["X0"] > 1, # condition used for CATE method_params={ "init_params": { 'model_y': GradientBoostingRegressor(), 'model_t': GradientBoostingRegressor(), 'featurizer': PolynomialFeatures(degree=1, include_bias=True) }, "fit_params": {} }) # Test ContinuousTreatmentOrthoForest orthoforest_estimate = model.estimate_effect( identified_estimand, method_name= "backdoor.econml.ortho_forest.ContinuousTreatmentOrthoForest", target_units=lambda df: df["X0"] > 2, method_params={ "init_params": { 'n_trees': 10 }, "fit_params": {} }) # Test LinearDRLearner data_binary = datasets.linear_dataset(10, num_common_causes=4, num_samples=10000, num_instruments=2, num_effect_modifiers=2, treatment_is_binary=True, outcome_is_binary=True) model_binary = CausalModel( data=data_binary["df"], treatment=data_binary["treatment_name"], outcome=data_binary["outcome_name"], effect_modifiers=data["effect_modifier_names"], graph=data_binary["gml_graph"]) identified_estimand_binary = model_binary.identify_effect( proceed_when_unidentifiable=True) drlearner_estimate = model_binary.estimate_effect( identified_estimand_binary, method_name="backdoor.econml.drlearner.LinearDRLearner", target_units=lambda df: df["X0"] > 1, confidence_intervals=False, method_params={ "init_params": { 'model_propensity': LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto') }, "fit_params": {} })
def test_iv_estimators(self): # Setup data data = datasets.linear_dataset(10, num_common_causes=4, num_samples=10000, num_instruments=2, num_effect_modifiers=2, num_treatments=1, treatment_is_binary=False) df = data['df'] model = CausalModel(data=data["df"], treatment=data["treatment_name"], outcome=data["outcome_name"], effect_modifiers=data["effect_modifier_names"], graph=data["gml_graph"]) identified_estimand = model.identify_effect( proceed_when_unidentifiable=True) # Test DeepIV dims_zx = len(model._instruments) + len(model._effect_modifiers) dims_tx = len(model._treatment) + len(model._effect_modifiers) treatment_model = keras.Sequential([ keras.layers.Dense( 128, activation='relu', input_shape=(dims_zx, )), # sum of dims of Z and X keras.layers.Dropout(0.17), keras.layers.Dense(64, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(32, activation='relu'), keras.layers.Dropout(0.17) ]) response_model = keras.Sequential([ keras.layers.Dense( 128, activation='relu', input_shape=(dims_tx, )), # sum of dims of T and X keras.layers.Dropout(0.17), keras.layers.Dense(64, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(32, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(1) ]) deepiv_estimate = model.estimate_effect( identified_estimand, method_name="iv.econml.deepiv.DeepIVEstimator", target_units=lambda df: df["X0"] > -1, confidence_intervals=False, method_params={ "init_params": { 'n_components': 10, # Number of gaussians in the mixture density networks 'm': lambda z, x: treatment_model( keras.layers.concatenate([z, x])), # Treatment model, "h": lambda t, x: response_model( keras.layers.concatenate([t, x])), # Response model 'n_samples': 1, # Number of samples used to estimate the response 'first_stage_options': { 'epochs': 25 }, 'second_stage_options': { 'epochs': 25 } }, "fit_params": {} })
def average_treatment_effect_test(self, dataset="linear", beta=10, num_common_causes=1, num_instruments=1, num_effect_modifiers=0, num_treatments=1, num_frontdoor_variables = 0, num_samples=100000, treatment_is_binary=True, outcome_is_binary=False, confidence_intervals=False, test_significance=False, method_params=None): if dataset == "linear": data = dowhy.datasets.linear_dataset(beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_effect_modifiers = num_effect_modifiers, num_treatments = num_treatments, num_frontdoor_variables=num_frontdoor_variables, num_samples=num_samples, treatment_is_binary=treatment_is_binary, outcome_is_binary = outcome_is_binary) elif dataset == "simple-iv": data = dowhy.datasets.simple_iv_dataset(beta=beta, num_treatments = num_treatments, num_samples = num_samples, treatment_is_binary=treatment_is_binary, outcome_is_binary = outcome_is_binary) else: raise ValueError("Dataset type not supported.") model = CausalModel( data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=test_significance ) target_estimand = model.identify_effect() target_estimand.set_identifier_method(self._identifier_method) estimator_ate = self._Estimator( data['df'], identified_estimand=target_estimand, treatment=data["treatment_name"], outcome=data["outcome_name"], control_value = 0, treatment_value = 1, test_significance=test_significance, evaluate_effect_strength=False, confidence_intervals = confidence_intervals, target_units = "ate", effect_modifiers = data["effect_modifier_names"], params=method_params ) true_ate = data["ate"] ate_estimate = estimator_ate.estimate_effect() str(ate_estimate) # checking if str output is correctly created error = abs(ate_estimate.value - true_ate) print("Error in ATE estimate = {0} with tolerance {1}%. Estimated={2},True={3}".format( error, self._error_tolerance * 100, ate_estimate.value, true_ate) ) res = True if (error < abs(true_ate) * self._error_tolerance) else False assert res # Compute confidence intervals, standard error and significance tests if confidence_intervals: ate_estimate.get_confidence_intervals() ate_estimate.get_confidence_intervals(confidence_level=0.99) ate_estimate.get_confidence_intervals(method="bootstrap") ate_estimate.get_standard_error() ate_estimate.get_standard_error(method="bootstrap") if test_significance: ate_estimate.test_stat_significance() ate_estimate.test_stat_significance(method="bootstrap")
import pandas as pd import dowhy.datasets from datasets import * from dowhy import CausalModel credit_data = get_credit() model = CausalModel( data=credit_data["df"], treatment=["YearsEmployed"], outcome=["Approved"], graph=credit_data["dot_graph"], ) from sklearn.linear_model import LogisticRegressionCV # Saves the model as "causal_model.png" model.view_model(layout="dot") identified_estimand_binary = model.identify_effect( proceed_when_unidentifiable=True) # estimate = model.estimate_effect(identified_estimand, method_name="backdoor.econml.drlearner.LinearDRLearner") orthoforest_estimate = model.estimate_effect( identified_estimand_binary, method_name="backdoor.econml.ortho_forest.ContinuousTreatmentOrthoForest", target_units=lambda df: df["Male"] == 1, confidence_intervals=False, method_params={ "init_params": { 'n_trees': 2, # not ideal, just as an example to speed up computation
def test_graph_refutation2(self, num_variables,num_samples): data = dowhy.datasets.dataset_from_random_graph(num_vars = num_variables, num_samples= num_samples) df = data["df"] gml_str = """ graph [ directed 1 node [ id 0 label "a" ] node [ id 1 label "b" ] node [ id 2 label "c" ] node [ id 3 label "d" ] node [ id 4 label "e" ] node [ id 5 label "f" ] node [ id 6 label "g" ] node [ id 7 label "h" ] node [ id 8 label "i" ] node [ id 9 label "j" ] edge [ source 0 target 1 ] edge [ source 0 target 3 ] edge [ source 3 target 2 ] edge [ source 7 target 4 ] edge [ source 6 target 5 ] edge [ source 7 target 8 ] edge [ source 9 target 2 ] edge [ source 9 target 8 ] ] """ model = CausalModel( data=df, treatment=data["treatment_name"], outcome=data["outcome_name"], graph=gml_str, ) graph_refutation_object = model.refute_graph(k = 2, independence_test = {'test_for_continuous': 'partial_correlation', 'test_for_discrete' : 'conditional_mutual_information'}) assert graph_refutation_object.refutation_result == False
def null_refutation_test(self, data=None, dataset="linear", beta=10, num_common_causes=1, num_instruments=1, num_samples=100000, treatment_is_binary=True): # Supports user-provided dataset object if data is None: data = dowhy.datasets.linear_dataset( beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_samples=num_samples, treatment_is_binary=treatment_is_binary) print(data['df']) print("") model = CausalModel(data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None) target_estimand = model.identify_effect() ate_estimate = model.estimate_effect( identified_estimand=target_estimand, method_name=self.estimator_method, test_significance=None) true_ate = data["ate"] self.logger.debug(true_ate) if self.refuter_method == "add_unobserved_common_cause": # To test if there are any exceptions ref = model.refute_estimate( target_estimand, ate_estimate, method_name=self.refuter_method, confounders_effect_on_treatment=self.confounders_effect_on_t, confounders_effect_on_outcome=self.confounders_effect_on_y, effect_strength_on_treatment=self.effect_strength_on_t, effect_strength_on_outcome=self.effect_strength_on_y) self.logger.debug(ref.new_effect) # To test if the estimate is identical if refutation parameters are zero refute = model.refute_estimate( target_estimand, ate_estimate, method_name=self.refuter_method, confounders_effect_on_treatment=self.confounders_effect_on_t, confounders_effect_on_outcome=self.confounders_effect_on_y, effect_strength_on_treatment=0, effect_strength_on_outcome=0) error = abs(refute.new_effect - ate_estimate.value) print( "Error in refuted estimate = {0} with tolerance {1}%. Estimated={2},After Refutation={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, refute.new_effect)) res = True if (error < abs(ate_estimate.value) * self._error_tolerance) else False assert res elif self.refuter_method == "placebo_treatment_refuter": if treatment_is_binary is True: ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method, num_simulations=10) else: ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method) # This value is hardcoded to be zero as we are runnning this on a linear dataset. # Ordinarily, we should expect this value to be zero. EXPECTED_PLACEBO_VALUE = 0 error = abs(ref.new_effect - EXPECTED_PLACEBO_VALUE) print( "Error in the refuted estimate = {0} with tolerence {1}%. Expected Value={2}, After Refutation={3}" .format(error, self._error_tolerance * 100, EXPECTED_PLACEBO_VALUE, ref.new_effect)) print(ref) res = True if (error < self._error_tolerance) else False assert res elif self.refuter_method == "data_subset_refuter": if treatment_is_binary is True: ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method, num_simulations=5) else: ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method) error = abs(ref.new_effect - ate_estimate.value) print( "Error in the refuted estimate = {0} with tolerence {1}%. Estimated={2}, After Refutation={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, ref.new_effect)) print(ref) res = True if (error < abs(ate_estimate.value) * self._error_tolerance) else False assert res elif self.refuter_method == "bootstrap_refuter": if treatment_is_binary is True: ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method, num_simulations=5) else: ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method) error = abs(ref.new_effect - ate_estimate.value) print( "Error in the refuted estimate = {0} with tolerence {1}%. Estimated={2}, After Refutation={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, ref.new_effect)) print(ref) res = True if (error < abs(ate_estimate.value) * self._error_tolerance) else False assert res elif self.refuter_method == "dummy_outcome_refuter": if self.transformations is None: ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method, num_simulations=2) else: ref = model.refute_estimate( target_estimand, ate_estimate, method_name=self.refuter_method, transformations=self.transformations, params=self.params, num_simulations=2) # This value is hardcoded to be zero as we are runnning this on a linear dataset. # Ordinarily, we should expect this value to be zero. EXPECTED_DUMMY_OUTCOME_VALUE = 0 error = abs(ref.new_effect - EXPECTED_DUMMY_OUTCOME_VALUE) print( "Error in the refuted estimate = {0} with tolerence {1}%. Expected Value={2}, After Refutation={3}" .format(error, self._error_tolerance * 100, EXPECTED_DUMMY_OUTCOME_VALUE, ref.new_effect)) print(ref) assert ref
def test_iv_estimators(self): keras = pytest.importorskip("keras") # Setup data data = datasets.linear_dataset(10, num_common_causes=4, num_samples=10000, num_instruments=2, num_effect_modifiers=2, num_treatments=1, treatment_is_binary=False) df = data['df'] model = CausalModel(data=data["df"], treatment=data["treatment_name"], outcome=data["outcome_name"], effect_modifiers=data["effect_modifier_names"], graph=data["gml_graph"]) identified_estimand = model.identify_effect( proceed_when_unidentifiable=True) # Test DeepIV dims_zx = len(model._instruments) + len(model._effect_modifiers) dims_tx = len(model._treatment) + len(model._effect_modifiers) treatment_model = keras.Sequential([ keras.layers.Dense( 128, activation='relu', input_shape=(dims_zx, )), # sum of dims of Z and X keras.layers.Dropout(0.17), keras.layers.Dense(64, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(32, activation='relu'), keras.layers.Dropout(0.17) ]) response_model = keras.Sequential([ keras.layers.Dense( 128, activation='relu', input_shape=(dims_tx, )), # sum of dims of T and X keras.layers.Dropout(0.17), keras.layers.Dense(64, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(32, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(1) ]) deepiv_estimate = model.estimate_effect( identified_estimand, method_name="iv.econml.deepiv.DeepIVEstimator", target_units=lambda df: df["X0"] > -1, confidence_intervals=False, method_params={ "init_params": { 'n_components': 10, # Number of gaussians in the mixture density networks # Treatment model, 'm': lambda z, x: treatment_model( keras.layers.concatenate([z, x])), # Response model "h": lambda t, x: response_model( keras.layers.concatenate([t, x])), 'n_samples': 1, # Number of samples used to estimate the response 'first_stage_options': { 'epochs': 25 }, 'second_stage_options': { 'epochs': 25 } }, "fit_params": {} }) # Test IntentToTreatDRIV data = datasets.linear_dataset(10, num_common_causes=4, num_samples=10000, num_instruments=1, num_effect_modifiers=2, num_treatments=1, treatment_is_binary=True, num_discrete_instruments=1) df = data['df'] model = CausalModel(data=data["df"], treatment=data["treatment_name"], outcome=data["outcome_name"], effect_modifiers=data["effect_modifier_names"], graph=data["gml_graph"]) identified_estimand = model.identify_effect( proceed_when_unidentifiable=True) driv_estimate = model.estimate_effect( identified_estimand, method_name="iv.econml.ortho_iv.LinearIntentToTreatDRIV", target_units=lambda df: df["X0"] > 1, confidence_intervals=False, method_params={ "init_params": { 'model_T_XZ': GradientBoostingClassifier(), 'model_Y_X': GradientBoostingRegressor(), 'flexible_model_effect': GradientBoostingRegressor(), 'featurizer': PolynomialFeatures(degree=1, include_bias=False) }, "fit_params": {} })