def plot(self, *args, **kwargs): if kwargs.get('method_name'): method_name = kwargs.get('method_name') else: method_name = "backdoor.propensity_score_matching" logging.info("Using {} for estimation.".format(method_name)) if kwargs.get('common_causes'): self.use_graph = False elif kwargs.get('dot_graph'): self.use_graph = True else: raise Exception("You must specify a method for determining a backdoor set.") if self.use_graph: model = CausalModel(data=self._obj, treatment=self._obj[kwargs["treatment_name"]], outcome=self._obj[kwargs["outcome_name"]], graph=args["dot_graph"]) else: model = CausalModel(data=self._obj, treatment=self._obj[kwargs["treatment_name"]], outcome=self._obj[kwargs["outcome_name"]], common_causes=args["common_causes"]) if kwargs['kind'] == 'bar': identified_estimand = model.identify_effect() estimate = model.estimate_effect(identified_estimand, method_name=method_name) elif kwargs['kind'] == 'line' or not kwargs['kind'].get(): identified_estimand = model.identify_effect() estimate = model.estimate_effect(identified_estimand, method_name=method_name) else: raise Exception("Plot type {} not supported for causal plots!".format(kwargs.get('kind'))) self._obj.plot(*args, **kwargs)
def average_treatment_effect_test(self, dataset="linear", beta=10, num_common_causes=1, num_instruments=1, num_samples=10000, treatment_is_binary=True): data = dowhy.datasets.linear_dataset( beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_samples=num_samples, treatment_is_binary=treatment_is_binary) model = CausalModel(data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None) target_estimand = model.identify_effect() estimator_ate = self._Estimator(data['df'], identified_estimand=target_estimand, treatment=data["treatment_name"], outcome=data["outcome_name"], test_significance=None) true_ate = data["ate"] ate_estimate = estimator_ate.estimate_effect() error = ate_estimate.value - true_ate print( "Error in ATE estimate = {0} with tolerance {1}%. Estimated={2},True={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, true_ate)) res = True if (error < true_ate * self._error_tolerance) else False assert res
def null_refutation_test(self, data=None, dataset="linear", beta=10, num_common_causes=1, num_instruments=1, num_samples=100000, treatment_is_binary=True): # Supports user-provided dataset object if data is None: data = dowhy.datasets.linear_dataset( beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_samples=num_samples, treatment_is_binary=treatment_is_binary) model = CausalModel(data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None) target_estimand = model.identify_effect() ate_estimate = model.estimate_effect( identified_estimand=target_estimand, method_name=self.estimator_method, test_significance=None) true_ate = data["ate"] # To test if there are any exceptions ref = model.refute_estimate( target_estimand, ate_estimate, method_name=self.refuter_method, confounders_effect_on_treatment=self.confounders_effect_on_t, confounders_effect_on_outcome=self.confounders_effect_on_y, effect_strength_on_treatment=self.effect_strength_on_t, effect_strength_on_outcome=self.effect_strength_on_y) # To test if the estimate is identical if refutation parameters are zero refute = model.refute_estimate( target_estimand, ate_estimate, method_name=self.refuter_method, confounders_effect_on_treatment=self.confounders_effect_on_t, confounders_effect_on_outcome=self.confounders_effect_on_y, effect_strength_on_treatment=0, effect_strength_on_outcome=0) error = abs(refute.new_effect - ate_estimate.value) print( "Error in refuted estimate = {0} with tolerance {1}%. Estimated={2},After Refutation={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, refute.new_effect)) res = True if ( error < abs(ate_estimate.value) * self._error_tolerance) else False assert res
class CausalAccessor(object): def __init__(self, pandas_obj): self._obj = pandas_obj self._causal_model = None self._sampler = None self._identified_estimand = None self._method = None def reset(self): self._causal_model = None self._identified_estimand = None self._sampler = None self._method = None def do(self, x, method='weighting', num_cores=1, variable_types={}, outcome=None, params=None, dot_graph=None, common_causes=None, instruments=None, estimand_type='ate', proceed_when_unidentifiable=False, stateful=False): x, keep_original_treatment = self.parse_x(x) if not stateful or method != self._method: self.reset() if not self._causal_model: self._causal_model = CausalModel(self._obj, [xi for xi in x.keys()][0], outcome, graph=dot_graph, common_causes=common_causes, instruments=instruments, estimand_type=estimand_type, proceed_when_unidentifiable=proceed_when_unidentifiable) self._identified_estimand = self._causal_model.identify_effect() if not self._sampler: self._method = method do_sampler_class = do_samplers.get_class_object(method + "_sampler") self._sampler = do_sampler_class(self._obj, self._identified_estimand, self._causal_model._treatment, self._causal_model._outcome, params=params, variable_types=variable_types, num_cores=num_cores, causal_model=self._causal_model, keep_original_treatment=keep_original_treatment) result = self._sampler.do_sample(x) if not stateful: self.reset() return result def parse_x(self, x): if type(x) == str: return {x: None}, True if type(x) == list: return {xi: None for xi in x}, True if type(x) == dict: return x, False raise Exception('x format not recognized: {}'.format(type(x)))
def CalDoWhy(dat): model = CausalModel(data=dat["df"], treatment=dat["treatment_name"], outcome=dat["outcome_name"], graph=dat["gml_graph"]) # Identification identified_estimand = model.identify_effect() # Estimation causal_estimate = model.estimate_effect( identified_estimand, method_name="backdoor.linear_regression") return causal_estimate
def custom_data_average_treatment_effect_test(self, data): model = CausalModel(data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None) target_estimand = model.identify_effect() estimator_ate = self._Estimator(data['df'], identified_estimand=target_estimand, treatment=data["treatment_name"], outcome=data["outcome_name"], test_significance=None) true_ate = data["ate"] ate_estimate = estimator_ate.estimate_effect() error = ate_estimate.value - true_ate print( "Error in ATE estimate = {0} with tolerance {1}%. Estimated={2},True={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, true_ate)) res = True if (error < true_ate * self._error_tolerance) else False assert res
def register_graph(): digraph = request.args.get('digraph') dataset = request.args.get('dataset') treatment_name = request.args.get('treatment') outcome_name = request.args.get('outcome') df = dataiku.Dataset(dataset).get_dataframe() model = CausalModel( data=df, treatment=treatment_name, outcome=outcome_name, graph=digraph, ) identified_estimand = model.identify_effect() causal_estimate_reg = model.estimate_effect( identified_estimand, method_name="backdoor.linear_regression", test_significance=True) d = {'results': str(causal_estimate_reg)} return json.dumps(d)
method_name="iv.regression_discontinuity", method_params={ 'rd_variable_name': 'Z1', 'rd_threshold_value': 0.5, 'rd_bandwidth': 0.1 }) print(causal_estimate_regdist) print("Causal Estimate is " + str(causal_estimate_regdist.value)) if __name__ == "__main__": data = dowhy.datasets.linear_dataset(beta=10, num_common_causes=5, num_instruments=2, num_samples=10000, treatment_is_binary=True) # With graph model = CausalModel(data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["dot_graph"], instruments=data["instrument_names"], logging_level=logging.INFO) model.view_model() identified_estimand = model.identify_effect() print(identified_estimand) regression(model, identified_estimand)
class CausalAccessor(object): def __init__(self, pandas_obj): """ An accessor for the pandas.DataFrame under the `causal` namespace. :param pandas_obj: """ self._obj = pandas_obj self._causal_model = None self._sampler = None self._identified_estimand = None self._method = None def reset(self): """ If a `causal` namespace method (especially `do`) was run statefully, this resets the namespace. :return: """ self._causal_model = None self._identified_estimand = None self._sampler = None self._method = None def do(self, x, method='weighting', num_cores=1, variable_types={}, outcome=None, params=None, dot_graph=None, common_causes=None, estimand_type='ate', proceed_when_unidentifiable=False, stateful=False): """ The do-operation implemented with sampling. This will return a pandas.DataFrame with the outcome variable(s) replaced with samples from P(Y|do(X=x)). If the value of `x` is left unspecified (e.g. as a string or list), then the original values of `x` are left in the DataFrame, and Y is sampled from its respective P(Y|do(x)). If the value of `x` is specified (passed with a `dict`, where variable names are keys, and values are specified) then the new `DataFrame` will contain the specified values of `x`. For some methods, the `variable_types` field must be specified. It should be a `dict`, where the keys are variable names, and values are 'o' for ordered discrete, 'u' for un-ordered discrete, 'd' for discrete, or 'c' for continuous. Inference requires a set of control variables. These can be provided explicitly using `common_causes`, which contains a list of variable names to control for. These can be provided implicitly by specifying a causal graph with `dot_graph`, from which they will be chosen using the default identification method. When the set of control variables can't be identified with the provided assumptions, a prompt will raise to the user asking whether to proceed. To automatically over-ride the prompt, you can set the flag `proceed_when_unidentifiable` to `True`. Some methods build components during inference which are expensive. To retain those components for later inference (e.g. successive calls to `do` with different values of `x`), you can set the `stateful` flag to `True`. Be cautious about using the `do` operation statefully. State is set on the namespace, rather than the method, so can behave unpredictably. To reset the namespace and run statelessly again, you can call the `reset` method. :param x: str, list, dict: The causal state on which to intervene, and (optional) its interventional value(s). :param method: The inference method to use with the sampler. Currently, `'mcmc'`, `'weighting'`, and `'kernel_density'` are supported. :param num_cores: int: if the inference method only supports sampling a point at a time, this will parallelize sampling. :param variable_types: dict: The dictionary containing the variable types. Must contain the union of the causal state, control variables, and the outcome. :param outcome: str: The outcome variable. :param params: dict: extra parameters to set as attributes on the sampler object :param dot_graph: str: A string specifying the causal graph. :param common_causes: list: A list of strings containing the variable names to control for. :param estimand_type: str: 'ate' is the only one currently supported. Others may be added later, to allow for CATE estimation. :param proceed_when_unidentifiable: bool: A flag to over-ride user prompts to proceed when effects aren't identifiable with the assumptions provided. :param stateful: bool: Whether to retain state. By default, the do operation is stateless. :return: pandas.DataFrame: A DataFrame containing the sampled outcome """ x, keep_original_treatment = self.parse_x(x) if not stateful or method != self._method: self.reset() if not self._causal_model: self._causal_model = CausalModel( self._obj, [xi for xi in x.keys()][0], outcome, graph=dot_graph, common_causes=common_causes, instruments=None, estimand_type=estimand_type, proceed_when_unidentifiable=proceed_when_unidentifiable) self._identified_estimand = self._causal_model.identify_effect() if not self._sampler: self._method = method do_sampler_class = do_samplers.get_class_object(method + "_sampler") self._sampler = do_sampler_class( self._obj, self._identified_estimand, self._causal_model._treatment, self._causal_model._outcome, params=params, variable_types=variable_types, num_cores=num_cores, causal_model=self._causal_model, keep_original_treatment=keep_original_treatment) result = self._sampler.do_sample(x) if not stateful: self.reset() return result def parse_x(self, x): if type(x) == str: return {x: None}, True if type(x) == list: return {xi: None for xi in x}, True if type(x) == dict: return x, False raise Exception('x format not recognized: {}'.format(type(x)))
class DoWhyExample: data_old = ds.linear_dataset(beta=10, num_common_causes=5, num_instruments=5, num_samples=10000, treatment_is_binary=True) gml_graph = ('graph[directed 1' 'node[ id "TOJ" label "TOJ"]' 'node[ id "IntCur" label "IntCur"]' 'node[ id "U" label "Unobserved Confounders"]' 'edge[source "TOJ" target "IntCur"]' 'edge[source "U" target "TOJ"]' 'edge[source "U" target "IntCur"]') gml_graph = add_node(gml_graph, "YeshivaAdults", "IntCur") gml_graph = add_node(gml_graph, "Sex", "IntCur") gml_graph = add_node(gml_graph, "Age", "IntCur") gml_graph = connect_node(gml_graph, "Age", "TOJ") gml_graph = connect_node(gml_graph, "Age", "YeshivaAdults") gml_graph = connect_node(gml_graph, "Sex", "YeshivaAdults") gml_graph = connect_node(gml_graph, "TOJ", "YeshivaAdults") gml_graph = gml_graph + ']' # table # ID Age Sex TOJ (Orthodox)? (Treatment?) Yeshiva? Intell. Curios? (Outcome) data = pd.DataFrame( np.array([[30.0, 1.0, 1.0, 1.0, 0.0], [40.0, 1.0, 0.0, 0.0, 1.0]]), columns=['Age', 'Sex', 'TOJ', 'YeshivaAdults', 'IntCur']) # t_model = None t_identify = None t_estimate = None def model(self, force_again=False): if self.t_model is None or force_again: self.t_model = CausalModel(data=self.data, treatment='TOJ', outcome='IntCur', graph=self.gml_graph) # CausalModel(data=self.data["df"], # treatment=self.data["treatment_name"], # outcome=self.data["outcome_name"], # graph=self.data["gml_graph"]) return self.t_model def identify(self, force_again=False): if self.t_identify is None or force_again: if self.t_model is None or force_again: self.model(force_again=force_again) self.t_identify = self.t_model.identify_effect() return self.t_identify def estimate(self, method_name="backdoor.propensity_score_matching", force_again=False): if self.t_estimate is None or force_again: self.t_estimate = self.t_model.estimate_effect( self.identify(force_again), method_name) return self.t_estimate def refute(self, method_name="random_common_cause", force_again=False): return self.model(force_again=force_again).refute_estimate( self.identify(force_again), self.estimate(force_again=force_again), method_name=method_name)