def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True): """ We want to calculate the causal effect of X and Y through back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) for some admissable set of control variables, Z. First we calculate the conditional density P(Y|X,Z), then the density P(Z). We find the support of Z so we can properly sum over it later. variable_types are a dictionary with the column name pointing to an element of set(['o', 'u', 'c']), for 'ordered', 'unordered discrete', or 'continuous'. """ conditional_density_vars = causes + admissable_set self.causes = causes self.effects = effects self.admissable_set = admissable_set self.conditional_density_vars = conditional_density_vars if len(X) > 300 or max(len(causes+admissable_set),len(effects+admissable_set)) >= 3: self.defaults=EstimatorSettings(n_jobs=4, efficient=True) else: self.defaults=EstimatorSettings(n_jobs=-1, efficient=False) if variable_types: self.variable_types = variable_types dep_type = [variable_types[var] for var in effects] indep_type = [variable_types[var] for var in conditional_density_vars] density_types = [variable_types[var] for var in admissable_set] else: self.variable_types = self.__infer_variable_types(X) if 'c' not in variable_types.values(): bw = 'cv_ml' else: bw = 'normal_reference' if admissable_set: self.density = KDEMultivariate(X[admissable_set], var_type=''.join(density_types), bw=bw, defaults=self.defaults) self.conditional_density = KDEMultivariateConditional(endog=X[effects], exog=X[conditional_density_vars], dep_type=''.join(dep_type), indep_type=''.join(indep_type), bw=bw, defaults=self.defaults) if expectation: self.conditional_expectation = KernelReg(X[effects].values, X[conditional_density_vars].values, ''.join(indep_type), bw='cv_ls') self.support = self.__get_support(X) self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u']] self.discrete_Z = list(set(self.discrete_variables).intersection(set(admissable_set))) self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ] self.continuous_Z = list(set(self.continuous_variables).intersection(set(admissable_set)))
def _fit_conditional(self): self.conditional_density = KDEMultivariateConditional(endog=self._data[self._outcome_names], exog=self._data[self._treatment_names + self._target_estimand.backdoor_variables], dep_type=''.join(self.dep_type), indep_type=''.join(self.indep_type), bw=self.bw, defaults=self.defaults)
def __init__(self, outcome_upper_support, outcome_lower_support, outcome_names, treatment_names, backdoor_variables, data, dep_type, indep_type, bw, defaults): self._data = data self._outcome_names = outcome_names self._treatment_names = treatment_names self._backdoor_variables = backdoor_variables self.dep_type = dep_type self.indep_type = indep_type self.bw = bw self.defaults = defaults self.outcome_lower_support = outcome_lower_support self.outcome_upper_support = outcome_upper_support self.conditional_density = KDEMultivariateConditional(endog=self._data[self._outcome_names], exog=self._data[self._treatment_names + self._backdoor_variables], dep_type=''.join(self.dep_type), indep_type=''.join(self.indep_type), bw=self.bw, defaults=self.defaults)
class KernelSampler(object): def __init__(self, outcome_upper_support, outcome_lower_support, outcome_names, treatment_names, backdoor_variables, data, dep_type, indep_type, bw, defaults): self._data = data self._outcome_names = outcome_names self._treatment_names = treatment_names self._backdoor_variables = backdoor_variables self.dep_type = dep_type self.indep_type = indep_type self.bw = bw self.defaults = defaults self.outcome_lower_support = outcome_lower_support self.outcome_upper_support = outcome_upper_support self.conditional_density = KDEMultivariateConditional( endog=self._data[self._outcome_names], exog=self._data[self._treatment_names + self._backdoor_variables], dep_type=''.join(self.dep_type), indep_type=''.join(self.indep_type), bw=self.bw, defaults=self.defaults) def sample_point(self, x_z): y_bw = 1.06 * self._data[self._outcome_names].std() * ( self._data[self._outcome_names].count())**(-1. / 5.) n = 5 * np.ceil( (self.outcome_upper_support - self.outcome_lower_support) / y_bw) cum_ranges = [ np.linspace(self.outcome_lower_support[i], self.outcome_upper_support[i], n[i]) for i in range(len(self._outcome_names)) ] res = np.meshgrid(*cum_ranges) points = np.array(res).reshape(len(self._outcome_names), np.int(n.cumprod()[-1])).T x_z_repeated = np.repeat(x_z, len(points)).reshape(len(points), len(x_z)) cdf_vals = self._evaluate_cdf(points, x_z_repeated) cdf_vals = np.hstack([[0.], cdf_vals, [1.]]) points = np.vstack([[self.outcome_lower_support - 3. * y_bw], points, [self.outcome_upper_support + 3. * y_bw]]) inv_cdf = interp1d(cdf_vals.flatten(), points.flatten(), fill_value=0., axis=0) r = np.random.rand() try: return inv_cdf(r) except ValueError: return self.sample_point(x_z) def _evaluate_cdf(self, y, x_z): return self.conditional_density.cdf(endog_predict=[y], exog_predict=x_z)
def main(args): # Notebook experiment settings experiment_name = args.experiment_name experiment_results_folder = args.results_folder results_path = os.path.join("../", experiment_results_folder) data_folder = args.data_folder data_file = args.data_file extra_data_file = args.extra_data_file file_name_test = f"{experiment_name}_test.pickle" file_path_test = os.path.join(results_path, file_name_test) print(f"Saving: {file_name_test}") with open(file_path_test, 'wb') as f: pickle.dump("test", f) # Data settings obs_cols = args.obs_cols # Load data csv_path = os.path.join(data_folder, data_file) donkey_df = pd.read_csv(csv_path, parse_dates=[4, 11]) csv_path = os.path.join(data_folder, extra_data_file) extra_df = pd.read_csv(csv_path, parse_dates=[4, 12]) # Data prep train_idx, test_idx = get_split_idx_on_day(donkey_df) # Create full data test_data = donkey_df.loc[test_idx, obs_cols] train_data = pd.concat((donkey_df.loc[train_idx, obs_cols], extra_df.loc[:, obs_cols])) # Normalize data obs_scaler = StandardScaler().fit(train_data) scaled_train_data = obs_scaler.transform(train_data) # Create conditional variable hours = pd.concat((donkey_df.loc[train_idx, :], extra_df.loc[:, :])).merge_date.dt.hour.values hours = np.expand_dims(hours, 1) scaled_test_data = obs_scaler.transform(test_data) statsmods = KDEMultivariateConditional(endog=scaled_train_data, exog=hours, indep_type='o', dep_type='cc', bw='cv_ml') results_dict = {'model': statsmods, } file_name = f"{experiment_name}.pickle" file_path = os.path.join(results_path, file_name) print(f"Saving: {file_name}") with open(file_path, 'wb') as f: pickle.dump(results_dict, f)
def _compute_conditional_kde(self, dep, inds, normref=True): endog = self.node_data.info[dep]['data'] exog = [self.node_data.info[node]['data'] for node in inds] t = time.time() if normref: kde = KDEMultivariateConditional(endog=endog, exog=exog, dep_type='c', indep_type='c' * len(exog), bw='normal_reference') else: kde = KDEMultivariateConditional( endog=endog, exog=exog, dep_type='c', indep_type='c' * len(exog), bw='cv_ml', defaults=EstimatorSettings(efficient=True)) print("Fit conditional KDE for %s wrt %s in %s seconds" % (dep, inds, time.time() - t)) self.kdes_conditional[dep][inds] = kde
def continuous_treatment_model(data, covariates, treatment, variable_types): data, covariates = binarize_discrete(data, covariates, variable_types) if len(data) > 300 or len(treatment + covariates) >= 3: defaults = EstimatorSettings(n_jobs=4, efficient=True) else: defaults = EstimatorSettings(n_jobs=-1, efficient=False) if 'c' not in variable_types.values(): bw = 'cv_ml' else: bw = 'normal_reference' indep_type = get_type_string(covariates, variable_types) dep_type = get_type_string([treatment], variable_types) model = KDEMultivariateConditional(endog=data[treatment], exog=data[covariates], dep_type=''.join(dep_type), indep_type=''.join(indep_type), bw=bw, defaults=defaults) scores = model.pdf(endog_predict=data[treatment], exog_predict=data[covariates]) return scores
def estimate_cond_pdf(self, x, z, X): # normal_reference works better with mixed types if 'c' not in [self.variable_types[xi] for xi in x+z]: bw = 'cv_ml' else: bw = 'cv_ls'#'normal_reference' # if conditioning on the empty set, return a pdf instead of cond pdf if len(z) == 0: return KDEMultivariate(X[x], var_type=''.join([self.variable_types[xi] for xi in x]), bw=bw, defaults=self.defaults) else: return KDEMultivariateConditional(endog=X[x], exog=X[z], dep_type=''.join([self.variable_types[xi] for xi in x]), indep_type=''.join([self.variable_types[zi] for zi in z]), bw=bw, defaults=self.defaults)
class CausalEffect(object): def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True): """ We want to calculate the causal effect of X and Y through back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) for some admissable set of control variables, Z. First we calculate the conditional density P(Y|X,Z), then the density P(Z). We find the support of Z so we can properly sum over it later. variable_types are a dictionary with the column name pointing to an element of set(['o', 'u', 'c']), for 'ordered', 'unordered discrete', or 'continuous'. """ conditional_density_vars = causes + admissable_set self.causes = causes self.effects = effects self.admissable_set = admissable_set self.conditional_density_vars = conditional_density_vars if variable_types: self.variable_types = variable_types dep_type = [variable_types[var] for var in effects] indep_type = [variable_types[var] for var in conditional_density_vars] density_types = [variable_types[var] for var in admissable_set] else: self.variable_types = self.__infer_variable_types(X) if 'c' not in variable_types.values(): bw = 'cv_ml' else: bw = 'normal_reference' if admissable_set: self.density = KDEMultivariate(X[admissable_set], var_type=''.join(density_types), bw=bw) self.conditional_density = KDEMultivariateConditional(endog=X[effects], exog=X[conditional_density_vars], dep_type=''.join(dep_type), indep_type=''.join(indep_type), bw=bw) if expectation: self.conditional_expectation = KernelReg(X[effects].values, X[conditional_density_vars].values, ''.join(indep_type), bw='cv_ls') self.support = self.__get_support(X) self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u']] self.discrete_Z = list(set(self.discrete_variables).intersection(set(admissable_set))) self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ] self.continuous_Z = list(set(self.continuous_variables).intersection(set(admissable_set))) def __infer_variable_types(self,X): """ fill this in later. """ pass def __get_support(self, X): """ find the smallest cube around which the densities are supported, allowing a little flexibility for variables with larger bandwidths. """ data_support = { variable : (X[variable].min(), X[variable].max()) for variable in X.columns} variable_bandwidths = { variable : bw for variable, bw in zip(self.effects + self.conditional_density_vars, self.conditional_density.bw)} support = {} for variable in self.effects + self.conditional_density_vars: if self.variable_types[variable] == 'c': lower_support = data_support[variable][0] - 10. * variable_bandwidths[variable] upper_support = data_support[variable][1] + 10. * variable_bandwidths[variable] support[variable] = (lower_support, upper_support) else: support[variable] = data_support[variable] return support def integration_function(self,*args): # takes continuous z, discrete z, then x data = pd.DataFrame({ k : [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes + self.effects, args)}) conditional = self.conditional_density.pdf(exog_predict=data[self.conditional_density_vars].values[0], endog_predict=data[self.effects].values[0]) density = self.density.pdf(data_predict=data[self.admissable_set]) return conditional * density def expectation_integration_function(self, *args): data = pd.DataFrame({ k : [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes, args)}) conditional = self.conditional_expectation.fit(data_predict=data[self.conditional_density_vars].values)[0] density = self.density.pdf(data_predict=data[self.admissable_set]) return conditional * density def pdf(self, x): """ Currently, this does the whole sum/integral over the cube support of Z. We may be able to improve this by taking into account how the joint and conditionals factorize, and/or finding a more efficient support. This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete variable cardinalities. It runs in O(n_1 n_2 ... n_k) in the cardinality of the discrete variables, |Z_1| = n_1, etc. It likewise runs in O(V^n) for n continuous Z variables. Factorizing the joint/conditional distributions in the sum could linearize the runtime. """ causal_effect = 0. x = x[self.causes + self.effects] if self.discrete_Z: discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1])+1)) for variable in self.discrete_Z] for z_vals in itertools.product(*discrete_variable_ranges): z_discrete = pd.DataFrame({k : [v] for k, v in zip(self.discrete_Z, z_vals)}) if self.continuous_Z: continuous_Z_ranges = [self.support[variable] for variable in self.continuous_Z] args = z_discrete.join(x).values[0] causal_effect += nquad(self.integration_function,continuous_Z_ranges,args=args)[0] else: z_discrete = z_discrete[self.admissable_set] exog_predictors = x.join(z_discrete)[self.conditional_density_vars] conditional = self.conditional_density.pdf(exog_predict=exog_predictors, endog_predict=x[self.effects]) density = self.density.pdf(data_predict=z_discrete) dc = conditional * density causal_effect += dc return causal_effect elif self.continuous_Z: continuous_Z_ranges = [self.support[var] for var in self.continuous_Z] causal_effect, error = nquad(self.integration_function,continuous_Z_ranges,args=tuple(x.values[0])) return causal_effect else: return self.conditional_density.pdf(exog_predict=x[self.causes],endog_predict=x[self.effects]) def expected_value( self, x): """ Currently, this does the whole sum/integral over the cube support of Z. We may be able to improve this by taking into account how the joint and conditionals factorize, and/or finding a more efficient support. This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete variable cardinalities. It runs in O(n_1 n_2 ... n_k) in the cardinality of the discrete variables, |Z_1| = n_1, etc. It likewise runs in O(V^n) for n continuous Z variables. Factorizing the joint/conditional distributions in the sum could linearize the runtime. """ causal_effect = 0. x = x[self.causes] if self.discrete_Z: discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1])+1)) for variable in self.discrete_Z] for z_vals in itertools.product(*discrete_variable_ranges): z_discrete = pd.DataFrame({k : [v] for k, v in zip(self.discrete_Z, z_vals)}) if self.continuous_Z: continuous_Z_ranges = [self.support[variable] for variable in self.continuous_Z] args = z_discrete.join(x).values[0] causal_effect += nquad(self.expectation_integration_function,continuous_Z_ranges,args=args)[0] else: z_discrete = z_discrete[self.admissable_set] exog_predictors = x.join(z_discrete)[self.conditional_density_vars] causal_effect += self.conditional_expectation.fit(data_predict=exog_predictors.values)[0] * self.density.pdf(data_predict=z_discrete.values) return causal_effect elif self.continuous_Z: continuous_Z_ranges = [self.support[var] for var in self.continuous_Z] causal_effect, error = nquad(self.expectation_integration_function,continuous_Z_ranges,args=tuple(x.values[0])) return causal_effect else: return self.conditional_expectation.fit(data_predict=x[self.causes])[0]
def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True): """ We want to calculate the causal effect of X and Y through back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) for some admissable set of control variables, Z. First we calculate the conditional density P(Y|X,Z), then the density P(Z). We find the support of Z so we can properly sum over it later. variable_types are a dictionary with the column name pointing to an element of set(['o', 'u', 'c']), for 'ordered', 'unordered discrete', or 'continuous'. """ conditional_density_vars = causes + admissable_set self.causes = causes self.effects = effects self.admissable_set = admissable_set self.conditional_density_vars = conditional_density_vars if variable_types: self.variable_types = variable_types dep_type = [variable_types[var] for var in effects] indep_type = [ variable_types[var] for var in conditional_density_vars ] density_types = [variable_types[var] for var in admissable_set] else: self.variable_types = self.__infer_variable_types(X) if 'c' not in variable_types.values(): bw = 'cv_ml' else: bw = 'normal_reference' if admissable_set: self.density = KDEMultivariate(X[admissable_set], var_type=''.join(density_types), bw=bw) self.conditional_density = KDEMultivariateConditional( endog=X[effects], exog=X[conditional_density_vars], dep_type=''.join(dep_type), indep_type=''.join(indep_type), bw=bw) if expectation: self.conditional_expectation = KernelReg( X[effects].values, X[conditional_density_vars].values, ''.join(indep_type), bw='cv_ls') self.support = self.__get_support(X) self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u'] ] self.discrete_Z = list( set(self.discrete_variables).intersection(set(admissable_set))) self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ] self.continuous_Z = list( set(self.continuous_variables).intersection(set(admissable_set)))
class CausalEffect(object): def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True): """ We want to calculate the causal effect of X and Y through back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) for some admissable set of control variables, Z. First we calculate the conditional density P(Y|X,Z), then the density P(Z). We find the support of Z so we can properly sum over it later. variable_types are a dictionary with the column name pointing to an element of set(['o', 'u', 'c']), for 'ordered', 'unordered discrete', or 'continuous'. """ conditional_density_vars = causes + admissable_set self.causes = causes self.effects = effects self.admissable_set = admissable_set self.conditional_density_vars = conditional_density_vars if variable_types: self.variable_types = variable_types dep_type = [variable_types[var] for var in effects] indep_type = [ variable_types[var] for var in conditional_density_vars ] density_types = [variable_types[var] for var in admissable_set] else: self.variable_types = self.__infer_variable_types(X) if 'c' not in variable_types.values(): bw = 'cv_ml' else: bw = 'normal_reference' if admissable_set: self.density = KDEMultivariate(X[admissable_set], var_type=''.join(density_types), bw=bw) self.conditional_density = KDEMultivariateConditional( endog=X[effects], exog=X[conditional_density_vars], dep_type=''.join(dep_type), indep_type=''.join(indep_type), bw=bw) if expectation: self.conditional_expectation = KernelReg( X[effects].values, X[conditional_density_vars].values, ''.join(indep_type), bw='cv_ls') self.support = self.__get_support(X) self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u'] ] self.discrete_Z = list( set(self.discrete_variables).intersection(set(admissable_set))) self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ] self.continuous_Z = list( set(self.continuous_variables).intersection(set(admissable_set))) def __infer_variable_types(self, X): """ fill this in later. """ pass def __get_support(self, X): """ find the smallest cube around which the densities are supported, allowing a little flexibility for variables with larger bandwidths. """ data_support = { variable: (X[variable].min(), X[variable].max()) for variable in X.columns } variable_bandwidths = { variable: bw for variable, bw in zip( self.effects + self.conditional_density_vars, self.conditional_density.bw) } support = {} for variable in self.effects + self.conditional_density_vars: if self.variable_types[variable] == 'c': lower_support = data_support[variable][ 0] - 10. * variable_bandwidths[variable] upper_support = data_support[variable][ 1] + 10. * variable_bandwidths[variable] support[variable] = (lower_support, upper_support) else: support[variable] = data_support[variable] return support def integration_function(self, *args): # takes continuous z, discrete z, then x data = pd.DataFrame({ k: [v] for k, v in zip( self.continuous_Z + self.discrete_Z + self.causes + self.effects, args) }) conditional = self.conditional_density.pdf( exog_predict=data[self.conditional_density_vars].values[0], endog_predict=data[self.effects].values[0]) density = self.density.pdf(data_predict=data[self.admissable_set]) return conditional * density def expectation_integration_function(self, *args): data = pd.DataFrame({ k: [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes, args) }) conditional = self.conditional_expectation.fit( data_predict=data[self.conditional_density_vars].values)[0] density = self.density.pdf(data_predict=data[self.admissable_set]) return conditional * density def pdf(self, x): """ Currently, this does the whole sum/integral over the cube support of Z. We may be able to improve this by taking into account how the joint and conditionals factorize, and/or finding a more efficient support. This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete variable cardinalities. It runs in O(n_1 n_2 ... n_k) in the cardinality of the discrete variables, |Z_1| = n_1, etc. It likewise runs in O(V^n) for n continuous Z variables. Factorizing the joint/conditional distributions in the sum could linearize the runtime. """ causal_effect = 0. x = x[self.causes + self.effects] if self.discrete_Z: discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1]) + 1)) for variable in self.discrete_Z ] for z_vals in itertools.product(*discrete_variable_ranges): z_discrete = pd.DataFrame( {k: [v] for k, v in zip(self.discrete_Z, z_vals)}) if self.continuous_Z: continuous_Z_ranges = [ self.support[variable] for variable in self.continuous_Z ] args = z_discrete.join(x).values[0] causal_effect += nquad(self.integration_function, continuous_Z_ranges, args=args)[0] else: z_discrete = z_discrete[self.admissable_set] exog_predictors = x.join(z_discrete)[ self.conditional_density_vars] conditional = self.conditional_density.pdf( exog_predict=exog_predictors, endog_predict=x[self.effects]) density = self.density.pdf(data_predict=z_discrete) dc = conditional * density causal_effect += dc return causal_effect elif self.continuous_Z: continuous_Z_ranges = [ self.support[var] for var in self.continuous_Z ] causal_effect, error = nquad(self.integration_function, continuous_Z_ranges, args=tuple(x.values[0])) return causal_effect else: return self.conditional_density.pdf(exog_predict=x[self.causes], endog_predict=x[self.effects]) def expected_value(self, x): """ Currently, this does the whole sum/integral over the cube support of Z. We may be able to improve this by taking into account how the joint and conditionals factorize, and/or finding a more efficient support. This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete variable cardinalities. It runs in O(n_1 n_2 ... n_k) in the cardinality of the discrete variables, |Z_1| = n_1, etc. It likewise runs in O(V^n) for n continuous Z variables. Factorizing the joint/conditional distributions in the sum could linearize the runtime. """ causal_effect = 0. x = x[self.causes] if self.discrete_Z: discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1]) + 1)) for variable in self.discrete_Z ] for z_vals in itertools.product(*discrete_variable_ranges): z_discrete = pd.DataFrame( {k: [v] for k, v in zip(self.discrete_Z, z_vals)}) if self.continuous_Z: continuous_Z_ranges = [ self.support[variable] for variable in self.continuous_Z ] args = z_discrete.join(x).values[0] causal_effect += nquad( self.expectation_integration_function, continuous_Z_ranges, args=args)[0] else: z_discrete = z_discrete[self.admissable_set] exog_predictors = x.join(z_discrete)[ self.conditional_density_vars] causal_effect += self.conditional_expectation.fit( data_predict=exog_predictors.values )[0] * self.density.pdf(data_predict=z_discrete.values) return causal_effect elif self.continuous_Z: continuous_Z_ranges = [ self.support[var] for var in self.continuous_Z ] causal_effect, error = nquad(self.expectation_integration_function, continuous_Z_ranges, args=tuple(x.values[0])) return causal_effect else: return self.conditional_expectation.fit( data_predict=x[self.causes])[0]
def __init__( self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True, ): """ We want to calculate the causal effect of X and Y through back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) for some admissable set of control variables, Z. First we calculate the conditional density P(Y|X,Z), then the density P(Z). We find the support of Z so we can properly sum over it later. variable_types are a dictionary with the column name pointing to an element of set(['o', 'u', 'c']), for 'ordered', 'unordered discrete', or 'continuous'. """ conditional_density_vars = causes + admissable_set self.causes = causes self.effects = effects self.admissable_set = list( admissable_set ) # uses a list internally; AdjustForDirectCauses.admissable_set returns a set self.conditional_density_vars = conditional_density_vars if ( len(X) > 300 or max(len(causes + admissable_set), len(effects + admissable_set)) >= 3 ): self.defaults = EstimatorSettings(n_jobs=4, efficient=True) else: self.defaults = EstimatorSettings(n_jobs=-1, efficient=False) if variable_types: self.variable_types = variable_types dep_type = [variable_types[var] for var in effects] indep_type = [variable_types[var] for var in conditional_density_vars] density_types = [variable_types[var] for var in admissable_set] else: self.variable_types = self.__infer_variable_types(X) if "c" not in variable_types.values(): bw = "cv_ml" else: bw = "normal_reference" if admissable_set: self.density = KDEMultivariate( X[admissable_set], var_type="".join(density_types), bw=bw, defaults=self.defaults, ) self.conditional_density = KDEMultivariateConditional( endog=X[effects], exog=X[conditional_density_vars], dep_type="".join(dep_type), indep_type="".join(indep_type), bw=bw, defaults=self.defaults, ) if expectation: self.conditional_expectation = KernelReg( X[effects].values, X[conditional_density_vars].values, "".join(indep_type), bw="cv_ls", ) self.support = self.__get_support(X) self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ["o", "u"] ] self.discrete_Z = list( set(self.discrete_variables).intersection(set(admissable_set)) ) self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == "c" ] self.continuous_Z = list( set(self.continuous_variables).intersection(set(admissable_set)) )