예제 #1
0
    def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True):
        """
        We want to calculate the causal effect of X and Y through
        back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) 
        for some admissable set of control variables, Z.  First we 
        calculate the conditional density P(Y|X,Z), then the density
        P(Z).  We find the support of Z so we can properly sum over
        it later.  variable_types are a dictionary with the column name
        pointing to an element of set(['o', 'u', 'c']), for 'ordered',
        'unordered discrete', or 'continuous'.
        """
        conditional_density_vars = causes + admissable_set
        self.causes = causes
        self.effects = effects
        self.admissable_set = admissable_set
        self.conditional_density_vars = conditional_density_vars

        if len(X) > 300 or max(len(causes+admissable_set),len(effects+admissable_set)) >= 3:
            self.defaults=EstimatorSettings(n_jobs=4, efficient=True)
        else:
            self.defaults=EstimatorSettings(n_jobs=-1, efficient=False)
        
        if variable_types:
            self.variable_types = variable_types
            dep_type      = [variable_types[var] for var in effects]
            indep_type    = [variable_types[var] for var in conditional_density_vars]
            density_types = [variable_types[var] for var in admissable_set]
        else:
            self.variable_types = self.__infer_variable_types(X)

        if 'c' not in variable_types.values():
            bw = 'cv_ml'
        else:
            bw = 'normal_reference'


        if admissable_set:            
            self.density = KDEMultivariate(X[admissable_set], 
                                  var_type=''.join(density_types),
                                  bw=bw,
                                  defaults=self.defaults)
        
        self.conditional_density = KDEMultivariateConditional(endog=X[effects],
                                                         exog=X[conditional_density_vars],
                                                         dep_type=''.join(dep_type),
                                                         indep_type=''.join(indep_type),
                                                         bw=bw,
                                                         defaults=self.defaults)
        if expectation:
            self.conditional_expectation = KernelReg(X[effects].values,
                                                 X[conditional_density_vars].values,
                                                 ''.join(indep_type),
                                                 bw='cv_ls')

        self.support = self.__get_support(X)
        
        self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u']]
        self.discrete_Z = list(set(self.discrete_variables).intersection(set(admissable_set)))
        self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ]
        self.continuous_Z = list(set(self.continuous_variables).intersection(set(admissable_set)))
예제 #2
0
 def _fit_conditional(self):
     self.conditional_density = KDEMultivariateConditional(endog=self._data[self._outcome_names],
                                                           exog=self._data[self._treatment_names + self._target_estimand.backdoor_variables],
                                                           dep_type=''.join(self.dep_type),
                                                           indep_type=''.join(self.indep_type),
                                                           bw=self.bw,
                                                           defaults=self.defaults)
예제 #3
0
 def __init__(self, outcome_upper_support, outcome_lower_support, outcome_names, treatment_names, backdoor_variables,
              data, dep_type, indep_type, bw, defaults):
     self._data = data
     self._outcome_names = outcome_names
     self._treatment_names = treatment_names
     self._backdoor_variables = backdoor_variables
     self.dep_type = dep_type
     self.indep_type = indep_type
     self.bw = bw
     self.defaults = defaults
     self.outcome_lower_support = outcome_lower_support
     self.outcome_upper_support = outcome_upper_support
     self.conditional_density = KDEMultivariateConditional(endog=self._data[self._outcome_names],
                                                           exog=self._data[self._treatment_names + self._backdoor_variables],
                                                           dep_type=''.join(self.dep_type),
                                                           indep_type=''.join(self.indep_type),
                                                           bw=self.bw,
                                                           defaults=self.defaults)
예제 #4
0
class KernelSampler(object):
    def __init__(self, outcome_upper_support, outcome_lower_support,
                 outcome_names, treatment_names, backdoor_variables, data,
                 dep_type, indep_type, bw, defaults):
        self._data = data
        self._outcome_names = outcome_names
        self._treatment_names = treatment_names
        self._backdoor_variables = backdoor_variables
        self.dep_type = dep_type
        self.indep_type = indep_type
        self.bw = bw
        self.defaults = defaults
        self.outcome_lower_support = outcome_lower_support
        self.outcome_upper_support = outcome_upper_support
        self.conditional_density = KDEMultivariateConditional(
            endog=self._data[self._outcome_names],
            exog=self._data[self._treatment_names + self._backdoor_variables],
            dep_type=''.join(self.dep_type),
            indep_type=''.join(self.indep_type),
            bw=self.bw,
            defaults=self.defaults)

    def sample_point(self, x_z):
        y_bw = 1.06 * self._data[self._outcome_names].std() * (
            self._data[self._outcome_names].count())**(-1. / 5.)
        n = 5 * np.ceil(
            (self.outcome_upper_support - self.outcome_lower_support) / y_bw)
        cum_ranges = [
            np.linspace(self.outcome_lower_support[i],
                        self.outcome_upper_support[i], n[i])
            for i in range(len(self._outcome_names))
        ]

        res = np.meshgrid(*cum_ranges)
        points = np.array(res).reshape(len(self._outcome_names),
                                       np.int(n.cumprod()[-1])).T

        x_z_repeated = np.repeat(x_z,
                                 len(points)).reshape(len(points), len(x_z))
        cdf_vals = self._evaluate_cdf(points, x_z_repeated)
        cdf_vals = np.hstack([[0.], cdf_vals, [1.]])
        points = np.vstack([[self.outcome_lower_support - 3. * y_bw], points,
                            [self.outcome_upper_support + 3. * y_bw]])
        inv_cdf = interp1d(cdf_vals.flatten(),
                           points.flatten(),
                           fill_value=0.,
                           axis=0)
        r = np.random.rand()
        try:
            return inv_cdf(r)
        except ValueError:
            return self.sample_point(x_z)

    def _evaluate_cdf(self, y, x_z):
        return self.conditional_density.cdf(endog_predict=[y],
                                            exog_predict=x_z)
예제 #5
0
def main(args):
    # Notebook experiment settings
    experiment_name = args.experiment_name
    experiment_results_folder = args.results_folder
    results_path = os.path.join("../", experiment_results_folder)
    data_folder = args.data_folder
    data_file = args.data_file
    extra_data_file = args.extra_data_file

    file_name_test = f"{experiment_name}_test.pickle"
    file_path_test = os.path.join(results_path, file_name_test)

    print(f"Saving: {file_name_test}")
    with open(file_path_test, 'wb') as f:
        pickle.dump("test", f)

    # Data settings
    obs_cols = args.obs_cols

    # Load data
    csv_path = os.path.join(data_folder, data_file)
    donkey_df = pd.read_csv(csv_path, parse_dates=[4, 11])

    csv_path = os.path.join(data_folder, extra_data_file)
    extra_df = pd.read_csv(csv_path, parse_dates=[4, 12])

    # Data prep
    train_idx, test_idx = get_split_idx_on_day(donkey_df)

    # Create full data
    test_data = donkey_df.loc[test_idx, obs_cols]
    train_data = pd.concat((donkey_df.loc[train_idx, obs_cols], extra_df.loc[:, obs_cols]))

    # Normalize data
    obs_scaler = StandardScaler().fit(train_data)
    scaled_train_data = obs_scaler.transform(train_data)

    # Create conditional variable
    hours = pd.concat((donkey_df.loc[train_idx, :], extra_df.loc[:, :])).merge_date.dt.hour.values
    hours = np.expand_dims(hours, 1)

    scaled_test_data = obs_scaler.transform(test_data)

    statsmods = KDEMultivariateConditional(endog=scaled_train_data, exog=hours, indep_type='o', dep_type='cc',
                                           bw='cv_ml')

    results_dict = {'model': statsmods, }

    file_name = f"{experiment_name}.pickle"
    file_path = os.path.join(results_path, file_name)

    print(f"Saving: {file_name}")
    with open(file_path, 'wb') as f:
        pickle.dump(results_dict, f)
예제 #6
0
파일: sampler.py 프로젝트: q1park/spacetime
 def _compute_conditional_kde(self, dep, inds, normref=True):
     endog = self.node_data.info[dep]['data']
     exog = [self.node_data.info[node]['data'] for node in inds]
     t = time.time()
     if normref:
         kde = KDEMultivariateConditional(endog=endog,
                                          exog=exog,
                                          dep_type='c',
                                          indep_type='c' * len(exog),
                                          bw='normal_reference')
     else:
         kde = KDEMultivariateConditional(
             endog=endog,
             exog=exog,
             dep_type='c',
             indep_type='c' * len(exog),
             bw='cv_ml',
             defaults=EstimatorSettings(efficient=True))
     print("Fit conditional KDE for %s wrt %s in %s seconds" %
           (dep, inds, time.time() - t))
     self.kdes_conditional[dep][inds] = kde
예제 #7
0
def continuous_treatment_model(data, covariates, treatment, variable_types):
    data, covariates = binarize_discrete(data, covariates, variable_types)
    if len(data) > 300 or len(treatment + covariates) >= 3:
        defaults = EstimatorSettings(n_jobs=4, efficient=True)
    else:
        defaults = EstimatorSettings(n_jobs=-1, efficient=False)

    if 'c' not in variable_types.values():
        bw = 'cv_ml'
    else:
        bw = 'normal_reference'

    indep_type = get_type_string(covariates, variable_types)
    dep_type = get_type_string([treatment], variable_types)

    model = KDEMultivariateConditional(endog=data[treatment],
                                       exog=data[covariates],
                                       dep_type=''.join(dep_type),
                                       indep_type=''.join(indep_type),
                                       bw=bw,
                                       defaults=defaults)
    scores = model.pdf(endog_predict=data[treatment], exog_predict=data[covariates])
    return scores
예제 #8
0
    def estimate_cond_pdf(self, x, z, X):
        # normal_reference works better with mixed types
        if 'c' not in [self.variable_types[xi] for xi in x+z]:
            bw = 'cv_ml'
        else:
            bw = 'cv_ls'#'normal_reference'

        # if conditioning on the empty set, return a pdf instead of cond pdf
        if len(z) == 0:
            return KDEMultivariate(X[x],
                                  var_type=''.join([self.variable_types[xi] for xi in x]),
                                  bw=bw,
                                  defaults=self.defaults)
        else:
            return KDEMultivariateConditional(endog=X[x],
                                              exog=X[z],
                                              dep_type=''.join([self.variable_types[xi] for xi in x]),
                                              indep_type=''.join([self.variable_types[zi] for zi in z]),
                                              bw=bw,
                                              defaults=self.defaults)
예제 #9
0
class CausalEffect(object):
    def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True):
        """
        We want to calculate the causal effect of X and Y through
        back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) 
        for some admissable set of control variables, Z.  First we 
        calculate the conditional density P(Y|X,Z), then the density
        P(Z).  We find the support of Z so we can properly sum over
        it later.  variable_types are a dictionary with the column name
        pointing to an element of set(['o', 'u', 'c']), for 'ordered',
        'unordered discrete', or 'continuous'.
        """
        conditional_density_vars = causes + admissable_set
        self.causes = causes
        self.effects = effects
        self.admissable_set = admissable_set
        self.conditional_density_vars = conditional_density_vars
        
        if variable_types:
            self.variable_types = variable_types
            dep_type      = [variable_types[var] for var in effects]
            indep_type    = [variable_types[var] for var in conditional_density_vars]
            density_types = [variable_types[var] for var in admissable_set]
        else:
            self.variable_types = self.__infer_variable_types(X)

        if 'c' not in variable_types.values():
            bw = 'cv_ml'
        else:
            bw = 'normal_reference'


        if admissable_set:            
            self.density = KDEMultivariate(X[admissable_set], 
                                  var_type=''.join(density_types),
                                  bw=bw)
        
        self.conditional_density = KDEMultivariateConditional(endog=X[effects],
                                                         exog=X[conditional_density_vars],
                                                         dep_type=''.join(dep_type),
                                                         indep_type=''.join(indep_type),
                                                         bw=bw)
        if expectation:
            self.conditional_expectation = KernelReg(X[effects].values,
                                                 X[conditional_density_vars].values,
                                                 ''.join(indep_type),
                                                 bw='cv_ls')

        self.support = self.__get_support(X)
        
        self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u']]
        self.discrete_Z = list(set(self.discrete_variables).intersection(set(admissable_set)))
        self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ]
        self.continuous_Z = list(set(self.continuous_variables).intersection(set(admissable_set)))
       
 
    def __infer_variable_types(self,X):
        """
        fill this in later.
        """
        pass
       
 
    def __get_support(self, X):
        """
        find the smallest cube around which the densities are supported,
        allowing a little flexibility for variables with larger bandwidths.
        """
        data_support = { variable : (X[variable].min(), X[variable].max()) for variable in X.columns}
        variable_bandwidths = { variable : bw for variable, bw in zip(self.effects + self.conditional_density_vars, self.conditional_density.bw)}
        support = {}
        for variable in self.effects + self.conditional_density_vars:
            if self.variable_types[variable] == 'c':
                lower_support = data_support[variable][0] - 10. * variable_bandwidths[variable]
                upper_support = data_support[variable][1] + 10. * variable_bandwidths[variable]
                support[variable] = (lower_support, upper_support)
            else:
                support[variable] = data_support[variable]
        return support

        
    def integration_function(self,*args):
        # takes continuous z, discrete z, then x
        data = pd.DataFrame({ k : [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes + self.effects, args)})
        conditional = self.conditional_density.pdf(exog_predict=data[self.conditional_density_vars].values[0], 
                                                   endog_predict=data[self.effects].values[0]) 
        density = self.density.pdf(data_predict=data[self.admissable_set])
        return conditional * density

    
    def expectation_integration_function(self, *args):
        data = pd.DataFrame({ k : [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes, args)})
        conditional = self.conditional_expectation.fit(data_predict=data[self.conditional_density_vars].values)[0]
        density = self.density.pdf(data_predict=data[self.admissable_set])
        return conditional * density

    
    def pdf(self, x):
        """
        Currently, this does the whole sum/integral over the cube support of Z.
        We may be able to improve this by taking into account how the joint
        and conditionals factorize, and/or finding a more efficient support.
        
        This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete
        variable cardinalities.  It runs in O(n_1 n_2 ... n_k) in the cardinality of
        the discrete variables, |Z_1| = n_1, etc.  It likewise runs in O(V^n) for n
        continuous Z variables.  Factorizing the joint/conditional distributions in
        the sum could linearize the runtime.
        """
        causal_effect = 0.
        x = x[self.causes + self.effects]
        if self.discrete_Z:
            discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1])+1)) for variable in self.discrete_Z]
            for z_vals in itertools.product(*discrete_variable_ranges):
                z_discrete = pd.DataFrame({k : [v] for k, v in zip(self.discrete_Z, z_vals)})
                if self.continuous_Z:
                    continuous_Z_ranges = [self.support[variable] for variable in self.continuous_Z]
                    args = z_discrete.join(x).values[0]
                    causal_effect += nquad(self.integration_function,continuous_Z_ranges,args=args)[0]
                else:
                    z_discrete = z_discrete[self.admissable_set]
                    exog_predictors = x.join(z_discrete)[self.conditional_density_vars]
                    conditional = self.conditional_density.pdf(exog_predict=exog_predictors, 
                                                               endog_predict=x[self.effects]) 
                    density = self.density.pdf(data_predict=z_discrete)
                    dc = conditional * density
                    causal_effect += dc
            return causal_effect
        elif self.continuous_Z:
            continuous_Z_ranges = [self.support[var] for var in self.continuous_Z]
            causal_effect, error = nquad(self.integration_function,continuous_Z_ranges,args=tuple(x.values[0]))
            return causal_effect
        else:
            return self.conditional_density.pdf(exog_predict=x[self.causes],endog_predict=x[self.effects])

       
 
    def expected_value( self, x):
        """
        Currently, this does the whole sum/integral over the cube support of Z.
        We may be able to improve this by taking into account how the joint
        and conditionals factorize, and/or finding a more efficient support.
        
        This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete
        variable cardinalities.  It runs in O(n_1 n_2 ... n_k) in the cardinality of
        the discrete variables, |Z_1| = n_1, etc.  It likewise runs in O(V^n) for n
        continuous Z variables.  Factorizing the joint/conditional distributions in
        the sum could linearize the runtime.
        """
        causal_effect = 0.
        x = x[self.causes]
        if self.discrete_Z:
            discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1])+1)) for variable in self.discrete_Z]
            for z_vals in itertools.product(*discrete_variable_ranges):
                z_discrete = pd.DataFrame({k : [v] for k, v in zip(self.discrete_Z, z_vals)})
                if self.continuous_Z:
                    continuous_Z_ranges = [self.support[variable] for variable in self.continuous_Z]
                    args = z_discrete.join(x).values[0]
                    causal_effect += nquad(self.expectation_integration_function,continuous_Z_ranges,args=args)[0]
                else:
                    z_discrete = z_discrete[self.admissable_set]
                    exog_predictors = x.join(z_discrete)[self.conditional_density_vars]
                    causal_effect += self.conditional_expectation.fit(data_predict=exog_predictors.values)[0] * self.density.pdf(data_predict=z_discrete.values)
            return causal_effect
        elif self.continuous_Z:
            continuous_Z_ranges = [self.support[var] for var in self.continuous_Z]
            causal_effect, error = nquad(self.expectation_integration_function,continuous_Z_ranges,args=tuple(x.values[0]))
            return causal_effect
        else:
            return self.conditional_expectation.fit(data_predict=x[self.causes])[0]
예제 #10
0
    def __init__(self,
                 X,
                 causes,
                 effects,
                 admissable_set=[],
                 variable_types=None,
                 expectation=False,
                 density=True):
        """
        We want to calculate the causal effect of X and Y through
        back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) 
        for some admissable set of control variables, Z.  First we 
        calculate the conditional density P(Y|X,Z), then the density
        P(Z).  We find the support of Z so we can properly sum over
        it later.  variable_types are a dictionary with the column name
        pointing to an element of set(['o', 'u', 'c']), for 'ordered',
        'unordered discrete', or 'continuous'.
        """
        conditional_density_vars = causes + admissable_set
        self.causes = causes
        self.effects = effects
        self.admissable_set = admissable_set
        self.conditional_density_vars = conditional_density_vars

        if variable_types:
            self.variable_types = variable_types
            dep_type = [variable_types[var] for var in effects]
            indep_type = [
                variable_types[var] for var in conditional_density_vars
            ]
            density_types = [variable_types[var] for var in admissable_set]
        else:
            self.variable_types = self.__infer_variable_types(X)

        if 'c' not in variable_types.values():
            bw = 'cv_ml'
        else:
            bw = 'normal_reference'

        if admissable_set:
            self.density = KDEMultivariate(X[admissable_set],
                                           var_type=''.join(density_types),
                                           bw=bw)

        self.conditional_density = KDEMultivariateConditional(
            endog=X[effects],
            exog=X[conditional_density_vars],
            dep_type=''.join(dep_type),
            indep_type=''.join(indep_type),
            bw=bw)
        if expectation:
            self.conditional_expectation = KernelReg(
                X[effects].values,
                X[conditional_density_vars].values,
                ''.join(indep_type),
                bw='cv_ls')

        self.support = self.__get_support(X)

        self.discrete_variables = [
            variable for variable, var_type in self.variable_types.items()
            if var_type in ['o', 'u']
        ]
        self.discrete_Z = list(
            set(self.discrete_variables).intersection(set(admissable_set)))
        self.continuous_variables = [
            variable for variable, var_type in self.variable_types.items()
            if var_type == 'c'
        ]
        self.continuous_Z = list(
            set(self.continuous_variables).intersection(set(admissable_set)))
예제 #11
0
class CausalEffect(object):
    def __init__(self,
                 X,
                 causes,
                 effects,
                 admissable_set=[],
                 variable_types=None,
                 expectation=False,
                 density=True):
        """
        We want to calculate the causal effect of X and Y through
        back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) 
        for some admissable set of control variables, Z.  First we 
        calculate the conditional density P(Y|X,Z), then the density
        P(Z).  We find the support of Z so we can properly sum over
        it later.  variable_types are a dictionary with the column name
        pointing to an element of set(['o', 'u', 'c']), for 'ordered',
        'unordered discrete', or 'continuous'.
        """
        conditional_density_vars = causes + admissable_set
        self.causes = causes
        self.effects = effects
        self.admissable_set = admissable_set
        self.conditional_density_vars = conditional_density_vars

        if variable_types:
            self.variable_types = variable_types
            dep_type = [variable_types[var] for var in effects]
            indep_type = [
                variable_types[var] for var in conditional_density_vars
            ]
            density_types = [variable_types[var] for var in admissable_set]
        else:
            self.variable_types = self.__infer_variable_types(X)

        if 'c' not in variable_types.values():
            bw = 'cv_ml'
        else:
            bw = 'normal_reference'

        if admissable_set:
            self.density = KDEMultivariate(X[admissable_set],
                                           var_type=''.join(density_types),
                                           bw=bw)

        self.conditional_density = KDEMultivariateConditional(
            endog=X[effects],
            exog=X[conditional_density_vars],
            dep_type=''.join(dep_type),
            indep_type=''.join(indep_type),
            bw=bw)
        if expectation:
            self.conditional_expectation = KernelReg(
                X[effects].values,
                X[conditional_density_vars].values,
                ''.join(indep_type),
                bw='cv_ls')

        self.support = self.__get_support(X)

        self.discrete_variables = [
            variable for variable, var_type in self.variable_types.items()
            if var_type in ['o', 'u']
        ]
        self.discrete_Z = list(
            set(self.discrete_variables).intersection(set(admissable_set)))
        self.continuous_variables = [
            variable for variable, var_type in self.variable_types.items()
            if var_type == 'c'
        ]
        self.continuous_Z = list(
            set(self.continuous_variables).intersection(set(admissable_set)))

    def __infer_variable_types(self, X):
        """
        fill this in later.
        """
        pass

    def __get_support(self, X):
        """
        find the smallest cube around which the densities are supported,
        allowing a little flexibility for variables with larger bandwidths.
        """
        data_support = {
            variable: (X[variable].min(), X[variable].max())
            for variable in X.columns
        }
        variable_bandwidths = {
            variable: bw
            for variable, bw in zip(
                self.effects +
                self.conditional_density_vars, self.conditional_density.bw)
        }
        support = {}
        for variable in self.effects + self.conditional_density_vars:
            if self.variable_types[variable] == 'c':
                lower_support = data_support[variable][
                    0] - 10. * variable_bandwidths[variable]
                upper_support = data_support[variable][
                    1] + 10. * variable_bandwidths[variable]
                support[variable] = (lower_support, upper_support)
            else:
                support[variable] = data_support[variable]
        return support

    def integration_function(self, *args):
        # takes continuous z, discrete z, then x
        data = pd.DataFrame({
            k: [v]
            for k, v in zip(
                self.continuous_Z + self.discrete_Z + self.causes +
                self.effects, args)
        })
        conditional = self.conditional_density.pdf(
            exog_predict=data[self.conditional_density_vars].values[0],
            endog_predict=data[self.effects].values[0])
        density = self.density.pdf(data_predict=data[self.admissable_set])
        return conditional * density

    def expectation_integration_function(self, *args):
        data = pd.DataFrame({
            k: [v]
            for k, v in zip(self.continuous_Z + self.discrete_Z +
                            self.causes, args)
        })
        conditional = self.conditional_expectation.fit(
            data_predict=data[self.conditional_density_vars].values)[0]
        density = self.density.pdf(data_predict=data[self.admissable_set])
        return conditional * density

    def pdf(self, x):
        """
        Currently, this does the whole sum/integral over the cube support of Z.
        We may be able to improve this by taking into account how the joint
        and conditionals factorize, and/or finding a more efficient support.
        
        This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete
        variable cardinalities.  It runs in O(n_1 n_2 ... n_k) in the cardinality of
        the discrete variables, |Z_1| = n_1, etc.  It likewise runs in O(V^n) for n
        continuous Z variables.  Factorizing the joint/conditional distributions in
        the sum could linearize the runtime.
        """
        causal_effect = 0.
        x = x[self.causes + self.effects]
        if self.discrete_Z:
            discrete_variable_ranges = [
                xrange(*(int(self.support[variable][0]),
                         int(self.support[variable][1]) + 1))
                for variable in self.discrete_Z
            ]
            for z_vals in itertools.product(*discrete_variable_ranges):
                z_discrete = pd.DataFrame(
                    {k: [v]
                     for k, v in zip(self.discrete_Z, z_vals)})
                if self.continuous_Z:
                    continuous_Z_ranges = [
                        self.support[variable]
                        for variable in self.continuous_Z
                    ]
                    args = z_discrete.join(x).values[0]
                    causal_effect += nquad(self.integration_function,
                                           continuous_Z_ranges,
                                           args=args)[0]
                else:
                    z_discrete = z_discrete[self.admissable_set]
                    exog_predictors = x.join(z_discrete)[
                        self.conditional_density_vars]
                    conditional = self.conditional_density.pdf(
                        exog_predict=exog_predictors,
                        endog_predict=x[self.effects])
                    density = self.density.pdf(data_predict=z_discrete)
                    dc = conditional * density
                    causal_effect += dc
            return causal_effect
        elif self.continuous_Z:
            continuous_Z_ranges = [
                self.support[var] for var in self.continuous_Z
            ]
            causal_effect, error = nquad(self.integration_function,
                                         continuous_Z_ranges,
                                         args=tuple(x.values[0]))
            return causal_effect
        else:
            return self.conditional_density.pdf(exog_predict=x[self.causes],
                                                endog_predict=x[self.effects])

    def expected_value(self, x):
        """
        Currently, this does the whole sum/integral over the cube support of Z.
        We may be able to improve this by taking into account how the joint
        and conditionals factorize, and/or finding a more efficient support.
        
        This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete
        variable cardinalities.  It runs in O(n_1 n_2 ... n_k) in the cardinality of
        the discrete variables, |Z_1| = n_1, etc.  It likewise runs in O(V^n) for n
        continuous Z variables.  Factorizing the joint/conditional distributions in
        the sum could linearize the runtime.
        """
        causal_effect = 0.
        x = x[self.causes]
        if self.discrete_Z:
            discrete_variable_ranges = [
                xrange(*(int(self.support[variable][0]),
                         int(self.support[variable][1]) + 1))
                for variable in self.discrete_Z
            ]
            for z_vals in itertools.product(*discrete_variable_ranges):
                z_discrete = pd.DataFrame(
                    {k: [v]
                     for k, v in zip(self.discrete_Z, z_vals)})
                if self.continuous_Z:
                    continuous_Z_ranges = [
                        self.support[variable]
                        for variable in self.continuous_Z
                    ]
                    args = z_discrete.join(x).values[0]
                    causal_effect += nquad(
                        self.expectation_integration_function,
                        continuous_Z_ranges,
                        args=args)[0]
                else:
                    z_discrete = z_discrete[self.admissable_set]
                    exog_predictors = x.join(z_discrete)[
                        self.conditional_density_vars]
                    causal_effect += self.conditional_expectation.fit(
                        data_predict=exog_predictors.values
                    )[0] * self.density.pdf(data_predict=z_discrete.values)
            return causal_effect
        elif self.continuous_Z:
            continuous_Z_ranges = [
                self.support[var] for var in self.continuous_Z
            ]
            causal_effect, error = nquad(self.expectation_integration_function,
                                         continuous_Z_ranges,
                                         args=tuple(x.values[0]))
            return causal_effect
        else:
            return self.conditional_expectation.fit(
                data_predict=x[self.causes])[0]
예제 #12
0
    def __init__(
        self,
        X,
        causes,
        effects,
        admissable_set=[],
        variable_types=None,
        expectation=False,
        density=True,
    ):
        """
        We want to calculate the causal effect of X and Y through
        back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z)
        for some admissable set of control variables, Z.  First we
        calculate the conditional density P(Y|X,Z), then the density
        P(Z).  We find the support of Z so we can properly sum over
        it later.  variable_types are a dictionary with the column name
        pointing to an element of set(['o', 'u', 'c']), for 'ordered',
        'unordered discrete', or 'continuous'.
        """
        conditional_density_vars = causes + admissable_set
        self.causes = causes
        self.effects = effects
        self.admissable_set = list(
            admissable_set
        )  # uses a list internally; AdjustForDirectCauses.admissable_set returns a set
        self.conditional_density_vars = conditional_density_vars

        if (
            len(X) > 300
            or max(len(causes + admissable_set), len(effects + admissable_set)) >= 3
        ):
            self.defaults = EstimatorSettings(n_jobs=4, efficient=True)
        else:
            self.defaults = EstimatorSettings(n_jobs=-1, efficient=False)

        if variable_types:
            self.variable_types = variable_types
            dep_type = [variable_types[var] for var in effects]
            indep_type = [variable_types[var] for var in conditional_density_vars]
            density_types = [variable_types[var] for var in admissable_set]
        else:
            self.variable_types = self.__infer_variable_types(X)

        if "c" not in variable_types.values():
            bw = "cv_ml"
        else:
            bw = "normal_reference"

        if admissable_set:
            self.density = KDEMultivariate(
                X[admissable_set],
                var_type="".join(density_types),
                bw=bw,
                defaults=self.defaults,
            )

        self.conditional_density = KDEMultivariateConditional(
            endog=X[effects],
            exog=X[conditional_density_vars],
            dep_type="".join(dep_type),
            indep_type="".join(indep_type),
            bw=bw,
            defaults=self.defaults,
        )
        if expectation:
            self.conditional_expectation = KernelReg(
                X[effects].values,
                X[conditional_density_vars].values,
                "".join(indep_type),
                bw="cv_ls",
            )

        self.support = self.__get_support(X)

        self.discrete_variables = [
            variable
            for variable, var_type in self.variable_types.items()
            if var_type in ["o", "u"]
        ]
        self.discrete_Z = list(
            set(self.discrete_variables).intersection(set(admissable_set))
        )
        self.continuous_variables = [
            variable
            for variable, var_type in self.variable_types.items()
            if var_type == "c"
        ]
        self.continuous_Z = list(
            set(self.continuous_variables).intersection(set(admissable_set))
        )