예제 #1
0
 def _fit_conditional(self):
     self.conditional_density = KDEMultivariateConditional(endog=self._data[self._outcome_names],
                                                           exog=self._data[self._treatment_names + self._target_estimand.backdoor_variables],
                                                           dep_type=''.join(self.dep_type),
                                                           indep_type=''.join(self.indep_type),
                                                           bw=self.bw,
                                                           defaults=self.defaults)
예제 #2
0
def main(args):
    # Notebook experiment settings
    experiment_name = args.experiment_name
    experiment_results_folder = args.results_folder
    results_path = os.path.join("../", experiment_results_folder)
    data_folder = args.data_folder
    data_file = args.data_file
    extra_data_file = args.extra_data_file

    file_name_test = f"{experiment_name}_test.pickle"
    file_path_test = os.path.join(results_path, file_name_test)

    print(f"Saving: {file_name_test}")
    with open(file_path_test, 'wb') as f:
        pickle.dump("test", f)

    # Data settings
    obs_cols = args.obs_cols

    # Load data
    csv_path = os.path.join(data_folder, data_file)
    donkey_df = pd.read_csv(csv_path, parse_dates=[4, 11])

    csv_path = os.path.join(data_folder, extra_data_file)
    extra_df = pd.read_csv(csv_path, parse_dates=[4, 12])

    # Data prep
    train_idx, test_idx = get_split_idx_on_day(donkey_df)

    # Create full data
    test_data = donkey_df.loc[test_idx, obs_cols]
    train_data = pd.concat((donkey_df.loc[train_idx, obs_cols], extra_df.loc[:, obs_cols]))

    # Normalize data
    obs_scaler = StandardScaler().fit(train_data)
    scaled_train_data = obs_scaler.transform(train_data)

    # Create conditional variable
    hours = pd.concat((donkey_df.loc[train_idx, :], extra_df.loc[:, :])).merge_date.dt.hour.values
    hours = np.expand_dims(hours, 1)

    scaled_test_data = obs_scaler.transform(test_data)

    statsmods = KDEMultivariateConditional(endog=scaled_train_data, exog=hours, indep_type='o', dep_type='cc',
                                           bw='cv_ml')

    results_dict = {'model': statsmods, }

    file_name = f"{experiment_name}.pickle"
    file_path = os.path.join(results_path, file_name)

    print(f"Saving: {file_name}")
    with open(file_path, 'wb') as f:
        pickle.dump(results_dict, f)
예제 #3
0
파일: sampler.py 프로젝트: q1park/spacetime
 def _compute_conditional_kde(self, dep, inds, normref=True):
     endog = self.node_data.info[dep]['data']
     exog = [self.node_data.info[node]['data'] for node in inds]
     t = time.time()
     if normref:
         kde = KDEMultivariateConditional(endog=endog,
                                          exog=exog,
                                          dep_type='c',
                                          indep_type='c' * len(exog),
                                          bw='normal_reference')
     else:
         kde = KDEMultivariateConditional(
             endog=endog,
             exog=exog,
             dep_type='c',
             indep_type='c' * len(exog),
             bw='cv_ml',
             defaults=EstimatorSettings(efficient=True))
     print("Fit conditional KDE for %s wrt %s in %s seconds" %
           (dep, inds, time.time() - t))
     self.kdes_conditional[dep][inds] = kde
예제 #4
0
 def __init__(self, outcome_upper_support, outcome_lower_support, outcome_names, treatment_names, backdoor_variables,
              data, dep_type, indep_type, bw, defaults):
     self._data = data
     self._outcome_names = outcome_names
     self._treatment_names = treatment_names
     self._backdoor_variables = backdoor_variables
     self.dep_type = dep_type
     self.indep_type = indep_type
     self.bw = bw
     self.defaults = defaults
     self.outcome_lower_support = outcome_lower_support
     self.outcome_upper_support = outcome_upper_support
     self.conditional_density = KDEMultivariateConditional(endog=self._data[self._outcome_names],
                                                           exog=self._data[self._treatment_names + self._backdoor_variables],
                                                           dep_type=''.join(self.dep_type),
                                                           indep_type=''.join(self.indep_type),
                                                           bw=self.bw,
                                                           defaults=self.defaults)
예제 #5
0
    def estimate_cond_pdf(self, x, z, X):
        # normal_reference works better with mixed types
        if 'c' not in [self.variable_types[xi] for xi in x+z]:
            bw = 'cv_ml'
        else:
            bw = 'cv_ls'#'normal_reference'

        # if conditioning on the empty set, return a pdf instead of cond pdf
        if len(z) == 0:
            return KDEMultivariate(X[x],
                                  var_type=''.join([self.variable_types[xi] for xi in x]),
                                  bw=bw,
                                  defaults=self.defaults)
        else:
            return KDEMultivariateConditional(endog=X[x],
                                              exog=X[z],
                                              dep_type=''.join([self.variable_types[xi] for xi in x]),
                                              indep_type=''.join([self.variable_types[zi] for zi in z]),
                                              bw=bw,
                                              defaults=self.defaults)
예제 #6
0
def continuous_treatment_model(data, covariates, treatment, variable_types):
    data, covariates = binarize_discrete(data, covariates, variable_types)
    if len(data) > 300 or len(treatment + covariates) >= 3:
        defaults = EstimatorSettings(n_jobs=4, efficient=True)
    else:
        defaults = EstimatorSettings(n_jobs=-1, efficient=False)

    if 'c' not in variable_types.values():
        bw = 'cv_ml'
    else:
        bw = 'normal_reference'

    indep_type = get_type_string(covariates, variable_types)
    dep_type = get_type_string([treatment], variable_types)

    model = KDEMultivariateConditional(endog=data[treatment],
                                       exog=data[covariates],
                                       dep_type=''.join(dep_type),
                                       indep_type=''.join(indep_type),
                                       bw=bw,
                                       defaults=defaults)
    scores = model.pdf(endog_predict=data[treatment], exog_predict=data[covariates])
    return scores
예제 #7
0
    def __init__(self,
                 X,
                 causes,
                 effects,
                 admissable_set=[],
                 variable_types=None,
                 expectation=False,
                 density=True):
        """
        We want to calculate the causal effect of X and Y through
        back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) 
        for some admissable set of control variables, Z.  First we 
        calculate the conditional density P(Y|X,Z), then the density
        P(Z).  We find the support of Z so we can properly sum over
        it later.  variable_types are a dictionary with the column name
        pointing to an element of set(['o', 'u', 'c']), for 'ordered',
        'unordered discrete', or 'continuous'.
        """
        conditional_density_vars = causes + admissable_set
        self.causes = causes
        self.effects = effects
        self.admissable_set = admissable_set
        self.conditional_density_vars = conditional_density_vars

        if variable_types:
            self.variable_types = variable_types
            dep_type = [variable_types[var] for var in effects]
            indep_type = [
                variable_types[var] for var in conditional_density_vars
            ]
            density_types = [variable_types[var] for var in admissable_set]
        else:
            self.variable_types = self.__infer_variable_types(X)

        if 'c' not in variable_types.values():
            bw = 'cv_ml'
        else:
            bw = 'normal_reference'

        if admissable_set:
            self.density = KDEMultivariate(X[admissable_set],
                                           var_type=''.join(density_types),
                                           bw=bw)

        self.conditional_density = KDEMultivariateConditional(
            endog=X[effects],
            exog=X[conditional_density_vars],
            dep_type=''.join(dep_type),
            indep_type=''.join(indep_type),
            bw=bw)
        if expectation:
            self.conditional_expectation = KernelReg(
                X[effects].values,
                X[conditional_density_vars].values,
                ''.join(indep_type),
                bw='cv_ls')

        self.support = self.__get_support(X)

        self.discrete_variables = [
            variable for variable, var_type in self.variable_types.items()
            if var_type in ['o', 'u']
        ]
        self.discrete_Z = list(
            set(self.discrete_variables).intersection(set(admissable_set)))
        self.continuous_variables = [
            variable for variable, var_type in self.variable_types.items()
            if var_type == 'c'
        ]
        self.continuous_Z = list(
            set(self.continuous_variables).intersection(set(admissable_set)))
예제 #8
0
    def __init__(
        self,
        X,
        causes,
        effects,
        admissable_set=[],
        variable_types=None,
        expectation=False,
        density=True,
    ):
        """
        We want to calculate the causal effect of X and Y through
        back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z)
        for some admissable set of control variables, Z.  First we
        calculate the conditional density P(Y|X,Z), then the density
        P(Z).  We find the support of Z so we can properly sum over
        it later.  variable_types are a dictionary with the column name
        pointing to an element of set(['o', 'u', 'c']), for 'ordered',
        'unordered discrete', or 'continuous'.
        """
        conditional_density_vars = causes + admissable_set
        self.causes = causes
        self.effects = effects
        self.admissable_set = list(
            admissable_set
        )  # uses a list internally; AdjustForDirectCauses.admissable_set returns a set
        self.conditional_density_vars = conditional_density_vars

        if (
            len(X) > 300
            or max(len(causes + admissable_set), len(effects + admissable_set)) >= 3
        ):
            self.defaults = EstimatorSettings(n_jobs=4, efficient=True)
        else:
            self.defaults = EstimatorSettings(n_jobs=-1, efficient=False)

        if variable_types:
            self.variable_types = variable_types
            dep_type = [variable_types[var] for var in effects]
            indep_type = [variable_types[var] for var in conditional_density_vars]
            density_types = [variable_types[var] for var in admissable_set]
        else:
            self.variable_types = self.__infer_variable_types(X)

        if "c" not in variable_types.values():
            bw = "cv_ml"
        else:
            bw = "normal_reference"

        if admissable_set:
            self.density = KDEMultivariate(
                X[admissable_set],
                var_type="".join(density_types),
                bw=bw,
                defaults=self.defaults,
            )

        self.conditional_density = KDEMultivariateConditional(
            endog=X[effects],
            exog=X[conditional_density_vars],
            dep_type="".join(dep_type),
            indep_type="".join(indep_type),
            bw=bw,
            defaults=self.defaults,
        )
        if expectation:
            self.conditional_expectation = KernelReg(
                X[effects].values,
                X[conditional_density_vars].values,
                "".join(indep_type),
                bw="cv_ls",
            )

        self.support = self.__get_support(X)

        self.discrete_variables = [
            variable
            for variable, var_type in self.variable_types.items()
            if var_type in ["o", "u"]
        ]
        self.discrete_Z = list(
            set(self.discrete_variables).intersection(set(admissable_set))
        )
        self.continuous_variables = [
            variable
            for variable, var_type in self.variable_types.items()
            if var_type == "c"
        ]
        self.continuous_Z = list(
            set(self.continuous_variables).intersection(set(admissable_set))
        )