예제 #1
0
    def qc_prop_matching(self, rel_cols, label):
        """
        Evaluates the need for a propensity score matching and can be used to quality control a propensity score matched
        population. Will train classifiers and create a plot.

        :param rel_cols: relevant columns
        :param label: Label or class which should be regressed. \
        (cohort1/cohort2, case/control, treatment/untreated etc.)
        """

        cols = rel_cols[::]

        # create reduced copies of the dataframes for propensity score quality control
        qc_dfs = []
        for df in self:
            qc_dfs.append(df[cols])

        # exclude label if included into columns
        if label in cols:
            cols.remove(label)

        # construct formula
        formula = construct_formula(label, cols)

        # create Matcher
        m = Matcher(*qc_dfs, yvar=label, formula=formula)
        # train classifier to asses predictability
        m.fit_scores(balance=True, nmodels=10)
        # calculate and visualize propensity scores
        m.predict_scores()
        m.plot_scores()
예제 #2
0
def calc_propensity_scores(file_name):
    data = pd.read_csv("datasets/{}.csv".format(file_name), index_col=0)[fields]
    categorical_c=[]
    for a in data.columns:
        try:
            float(data.iloc[0].loc[a])
        except:
            categorical_c.append(a)

    data_dummy=pd.get_dummies(data, columns=categorical_c, drop_first=True)

    control=data_dummy[data_dummy["T"]==0]
    test=data_dummy[data_dummy["T"]==1]

    m = Matcher(test, control, yvar="T", exclude=["Y"])
    np.random.seed(20170925)
    m.fit_scores(balance=False, nmodels=1)
    m.predict_scores()
    m.plot_scores()
    plt.savefig("output/pm_results_{}.png".format(file_name))
    m.data.to_csv("datasets/{}_p.csv".format(file_name))
    return m.data["scores"]
예제 #3
0
def propensity_match(exposure,
                     control,
                     covariates=[
                         'age', 'apache_prob', 'sepsis',
                         'infection_skin_soft_tissue', 'immunocompromised'
                     ],
                     outcome_var='aki',
                     seed=389202,
                     balance=False,
                     n_models=100,
                     verbose=False):

    np.random.seed(seed)

    exposure = exposure.copy()
    control = control.copy()

    # make sure we don't overwrite the legit column status
    if 'status' in exposure.columns:
        exposure['status_original'] = exposure['status']
        control['status_original'] = control['status']
    exposure_var = 'status'
    exposure.loc[:, exposure_var] = 1
    control.loc[:, exposure_var] = 0

    # vars we exclude
    cols_exclude, cols_include = [], []
    for c in exposure.columns:
        if c == exposure_var:
            continue
        if c not in covariates:
            cols_exclude.append(c)
        else:
            cols_include.append(c)

    if len(cols_include) == 0:
        raise ValueError(
            'None of the covariates appear in the exposure dataframe.')
    logger.info((f'Columns included: {cols_include}'))

    # warn about missing data and missing columns
    for c in exposure.columns:
        if str(exposure[c].dtype) == 'object':
            mu = pd.concat([exposure[c], control[c]],
                           axis=0).value_counts().index[0]
        else:
            mu = pd.concat([exposure[c], control[c]], axis=0).mean()

        n = exposure[c].isnull().sum()
        if (n > 0) & (c not in cols_exclude):
            logger.warning(
                f'Column {c} missing {n} observations in exposure dataframe.')
            exposure[c].fillna(mu, inplace=True)

        if c not in control:
            logger.warning(f'Did not find column {c} in control dataframe.')
        else:
            n = control[c].isnull().sum()
            if (n > 0) & (c not in cols_exclude):
                logger.warning(
                    f'Column {c} missing {n} observations in control dataframe.'
                )
                control[c].fillna(mu, inplace=True)

    # print('Dataframe being used:')
    # display(exposure[cols].head())
    m = Matcher(exposure, control, yvar=exposure_var, exclude=cols_exclude)

    # predict the y outcome balancing the classes
    # repeat 100 times to be sure we use a lot of majority class data
    if balance:
        m.fit_scores(balance=balance, nmodels=n_models)
    else:
        m.fit_scores(balance=False)

    m.predict_scores()

    if verbose:
        m.plot_scores()

    # m.tune_threshold(method='random')
    m.match(
        method="min", nmatches=1,
        threshold=0.0005)  # finds the closest match for each minority record
    # m.record_frequency()

    # no categorical variables -> this errors
    if verbose:
        cc = m.compare_categorical(return_table=True)
        display(cc)
        cc = m.compare_continuous(return_table=True)
        display(cc)

    return m
#4 explain the tecniques are able to create a new control based on the causal inference methods
#  - Exercice proposed by Dehejia and Wahba: they claimed that most modern tecniques, such as propensity scores matching,
#  were capable of generate better results
#5 Show rhe results and conclusion
# %%
treated = rct_data[rct_data.treat == 1].copy().drop(columns=['data_id'])
observational_control = observational_data.copy().drop(columns=['data_id'])
# %%
m = Matcher(treated, observational_control, yvar="treat", exclude=['re78'])
# %%
np.random.seed(666)
m.fit_scores(balance=True, nmodels=100)
# %%
m.predict_scores()
# %%
m.plot_scores()
# %%
m.tune_threshold(method='random')
# %%
m.match(method="min", nmatches=1, threshold=0.0004)
m.record_frequency()
# %%
m.assign_weight_vector()

# %%
m.matched_data.sort_values("match_id").head(6)
# %%
df = m.data
ps = df['scores']
y = df['re78']
z = df['treat']