Exemplo n.º 1
0
    def qc_prop_matching(self, rel_cols, label):
        """
        Evaluates the need for a propensity score matching and can be used to quality control a propensity score matched
        population. Will train classifiers and create a plot.

        :param rel_cols: relevant columns
        :param label: Label or class which should be regressed. \
        (cohort1/cohort2, case/control, treatment/untreated etc.)
        """

        cols = rel_cols[::]

        # create reduced copies of the dataframes for propensity score quality control
        qc_dfs = []
        for df in self:
            qc_dfs.append(df[cols])

        # exclude label if included into columns
        if label in cols:
            cols.remove(label)

        # construct formula
        formula = construct_formula(label, cols)

        # create Matcher
        m = Matcher(*qc_dfs, yvar=label, formula=formula)
        # train classifier to asses predictability
        m.fit_scores(balance=True, nmodels=10)
        # calculate and visualize propensity scores
        m.predict_scores()
        m.plot_scores()
Exemplo n.º 2
0
def pymatch_matching(patients, controls, max_age_diff=3):
    raise Exception('This doesnt seem to work')
    patients = {p:patients[p] for p in patients}
    controls = {p:controls[p] for p in controls}
    
    p_names = list(patients)
    p_ages = [p['age'] for p in patients.values()]
    p_genders = [p['gender'] for p in patients.values()]
    p_group = [1 for _ in patients]
    patients_df = pd.DataFrame(list(zip(p_names,p_genders, p_ages, p_group)), columns=['Name','Gender', 'Age', 'Group'])

    c_names = list(controls)
    c_ages = [c['age'] for c in controls.values()]
    c_genders = [c['gender']for c in controls.values()]
    c_group = [0 for _ in controls]
    controls_df = pd.DataFrame(list(zip(c_names, c_genders, c_ages, c_group)), columns=['Name','Gender', 'Age', 'Group'])

    matches = [[] for _ in range(max_age_diff+1)]
    not_matched = []
    
    for gender in ['male', 'female']:
        test_group = patients_df.loc[patients_df['Gender']==gender]
        control_group = controls_df.loc[controls_df['Gender']==gender]
        m = Matcher(test_group , control_group , yvar='Group', exclude = ['Name', 'Gender'])
        m.fit_scores(balance=True, nmodels=100, formula ='')
        m.match(with_replacement=False, nmatches=1, threshold=10)
        for match in m.matched_data.loc[m.matched_data['Group']==0].itertuples():
            case = test_group.iloc[match.match_id]
            diff = abs(case.Age - match.Age)
            if diff<=max_age_diff:
                matches[diff].append([case.Name, match.Name])
            else:
                not_matched.append(case.Name)
Exemplo n.º 3
0
 def PSM(self, HC_modal_pheno, ND_modal_pheno):
     '''A method class to Propensity score matching HC and ND data.
     
     Parameters
     ----------
     HC_modal_pheno: DataFrame
     Phenotype information DataFrame of HC in one modality
 
     ND_modal_pheno: DataFrame
     Phenotype information DataFrame of ND in one modality
     '''
     #select relevant phenotype: sex, age, ethnicity,smoking status, BMI, label samples
     HC_Match = HC_modal_pheno[["eid", "31-0.0", "34-0.0", "file_path"]]
     HC_Match[["34-0.0"]] = HC_Match [["34-0.0"]].astype(float)
     #HC_Match = HC_Match.fillna(method = 'ffill')
     HC_Match['label'] = 0
     
     ND_Match = ND_modal_pheno[["eid", "31-0.0", "34-0.0", "file_path"]]
     ND_Match[["34-0.0"]] = ND_Match [["34-0.0"]].astype(float)
     #ND_Match = ND_Match.fillna(method = 'ffill')
     ND_Match['label'] = 1
     
     #Caulate propensity score and match them
     match_PSM = Matcher(ND_Match, HC_Match, yvar="label", exclude=['eid', 'file_path'])
     np.random.seed(20200624)
     match_PSM.fit_scores(balance = True, nmodels = 1000)
     match_PSM.match(method = 'min', nmatches = 1, threshold = 0.001)
     
     #get the matched balanced data
     HC_ND_matched = match_PSM.matched_data[['eid', 'file_path', 'label']].sort_values('label', ascending = False)
     HC_ND_matched.reset_index(drop = True, inplace = True)
     
     return HC_ND_matched
Exemplo n.º 4
0
def calc_propensity_scores(file_name):
    data = pd.read_csv("datasets/{}.csv".format(file_name), index_col=0)[fields]
    categorical_c=[]
    for a in data.columns:
        try:
            float(data.iloc[0].loc[a])
        except:
            categorical_c.append(a)

    data_dummy=pd.get_dummies(data, columns=categorical_c, drop_first=True)

    control=data_dummy[data_dummy["T"]==0]
    test=data_dummy[data_dummy["T"]==1]

    m = Matcher(test, control, yvar="T", exclude=["Y"])
    np.random.seed(20170925)
    m.fit_scores(balance=False, nmodels=1)
    m.predict_scores()
    m.plot_scores()
    plt.savefig("output/pm_results_{}.png".format(file_name))
    m.data.to_csv("datasets/{}_p.csv".format(file_name))
    return m.data["scores"]
Exemplo n.º 5
0
                'logprice_adjusted', 'ImportParcelID', 'timeid', 'treatment',
                'YearBuilt', 'NoOfStories', 'TotalRooms', 'TotalBedrooms',
                'area', 'LandAssessedValue_persqft'
            ]]
            treated = treated.fillna(treated.mean())
            control = data3[data3['treatment'] == 0][[
                'Unique_Index', 'state', 'city_ID', 'year',
                'logprice_adjusted', 'ImportParcelID', 'timeid', 'treatment',
                'YearBuilt', 'NoOfStories', 'TotalRooms', 'TotalBedrooms',
                'area', 'LandAssessedValue_persqft'
            ]]
            control = control.fillna(control.mean())
            m = Matcher(treated,
                        control,
                        yvar="treatment",
                        exclude=[
                            'Unique_Index', 'state', 'city_ID', 'year',
                            'ImportParcelID', 'timeid', 'logprice_adjusted'
                        ])
            m.fit_scores(balance=True, nmodels=50)
            m.predict_scores()
            m.match(method="min", nmatches=3, threshold=0.0001)
            m.assign_weight_vector()
            Matched = pd.concat([Matched, m.matched_data], sort=False)
        except:
            pass

#%% sort out cities that have both CT
Matched = pd.read_csv('Matched4-1to3-add landvaluepersqft-balance false.csv')
treatment_city = Matched.groupby('city')['treatment'].value_counts().to_frame()
treatment_city.rename(columns={'treatment': 'count'}, inplace=True)
Exemplo n.º 6
0
def propensity_match(exposure,
                     control,
                     covariates=[
                         'age', 'apache_prob', 'sepsis',
                         'infection_skin_soft_tissue', 'immunocompromised'
                     ],
                     outcome_var='aki',
                     seed=389202,
                     balance=False,
                     n_models=100,
                     verbose=False):

    np.random.seed(seed)

    exposure = exposure.copy()
    control = control.copy()

    # make sure we don't overwrite the legit column status
    if 'status' in exposure.columns:
        exposure['status_original'] = exposure['status']
        control['status_original'] = control['status']
    exposure_var = 'status'
    exposure.loc[:, exposure_var] = 1
    control.loc[:, exposure_var] = 0

    # vars we exclude
    cols_exclude, cols_include = [], []
    for c in exposure.columns:
        if c == exposure_var:
            continue
        if c not in covariates:
            cols_exclude.append(c)
        else:
            cols_include.append(c)

    if len(cols_include) == 0:
        raise ValueError(
            'None of the covariates appear in the exposure dataframe.')
    logger.info((f'Columns included: {cols_include}'))

    # warn about missing data and missing columns
    for c in exposure.columns:
        if str(exposure[c].dtype) == 'object':
            mu = pd.concat([exposure[c], control[c]],
                           axis=0).value_counts().index[0]
        else:
            mu = pd.concat([exposure[c], control[c]], axis=0).mean()

        n = exposure[c].isnull().sum()
        if (n > 0) & (c not in cols_exclude):
            logger.warning(
                f'Column {c} missing {n} observations in exposure dataframe.')
            exposure[c].fillna(mu, inplace=True)

        if c not in control:
            logger.warning(f'Did not find column {c} in control dataframe.')
        else:
            n = control[c].isnull().sum()
            if (n > 0) & (c not in cols_exclude):
                logger.warning(
                    f'Column {c} missing {n} observations in control dataframe.'
                )
                control[c].fillna(mu, inplace=True)

    # print('Dataframe being used:')
    # display(exposure[cols].head())
    m = Matcher(exposure, control, yvar=exposure_var, exclude=cols_exclude)

    # predict the y outcome balancing the classes
    # repeat 100 times to be sure we use a lot of majority class data
    if balance:
        m.fit_scores(balance=balance, nmodels=n_models)
    else:
        m.fit_scores(balance=False)

    m.predict_scores()

    if verbose:
        m.plot_scores()

    # m.tune_threshold(method='random')
    m.match(
        method="min", nmatches=1,
        threshold=0.0005)  # finds the closest match for each minority record
    # m.record_frequency()

    # no categorical variables -> this errors
    if verbose:
        cc = m.compare_categorical(return_table=True)
        display(cc)
        cc = m.compare_continuous(return_table=True)
        display(cc)

    return m
#  - La Londe main goal: all tecniques available by his time weren't capable of got similar results to the experimental design,
#    then claiming that experimental design was the only reasonable tool to infer treatment impact
#  - Try to simulate some of the proposed alternative frameworks used by La Londe (Fixed Effect,TWO-STAGE ESTIMATOR)
#      - Mas bem talvez, nem os caras do 2o paper o fizeram. Mais importante é mostrar como Dummy Nonexperimental estimation é Naivy
#2 Demonstrate how the treatment effect is scored by simple t test and an adjusted result by regression
#3 Present the external data and how it difers from the original one
# Simulates original La Londes exercice to demonstrate how to apply a simple OLS into new data generates biased results
#4 explain the tecniques are able to create a new control based on the causal inference methods
#  - Exercice proposed by Dehejia and Wahba: they claimed that most modern tecniques, such as propensity scores matching,
#  were capable of generate better results
#5 Show rhe results and conclusion
# %%
treated = rct_data[rct_data.treat == 1].copy().drop(columns=['data_id'])
observational_control = observational_data.copy().drop(columns=['data_id'])
# %%
m = Matcher(treated, observational_control, yvar="treat", exclude=['re78'])
# %%
np.random.seed(666)
m.fit_scores(balance=True, nmodels=100)
# %%
m.predict_scores()
# %%
m.plot_scores()
# %%
m.tune_threshold(method='random')
# %%
m.match(method="min", nmatches=1, threshold=0.0004)
m.record_frequency()
# %%
m.assign_weight_vector()