def qc_prop_matching(self, rel_cols, label): """ Evaluates the need for a propensity score matching and can be used to quality control a propensity score matched population. Will train classifiers and create a plot. :param rel_cols: relevant columns :param label: Label or class which should be regressed. \ (cohort1/cohort2, case/control, treatment/untreated etc.) """ cols = rel_cols[::] # create reduced copies of the dataframes for propensity score quality control qc_dfs = [] for df in self: qc_dfs.append(df[cols]) # exclude label if included into columns if label in cols: cols.remove(label) # construct formula formula = construct_formula(label, cols) # create Matcher m = Matcher(*qc_dfs, yvar=label, formula=formula) # train classifier to asses predictability m.fit_scores(balance=True, nmodels=10) # calculate and visualize propensity scores m.predict_scores() m.plot_scores()
def pymatch_matching(patients, controls, max_age_diff=3): raise Exception('This doesnt seem to work') patients = {p:patients[p] for p in patients} controls = {p:controls[p] for p in controls} p_names = list(patients) p_ages = [p['age'] for p in patients.values()] p_genders = [p['gender'] for p in patients.values()] p_group = [1 for _ in patients] patients_df = pd.DataFrame(list(zip(p_names,p_genders, p_ages, p_group)), columns=['Name','Gender', 'Age', 'Group']) c_names = list(controls) c_ages = [c['age'] for c in controls.values()] c_genders = [c['gender']for c in controls.values()] c_group = [0 for _ in controls] controls_df = pd.DataFrame(list(zip(c_names, c_genders, c_ages, c_group)), columns=['Name','Gender', 'Age', 'Group']) matches = [[] for _ in range(max_age_diff+1)] not_matched = [] for gender in ['male', 'female']: test_group = patients_df.loc[patients_df['Gender']==gender] control_group = controls_df.loc[controls_df['Gender']==gender] m = Matcher(test_group , control_group , yvar='Group', exclude = ['Name', 'Gender']) m.fit_scores(balance=True, nmodels=100, formula ='') m.match(with_replacement=False, nmatches=1, threshold=10) for match in m.matched_data.loc[m.matched_data['Group']==0].itertuples(): case = test_group.iloc[match.match_id] diff = abs(case.Age - match.Age) if diff<=max_age_diff: matches[diff].append([case.Name, match.Name]) else: not_matched.append(case.Name)
def PSM(self, HC_modal_pheno, ND_modal_pheno): '''A method class to Propensity score matching HC and ND data. Parameters ---------- HC_modal_pheno: DataFrame Phenotype information DataFrame of HC in one modality ND_modal_pheno: DataFrame Phenotype information DataFrame of ND in one modality ''' #select relevant phenotype: sex, age, ethnicity,smoking status, BMI, label samples HC_Match = HC_modal_pheno[["eid", "31-0.0", "34-0.0", "file_path"]] HC_Match[["34-0.0"]] = HC_Match [["34-0.0"]].astype(float) #HC_Match = HC_Match.fillna(method = 'ffill') HC_Match['label'] = 0 ND_Match = ND_modal_pheno[["eid", "31-0.0", "34-0.0", "file_path"]] ND_Match[["34-0.0"]] = ND_Match [["34-0.0"]].astype(float) #ND_Match = ND_Match.fillna(method = 'ffill') ND_Match['label'] = 1 #Caulate propensity score and match them match_PSM = Matcher(ND_Match, HC_Match, yvar="label", exclude=['eid', 'file_path']) np.random.seed(20200624) match_PSM.fit_scores(balance = True, nmodels = 1000) match_PSM.match(method = 'min', nmatches = 1, threshold = 0.001) #get the matched balanced data HC_ND_matched = match_PSM.matched_data[['eid', 'file_path', 'label']].sort_values('label', ascending = False) HC_ND_matched.reset_index(drop = True, inplace = True) return HC_ND_matched
def calc_propensity_scores(file_name): data = pd.read_csv("datasets/{}.csv".format(file_name), index_col=0)[fields] categorical_c=[] for a in data.columns: try: float(data.iloc[0].loc[a]) except: categorical_c.append(a) data_dummy=pd.get_dummies(data, columns=categorical_c, drop_first=True) control=data_dummy[data_dummy["T"]==0] test=data_dummy[data_dummy["T"]==1] m = Matcher(test, control, yvar="T", exclude=["Y"]) np.random.seed(20170925) m.fit_scores(balance=False, nmodels=1) m.predict_scores() m.plot_scores() plt.savefig("output/pm_results_{}.png".format(file_name)) m.data.to_csv("datasets/{}_p.csv".format(file_name)) return m.data["scores"]
'logprice_adjusted', 'ImportParcelID', 'timeid', 'treatment', 'YearBuilt', 'NoOfStories', 'TotalRooms', 'TotalBedrooms', 'area', 'LandAssessedValue_persqft' ]] treated = treated.fillna(treated.mean()) control = data3[data3['treatment'] == 0][[ 'Unique_Index', 'state', 'city_ID', 'year', 'logprice_adjusted', 'ImportParcelID', 'timeid', 'treatment', 'YearBuilt', 'NoOfStories', 'TotalRooms', 'TotalBedrooms', 'area', 'LandAssessedValue_persqft' ]] control = control.fillna(control.mean()) m = Matcher(treated, control, yvar="treatment", exclude=[ 'Unique_Index', 'state', 'city_ID', 'year', 'ImportParcelID', 'timeid', 'logprice_adjusted' ]) m.fit_scores(balance=True, nmodels=50) m.predict_scores() m.match(method="min", nmatches=3, threshold=0.0001) m.assign_weight_vector() Matched = pd.concat([Matched, m.matched_data], sort=False) except: pass #%% sort out cities that have both CT Matched = pd.read_csv('Matched4-1to3-add landvaluepersqft-balance false.csv') treatment_city = Matched.groupby('city')['treatment'].value_counts().to_frame() treatment_city.rename(columns={'treatment': 'count'}, inplace=True)
def propensity_match(exposure, control, covariates=[ 'age', 'apache_prob', 'sepsis', 'infection_skin_soft_tissue', 'immunocompromised' ], outcome_var='aki', seed=389202, balance=False, n_models=100, verbose=False): np.random.seed(seed) exposure = exposure.copy() control = control.copy() # make sure we don't overwrite the legit column status if 'status' in exposure.columns: exposure['status_original'] = exposure['status'] control['status_original'] = control['status'] exposure_var = 'status' exposure.loc[:, exposure_var] = 1 control.loc[:, exposure_var] = 0 # vars we exclude cols_exclude, cols_include = [], [] for c in exposure.columns: if c == exposure_var: continue if c not in covariates: cols_exclude.append(c) else: cols_include.append(c) if len(cols_include) == 0: raise ValueError( 'None of the covariates appear in the exposure dataframe.') logger.info((f'Columns included: {cols_include}')) # warn about missing data and missing columns for c in exposure.columns: if str(exposure[c].dtype) == 'object': mu = pd.concat([exposure[c], control[c]], axis=0).value_counts().index[0] else: mu = pd.concat([exposure[c], control[c]], axis=0).mean() n = exposure[c].isnull().sum() if (n > 0) & (c not in cols_exclude): logger.warning( f'Column {c} missing {n} observations in exposure dataframe.') exposure[c].fillna(mu, inplace=True) if c not in control: logger.warning(f'Did not find column {c} in control dataframe.') else: n = control[c].isnull().sum() if (n > 0) & (c not in cols_exclude): logger.warning( f'Column {c} missing {n} observations in control dataframe.' ) control[c].fillna(mu, inplace=True) # print('Dataframe being used:') # display(exposure[cols].head()) m = Matcher(exposure, control, yvar=exposure_var, exclude=cols_exclude) # predict the y outcome balancing the classes # repeat 100 times to be sure we use a lot of majority class data if balance: m.fit_scores(balance=balance, nmodels=n_models) else: m.fit_scores(balance=False) m.predict_scores() if verbose: m.plot_scores() # m.tune_threshold(method='random') m.match( method="min", nmatches=1, threshold=0.0005) # finds the closest match for each minority record # m.record_frequency() # no categorical variables -> this errors if verbose: cc = m.compare_categorical(return_table=True) display(cc) cc = m.compare_continuous(return_table=True) display(cc) return m
# - La Londe main goal: all tecniques available by his time weren't capable of got similar results to the experimental design, # then claiming that experimental design was the only reasonable tool to infer treatment impact # - Try to simulate some of the proposed alternative frameworks used by La Londe (Fixed Effect,TWO-STAGE ESTIMATOR) # - Mas bem talvez, nem os caras do 2o paper o fizeram. Mais importante é mostrar como Dummy Nonexperimental estimation é Naivy #2 Demonstrate how the treatment effect is scored by simple t test and an adjusted result by regression #3 Present the external data and how it difers from the original one # Simulates original La Londes exercice to demonstrate how to apply a simple OLS into new data generates biased results #4 explain the tecniques are able to create a new control based on the causal inference methods # - Exercice proposed by Dehejia and Wahba: they claimed that most modern tecniques, such as propensity scores matching, # were capable of generate better results #5 Show rhe results and conclusion # %% treated = rct_data[rct_data.treat == 1].copy().drop(columns=['data_id']) observational_control = observational_data.copy().drop(columns=['data_id']) # %% m = Matcher(treated, observational_control, yvar="treat", exclude=['re78']) # %% np.random.seed(666) m.fit_scores(balance=True, nmodels=100) # %% m.predict_scores() # %% m.plot_scores() # %% m.tune_threshold(method='random') # %% m.match(method="min", nmatches=1, threshold=0.0004) m.record_frequency() # %% m.assign_weight_vector()