def generate_df(metaroi_vals, outdir): """Generates a dataframe with the metaroi values for each subject Parameters ---------- metaroi_vals : dict Dictionary where keys are the filename and values are the mean FDG value outdir : string Full path where FDG output file should be saved Returns ------- metaroi_df : pandas DataFrame DataFrame where each row is an FDG scan """ metaroi_df = pd.DataFrame.from_dict(metaroi_vals, orient="index") metaroi_df.reset_index(level=0, inplace=True) metaroi_df.rename(columns={0: "roi_vals", "index": "path"}, inplace=True) metaroi_df["roi_vals"] = [float(x) for x in metaroi_df["roi_vals"]] metaroi_df["codea"] = [cf.get_id(x) for x in metaroi_df["path"]] metaroi_df = metaroi_df.rename(columns={"roi_vals": "FDG_val"}) metaroi_df = metaroi_df.drop("path", axis=1) cf.save_xls_and_pkl(metaroi_df, "fdg_metaroi", outdir) return metaroi_df
def codetranslator_run(codetblpath, outdir): """Takes an excel file as input and generates a pandas dataframe containing only matched pairs of codea and codeb. Parameters ---------- codetblpath : string Hard path to the excel file holding code data. Expects two columns named 'codeaGRAB' and 'codeb'. outdir : string Hard path to the directory to save the output file Renames columns to 'codea' and 'codeb' and saves xls and pkl files named 'codetranslator' to outdir. """ codetblin = pd.read_excel(codetblpath) codetbl = codetblin[['codeaGRAB','codeb']] codetbl = codetbl.rename(columns={'codeaGRAB' : 'codeb'}) codetbl = codetbl.dropna() cf.save_xls_and_pkl(codetbl, 'codetranslator', outdir) return codetbl
def datamerge_run(filenames, outdir, roc_cols): """Main function to merge all data Parameters ---------- filenames : list List of strings corresponding to filename prefixes of pickle format in outdir outdir : string Full path of directory containing files in filenames roc_cols : list List of strings corresponding to column names for which rate of change should be calculated Returns ------- tbldict : dict Dictionary of DataFrames each containing a different type of BACS data NPtbl : pandas DataFrame DataFrame where each row is a single subject's single cognitive testing session subjtbl : pandas DataFrame DataFrame where each row is a single subject """ tbldict = collect2dict(filenames, outdir) tbldict = cogtest_manipulation(tbldict, roc_cols) #count number of tps tbldict['cogtests'] = count_instances(tbldict['cogtests'], 'codeb', 'NP_NoTps') tbldict['aseg_change'] = count_instances(tbldict['aseg_change'], 'codea', 'MRI_NoTps') tbldict['pibparams'] = count_instances(tbldict['pibparams'], 'codea', 'PIB_NoTps') new_tbldict = {} for key, tbl in tbldict.iteritems(): tpcol = [s for s in tbl.columns if ('_Tp' in s)] if tpcol: tpcol = tpcol[0] tblflat, tblflatnm = flatten(tbl, tpcol, key, [1, '1']) new_tbldict[tblflatnm] = tblflat tbldict.update(new_tbldict) #make sure each table contains SubjID and BAC# fields for key, tbl in tbldict.iteritems(): tbl = addcodes(tbl, tbldict['codetranslator']) tbldict[key] = tbl #merge tables tblstojoin = ['cogtests_flat','pibparams_flat','aseg_change_flat','fdg_metaroi_flat','subjinfo'] joincol = ['codea','codeb'] subjtbl = mergelots(tbldict, tblstojoin, joincol) #merge tables tblstojoin = ['cogtests','subjinfo','pibparams_flat','aseg_change_flat','fdg_metaroi_flat'] joincol = ['codea','codeb'] NPtbl = mergelots(tbldict, tblstojoin, joincol) cf.save_xls_and_pkl(subjtbl, 'subjtbl', outdir) cf.save_xls_and_pkl(NPtbl, 'NPtbl', outdir) return tbldict, NPtbl, subjtbl
def mri_run(datadir, outdir, rois): """Main function to collect MRI volume data Parameters ---------- datadir : string Full path to root directory of freesurfer processed data. Expect file tree to be datadir/subcode/stats/aseg.stats outdir : string Full path to directory where data will be saved rois : list of strings List of freesurfer rois of interest. These volumes of these rois will be inserted in aseg_change along with their rates of change Returns ------- aseg_stats : pandas DataFrame DataFrame where each row is a scan, and columns are volumes of all freesurfer processed regions aseg_change : pandas DataFrame DataFrame where each row is a scan, and columns are the volumes of interest, their rates of change, and icv correction """ #get aseg_stats data from freesurfer processed data outfile = '%sFS_aseg_stats.txt' %outdir subs, asegout, output = extractFSasegstats(datadir, outfile) aseg_stats = pd.read_csv(outfile, header=0, delim_whitespace=True) #add columns for SubjID and MRI_TP aseg_stats['codea'] = [cf.get_id(sub) for sub in subs] aseg_stats['MRI_Tp'] = [cf.get_tp(sub) for sub in subs] aseg_stats.drop('Measure:volume', axis=1, inplace=True) #get dates of MRI scans that were processed with freesurfer mridates = bacs_pet_mri_date_batch(datadir) aseg_change = pd.merge(aseg_change, mridates, on=['codea','MRI_Tp']) rois_icvcorr = dict([(roi, '%s_icvcorr' %roi) for roi in rois]) aseg_change = icvcorr(aseg_change, rois_icvcorr, 'IntraCranialVol') #calculate rate of change in years for roi in rois: aseg_change = cf.rate_of_change(aseg_change, 'codea', 'MRI_Tp', 'MRI_Scandate', roi, '%s_sl' %roi) cf.save_xls_and_pkl(aseg_stats, 'aseg_stats', outdir) cf.save_xls_and_pkl(aseg_change, 'aseg_change', outdir) return aseg_stats, aseg_change
def pibparams_run(path_pib, pibrename, outdir, pibcutoff): """Reads data from the spreadsheet, does some calculations, and returns a Pandas dataframe with PIB data. Parameters ---------- path_pib : string String of full path to *.xls pibrename : dict Dictionary of name:rename pairs, where the keys are columns in the PIB spreadsheet and values are what to rename the keys to outdir : string Full path where final dataframe will be saved pibcutoff : float PIB cutoff value Returns ------- pib_df : pandas dataframe Dataframe containing all PIB data """ #read in pib data from old sheet pib_old = pd.read_excel(path_pib, sheetname='i') #read in PIB data from longitudinal timepoints pib_long = pd.read_excel(path_pib, sheetname='j') #concatenate PIB tables pib_df = pd.concat([pib_long, pib_old]) pib_df = pib_df[pibrename.keys()] pib_df.rename(columns=pibrename, inplace=True) #make binary PIB value pib_df['PIB_Pos'] = pib_df['PIB_Index'].apply(lambda x: 1 if x >= pibcutoff else 0) #calculate rate of change of PIB_Index in years pib_df = cf.rate_of_change(pib_df, 'codea', 'PIB_Tp', 'PIB_Scandate', 'PIB_Index', 'PIB_sl') #make column for the age at which PIB positivity appears pib_df.sort(columns=['codea','PIB_Tp'], inplace=True) #calculate age of PIB positivity pib_df['PIB_agepos'] = float('nan') pib_df = pib_df.groupby(by='codea') pib_df = pib_df.apply(f) cf.save_xls_and_pkl(pib_df, 'pibparams', outdir) return pib_df
def factoranalysis_run(cogpth, blpth, wpth, outdir, rowind, cogtests_master, **kwargs): """Takes cognitive data output from filemaker pro database and applies weights from a factor analysis. Outputs data for each subject for each cognitive session that has been z scored, and the factor weights for each of those datapoints. Parameters ---------- cogpth : string Path to folder holding excel sheets of cognitive data blpth : string Full path to excel sheet holding cognitive data for a reference population wpth : string Full path to excel file holding factor weights. Each column in this file is a cognitive test, and each row is a factor. outdir : string Full path where output files should be saved rowind : string Name of column representing the index value. Default is 'codeb' cogtests_master : list List of strings that are all cognitive tests that should be included in the factor analysis Returns ------- subjdata : dict Dictionary, keys are 'sessX' where X is the session number and values are DataFrames containing that session's cognitive data subjdata_z : pandas DataFrame DataFrame containing all subject data where each row is a single subject's single cognitive testing session. This data has been z-scored. cogdata : pandas DataFrame DataFrame where each row is a a single subject's single cognitive testing session. Columns are scores on each of the cognitive tests, and scores for each factor """ cogglob = sorted(glob(cogpth)) subjdata = cogprep(cogglob, cogtests_master) subjdata_z = zscore(blpth, subjdata, cogtests_master) cogdata = factorscores(subjdata_z, wpth, cogtests_master) cf.save_xls_and_pkl(cogdata, 'cogdata', outdir) return subjdata, subjdata_z, cogdata
def cogtestdates_run(path_cogdates, staticrename, outdir): """Reads cognitive testing dates into a dataframe Parameters ---------- path_cogdates : string Path to the excel sheets holding the dates of cognitive testing staticrename : dict Dictionary where keys are existing names of columns in the cogdates spreadsheet, and values are what to rename the keys outdir : string Full path where output files should be saved Returns ------- testing_out : pandas DataFrame DataFrame holding the dates of the cognitive tests for each subject at each timepoint subjinfo : pandas DataFrame DataFrame holding basic subject information """ #import data with neuropsych test dates cogdates = pd.read_excel(path_cogdates) cogdates.rename(columns=staticrename, inplace=True) staticcols = staticrename.values() #split table into basic subject variables, and ones that change with testing session subjinfo = cogdates[['codea'] + staticcols] testing = cogdates.drop(staticcols, axis=1) #make columns for APOE presence and dose subjinfo['APOE_presence'] = subjinfo.apply(APOE_presence, axis=1) subjinfo['APOE_dose'] = subjinfo.apply(APOE_dose, axis=1) #reconfigure testing table to put tp as row values testing_melted = pd.melt(testing, id_vars='codea', var_name='NP_Exam') #initiate regex statements to draw tp and test type from column names tp_regex = re.compile('(\d)::') type_regex = re.compile('::(.*)') refcol = testing_melted['NP_Exam'].tolist() #rename NP_Tp and NP_Type columns based on regex statements testing_melted['NP_Tp'] = [sm.group(1) for s in refcol for sm in [tp_regex.search(s)] if sm] testing_melted['NP_Type'] = [sm.group(1) for s in refcol for sm in [type_regex.search(s)] if sm] testing_melted['subtp'] = testing_melted['codea'] + testing_melted['NP_Tp'].map(str) pattern = re.compile('[\d\s_]+') testing_melted['NP_Type'] = [pattern.sub('', s) for s in testing_melted['NP_Type']] #reconfigure table to put tests in columns testing_piv = testing_melted.pivot(index='subtp', columns='NP_Type', values='value') testing_out = pd.merge(testing_piv.reset_index(), testing_melted, on='subtp') testing_out.drop(['NP_Type','subtp','value','NP_Exam'], axis=1, inplace=True) testing_out.drop_duplicates(inplace=True) testing_out.rename(columns={'AgeatSession':'NP_Age','NeuropsychExamTestDate':'NP_Date'}, inplace=True) testing_out.dropna(axis=0, subset=['NP_Date'], inplace=True) #add column for years relative to baseline timecalc = testing_out[testing_out['NP_Tp']=='1'] timecalc.rename(columns={'NP_Age':'NP_AgeBL','NP_Date':'NP_DateBL'}, inplace=True) timecalc.drop(['NP_Tp'], axis=1, inplace=True) testing_out = pd.merge(testing_out, timecalc, on='codea') testing_out['NP_YrsRelBL'] = pd.to_datetime(testing_out['NP_Date'])- pd.to_datetime(testing_out['NP_DateBL']) testing_out['NP_YrsRelBL'] = (testing_out['NP_YrsRelBL'].astype('timedelta64[D]'))/365.25 cf.save_xls_and_pkl(testing_out, 'cogtestdates', outdir) cf.save_xls_and_pkl(subjinfo, 'subjinfo', outdir) return testing_out, subjinfo