def save_df_Emmanuelle(df, groups, stats_dic, cols2color_sig, path2save, make_with_colors, extensions=('xlsx', 'csv', 'json')): if 'xlsx' in extensions: import openpyxl import string df.to_excel('stats_new.xlsx') ########## MERGE MEAN/STD SUB-INDEXES ################ file = openpyxl.load_workbook('stats_new.xlsx') sheet = file['Sheet1'] alpha = string.ascii_uppercase for ltr in range(len(alpha))[1:(2 * len(groups)) + 1:2]: cell1, cell2 = alpha[ltr] + str(2), alpha[ltr + 1] + str(2) sheet.merge_cells(str(cell1 + ':' + cell2)) file.save('stats_new.xlsx') if 'json' in extensions: utilities.save_json(stats_dic, os.path.join(path2save, 'stats.json')) if 'csv' in extensions: tab = Table() tab.save_df(df, os.path.join(path2save, 'stats_new.csv'), sheet_name='stats') if make_with_colors: save_2xlsx_with_colors_Emmanuelle(df, 'stats_new.xlsx', path2save, 'stats_wcolors.xlsx', cols2color_sig=cols2color_sig)
class ClusterFile2CSV(): def __init__(self, file_abspath, result_abspath): from stats.db_processing import Table self.contrasts = fs_definitions.GLMcontrasts['contrasts'] self.get_explanations() self.col_4constrasts = "Contrast" self.header = ("ClusterNo", "Max", "VtxMax", "Size(mm^2)", "TalX", "TalY", "TalZ", "CWP", "CWPLow", "CWPHi", "NVtxs", "WghtVtx", "Annot", self.col_4constrasts, "Explanation") self.length_matrix = len(self.header) self.content = open(file_abspath, 'r').readlines() self.result_abspath = result_abspath self.tab = Table() self.ls_vals_2chk = self.contrasts.keys() self.run() def run(self): d = dict() i = 0 while i < len(self.content): line = self.content[i].replace('\n','') if self.chk_if_vals_in_line(line): expl = self.content[i+1].replace('\n','').replace(';','.') d[i] = ['','','','','','','','','','','','','', line, expl,] i += 2 else: line = self.clean_nans_from_list(line.split(' ')) i += 1 if len(line) != 0: d[i] = line + ['',''] self.save_2table(d) def save_2table(self, d): df = self.tab.create_df_from_dict(d).T column_names = {i[0]:i[1] for i in list(zip(df.columns, self.header))} df = df.rename(columns = column_names) df = df.set_index(df[self.col_4constrasts]) df = df.drop(columns = [self.col_4constrasts]) self.tab.save_df(df, self.result_abspath) def chk_if_vals_in_line(self, line): '''will use each value from self.ls_vals_2chk if present in the line: will return True and break else: return False ''' exists = False for val_2chk in self.ls_vals_2chk: if val_2chk in line: exists = True break return exists def clean_nans_from_list(self, ls): for i in ls[::-1]: if i == '': ls.remove(i) return ls def get_explanations(self): self.explanations = list() for key in self.contrasts: for file_name in self.contrasts[key]: self.explanations.append(self.contrasts[key][file_name][1])
class CheckIfReady4GLM(): def __init__(self, nimb_vars, fs_vars, proj_vars, f_ids_processed, f_GLM_group, FS_GLM_dir): self.proj_vars = proj_vars self.vars_fs = fs_vars self.FS_SUBJECTS_DIR = fs_vars['SUBJECTS_DIR'] self.NIMB_PROCESSED_FS = fs_vars['NIMB_PROCESSED'] self.f_ids_processed = f_ids_processed self.f_GLM_group = f_GLM_group self.FS_GLM_dir = FS_GLM_dir self.archive_type = '.zip' self.tab = Table() self.miss = dict() self.ids_4fs_glm = dict() self.df = self.tab.get_df(self.f_GLM_group) self.bids_ids = self.df[self.proj_vars['id_col']].tolist() self.ids_exclude_glm = os.path.join(self.FS_GLM_dir, 'excluded_from_glm.json') def chk_if_subjects_ready(self): fs_proc_ids = self.get_ids_processed() miss_bids_ids = [ i for i in self.bids_ids if i not in fs_proc_ids.keys() ] if miss_bids_ids: print( f' {len(miss_bids_ids)} IDs are missing from file: {self.f_ids_processed}' ) print(f' first 5 IDs are: {self.f_ids_processed[:5]}') for bids_id in miss_bids_ids: self.add_to_miss(bids_id, 'id_missing') if len(miss_bids_ids) < len(fs_proc_ids.keys()): for bids_id in [ i for i in self.bids_ids if i not in miss_bids_ids ]: fs_proc_id = fs_proc_ids[bids_id].replace( self.archive_type, '') if os.path.exists(os.path.join(self.FS_SUBJECTS_DIR, bids_id)): self.ids_4fs_glm[bids_id] = bids_id self.chk_glm_files(bids_id) elif os.path.exists( os.path.join(self.FS_SUBJECTS_DIR, fs_proc_id)): self.ids_4fs_glm[bids_id] = fs_proc_id self.chk_glm_files(fs_proc_id) else: print(f'id {bids_id} or freesurfer id {fs_proc_id} \ are missing from the {self.FS_SUBJECTS_DIR} folder') self.add_to_miss(bids_id, 'id_missing') if self.miss.keys(): print(" missing files and ids: ", self.miss) save_json(self.miss, self.ids_exclude_glm, print_space=8) subjs_missing = len(self.miss.keys()) subjs_present = len(self.ids_4fs_glm.keys()) print(f' Number of participants ready for FreeSurfer GLM:') print(f' in the folder: {self.FS_SUBJECTS_DIR}') print(f' {subjs_present} present') print(f' {subjs_missing} missing') not_ready = [ i for i in self.miss if "id_missing" not in self.miss[i] ] maybe_archived = [i for i in self.miss if i not in not_ready] if maybe_archived: print(" MAYBE archived: ", maybe_archived) q = " EXCEPTION! Some IDs are missing, but they could be archived.\n\ Do you want to do glm analysis with current subjects (y) or try to check the archive (n) ? (y/n)\n\ (note: if you answer NO, you will be asked to unarchive the \n\ processed folders of IDs if they are present in FREESURFER_PROCESSED)" if get_yes_no(q) == 1: self.create_fs_glm_df() return True, list() else: return False, maybe_archived if not_ready: print( " MISSING FILES: these participant CANNOT be included in the GLM analysis: ", not_ready) q = " EXCEPTION! Some IDs have missing files and they MUST be excluded from analysis.\n\ Do you want to continue without excluded IDs ? (y/n)" if get_yes_no(q) == 1: self.create_fs_glm_df() return True, list() else: return False, not_ready else: self.create_fs_glm_df() return True, list() else: print(' no ids found') return False, list() def chk_glm_files(self, bids_id): '''it is expected that the BIDS IDs are located in FREESURFER -> SUBJECTS_DIR script checks if subjects are present Args: bids_id: ID of the subject to chk Return: populates list of missing subjects populates dict with ids ''' files_not_ok = fs_definitions.ChkFSQcache(self.FS_SUBJECTS_DIR, bids_id, self.vars_fs).miss if files_not_ok: for file in files_not_ok[bids_id]: self.add_to_miss(bids_id, file) return False else: return True def create_fs_glm_df(self): self.rm_missing_ids() tmp_id = 'fs_id' print(' creating the glm file for FreeSurfer GLM analysis') d_ids = { self.proj_vars['id_col']: [i for i in list(self.ids_4fs_glm.keys())], tmp_id: [i for i in list(self.ids_4fs_glm.values())] } fs_proc_df = self.tab.create_df_from_dict(d_ids) fs_proc_df = self.tab.change_index(fs_proc_df, self.proj_vars['id_col']) grid_fs_df_pre = self.tab.change_index(self.df, self.proj_vars['id_col']) self.df_ids = self.tab.join_dfs(grid_fs_df_pre, fs_proc_df, how='outer') self.df_ids.rename(columns={tmp_id: self.proj_vars['id_col']}, inplace=True) self.df_ids = self.tab.change_index(self.df_ids, self.proj_vars['id_col']) self.tab.save_df(self.df_ids, self.f_GLM_group) PrepareForGLM(self.FS_SUBJECTS_DIR, self.FS_GLM_dir, self.f_GLM_group, self.proj_vars, self.vars_fs) def rm_missing_ids(self): ls_ix_2rm = list() for ix in self.df.index: bids_id = self.df.at[ix, self.proj_vars['id_col']] if bids_id not in self.ids_4fs_glm.keys(): ls_ix_2rm.append(ix) len_miss = len(ls_ix_2rm) if len_miss == 0: print(f' ALL subjects are present') else: print(f' {len_miss} subjects are missing') print(f' they will be removed from futher analysis') self.df = self.df.drop(ls_ix_2rm) def get_ids_processed(self): '''retrieves the bids names of the IDs provided in the GLM file. It is expected that each project had a group of subjects that are present in the dataset it is expected that BIDS names are the ones used in the groups_glm file for the ids the f_ids.json has the BIDS names of the subjects, and for each BIDS name has the corresponding names of the source file/freesurfer/nilearn/dipy processed ziped files see nimb/example/f_ids.json ''' print(' extracting list of ids that were processed with FreeSurfer') print(f' in the file{self.f_ids_processed}') self.ids_bids_proc_all = self.read_json(self.f_ids_processed) return { i: self.ids_bids_proc_all[i][DEFAULT.freesurfer_key] for i in self.ids_bids_proc_all } # return {i: 'path' for i in self.ids_bids_proc_all if self.ids_bids_proc_all[i]['source'] in ids_src_glm_file} #old version def add_to_miss(self, bids_id, file): '''add to the list of missing subjects ''' if bids_id not in self.miss: self.miss[bids_id] = list() self.miss[bids_id].append(file) if bids_id in self.ids_4fs_glm: self.ids_4fs_glm.pop(bids_id, None) def read_json(self, f): '''read a json file ''' with open(f, 'r') as jf: return json.load(jf)
def mkstatisticsf(df_4stats, groups, group_col, path2save, make_with_colors=True): '''Creates discriptive statistical file for publication, based on provided pandas.DataFrame Works only on 2 groups Args: df_4stats: pandas.DataFrame group: list/ tuple of groups as str/int group_col: str() column name in df_4stats that has the group names from group path_2save: abspath to save the descrptive files make_with_colors: will create an additional .xlsx file with colored significant results, provided xlwt is installed Return: json file with results .csv file with results .xlsx file with results with red colored significant ones ''' tab = Table() ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'TTest', 'Welch', 'ANOVA', 'Bartlett', 'MannWhitneyu', 'Kruskal') groups_df = dict() for group in groups: groups_df[group] = tab.get_df_per_parameter(df_4stats, group_col, group) stats_dic = dict() vals2chk = df_4stats.columns.tolist() if group_col in vals2chk: vals2chk.remove(group_col) cols2color_sig = list() groups = list(groups_df.keys()) group1 = groups_df[groups[0]] group2 = groups_df[groups[1]] for test in ls_tests: for val in vals2chk: results, params = get_stats(test, group1[val], group2[val]) if test in ('mean', 'std', 'kurtosis', 'skewness'): key1 = f'{groups[0]}, {params[0]}' key2 = f'{groups[1]}, {params[0]}' else: key1 = f'{test}, {params[0]}' key2 = f'{test}, {params[1]}' cols2color_sig.append(key2) for key in (key1, key2): if key not in stats_dic: stats_dic[key] = dict() stats_dic[key1][val] = f'{results[0]}' stats_dic[key2][val] = f'{results[1]}' df = tab.create_df_from_dict(stats_dic) tab.save_df(df, os.path.join(path2save, 'stats_general.csv'), sheet_name='stats') utilities.save_json(stats_dic, os.path.join(path2save, 'stats_general.json')) if make_with_colors: save_2xlsx_with_colors(df, path2save=path2save, cols2color_sig=cols2color_sig)