class CheckIfReady4GLM(): def __init__(self, nimb_vars, fs_vars, proj_vars, f_ids_processed, f_GLM_group, FS_GLM_dir): self.proj_vars = proj_vars self.vars_fs = fs_vars self.FS_SUBJECTS_DIR = fs_vars['SUBJECTS_DIR'] self.NIMB_PROCESSED_FS = fs_vars['NIMB_PROCESSED'] self.f_ids_processed = f_ids_processed self.f_GLM_group = f_GLM_group self.FS_GLM_dir = FS_GLM_dir self.archive_type = '.zip' self.tab = Table() self.miss = dict() self.ids_4fs_glm = dict() self.df = self.tab.get_df(self.f_GLM_group) self.bids_ids = self.df[self.proj_vars['id_col']].tolist() self.ids_exclude_glm = os.path.join(self.FS_GLM_dir, 'excluded_from_glm.json') def chk_if_subjects_ready(self): fs_proc_ids = self.get_ids_processed() miss_bids_ids = [ i for i in self.bids_ids if i not in fs_proc_ids.keys() ] if miss_bids_ids: print( f' {len(miss_bids_ids)} IDs are missing from file: {self.f_ids_processed}' ) print(f' first 5 IDs are: {self.f_ids_processed[:5]}') for bids_id in miss_bids_ids: self.add_to_miss(bids_id, 'id_missing') if len(miss_bids_ids) < len(fs_proc_ids.keys()): for bids_id in [ i for i in self.bids_ids if i not in miss_bids_ids ]: fs_proc_id = fs_proc_ids[bids_id].replace( self.archive_type, '') if os.path.exists(os.path.join(self.FS_SUBJECTS_DIR, bids_id)): self.ids_4fs_glm[bids_id] = bids_id self.chk_glm_files(bids_id) elif os.path.exists( os.path.join(self.FS_SUBJECTS_DIR, fs_proc_id)): self.ids_4fs_glm[bids_id] = fs_proc_id self.chk_glm_files(fs_proc_id) else: print(f'id {bids_id} or freesurfer id {fs_proc_id} \ are missing from the {self.FS_SUBJECTS_DIR} folder') self.add_to_miss(bids_id, 'id_missing') if self.miss.keys(): print(" missing files and ids: ", self.miss) save_json(self.miss, self.ids_exclude_glm, print_space=8) subjs_missing = len(self.miss.keys()) subjs_present = len(self.ids_4fs_glm.keys()) print(f' Number of participants ready for FreeSurfer GLM:') print(f' in the folder: {self.FS_SUBJECTS_DIR}') print(f' {subjs_present} present') print(f' {subjs_missing} missing') not_ready = [ i for i in self.miss if "id_missing" not in self.miss[i] ] maybe_archived = [i for i in self.miss if i not in not_ready] if maybe_archived: print(" MAYBE archived: ", maybe_archived) q = " EXCEPTION! Some IDs are missing, but they could be archived.\n\ Do you want to do glm analysis with current subjects (y) or try to check the archive (n) ? (y/n)\n\ (note: if you answer NO, you will be asked to unarchive the \n\ processed folders of IDs if they are present in FREESURFER_PROCESSED)" if get_yes_no(q) == 1: self.create_fs_glm_df() return True, list() else: return False, maybe_archived if not_ready: print( " MISSING FILES: these participant CANNOT be included in the GLM analysis: ", not_ready) q = " EXCEPTION! Some IDs have missing files and they MUST be excluded from analysis.\n\ Do you want to continue without excluded IDs ? (y/n)" if get_yes_no(q) == 1: self.create_fs_glm_df() return True, list() else: return False, not_ready else: self.create_fs_glm_df() return True, list() else: print(' no ids found') return False, list() def chk_glm_files(self, bids_id): '''it is expected that the BIDS IDs are located in FREESURFER -> SUBJECTS_DIR script checks if subjects are present Args: bids_id: ID of the subject to chk Return: populates list of missing subjects populates dict with ids ''' files_not_ok = fs_definitions.ChkFSQcache(self.FS_SUBJECTS_DIR, bids_id, self.vars_fs).miss if files_not_ok: for file in files_not_ok[bids_id]: self.add_to_miss(bids_id, file) return False else: return True def create_fs_glm_df(self): self.rm_missing_ids() tmp_id = 'fs_id' print(' creating the glm file for FreeSurfer GLM analysis') d_ids = { self.proj_vars['id_col']: [i for i in list(self.ids_4fs_glm.keys())], tmp_id: [i for i in list(self.ids_4fs_glm.values())] } fs_proc_df = self.tab.create_df_from_dict(d_ids) fs_proc_df = self.tab.change_index(fs_proc_df, self.proj_vars['id_col']) grid_fs_df_pre = self.tab.change_index(self.df, self.proj_vars['id_col']) self.df_ids = self.tab.join_dfs(grid_fs_df_pre, fs_proc_df, how='outer') self.df_ids.rename(columns={tmp_id: self.proj_vars['id_col']}, inplace=True) self.df_ids = self.tab.change_index(self.df_ids, self.proj_vars['id_col']) self.tab.save_df(self.df_ids, self.f_GLM_group) PrepareForGLM(self.FS_SUBJECTS_DIR, self.FS_GLM_dir, self.f_GLM_group, self.proj_vars, self.vars_fs) def rm_missing_ids(self): ls_ix_2rm = list() for ix in self.df.index: bids_id = self.df.at[ix, self.proj_vars['id_col']] if bids_id not in self.ids_4fs_glm.keys(): ls_ix_2rm.append(ix) len_miss = len(ls_ix_2rm) if len_miss == 0: print(f' ALL subjects are present') else: print(f' {len_miss} subjects are missing') print(f' they will be removed from futher analysis') self.df = self.df.drop(ls_ix_2rm) def get_ids_processed(self): '''retrieves the bids names of the IDs provided in the GLM file. It is expected that each project had a group of subjects that are present in the dataset it is expected that BIDS names are the ones used in the groups_glm file for the ids the f_ids.json has the BIDS names of the subjects, and for each BIDS name has the corresponding names of the source file/freesurfer/nilearn/dipy processed ziped files see nimb/example/f_ids.json ''' print(' extracting list of ids that were processed with FreeSurfer') print(f' in the file{self.f_ids_processed}') self.ids_bids_proc_all = self.read_json(self.f_ids_processed) return { i: self.ids_bids_proc_all[i][DEFAULT.freesurfer_key] for i in self.ids_bids_proc_all } # return {i: 'path' for i in self.ids_bids_proc_all if self.ids_bids_proc_all[i]['source'] in ids_src_glm_file} #old version def add_to_miss(self, bids_id, file): '''add to the list of missing subjects ''' if bids_id not in self.miss: self.miss[bids_id] = list() self.miss[bids_id].append(file) if bids_id in self.ids_4fs_glm: self.ids_4fs_glm.pop(bids_id, None) def read_json(self, f): '''read a json file ''' with open(f, 'r') as jf: return json.load(jf)
class RUN_stats(): """will run statistical analysis for the provided groups file""" def __init__(self, all_vars): self.project = all_vars.params.project self.project_vars = all_vars.projects[self.project] self.stats_paths = self.project_vars['STATS_PATHS'] self.stats_params = self.project_vars['STATS_PARAMS'] self.group_col = self.project_vars['group_col'] self.dir_stats_home = self.stats_paths["STATS_HOME"] self.atlas = ('DK', 'DS', 'DKDS')[1] self.get_steps(all_vars) print( f' Performing statistical analysis in folder: {self.dir_stats_home}' ) print(' materials located at: {:<50}'.format( self.project_vars['materials_DIR'][1])) print(' file for analysis: {:<50}'.format( self.project_vars['fname_groups'])) print(' id column: {:<50}'.format(str(self.project_vars['id_col']))) print(' group column: {:<50}'.format( str(self.project_vars['group_col']))) # print(' variables to analyse: {:<50}'.format(str(self.project_vars['variables_for_glm']))) self.tab = Table() self.preproc = preprocessing.Preprocess() self.df_user_stats, self.df_final_grid,\ self.df_adjusted,\ self.cols_X,\ self.groups = MakeGrid(self.project_vars).grid() def run(self): print("running") for step in self.steps: step2run = self.steps[step]['name'] if self.steps[step]["run"]: print(f" running step: {step2run}") self.run_step(step2run) def run_step(self, step2run): self.use_features = False self.feature_algo = 'PCA' #'RFE' for group in [ 'all', ] + self.groups: #'all' stands for all groups df_X, y_labeled, X_scaled, df_clin_group = self.get_X_data_per_group_all_groups( group) df_with_features, features, features_rfe_and_rank_df = self.get_features_df_per_group( group, X_scaled, y_labeled, df_X) if group == 'all': self.params_y = self.project_vars['variables_for_glm'] # STEP run general stats if step2run == "STEP_stats_ttest": from stats.stats_stats import ttest_do variables = self.params_y + df_X.columns.tolist() dir_2save = varia.get_dir( path.join(self.dir_stats_home, group)) ttest_res = ttest_do(self.tab.join_dfs( df_clin_group, df_X), self.group_col, variables, self.groups, dir_2save, p_thresh=0.05).res_ttest # STEP run ANOVA and Simple Linear Regression if step2run == "STEP_Anova": from stats.stats_models import ANOVA_do print('performing ANOVA') sig_cols = self.run_anova(features, 0.05, 0.05) if step2run == "STEP_SimpLinReg": print('performing Simple Linear Regression on all columns') from stats.plotting import Make_Plot_Regression, Make_plot_group_difference dir_2save = varia.get_dir( self.stats_paths['simp_lin_reg_dir']) param_features = self.run_anova(features, 1.0, 1.0) Make_Plot_Regression(self.df_final_grid, param_features, self.group_col, dir_2save) dir_2save = varia.get_dir(self.stats_paths['anova']) Make_plot_group_difference(self.df_final_grid, param_features, self.group_col, self.groups, dir_2save) # from stats.stats_groups_anova import RUN_GroupAnalysis_ANOVA_SimpleLinearRegression # dir_2save = varia.get_dir(path.join(self.dir_stats_home, # self.stats_paths['anova']+"_"+group)) # RUN_GroupAnalysis_ANOVA_SimpleLinearRegression(self.df_final_grid, # groups, # self.params_y, # self.project_vars['other_params'], # dir_2save, # self.group_col, # features) # STEP run ANOVA and Simple Logistic Regression if step2run == "STEP_LogisticRegression": from stats import stats_LogisticRegression print('performing Logistic Regression for all groups') dir_2save = varia.get_dir( path.join( self.dir_stats_home, self.stats_paths['logistic_regression_dir'] + "_" + group)) stats_LogisticRegression.Logistic_Regression( X_scaled, y_labeled, self.group_col, dir_2save) # STEP run Prediction RF SKF if step2run == "STEP_Predict_RF_SKF": print(' performing RF SKF Prediction for all groups') df_X_scaled = self.tab.create_df(X_scaled, index_col=range( X_scaled.shape[0]), cols=self.cols_X) accuracy, best_estimator, average_score_list, _ = predict.SKF_algorithm( features, df_X_scaled[features].values, y_labeled) print( " prediction accuracy computed with RF and SKF based on PCA features is: ", accuracy) # accuracy, best_estimator, average_score_list, _ = predict.SKF_algorithm( # features_rfe_and_rank_df.feature, df_X_scaled[features_rfe_and_rank_df.feature].values, y_labeled) # print("prediction accuracy computed with RF and SKF based on RFE features is: ",accuracy) # STEP run Prediction RF LOO if step2run == "STEP_Predict_RF_LOO": print( 'performing RF Leave-One_out Prediction for all groups' ) df_X_scaled = self.tab.create_df(X_scaled, index_col=range( X_scaled.shape[0]), cols=self.cols_X) accuracy, best_estimator, average_score_list, _ = predict.LOO_algorithm( features, df_X_scaled[features].values, y_labeled) print( " prediction accuracy computed with RF and SKF based on PCA features is: ", accuracy) accuracy, best_estimator, average_score_list, _ = predict.LOO_algorithm( features_rfe_and_rank_df.feature, df_X_scaled[features_rfe_and_rank_df.feature].values, y_labeled) print( " prediction accuracy computed with RF and SKF based on RFE features is: ", accuracy) else: # run Descriptive Statistics dir_2save = varia.get_dir( path.join(self.dir_stats_home, 'description')) self.run_descriptive_stats(df_clin_group, features, dir_2save) # STEP run Linear Regression Moderation if step2run == "STEP_LinRegModeration": from stats import stats_models print('performing Linear Regression Moderation analysis') stats_models.linreg_moderation_results( self.df_final_grid, features, self.project_vars['group_param'], self.project_vars['regression_param'], varia.get_dir( path.join( self.dir_stats_home, self.stats_paths['linreg_moderation_dir'])), group) # STEP run Laterality if step2run == "STEP_Laterality": from processing.atlases.atlas_definitions import RReplace from stats import stats_laterality print('performing Laterality analysis') lhrh_feat_d = RReplace(features).contralateral_features lhrh_features_list = [i for i in lhrh_feat_d.keys()] + [ v for v in lhrh_feat_d.values() ] df_with_features_lhrh = self.tab.get_df_from_df( df_X, usecols=sorted(lhrh_features_list)) stats_laterality.LateralityAnalysis( df_with_features_lhrh, lhrh_feat_d, group, varia.get_dir( path.join( self.dir_stats_home, self.stats_paths['laterality_dir']))).run() def run_descriptive_stats(self, df_clin_group, features, dir_2save): print('running descriptive statistics') def run_anova(self, features, p_thresh, intercept_thresh): from stats.stats_models import ANOVA_do dir_2save = varia.get_dir(self.stats_paths['anova']) return ANOVA_do(self.df_final_grid, self.params_y, features, dir_2save, p_thresh=p_thresh, intercept_thresh=intercept_thresh).sig_cols def get_X_data_per_group_all_groups(self, group): # extract X_scaled values for the brain parameters predicted_target = self.project_vars["prediction_target"] print(f" predicted target column is: {predicted_target}") if not predicted_target: predicted_target = self.group_col if group == 'all': df_clin_group = self.df_user_stats df_X = self.df_adjusted y_labeled = preprocessing.label_y(self.df_user_stats, predicted_target) X_scaled = preprocessing.scale_X(df_X) else: df_group = self.tab.get_df_per_parameter(self.df_final_grid, self.group_col, group) df_clin_group = self.tab.rm_cols_from_df(df_group, self.cols_X) df_X = self.tab.rm_cols_from_df( df_group, [i for i in df_group.columns.tolist() if i not in self.cols_X]) y_labeled = preprocessing.label_y(df_group, predicted_target) X_scaled = preprocessing.scale_X(df_X) return df_X, y_labeled, X_scaled, df_clin_group def log(self): stats = predict.get_stats_df( len(cols_X), atlas, self.stats_params["prediction_vars"]['nr_threads'], definitions.sys.platform, time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) def get_features_df_per_group(self, group, X_scaled, y_labeled, df_X): features_rfe_and_rank_df = 'none' if self.use_features: if self.feature_algo == 'PCA': # using PCA dir_2save = varia.get_dir( path.join(self.dir_stats_home, self.stats_paths['features'])) pca_threshold = self.stats_params["prediction_vars"][ 'pca_threshold'] features = predict.get_features_based_on_pca( dir_2save, pca_threshold, X_scaled, self.cols_X, group, self.atlas) elif self.feature_algo == 'RFE': # using RFE features, features_rfe_and_rank_df = predict.feature_ranking( X_scaled, y_labeled, self.cols_X) print(" number of features extracted by RFE: ", len(features_rfe_and_rank_df.feature)) df_with_features = self.tab.get_df_from_df(df_X, usecols=features) else: df_with_features = self.tab.get_df_from_df(df_X, usecols=self.cols_X) features = self.cols_X return df_with_features, features, features_rfe_and_rank_df def get_steps(self, all_vars): self.steps = { "groups": { "name": "STEP0_make_groups", "run": False }, "ttest": { "name": "STEP_stats_ttest", "run": False }, "anova": { "name": "STEP_Anova", "run": False }, "simplinreg": { "name": "STEP_SimpLinReg", "run": False }, "logreg": { "name": "STEP_LogisticRegression", "run": False }, "predskf": { "name": "STEP_Predict_RF_SKF", "run": False }, "predloo": { "name": "STEP_Predict_RF_LOO", "run": False }, "linregmod": { "name": "STEP_LinRegModeration", "run": False }, "laterality": { "name": "STEP_Laterality", "run": False }, } if all_vars.params.step == 00: for i in ("groups", "ttest", "anova", "simplinreg", "logreg", "predskf", "predloo", "linregmod", "laterality"): self.steps[i]["run"] = True else: self.steps[all_vars.params.step]["run"] = True