def cronbach_alpha_scale_if_deleted(df): gca = pg.cronbach_alpha(df) result = pd.DataFrame(columns=[ "Item", "Scale Mean if Item Deleted", "Scale Variance if Item Deleted", "Corrected Item-Total Correlation", "Cronbach's Alpha if Item Deleted" ]) for column in df: sub_df = df.drop([column], axis=1) ac = pg.cronbach_alpha(sub_df) scale_mean = sub_df.mean().sum() variance = sub_df.sum(axis=1).var() pr = pearsonr(sub_df.mean(axis=1), df[column]) result = result.append( { 'Item': column, "Scale Mean if Item Deleted": scale_mean, "Scale Variance if Item Deleted": variance, "Corrected Item-Total Correlation": pr[0], "Cronbach's Alpha if Item Deleted": ac[0] }, ignore_index=True) return [gca, result]
def benchmark_reproducibility(comb, modality, alg, sub_dict_clean, disc, int_consist, final_missingness_summary): df_summary = pd.DataFrame( columns=['grid', 'modality', 'embedding', 'discriminability']) print(comb) df_summary.at[0, "modality"] = modality df_summary.at[0, "embedding"] = alg if modality == 'func': try: extract, hpass, model, res, atlas, smooth = comb except: print(f"Missing {comb}...") extract, hpass, model, res, atlas = comb smooth = '0' comb_tuple = (atlas, extract, hpass, model, res, smooth) else: directget, minlength, model, res, atlas, tol = comb comb_tuple = (atlas, directget, minlength, model, res, tol) df_summary.at[0, "grid"] = comb_tuple missing_sub_seshes = \ final_missingness_summary.loc[(final_missingness_summary['alg']==alg) & (final_missingness_summary[ 'modality']==modality) & (final_missingness_summary[ 'grid']==comb_tuple) ].drop_duplicates(subset='id') # int_consist if int_consist is True and alg == 'topology': try: import pingouin as pg except ImportError: print("Cannot evaluate test-retest int_consist. pingouin" " must be installed!") for met in mets: id_dict = {} for ID in ids: id_dict[ID] = {} for ses in sub_dict_clean[ID].keys(): if comb_tuple in sub_dict_clean[ID][ses][modality][ alg].keys(): id_dict[ID][ses] = \ sub_dict_clean[ID][ses][modality][alg][comb_tuple][ mets.index(met)][0] df_wide = pd.DataFrame(id_dict).T if df_wide.empty: del df_wide return pd.Series() df_wide = df_wide.add_prefix(f"{met}_visit_") df_wide.replace(0, np.nan, inplace=True) try: c_alpha = pg.cronbach_alpha(data=df_wide) except: print('FAILED...') print(df_wide) del df_wide return pd.Series() df_summary.at[0, f"cronbach_alpha_{met}"] = c_alpha[0] del df_wide # icc if icc is True and alg == 'topology': try: import pingouin as pg except ImportError: print("Cannot evaluate ICC. pingouin" " must be installed!") for met in mets: id_dict = {} dfs = [] for ses in [str(i) for i in range(1, 11)]: for ID in ids: id_dict[ID] = {} if comb_tuple in sub_dict_clean[ID][ses][modality][ alg].keys(): id_dict[ID][ses] = \ sub_dict_clean[ID][ses][modality][alg][comb_tuple][ mets.index(met)][0] df = pd.DataFrame(id_dict).T if df.empty: del df_long return pd.Series() df.columns.values[0] = f"{met}" df.replace(0, np.nan, inplace=True) df['id'] = df.index df['ses'] = ses df.reset_index(drop=True, inplace=True) dfs.append(df) df_long = pd.concat(dfs, names=[ 'id', 'ses', f"{met}" ]).drop(columns=[str(i) for i in range(1, 10)]) try: c_icc = pg.intraclass_corr(data=df_long, targets='id', raters='ses', ratings=f"{met}", nan_policy='omit').round(3) c_icc = c_icc.set_index("Type") df_summary.at[0, f"icc_{met}"] = pd.DataFrame( c_icc.drop( index=['ICC1', 'ICC2', 'ICC3'])['ICC']).mean()[0] except: print('FAILED...') print(df_long) del df_long return pd.Series() del df_long if disc is True: vect_all = [] for ID in ids: try: out = gen_sub_vec(sub_dict_clean, ID, modality, alg, comb_tuple) except: print(f"{ID} {modality} {alg} {comb_tuple} failed...") continue # print(out) vect_all.append(out) vect_all = [ i for i in vect_all if i is not None and not np.isnan(i).all() ] if len(vect_all) > 0: if alg == 'topology': X_top = np.swapaxes(np.hstack(vect_all), 0, 1) bad_ixs = [i[1] for i in np.argwhere(np.isnan(X_top))] for m in set(bad_ixs): if (X_top.shape[0] - bad_ixs.count(m)) / \ X_top.shape[0] < 0.50: X_top = np.delete(X_top, m, axis=1) else: if len(vect_all) > 0: X_top = np.array(pd.concat(vect_all, axis=0)) else: return pd.Series() shapes = [] for ix, i in enumerate(vect_all): shapes.append(i.shape[0] * [list(ids)[ix]]) Y = np.array(list(flatten(shapes))) if alg == 'topology': imp = IterativeImputer(max_iter=50, random_state=42) else: imp = SimpleImputer() X_top = imp.fit_transform(X_top) scaler = StandardScaler() X_top = scaler.fit_transform(X_top) try: discr_stat_val, rdf = discr_stat(X_top, Y) except: return pd.Series() df_summary.at[0, "discriminability"] = discr_stat_val print(discr_stat_val) print("\n") # print(rdf) del discr_stat_val del vect_all return df_summary
def main(): #Initialize project #Intialize Project print('Project: Iris Classification', file=outfile) print('Author: Aakriti Sinha', file=outfile) print('Last run on ', datetime.now(), file=outfile) #------------------------------------------------------------------------------ #Raw Data #Get raw dataframe from data.make_dataset import df_iris #Describe raw data print('\nRaw Dataset Snapshot', file=outfile) print(df_iris.head(), '\n', file=outfile) print('\nRaw Data Description', file=outfile) print(df_iris.describe(), '\n', file=outfile) print('List of categories in categorical variable', file=outfile) print(df_iris['species'].unique(), '\n', file=outfile) #------------------------------------------------------------------------------ #Data Cleaning #Get tidy dataframe from data.clean_data import df_iris, missing_message print(missing_message) #Describe clean data print('\n\nClean Dataset Snapshot', file=outfile) print(df_iris.head(), '\n', file=outfile) print('\nClean Data Description', file=outfile) data_desc = df_iris.describe() print(data_desc, '\n', file=outfile) print('List of categories in categorical variable', file=outfile) cat_list = df_iris['species'].unique() print(cat_list, '\n', file=outfile) print('Distribution of categories', file=outfile) cat_dist = df_iris.groupby('species').count() print(cat_dist, file=outfile) #Save clean data description report abs_file_path = f_getFilePath("reports\\iris_clean_description.txt") cleandescfile = open(abs_file_path, 'w') print('\nClean Data Description', file=cleandescfile) print(data_desc, '\n', file=cleandescfile) print('List of categories in categorical variable', file=cleandescfile) print(cat_list, '\n', file=cleandescfile) print('Distribution of categories', file=cleandescfile) print(cat_dist, file=cleandescfile) cleandescfile.close() #------------------------------------------------------------------------------ #Test power of dataset f_powerTest(df_iris) #------------------------------------------------------------------------------ #Feature Scaling print('\n\nFeature Scaling: Centering, Standardizing and Normalizing', file=outfile) from data.scale_data import df_iris #Describe scaled data print('\nScaled Dataset Snapshot', file=outfile) print(df_iris.head(), '\n', file=outfile) print('\nScaled Data Description', file=outfile) data_desc = df_iris.describe() print(data_desc, '\n', file=outfile) print('List of categories in categorical variable', file=outfile) cat_list = df_iris['species'].unique() print(cat_list, '\n', file=outfile) print('Distribution of categories', file=outfile) cat_dist = df_iris.groupby('species').count() print(cat_dist, file=outfile) #Save scaled data description report abs_file_path = f_getFilePath("reports\\iris_scaled_description.txt") scaledescfile = open(abs_file_path, 'w') print('\nClean Data Description', file=scaledescfile) print(data_desc, '\n', file=scaledescfile) print('List of categories in categorical variable', file=scaledescfile) print(cat_list, '\n', file=scaledescfile) print('Distribution of categories', file=scaledescfile) print(cat_dist, file=scaledescfile) scaledescfile.close() #------------------------------------------------------------------------------ #Check Correlation corr_csv_name = 'reports\\correlation.csv' corr_image_name = 'reports\\figures\\Correlation_Heatmap.png' correlation = f_correlation(df_iris, corr_csv_name, corr_image_name) #Scatterplot Matrix scplt_image_name = 'reports\\figures\\Scatterplot_Matrix.png' f_scatterplot(df_iris, scplt_image_name) print('\n**MULTICOLLINEARITY FOUND**', file=outfile) #------------------------------------------------------------------------------ #Factor Analysis print('\nFACTOR ANALYSIS\n', file=outfile) #Testing factorability # f_testFactorability(df_iris, correlation) from features.factor_analysis import df_iris_scores df_iris_scores['species'] = df_iris['species'] #Check Correlation corr_csv_name = 'reports\\correlation_factors.csv' corr_image_name = 'reports\\figures\\Correlation_Heatmap_Factors.png' correlation = f_correlation(df_iris_scores, corr_csv_name, corr_image_name) #Scatterplot Matrix scplt_image_name = 'reports\\figures\\Scatterplot_Matrix_Factors.png' f_scatterplot(df_iris_scores, scplt_image_name) print( '\n**Factor 2 has low correlation with Species. So dropping Factor 2**\n', file=outfile) df_iris_scores.drop('Factor2', axis=1) #Save selected feature scores abs_file_path = f_getFilePath("data\\processed\\iris_scores.csv") df_iris_scores.to_csv(abs_file_path, index=False, encoding='utf-8') #Describe selected features print('\nSelected Features Snapshot', file=outfile) print(df_iris_scores.head(), '\n', file=outfile) print('\nSelected Features Description', file=outfile) data_desc = df_iris_scores.describe() print(data_desc, file=outfile) print('List of categories in categorical variable', file=outfile) cat_list = df_iris['species'].unique() print(cat_list, '\n', file=outfile) print('Distribution of categories', file=outfile) cat_dist = df_iris.groupby('species').count() print(cat_dist, file=outfile) #Save selected factors description report abs_file_path = f_getFilePath('reports\\iris_factors_description.txt') fadescfile = open(abs_file_path, 'w') print(data_desc, file=fadescfile) print('\nList of categories in categorical variable\n', cat_list, file=fadescfile) print('\nDistribution of categories\n', cat_dist, file=fadescfile) print('\nCronbach Alpha: ', pingouin.cronbach_alpha(df_iris_scores), file=fadescfile) fadescfile.close() #------------------------------------------------------------------------------ #Model Development #Train-Test Split print(cat_list) print(df_iris_scores.iloc[:, :-1].head()) print(df_iris_scores.iloc[:, -1].head()) # train_x, test_x, train_y, test_y = moses.train_test_split(df_iris_scores.iloc[:,:-1], df_iris_scores.iloc[:,-1], train_size=0.7, test_size=0.3, random_state = 42) train_x, test_x, train_y, test_y = moses.train_test_split( df_iris_scores.iloc[:, :-1], df_iris_scores.iloc[:, -1], train_size=0.7, test_size=0.3, random_state=42, stratify=df_iris_scores.iloc[:, -1]) abs_file_path = f_getFilePath("data\\processed\\iris_train_x.csv") train_x.to_csv(abs_file_path, index=False, encoding='utf-8') abs_file_path = f_getFilePath("data\\processed\\iris_train_y.csv") train_y.to_csv(abs_file_path, header=['species'], index=False, encoding='utf-8') abs_file_path = f_getFilePath("data\\processed\\iris_test_x.csv") test_x.to_csv(abs_file_path, index=False, encoding='utf-8') abs_file_path = f_getFilePath("data\\processed\\iris_test_y.csv") test_y.to_csv(abs_file_path, header=['species'], index=False, encoding='utf-8') #Train model import models.train_model
def main(): import sys import os from datetime import datetime from joblib import Parallel, delayed import tempfile import dill from pynets.statistics.utils import make_subject_dict, cleanNullTerms, \ get_ensembles_top, get_ensembles_embedding, \ build_grid from colorama import Fore, Style try: import pynets except ImportError: print( "PyNets not installed! Ensure that you are referencing the correct" " site-packages and using Python3.6+") if len(sys.argv) < 1: print("\nMissing command-line inputs! See help options with the -h" " flag.\n") sys.exit(1) # Parse inputs #base_dir = '/scratch/04171/dpisner/HNU/HNU_outs/triple' #base_dir = '/scratch/04171/dpisner/HNU/HNU_outs/outputs_language' #base_dir = '/working/hcp_test_retest' base_dir = '/media/dpys/data/HCP_trt' thr_type = "MST" icc = False disc = True int_consist = False modality = 'func' embedding_types = ['ASE', 'OMNI', 'betweenness', 'eigenvector'] parcellations = ['intersection', 'language', 'ventral', 'union'] # template = 'CN200' template = 'MNI152_T1' mets = [] metaparams_func = [ "parcellation", "granularity", "model", 'hpass', 'signal', 'tol' ] metaparams_dwi = [ "parcellation", "granularity", "model", 'traversal', 'minlength', 'tol' ] #sessions = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] sessions = ['1', '2'] #### print(f"{Fore.LIGHTBLUE_EX}\nBenchmarking API\n") print(Style.RESET_ALL) print(f"{Fore.LIGHTGREEN_EX}Gathering sampled data...") print(Style.RESET_ALL) for embedding_type in embedding_types: subject_dict_file_path = ( f"{base_dir}/pynets_subject_dict_{modality}_" f"{embedding_type}_{template}_{parcellations}.pkl") subject_mod_grids_file_path = ( f"{base_dir}/pynets_modality_grids_{modality}_" f"{embedding_type}_{template}_{parcellations}.pkl") missingness_summary = ( f"{base_dir}/pynets_missingness_summary_{modality}_" f"{embedding_type}_{template}_{parcellations}.csv") icc_tmps_dir = f"{base_dir}/icc_tmps/{parcellations}_{modality}_" \ f"{embedding_type}" os.makedirs(icc_tmps_dir, exist_ok=True) if not os.path.isfile(subject_dict_file_path): subject_dict, modality_grids, missingness_frames = \ make_subject_dict( [modality], base_dir, thr_type, mets, [embedding_type], template, sessions, parcellations ) sub_dict_clean = cleanNullTerms(subject_dict) missingness_frames = [ i for i in missingness_frames if isinstance(i, pd.DataFrame) ] if len(missingness_frames) != 0: if len(missingness_frames) > 0: if len(missingness_frames) > 1: final_missingness_summary = pd.concat( missingness_frames) final_missingness_summary.to_csv(missingness_summary, index=False) final_missingness_summary.id = \ final_missingness_summary.id.astype( 'str').str.split('_', expand=True)[0] elif len(missingness_frames) == 1: final_missingness_summary = missingness_frames[0] final_missingness_summary.to_csv(missingness_summary, index=False) final_missingness_summary.id = \ final_missingness_summary.id.astype( 'str').str.split('_', expand=True)[0] else: final_missingness_summary = pd.Series() else: final_missingness_summary = pd.Series() else: final_missingness_summary = pd.Series() with open(subject_dict_file_path, "wb") as f: dill.dump(sub_dict_clean, f) f.close() with open(subject_mod_grids_file_path, "wb") as f: dill.dump(modality_grids, f) f.close() else: with open(subject_dict_file_path, 'rb') as f: sub_dict_clean = dill.load(f) f.close() with open(subject_mod_grids_file_path, "rb") as f: modality_grids = dill.load(f) f.close() if os.path.isfile(missingness_summary): final_missingness_summary = pd.read_csv(missingness_summary) final_missingness_summary.id = \ final_missingness_summary.id.astype('str').str.split( '_', expand=True)[0] else: final_missingness_summary = pd.Series() ids = sub_dict_clean.keys() # print(f"MODALITY: {modality}") metaparams = eval(f"metaparams_{modality}") metaparam_dict = {} # print(f"EMBEDDING TYPE: {embedding_type}") # if os.path.isfile(f"{base_dir}/grid_clean_{modality}_{alg}.csv"): # continue if embedding_type == 'topology': ensembles, df_top = get_ensembles_top(modality, thr_type, f"{base_dir}/pynets") else: ensembles = get_ensembles_embedding(modality, embedding_type, base_dir) grid = build_grid(modality, metaparam_dict, sorted(list(set(metaparams))), ensembles)[1] grid = [i for i in grid if any(n in i for n in parcellations)] good_grids = [] for grid_param in grid: grid_finds = [] for ID in ids: if ID not in sub_dict_clean.keys(): print(f"ID: {ID} not found...") continue if str(sessions[0]) not in sub_dict_clean[ID].keys(): print(f"Session: {sessions[0]} not found for ID {ID}...") continue if modality not in sub_dict_clean[ID][str(sessions[0])].keys(): print(f"Modality: {modality} not found for ID {ID}, " f"ses-{sessions[0]}...") continue if embedding_type not in \ sub_dict_clean[ID][str(sessions[0])][modality].keys(): print(f"Modality: {modality} not found for ID {ID}, " f"ses-{sessions[0]}, {embedding_type}...") continue if grid_param in \ list(sub_dict_clean[ID][str(sessions[0])][modality][ embedding_type].keys()): grid_finds.append(grid_param) if len(grid_finds) < 0.75 * len(ids): print(f"Less than 75% of {grid_param} found. Removing from " f"grid...") continue else: good_grids.append(grid_param) modality_grids[modality] = good_grids cache_dir = tempfile.mkdtemp() with Parallel(n_jobs=-1, backend='loky', verbose=10, max_nbytes='200000M', temp_folder=cache_dir) as parallel: outs = parallel( delayed(benchmark_reproducibility) (base_dir, comb, modality, embedding_type, sub_dict_clean, disc, final_missingness_summary, icc_tmps_dir, icc, mets, ids, template) for comb in grid) # outs = [] # for comb in grid: # outs.append(benchmark_reproducibility( # base_dir, comb, modality, embedding_type, sub_dict_clean, # disc, final_missingness_summary, icc_tmps_dir, icc, # mets, ids, template # )) df_summary = pd.concat( [i for i in outs if i is not None and not i.empty], axis=0) df_summary = df_summary.dropna(axis=0, how='all') print(f"Saving to {base_dir}/grid_clean_{modality}_{embedding_type}_" f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv...") df_summary.to_csv( f"{base_dir}" f"/grid_clean_{modality}_{embedding_type}_{parcellations}_" f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}" f".csv", index=False) # int_consist if int_consist is True and embedding_type == 'topology': try: import pingouin as pg except ImportError: print("Cannot evaluate test-retest int_consist. pingouin" " must be installed!") df_summary_cronbach = pd.DataFrame( columns=['modality', 'embedding', 'cronbach']) df_summary_cronbach.at[0, "modality"] = modality df_summary_cronbach.at[0, "embedding"] = embedding_type for met in mets: cronbach_ses_list = [] for ses in range(1, len(sessions)): id_dict = {} for ID in ids: id_dict[ID] = {} for comb in grid: if modality == 'func': try: signal, hpass, model, granularity, atlas, \ tol = comb except BaseException: print(f"Missing {comb}...") signal, hpass, model, granularity, atlas = comb tol = '0' comb_tuple = (atlas, signal, hpass, model, granularity, tol) else: traversal, minlength, model, granularity, atlas, \ tol = comb comb_tuple = (atlas, traversal, minlength, model, granularity, tol) if comb_tuple in sub_dict_clean[ID][str( ses)][modality][embedding_type].keys(): if isinstance( sub_dict_clean[ID][str(ses)][modality] [embedding_type][comb_tuple], np.ndarray): id_dict[ID][comb] = sub_dict_clean[ID][str( ses)][modality][embedding_type][ comb_tuple][mets.index(met)][0] else: continue else: continue df_wide = pd.DataFrame(id_dict) if df_wide.empty is True: continue else: df_wide = df_wide.add_prefix(f"{met}_comb_") df_wide.replace(0, np.nan, inplace=True) print(df_wide) try: c_alpha = pg.cronbach_alpha(data=df_wide.dropna( axis=1, how='all'), nan_policy='listwise') cronbach_ses_list.append(c_alpha[0]) except BaseException: print('FAILED...') print(df_wide) del df_wide del df_wide df_summary_cronbach.at[0, f"average_cronbach_{met}"] = \ np.nanmean(cronbach_ses_list) print(f"Saving to {base_dir}/grid_clean_{modality}_" f"{embedding_type}_cronbach_" f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv...") df_summary_cronbach.to_csv( f"{base_dir}/grid_clean_{modality}_" f"{embedding_type}_cronbach" f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}" f".csv", index=False) return
def main(): import sys import os from datetime import datetime from joblib import Parallel, delayed import tempfile import dill from pynets.stats.utils import make_subject_dict, cleanNullTerms, \ get_ensembles_top, get_ensembles_embedding, \ build_grid from colorama import Fore, Style try: import pynets except ImportError: print( "PyNets not installed! Ensure that you are referencing the correct" " site-packages and using Python3.6+") if len(sys.argv) < 1: print("\nMissing command-line inputs! See help options with the -h" " flag.\n") sys.exit(1) #### Parse inputs base_dir = '/scratch/04171/dpisner/HNU/HNU_outs/archives/triple_network' #base_dir = '/scratch/04171/dpisner/HNU/HNU_outs/outputs_language' thr_type = "MST" icc = True disc = True int_consist = False modality = 'func' embedding_types = ['betweenness'] #rsns = ['language'] rsns = ['triple', 'kmeans'] template = 'CN200' # template = 'MNI152_T1' mets = [ "global_efficiency", "average_shortest_path_length", "degree_assortativity_coefficient", "average_betweenness_centrality", "average_eigenvector_centrality", "smallworldness", "modularity" ] metaparams_func = ["rsn", "res", "model", 'hpass', 'extract', 'smooth'] metaparams_dwi = ["rsn", "res", "model", 'directget', 'minlength', 'tol'] sessions = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] #### print(f"{Fore.LIGHTBLUE_EX}\nBenchmarking API\n") print(Style.RESET_ALL) print(f"{Fore.LIGHTGREEN_EX}Gathering sampled data...") print(Style.RESET_ALL) for embedding_type in embedding_types: subject_dict_file_path = (f"{base_dir}/pynets_subject_dict_{modality}_" f"{embedding_type}_{template}.pkl") subject_mod_grids_file_path = ( f"{base_dir}/pynets_modality_grids_{modality}_" f"{embedding_type}_{template}.pkl") missingness_summary = ( f"{base_dir}/pynets_missingness_summary_{modality}_" f"{embedding_type}_{template}.csv") icc_tmps_dir = f"{base_dir}/icc_tmps/{modality}_" \ f"{embedding_type}" os.makedirs(icc_tmps_dir, exist_ok=True) if not os.path.isfile(subject_dict_file_path): subject_dict, modality_grids, missingness_frames = make_subject_dict( [modality], base_dir, thr_type, mets, [embedding_type], template, sessions, rsns) sub_dict_clean = cleanNullTerms(subject_dict) missingness_frames = [ i for i in missingness_frames if isinstance(i, pd.DataFrame) ] if len(missingness_frames) != 0: if len(missingness_frames) > 0: if len(missingness_frames) > 1: final_missingness_summary = pd.concat( missingness_frames) final_missingness_summary.to_csv(missingness_summary, index=False) final_missingness_summary.id = final_missingness_summary.id.astype( 'str').str.split('_', expand=True)[0] elif len(missingness_frames) == 1: final_missingness_summary = missingness_frames[0] final_missingness_summary.to_csv(missingness_summary, index=False) final_missingness_summary.id = final_missingness_summary.id.astype( 'str').str.split('_', expand=True)[0] else: final_missingness_summary = pd.Series() else: final_missingness_summary = pd.Series() else: final_missingness_summary = pd.Series() with open(subject_dict_file_path, "wb") as f: dill.dump(sub_dict_clean, f) f.close() with open(subject_mod_grids_file_path, "wb") as f: dill.dump(modality_grids, f) f.close() else: with open(subject_dict_file_path, 'rb') as f: sub_dict_clean = dill.load(f) f.close() with open(subject_mod_grids_file_path, "rb") as f: modality_grids = dill.load(f) f.close() if os.path.isfile(missingness_summary): final_missingness_summary = pd.read_csv(missingness_summary) final_missingness_summary.id = final_missingness_summary.id.astype( 'str').str.split('_', expand=True)[0] else: final_missingness_summary = pd.Series() ids = sub_dict_clean.keys() # print(f"MODALITY: {modality}") metaparams = eval(f"metaparams_{modality}") metaparam_dict = {} # print(f"EMBEDDING TYPE: {embedding_type}") # if os.path.isfile(f"{base_dir}/grid_clean_{modality}_{alg}.csv"): # continue if embedding_type == 'topology': ensembles, df_top = get_ensembles_top(modality, thr_type, f"{base_dir}/pynets") else: ensembles = get_ensembles_embedding(modality, embedding_type, base_dir) grid = build_grid(modality, metaparam_dict, sorted(list(set(metaparams))), ensembles)[1] grid = [ i for i in grid if '200' not in i and '400' not in i and '600' not in i and '800' not in i ] if modality == "func": modality_grids[modality] = grid else: modality_grids[modality] = grid cache_dir = tempfile.mkdtemp() with Parallel(n_jobs=-1, require="sharedmem", backend='threading', verbose=10, max_nbytes='200000M', temp_folder=cache_dir) as parallel: outs = parallel( delayed(benchmark_reproducibility) (base_dir, comb, modality, embedding_type, sub_dict_clean, disc, final_missingness_summary, icc_tmps_dir, icc, mets, ids, template) for comb in grid) # outs = [] # for comb in grid: # outs.append(benchmark_reproducibility(base_dir, comb, modality, embedding_type, sub_dict_clean, # disc, final_missingness_summary, icc_tmps_dir, icc, # mets, ids)) df_summary = pd.concat( [i for i in outs if i is not None and not i.empty], axis=0) df_summary = df_summary.dropna(axis=0, how='all') print(f"Saving to {base_dir}/grid_clean_{modality}_{embedding_type}_" f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv...") df_summary.to_csv( f"{base_dir}" f"/grid_clean_{modality}_{embedding_type}_" f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv", index=False) # int_consist if int_consist is True and embedding_type == 'topology': try: import pingouin as pg except ImportError: print("Cannot evaluate test-retest int_consist. pingouin" " must be installed!") df_summary_cronbach = pd.DataFrame( columns=['modality', 'embedding', 'cronbach']) df_summary_cronbach.at[0, "modality"] = modality df_summary_cronbach.at[0, "embedding"] = embedding_type for met in mets: cronbach_ses_list = [] for ses in range(1, 10): id_dict = {} for ID in ids: id_dict[ID] = {} for comb in grid: if modality == 'func': try: extract, hpass, model, res, atlas, smooth = comb except BaseException: print(f"Missing {comb}...") extract, hpass, model, res, atlas = comb smooth = '0' comb_tuple = (atlas, extract, hpass, model, res, smooth) else: directget, minlength, model, res, atlas, tol = comb comb_tuple = (atlas, directget, minlength, model, res, tol) if comb_tuple in sub_dict_clean[ID][str( ses)][modality][embedding_type].keys(): if isinstance( sub_dict_clean[ID][str(ses)][modality] [embedding_type][comb_tuple], np.ndarray): id_dict[ID][comb] = sub_dict_clean[ID][str( ses)][modality][embedding_type][ comb_tuple][mets.index(met)][0] else: continue else: continue df_wide = pd.DataFrame(id_dict) if df_wide.empty is True: continue else: df_wide = df_wide.add_prefix(f"{met}_comb_") df_wide.replace(0, np.nan, inplace=True) print(df_wide) try: c_alpha = pg.cronbach_alpha(data=df_wide.dropna( axis=1, how='all'), nan_policy='listwise') cronbach_ses_list.append(c_alpha[0]) except BaseException: print('FAILED...') print(df_wide) del df_wide del df_wide df_summary_cronbach.at[0, f"average_cronbach_{met}"] = np.nanmean( cronbach_ses_list) print( f"Saving to {base_dir}/grid_clean_{modality}_{embedding_type}_cronbach_" f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv...") df_summary_cronbach.to_csv( f"{base_dir}/grid_clean_{modality}_{embedding_type}_cronbach{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv", index=False) return
fa.fit(x) loads = fa.loadings_ pd.DataFrame.from_records(loads) #A, P, BI, O # In[ ]: #cronbach - new method #Create the factors factor1 = x[['A1', 'A2', 'A3', 'A4']] factor2 = x[['P1', 'P2', 'P3', 'P4']] factor3 = x[['BI1', 'BI2', 'BI3', 'BI4']] factor4 = x[['O1', 'O2', 'O3']] #Get cronbach alpha factor1_alpha = pg.cronbach_alpha(factor1) factor2_alpha = pg.cronbach_alpha(factor2) factor3_alpha = pg.cronbach_alpha(factor3) factor4_alpha = pg.cronbach_alpha(factor4) print(factor1_alpha, factor2_alpha, factor3_alpha, factor4_alpha) #the alphas evaluated are 0.84, 0.68, 0.86, 0.65) # # Covariance: ANCOVA # https://www.statology.org/ancova-python/ # # Hypothesis Testing using Pearson # In[ ]: