def collect_nm(processing_dir, collect=False, binary=False): """This function checks and collects all batches. ** Input: * processing_dir -> Full path to the processing directory * collect -> If True data is checked for failed batches and collected; if False data is just checked ** Output: * Text files containing all results accross all batches the combined output written by (primarily) T Wolfers, (adapted) SM Kia """ # import of necessary modules import os import sys import glob import numpy as np import pandas as pd try: import nispat.fileio as fileio except ImportError: pass path = os.path.abspath(os.path.dirname(__file__)) if path not in sys.path: sys.path.append(path) del path import fileio if binary: file_extentions = '.pkl' else: file_extentions = '.txt' # detect number of subjects, batches, hyperparameters and CV file_example = glob.glob(processing_dir + 'batch_1/' + 'yhat' + file_extentions) if binary is False: file_example = fileio.load(file_example[0]) else: file_example = pd.read_pickle(file_example[0]) numsubjects = file_example.shape[0] batch_size = file_example.shape[1] all_Hyptxt = glob.glob(processing_dir + 'batch_*/' + 'Hyp*') if all_Hyptxt != []: first_Hyptxt = fileio.load(all_Hyptxt[0]) first_Hyptxt = first_Hyptxt.transpose() nHyp = len(first_Hyptxt) dir_first_Hyptxt = os.path.dirname(all_Hyptxt[0]) all_crossval = glob.glob(dir_first_Hyptxt + '/' + 'Hyp*') n_crossval = len(all_crossval) # artificially creates files for batches that were not executed count = 0 batch_fail = [] for batch in glob.glob(processing_dir + 'batch_*/'): filepath = glob.glob(batch + 'yhat*') if filepath == []: count = count + 1 batch1 = glob.glob(batch + '/*.sh') print(batch1) batch_fail.append(batch1) if collect is True: pRho = np.ones(batch_size) pRho = pRho.transpose() pRho = pd.Series(pRho) fileio.save(pRho, batch + 'pRho' + file_extentions) Rho = np.zeros(batch_size) Rho = Rho.transpose() Rho = pd.Series(Rho) fileio.save(Rho, batch + 'Rho' + file_extentions) rmse = np.zeros(batch_size) rmse = rmse.transpose() rmse = pd.Series(rmse) fileio.save(rmse, batch + 'rmse' + file_extentions) smse = np.zeros(batch_size) smse = smse.transpose() smse = pd.Series(smse) fileio.save(smse, batch + 'smse' + file_extentions) expv = np.zeros(batch_size) expv = expv.transpose() expv = pd.Series(expv) fileio.save(expv, batch + 'expv' + file_extentions) msll = np.zeros(batch_size) msll = msll.transpose() msll = pd.Series(msll) fileio.save(msll, batch + 'msll' + file_extentions) yhat = np.zeros([batch_size, numsubjects]) yhat = pd.DataFrame(yhat) fileio.save(yhat, batch + 'yhat' + file_extentions) ys2 = np.zeros([batch_size, numsubjects]) ys2 = pd.DataFrame(ys2) fileio.save(ys2, batch + 'ys2' + file_extentions) Z = np.zeros([batch_size, numsubjects]) Z = pd.DataFrame(Z) fileio.save(Z, batch + 'Z' + file_extentions) for n in range(1, n_crossval + 1): hyp = np.zeros([batch_size, nHyp]) hyp = pd.DataFrame(hyp) fileio.save(hyp, batch + 'hyp' + file_extentions) else: # if more than 10% of yhat is nan then consider the batch as a failed batch yhat = fileio.load(filepath[0]) if np.count_nonzero(~np.isnan(yhat)) / (np.prod(yhat.shape)) < 0.9: count = count + 1 batch1 = glob.glob(batch + '/*.sh') print('More than 10% nans in ' + batch1[0]) batch_fail.append(batch1) # list batches that were not executed print('Number of batches that failed:' + str(count)) batch_fail_df = pd.DataFrame(batch_fail) if file_extentions == '.txt': fileio.save_pd(batch_fail_df, processing_dir + 'failed_batches' + file_extentions) else: fileio.save(batch_fail_df, processing_dir + 'failed_batches' + file_extentions) # combines all output files across batches if collect is True: pRho_filenames = glob.glob(processing_dir + 'batch_*/' + 'pRho*') if pRho_filenames: pRho_filenames = fileio.sort_nicely(pRho_filenames) pRho_dfs = [] for pRho_filename in pRho_filenames: pRho_dfs.append(pd.DataFrame(fileio.load(pRho_filename))) pRho_combined = pd.concat(pRho_dfs, ignore_index=True) fileio.save(pRho_combined, processing_dir + 'pRho' + file_extentions) Rho_filenames = glob.glob(processing_dir + 'batch_*/' + 'Rho*') if pRho_filenames: Rho_filenames = fileio.sort_nicely(Rho_filenames) Rho_dfs = [] for Rho_filename in Rho_filenames: Rho_dfs.append(pd.DataFrame(fileio.load(Rho_filename))) Rho_combined = pd.concat(Rho_dfs, ignore_index=True) fileio.save(Rho_combined, processing_dir + 'Rho' + file_extentions) Z_filenames = glob.glob(processing_dir + 'batch_*/' + 'Z*') if Z_filenames: Z_filenames = fileio.sort_nicely(Z_filenames) Z_dfs = [] for Z_filename in Z_filenames: Z_dfs.append(pd.DataFrame(fileio.load(Z_filename))) Z_combined = pd.concat(Z_dfs, ignore_index=True) fileio.save(Z_combined, processing_dir + 'Z' + file_extentions) yhat_filenames = glob.glob(processing_dir + 'batch_*/' + 'yhat*') if yhat_filenames: yhat_filenames = fileio.sort_nicely(yhat_filenames) yhat_dfs = [] for yhat_filename in yhat_filenames: yhat_dfs.append(pd.DataFrame(fileio.load(yhat_filename))) yhat_combined = pd.concat(yhat_dfs, ignore_index=True) fileio.save(yhat_combined, processing_dir + 'yhat' + file_extentions) ys2_filenames = glob.glob(processing_dir + 'batch_*/' + 'ys2*') if ys2_filenames: ys2_filenames = fileio.sort_nicely(ys2_filenames) ys2_dfs = [] for ys2_filename in ys2_filenames: ys2_dfs.append(pd.DataFrame(fileio.load(ys2_filename))) ys2_combined = pd.concat(ys2_dfs, ignore_index=True) fileio.save(ys2_combined, processing_dir + 'ys2' + file_extentions) rmse_filenames = glob.glob(processing_dir + 'batch_*/' + 'rmse*') if rmse_filenames: rmse_filenames = fileio.sort_nicely(rmse_filenames) rmse_dfs = [] for rmse_filename in rmse_filenames: rmse_dfs.append(pd.DataFrame(fileio.load(rmse_filename))) rmse_combined = pd.concat(rmse_dfs, ignore_index=True) fileio.save(rmse_combined, processing_dir + 'rmse' + file_extentions) smse_filenames = glob.glob(processing_dir + 'batch_*/' + 'smse*') if rmse_filenames: smse_filenames = fileio.sort_nicely(smse_filenames) smse_dfs = [] for smse_filename in smse_filenames: smse_dfs.append(pd.DataFrame(fileio.load(smse_filename))) smse_combined = pd.concat(smse_dfs, ignore_index=True) fileio.save(smse_combined, processing_dir + 'smse' + file_extentions) expv_filenames = glob.glob(processing_dir + 'batch_*/' + 'expv*') if expv_filenames: expv_filenames = fileio.sort_nicely(expv_filenames) expv_dfs = [] for expv_filename in expv_filenames: expv_dfs.append(pd.DataFrame(fileio.load(expv_filename))) expv_combined = pd.concat(expv_dfs, ignore_index=True) fileio.save(expv_combined, processing_dir + 'expv' + file_extentions) msll_filenames = glob.glob(processing_dir + 'batch_*/' + 'msll*') if msll_filenames: msll_filenames = fileio.sort_nicely(msll_filenames) msll_dfs = [] for msll_filename in msll_filenames: msll_dfs.append(pd.DataFrame(fileio.load(msll_filename))) msll_combined = pd.concat(msll_dfs, ignore_index=True) fileio.save(msll_combined, processing_dir + 'msll' + file_extentions) for n in range(1, n_crossval + 1): Hyp_filenames = glob.glob(processing_dir + 'batch_*/' + 'Hyp_' + str(n) + '.*') if Hyp_filenames: Hyp_filenames = fileio.sort_nicely(Hyp_filenames) Hyp_dfs = [] for Hyp_filename in Hyp_filenames: Hyp_dfs.append(pd.DataFrame(fileio.load(Hyp_filename))) Hyp_combined = pd.concat(Hyp_dfs, ignore_index=True) fileio.save(Hyp_combined, processing_dir + 'Hyp_' + str(n) + file_extentions)
def split_nm(processing_dir, respfile_path, batch_size, binary, testrespfile_path=None): """ This function prepares the input files for normative_parallel. ** Input: * processing_dir -> Full path to the folder of processing * respfile_path -> Full path to the responsefile.txt (subjects x features) * batch_size -> Number of features in each batch * testrespfile_path -> Full path to the test responsefile.txt (subjects x features) * binary -> If True binary file ** Output: * The creation of a folder struture for batch-wise processing witten by (primarily) T Wolfers (adapted) SM Kia """ # import of necessary modules import os import sys import numpy as np import pandas as pd try: import nispat.fileio as fileio except ImportError: pass path = os.path.abspath(os.path.dirname(__file__)) if path not in sys.path: sys.path.append(path) del path import fileio dummy, respfile_extension = os.path.splitext(respfile_path) if (binary and respfile_extension != '.pkl'): raise (ValueError, """If binary is True the file format for the testrespfile file must be .pkl""") elif (binary == False and respfile_extension != '.txt'): raise (ValueError, """If binary is False the file format for the testrespfile file must be .txt""") # splits response into batches if testrespfile_path is None: if (binary == False): respfile = fileio.load_ascii(respfile_path) else: respfile = pd.read_pickle(respfile_path) respfile = pd.DataFrame(respfile) numcol = len(respfile.iloc[0, :]) batch_vec = np.arange(0, numcol, batch_size) batch_vec = np.append(batch_vec, numcol) for n in range(0, (len(batch_vec) - 1)): resp_batch = respfile.iloc[:, (batch_vec[n]):batch_vec[n + 1]] os.chdir(processing_dir) resp = str('resp_batch_' + str(n + 1)) batch = str('batch_' + str(n + 1)) if not os.path.exists(processing_dir + batch): os.makedirs(processing_dir + batch) if (binary == False): fileio.save_pd( resp_batch, processing_dir + batch + '/' + resp + '.txt') else: resp_batch.to_pickle(processing_dir + batch + '/' + resp + '.pkl') # splits response and test responsefile into batches else: dummy, testrespfile_extension = os.path.splitext(testrespfile_path) if (binary and testrespfile_extension != '.pkl'): raise (ValueError, """If binary is True the file format for the testrespfile file must be .pkl""") elif (binary == False and testrespfile_extension != '.txt'): raise (ValueError, """If binary is False the file format for the testrespfile file must be .txt""") if (binary == False): respfile = fileio.load_ascii(respfile_path) testrespfile = fileio.load_ascii(testrespfile_path) else: respfile = pd.read_pickle(respfile_path) testrespfile = pd.read_pickle(testrespfile_path) respfile = pd.DataFrame(respfile) testrespfile = pd.DataFrame(testrespfile) numcol = len(respfile.iloc[0, :]) batch_vec = np.arange(0, numcol, batch_size) batch_vec = np.append(batch_vec, numcol) for n in range(0, (len(batch_vec) - 1)): resp_batch = respfile.iloc[:, (batch_vec[n]):batch_vec[n + 1]] testresp_batch = testrespfile.iloc[:, (batch_vec[n]):batch_vec[n + 1]] os.chdir(processing_dir) resp = str('resp_batch_' + str(n + 1)) testresp = str('testresp_batch_' + str(n + 1)) batch = str('batch_' + str(n + 1)) if not os.path.exists(processing_dir + batch): os.makedirs(processing_dir + batch) if (binary == False): fileio.save_pd( resp_batch, processing_dir + batch + '/' + resp + '.txt') fileio.save_pd( testresp_batch, processing_dir + batch + '/' + testresp + '.txt') else: resp_batch.to_pickle(processing_dir + batch + '/' + resp + '.pkl') testresp_batch.to_pickle(processing_dir + batch + '/' + testresp + '.pkl')
def collect_nm(processing_dir, job_name, func='estimate', collect=False, binary=False, batch_size=None, outputsuffix=''): """This function checks and collects all batches. ** Input: * processing_dir -> Full path to the processing directory * collect -> If True data is checked for failed batches and collected; if False data is just checked ** Output: * Text files containing all results accross all batches the combined output written by (primarily) T Wolfers, (adapted) SM Kia """ if binary: file_extentions = '.pkl' else: file_extentions = '.txt' # detect number of subjects, batches, hyperparameters and CV batches = glob.glob(processing_dir + 'batch_*/') file_example = [] for batch in batches: if file_example == []: file_example = glob.glob(batch + 'yhat' + outputsuffix + file_extentions) else: break if binary is False: file_example = fileio.load(file_example[0]) else: file_example = pd.read_pickle(file_example[0]) numsubjects = file_example.shape[0] batch_size = file_example.shape[1] # artificially creates files for batches that were not executed count = 0 batch_fail = [] batch_dirs = glob.glob(processing_dir + 'batch_*/') batch_dirs = fileio.sort_nicely(batch_dirs) for batch in batch_dirs: filepath = glob.glob(batch + 'yhat' + outputsuffix + '*') if filepath == []: count = count + 1 batch1 = glob.glob(batch + '/' + job_name + '*.sh') print(batch1) batch_fail.append(batch1) if collect is True: pRho = np.ones(batch_size) pRho = pRho.transpose() pRho = pd.Series(pRho) fileio.save(pRho, batch + 'pRho' + outputsuffix + file_extentions) Rho = np.zeros(batch_size) Rho = Rho.transpose() Rho = pd.Series(Rho) fileio.save(Rho, batch + 'Rho' + outputsuffix + file_extentions) rmse = np.zeros(batch_size) rmse = rmse.transpose() rmse = pd.Series(rmse) fileio.save(rmse, batch + 'RMSE' + outputsuffix + file_extentions) smse = np.zeros(batch_size) smse = smse.transpose() smse = pd.Series(smse) fileio.save(smse, batch + 'SMSE' + outputsuffix + file_extentions) expv = np.zeros(batch_size) expv = expv.transpose() expv = pd.Series(expv) fileio.save(expv, batch + 'EXPV' + outputsuffix + file_extentions) msll = np.zeros(batch_size) msll = msll.transpose() msll = pd.Series(msll) fileio.save(msll, batch + 'MSLL' + outputsuffix + file_extentions) yhat = np.zeros([numsubjects, batch_size]) yhat = pd.DataFrame(yhat) fileio.save(yhat, batch + 'yhat' + outputsuffix + file_extentions) ys2 = np.zeros([numsubjects, batch_size]) ys2 = pd.DataFrame(ys2) fileio.save(ys2, batch + 'ys2' + outputsuffix + file_extentions) Z = np.zeros([numsubjects, batch_size]) Z = pd.DataFrame(Z) fileio.save(Z, batch + 'Z' + outputsuffix + file_extentions) if not os.path.isdir(batch + 'Models'): os.mkdir('Models') else: # if more than 10% of yhat is nan then consider the batch as a failed batch yhat = fileio.load(filepath[0]) if np.count_nonzero(~np.isnan(yhat)) / (np.prod(yhat.shape)) < 0.9: count = count + 1 batch1 = glob.glob(batch + '/' + job_name + '*.sh') print('More than 10% nans in ' + batch1[0]) batch_fail.append(batch1) # list batches that were not executed print('Number of batches that failed:' + str(count)) batch_fail_df = pd.DataFrame(batch_fail) if file_extentions == '.txt': fileio.save_pd(batch_fail_df, processing_dir + 'failed_batches' + file_extentions) else: fileio.save(batch_fail_df, processing_dir + 'failed_batches' + file_extentions) # combines all output files across batches if collect is True: pRho_filenames = glob.glob(processing_dir + 'batch_*/' + 'pRho' + outputsuffix + '*') if pRho_filenames: pRho_filenames = fileio.sort_nicely(pRho_filenames) pRho_dfs = [] for pRho_filename in pRho_filenames: pRho_dfs.append(pd.DataFrame(fileio.load(pRho_filename))) pRho_dfs = pd.concat(pRho_dfs, ignore_index=True, axis=0) fileio.save( pRho_dfs, processing_dir + 'pRho' + outputsuffix + file_extentions) del pRho_dfs Rho_filenames = glob.glob(processing_dir + 'batch_*/' + 'Rho' + outputsuffix + '*') if pRho_filenames: Rho_filenames = fileio.sort_nicely(Rho_filenames) Rho_dfs = [] for Rho_filename in Rho_filenames: Rho_dfs.append(pd.DataFrame(fileio.load(Rho_filename))) Rho_dfs = pd.concat(Rho_dfs, ignore_index=True, axis=0) fileio.save( Rho_dfs, processing_dir + 'Rho' + outputsuffix + file_extentions) del Rho_dfs Z_filenames = glob.glob(processing_dir + 'batch_*/' + 'Z' + outputsuffix + '*') if Z_filenames: Z_filenames = fileio.sort_nicely(Z_filenames) Z_dfs = [] for Z_filename in Z_filenames: Z_dfs.append(pd.DataFrame(fileio.load(Z_filename))) Z_dfs = pd.concat(Z_dfs, ignore_index=True, axis=1) fileio.save(Z_dfs, processing_dir + 'Z' + outputsuffix + file_extentions) del Z_dfs yhat_filenames = glob.glob(processing_dir + 'batch_*/' + 'yhat' + outputsuffix + '*') if yhat_filenames: yhat_filenames = fileio.sort_nicely(yhat_filenames) yhat_dfs = [] for yhat_filename in yhat_filenames: yhat_dfs.append(pd.DataFrame(fileio.load(yhat_filename))) yhat_dfs = pd.concat(yhat_dfs, ignore_index=True, axis=1) fileio.save( yhat_dfs, processing_dir + 'yhat' + outputsuffix + file_extentions) del yhat_dfs ys2_filenames = glob.glob(processing_dir + 'batch_*/' + 'ys2' + outputsuffix + '*') if ys2_filenames: ys2_filenames = fileio.sort_nicely(ys2_filenames) ys2_dfs = [] for ys2_filename in ys2_filenames: ys2_dfs.append(pd.DataFrame(fileio.load(ys2_filename))) ys2_dfs = pd.concat(ys2_dfs, ignore_index=True, axis=1) fileio.save( ys2_dfs, processing_dir + 'ys2' + outputsuffix + file_extentions) del ys2_dfs rmse_filenames = glob.glob(processing_dir + 'batch_*/' + 'RMSE' + outputsuffix + '*') if rmse_filenames: rmse_filenames = fileio.sort_nicely(rmse_filenames) rmse_dfs = [] for rmse_filename in rmse_filenames: rmse_dfs.append(pd.DataFrame(fileio.load(rmse_filename))) rmse_dfs = pd.concat(rmse_dfs, ignore_index=True, axis=0) fileio.save( rmse_dfs, processing_dir + 'RMSE' + outputsuffix + file_extentions) del rmse_dfs smse_filenames = glob.glob(processing_dir + 'batch_*/' + 'SMSE' + outputsuffix + '*') if rmse_filenames: smse_filenames = fileio.sort_nicely(smse_filenames) smse_dfs = [] for smse_filename in smse_filenames: smse_dfs.append(pd.DataFrame(fileio.load(smse_filename))) smse_dfs = pd.concat(smse_dfs, ignore_index=True, axis=0) fileio.save( smse_dfs, processing_dir + 'SMSE' + outputsuffix + file_extentions) del smse_dfs expv_filenames = glob.glob(processing_dir + 'batch_*/' + 'EXPV' + outputsuffix + '*') if expv_filenames: expv_filenames = fileio.sort_nicely(expv_filenames) expv_dfs = [] for expv_filename in expv_filenames: expv_dfs.append(pd.DataFrame(fileio.load(expv_filename))) expv_dfs = pd.concat(expv_dfs, ignore_index=True, axis=0) fileio.save( expv_dfs, processing_dir + 'EXPV' + outputsuffix + file_extentions) del expv_dfs msll_filenames = glob.glob(processing_dir + 'batch_*/' + 'MSLL' + outputsuffix + '*') if msll_filenames: msll_filenames = fileio.sort_nicely(msll_filenames) msll_dfs = [] for msll_filename in msll_filenames: msll_dfs.append(pd.DataFrame(fileio.load(msll_filename))) msll_dfs = pd.concat(msll_dfs, ignore_index=True, axis=0) fileio.save( msll_dfs, processing_dir + 'MSLL' + outputsuffix + file_extentions) del msll_dfs if func != 'predict': if not os.path.isdir(processing_dir + 'Models') and \ os.path.exists(os.path.join(batches[0], 'Models')): os.mkdir(processing_dir + 'Models') meta_filenames = glob.glob(processing_dir + 'batch_*/Models/' + 'meta_data.md') mY = [] sY = [] mX = [] sX = [] if meta_filenames: meta_filenames = fileio.sort_nicely(meta_filenames) with open(meta_filenames[0], 'rb') as file: meta_data = pickle.load(file) if meta_data['standardize']: for meta_filename in meta_filenames: mY.append(meta_data['mean_resp']) sY.append(meta_data['std_resp']) mX.append(meta_data['mean_cov']) sX.append(meta_data['std_cov']) meta_data['mean_resp'] = np.stack(mY) meta_data['std_resp'] = np.stack(sY) meta_data['mean_cov'] = np.stack(mX) meta_data['std_cov'] = np.stack(sX) with open( os.path.join(processing_dir, 'Models', 'meta_data.md'), 'wb') as file: pickle.dump(meta_data, file) batch_dirs = glob.glob(processing_dir + 'batch_*/') if batch_dirs: batch_dirs = fileio.sort_nicely(batch_dirs) for b, batch_dir in enumerate(batch_dirs): src_files = glob.glob(batch_dir + 'Models/*.pkl') src_files = fileio.sort_nicely(src_files) for f, full_file_name in enumerate(src_files): if os.path.isfile(full_file_name): file_name = full_file_name.split('/')[-1] n = file_name.split('_') n[-1] = str(b * batch_size + f) + '.pkl' n = '_'.join(n) shutil.copy(full_file_name, processing_dir + 'Models/' + n) if not batch_fail: return 1 else: return 0
def split_nm(processing_dir, respfile_path, batch_size, binary, **kwargs): """ This function prepares the input files for normative_parallel. ** Input: * processing_dir -> Full path to the folder of processing * respfile_path -> Full path to the responsefile.txt (subjects x features) * batch_size -> Number of features in each batch * testrespfile_path -> Full path to the test responsefile.txt (subjects x features) * binary -> If True binary file ** Output: * The creation of a folder struture for batch-wise processing witten by (primarily) T Wolfers (adapted) SM Kia """ testrespfile_path = kwargs.pop('testrespfile_path', None) dummy, respfile_extension = os.path.splitext(respfile_path) if (binary and respfile_extension != '.pkl'): raise (ValueError, """If binary is True the file format for the testrespfile file must be .pkl""") elif (binary == False and respfile_extension != '.txt'): raise (ValueError, """If binary is False the file format for the testrespfile file must be .txt""") # splits response into batches if testrespfile_path is None: if (binary == False): respfile = fileio.load_ascii(respfile_path) else: respfile = pd.read_pickle(respfile_path) respfile = pd.DataFrame(respfile) numsub = respfile.shape[1] batch_vec = np.arange(0, numsub, batch_size) batch_vec = np.append(batch_vec, numsub) for n in range(0, (len(batch_vec) - 1)): resp_batch = respfile.iloc[:, (batch_vec[n]):batch_vec[n + 1]] os.chdir(processing_dir) resp = str('resp_batch_' + str(n + 1)) batch = str('batch_' + str(n + 1)) if not os.path.exists(processing_dir + batch): os.makedirs(processing_dir + batch) if (binary == False): fileio.save_pd(resp_batch, processing_dir + batch + '/' + resp + '.txt') else: resp_batch.to_pickle(processing_dir + batch + '/' + resp + '.pkl') # splits response and test responsefile into batches else: dummy, testrespfile_extension = os.path.splitext(testrespfile_path) if (binary and testrespfile_extension != '.pkl'): raise (ValueError, """If binary is True the file format for the testrespfile file must be .pkl""") elif (binary == False and testrespfile_extension != '.txt'): raise (ValueError, """If binary is False the file format for the testrespfile file must be .txt""") if (binary == False): respfile = fileio.load_ascii(respfile_path) testrespfile = fileio.load_ascii(testrespfile_path) else: respfile = pd.read_pickle(respfile_path) testrespfile = pd.read_pickle(testrespfile_path) respfile = pd.DataFrame(respfile) testrespfile = pd.DataFrame(testrespfile) numsub = respfile.shape[1] batch_vec = np.arange(0, numsub, batch_size) batch_vec = np.append(batch_vec, numsub) for n in range(0, (len(batch_vec) - 1)): resp_batch = respfile.iloc[:, (batch_vec[n]):batch_vec[n + 1]] testresp_batch = testrespfile.iloc[:, (batch_vec[n]):batch_vec[n + 1]] os.chdir(processing_dir) resp = str('resp_batch_' + str(n + 1)) testresp = str('testresp_batch_' + str(n + 1)) batch = str('batch_' + str(n + 1)) if not os.path.exists(processing_dir + batch): os.makedirs(processing_dir + batch) if (binary == False): fileio.save_pd(resp_batch, processing_dir + batch + '/' + resp + '.txt') fileio.save_pd( testresp_batch, processing_dir + batch + '/' + testresp + '.txt') else: resp_batch.to_pickle(processing_dir + batch + '/' + resp + '.pkl') testresp_batch.to_pickle(processing_dir + batch + '/' + testresp + '.pkl')
def collect_nm(processing_dir): """This function checks and collects all batches. ** Input: * processing_dir -> Full path to the processing directory ** Output: * Text files containing all results accross all batches the combined output written by Thomas Wolfers """ # import of necessary modules import glob import numpy as np import os import pandas as pd import nispat.fileio as fileio # detect number of subjects, batches, hyperparameters and CV file_example = glob.glob(processing_dir + 'batch_1/' + 'resp*.txt') file_example = fileio.load_pd(file_example[0]) numsubjects = file_example.shape[0] batch_size = file_example.shape[1] all_Hyptxt = glob.glob(processing_dir + 'batch_*/' + 'Hyp_*') first_Hyptxt = fileio.load_pd(all_Hyptxt[0]) first_Hyptxt = first_Hyptxt.transpose() nHyp = len(first_Hyptxt) dir_first_Hyptxt = os.path.dirname(all_Hyptxt[0]) all_crossval = glob.glob(dir_first_Hyptxt + '/' + 'Hyp_*') n_crossval = len(all_crossval) # artificially creates files for batches that were not executed count = 0 batch_fail = [] for batch in glob.glob(processing_dir + 'batch_*/'): filepath = glob.glob(batch + 'pRho*') if filepath == []: pRho = np.ones(batch_size) pRho = pRho.transpose() pRho = pd.Series(pRho) fileio.save_pd(pRho, batch + 'pRho.txt') Rho = np.zeros(batch_size) Rho = Rho.transpose() Rho = pd.Series(Rho) fileio.save_pd(Rho, batch + 'Rho.txt') rmse = np.zeros(batch_size) rmse = rmse.transpose() rmse = pd.Series(rmse) fileio.save_pd(rmse, batch + 'rmse.txt') smse = np.zeros(batch_size) smse = smse.transpose() smse = pd.Series(smse) fileio.save_pd(smse, batch + 'smse.txt') Z = np.zeros([batch_size, numsubjects]) Z = pd.DataFrame(Z) fileio.save_pd(Z, batch + 'Z.txt') yhat = np.zeros([batch_size, numsubjects]) yhat = pd.DataFrame(yhat) fileio.save_pd(yhat, batch + 'yhat.txt') ys2 = np.zeros([batch_size, numsubjects]) ys2 = pd.DataFrame(ys2) fileio.save_pd(ys2, batch + 'ys2.txt') for n in range(1, n_crossval + 1): hyp = np.zeros([batch_size, nHyp]) hyp = pd.DataFrame(hyp) fileio.save_pd(hyp, batch + 'Hyp_' + str(n) + '.txt') count = count + 1 print(batch) batch_fail.append(batch) # list batches that were not executed print('Number of batches that failed:' + str(count)) batch_fail_df = pd.DataFrame(batch_fail) fileio.save_pd(batch_fail_df, processing_dir + 'failed_batches.txt') # combines all output files across batches pRho_filenames = glob.glob(processing_dir + 'batch_*/' + 'pRho*') pRho_filenames = fileio.sort_nicely(pRho_filenames) pRho_dfs = [] for pRho_filename in pRho_filenames: pRho_dfs.append(fileio.load_pd(pRho_filename)) pRho_combined = pd.concat(pRho_dfs, ignore_index=True) fileio.save_pd(pRho_combined, processing_dir + 'pRho.txt') Rho_filenames = glob.glob(processing_dir + 'batch_*/' + 'Rho*') Rho_filenames = fileio.sort_nicely(Rho_filenames) Rho_dfs = [] for Rho_filename in Rho_filenames: Rho_dfs.append(fileio.load_pd(Rho_filename)) Rho_combined = pd.concat(Rho_dfs, ignore_index=True) fileio.save_pd(Rho_combined, processing_dir + 'Rho.txt') Z_filenames = glob.glob(processing_dir + 'batch_*/' + 'Z*') Z_filenames = fileio.sort_nicely(Z_filenames) Z_dfs = [] for Z_filename in Z_filenames: Z_dfs.append(fileio.load_pd(Z_filename)) Z_combined = pd.concat(Z_dfs, ignore_index=True) fileio.save_pd(Z_combined, processing_dir + 'Z.txt') yhat_filenames = glob.glob(processing_dir + 'batch_*/' + 'yhat*') yhat_filenames = fileio.sort_nicely(yhat_filenames) yhat_dfs = [] for yhat_filename in yhat_filenames: yhat_dfs.append(fileio.load_pd(yhat_filename)) yhat_combined = pd.concat(yhat_dfs, ignore_index=True) fileio.save_pd(yhat_combined, processing_dir + 'yhat.txt') ys2_filenames = glob.glob(processing_dir + 'batch_*/' + 'ys2*') ys2_filenames = fileio.sort_nicely(ys2_filenames) ys2_dfs = [] for ys2_filename in ys2_filenames: ys2_dfs.append(fileio.load_pd(ys2_filename)) ys2_combined = pd.concat(ys2_dfs, ignore_index=True) fileio.save_pd(ys2_combined, processing_dir + 'ys2.txt') rmse_filenames = glob.glob(processing_dir + 'batch_*/' + 'rmse*') rmse_filenames = fileio.sort_nicely(rmse_filenames) rmse_dfs = [] for rmse_filename in rmse_filenames: rmse_dfs.append(fileio.load_pd(rmse_filename)) rmse_combined = pd.concat(rmse_dfs, ignore_index=True) fileio.save_pd(rmse_combined, processing_dir + 'rmse.txt') smse_filenames = glob.glob(processing_dir + 'batch_*/' + 'smse*') smse_filenames = fileio.sort_nicely(smse_filenames) smse_dfs = [] for smse_filename in smse_filenames: smse_dfs.append(fileio.load_pd(smse_filename)) smse_combined = pd.concat(smse_dfs, ignore_index=True) fileio.save_pd(smse_combined, processing_dir + 'smse.txt') for n in range(1, n_crossval + 1): Hyp_filenames = glob.glob(processing_dir + 'batch_*/' + 'Hyp_' + str(n) + '.*') Hyp_filenames = fileio.sort_nicely(Hyp_filenames) Hyp_dfs = [] for Hyp_filename in Hyp_filenames: Hyp_dfs.append(fileio.load_pd(Hyp_filename)) Hyp_combined = pd.concat(Hyp_dfs, ignore_index=True) fileio.save_pd(Hyp_combined, processing_dir + 'Hyp_' + str(n) + '.txt')
def split_nm(processing_dir, respfile_path, batch_size, testrespfile_path=None): """ This function prepares the input files for parallel normative modelling. ** Input: * processing_dir -> Full path to the folder of processing * respfile_path -> Full path to the responsefile.txt (subjects x features) * batch_size -> Number of features in each batch * testrespfile_path -> Full path to the test responsefile.txt (subjects x features) ** Output: * The creation of a folder struture for batch-wise processing witten by Thomas Wolfers """ # import of necessary modules import numpy as np import os import nispat.fileio as fileio # splits response into batches if testrespfile_path is None: respfile = fileio.load_pd(respfile_path) numsub = len(respfile.ix[0, :]) batch_vec = np.arange(0, numsub, batch_size) batch_vec = np.append(batch_vec, numsub) batch_vec = batch_vec - 1 for n in range(0, (len(batch_vec) - 1)): resp_batch = respfile.ix[:, (batch_vec[n] + 1):batch_vec[n + 1]] os.chdir(processing_dir) resp = str('resp_batch_' + str(n + 1)) batch = str('batch_' + str(n + 1)) if not os.path.exists(processing_dir + batch): os.makedirs(processing_dir + batch) fileio.save_pd(resp_batch, processing_dir + batch + '/' + resp + '.txt') # splits response and test responsefile into batches else: respfile = fileio.load_pd(respfile_path) testrespfile = fileio.load_pd(testrespfile_path) numsub = len(respfile.ix[0, :]) batch_vec = np.arange(0, numsub, batch_size) batch_vec = np.append(batch_vec, numsub) batch_vec = batch_vec - 1 for n in range(0, (len(batch_vec) - 1)): resp_batch = respfile.ix[:, (batch_vec[n] + 1):batch_vec[n + 1]] testresp_batch = testrespfile.ix[:, (batch_vec[n] + 1):batch_vec[n + 1]] os.chdir(processing_dir) resp = str('resp_batch_' + str(n + 1)) testresp = str('testresp_batch_' + str(n + 1)) batch = str('batch_' + str(n + 1)) if not os.path.exists(processing_dir + batch): os.makedirs(processing_dir + batch) fileio.save_pd(resp_batch, processing_dir + batch + '/' + resp + '.txt') fileio.save_pd( testresp_batch, processing_dir + batch + '/' + testresp + '.txt')