def collect_nm(processing_dir, job_name, func='estimate', collect=False, binary=False, batch_size=None, outputsuffix='_estimate'): """This function checks and collects all batches. ** Input: * processing_dir -> Full path to the processing directory * collect -> If True data is checked for failed batches and collected; if False data is just checked ** Output: * Text files containing all results accross all batches the combined output written by (primarily) T Wolfers, (adapted) SM Kia """ if binary: file_extentions = '.pkl' else: file_extentions = '.txt' # detect number of subjects, batches, hyperparameters and CV batches = glob.glob(processing_dir + 'batch_*/') count = 0 batch_fail = [] if func != 'fit': file_example = [] for batch in batches: if file_example == []: file_example = glob.glob(batch + 'yhat' + outputsuffix + file_extentions) else: break if binary is False: file_example = fileio.load(file_example[0]) else: file_example = pd.read_pickle(file_example[0]) numsubjects = file_example.shape[0] batch_size = file_example.shape[1] # artificially creates files for batches that were not executed batch_dirs = glob.glob(processing_dir + 'batch_*/') batch_dirs = fileio.sort_nicely(batch_dirs) for batch in batch_dirs: filepath = glob.glob(batch + 'yhat' + outputsuffix + '*') if filepath == []: count = count+1 batch1 = glob.glob(batch + '/' + job_name + '*.sh') print(batch1) batch_fail.append(batch1) if collect is True: pRho = np.ones(batch_size) pRho = pRho.transpose() pRho = pd.Series(pRho) fileio.save(pRho, batch + 'pRho' + outputsuffix + file_extentions) Rho = np.zeros(batch_size) Rho = Rho.transpose() Rho = pd.Series(Rho) fileio.save(Rho, batch + 'Rho' + outputsuffix + file_extentions) rmse = np.zeros(batch_size) rmse = rmse.transpose() rmse = pd.Series(rmse) fileio.save(rmse, batch + 'RMSE' + outputsuffix + file_extentions) smse = np.zeros(batch_size) smse = smse.transpose() smse = pd.Series(smse) fileio.save(smse, batch + 'SMSE' + outputsuffix + file_extentions) expv = np.zeros(batch_size) expv = expv.transpose() expv = pd.Series(expv) fileio.save(expv, batch + 'EXPV' + outputsuffix + file_extentions) msll = np.zeros(batch_size) msll = msll.transpose() msll = pd.Series(msll) fileio.save(msll, batch + 'MSLL' + outputsuffix + file_extentions) yhat = np.zeros([numsubjects, batch_size]) yhat = pd.DataFrame(yhat) fileio.save(yhat, batch + 'yhat' + outputsuffix + file_extentions) ys2 = np.zeros([numsubjects, batch_size]) ys2 = pd.DataFrame(ys2) fileio.save(ys2, batch + 'ys2' + outputsuffix + file_extentions) Z = np.zeros([numsubjects, batch_size]) Z = pd.DataFrame(Z) fileio.save(Z, batch + 'Z' + outputsuffix + file_extentions) if not os.path.isdir(batch + 'Models'): os.mkdir('Models') else: # if more than 10% of yhat is nan then consider the batch as a failed batch yhat = fileio.load(filepath[0]) if np.count_nonzero(~np.isnan(yhat))/(np.prod(yhat.shape))<0.9: count = count+1 batch1 = glob.glob(batch + '/' + job_name + '*.sh') print('More than 10% nans in '+ batch1[0]) batch_fail.append(batch1) # combines all output files across batches if collect is True: pRho_filenames = glob.glob(processing_dir + 'batch_*/' + 'pRho' + outputsuffix + '*') if pRho_filenames: pRho_filenames = fileio.sort_nicely(pRho_filenames) pRho_dfs = [] for pRho_filename in pRho_filenames: pRho_dfs.append(pd.DataFrame(fileio.load(pRho_filename))) pRho_dfs = pd.concat(pRho_dfs, ignore_index=True, axis=0) fileio.save(pRho_dfs, processing_dir + 'pRho' + outputsuffix + file_extentions) del pRho_dfs Rho_filenames = glob.glob(processing_dir + 'batch_*/' + 'Rho' + outputsuffix + '*') if Rho_filenames: Rho_filenames = fileio.sort_nicely(Rho_filenames) Rho_dfs = [] for Rho_filename in Rho_filenames: Rho_dfs.append(pd.DataFrame(fileio.load(Rho_filename))) Rho_dfs = pd.concat(Rho_dfs, ignore_index=True, axis=0) fileio.save(Rho_dfs, processing_dir + 'Rho' + outputsuffix + file_extentions) del Rho_dfs Z_filenames = glob.glob(processing_dir + 'batch_*/' + 'Z' + outputsuffix + '*') if Z_filenames: Z_filenames = fileio.sort_nicely(Z_filenames) Z_dfs = [] for Z_filename in Z_filenames: Z_dfs.append(pd.DataFrame(fileio.load(Z_filename))) Z_dfs = pd.concat(Z_dfs, ignore_index=True, axis=1) fileio.save(Z_dfs, processing_dir + 'Z' + outputsuffix + file_extentions) del Z_dfs yhat_filenames = glob.glob(processing_dir + 'batch_*/' + 'yhat' + outputsuffix + '*') if yhat_filenames: yhat_filenames = fileio.sort_nicely(yhat_filenames) yhat_dfs = [] for yhat_filename in yhat_filenames: yhat_dfs.append(pd.DataFrame(fileio.load(yhat_filename))) yhat_dfs = pd.concat(yhat_dfs, ignore_index=True, axis=1) fileio.save(yhat_dfs, processing_dir + 'yhat' + outputsuffix + file_extentions) del yhat_dfs ys2_filenames = glob.glob(processing_dir + 'batch_*/' + 'ys2' + outputsuffix + '*') if ys2_filenames: ys2_filenames = fileio.sort_nicely(ys2_filenames) ys2_dfs = [] for ys2_filename in ys2_filenames: ys2_dfs.append(pd.DataFrame(fileio.load(ys2_filename))) ys2_dfs = pd.concat(ys2_dfs, ignore_index=True, axis=1) fileio.save(ys2_dfs, processing_dir + 'ys2' + outputsuffix + file_extentions) del ys2_dfs rmse_filenames = glob.glob(processing_dir + 'batch_*/' + 'RMSE' + outputsuffix + '*') if rmse_filenames: rmse_filenames = fileio.sort_nicely(rmse_filenames) rmse_dfs = [] for rmse_filename in rmse_filenames: rmse_dfs.append(pd.DataFrame(fileio.load(rmse_filename))) rmse_dfs = pd.concat(rmse_dfs, ignore_index=True, axis=0) fileio.save(rmse_dfs, processing_dir + 'RMSE' + outputsuffix + file_extentions) del rmse_dfs smse_filenames = glob.glob(processing_dir + 'batch_*/' + 'SMSE' + outputsuffix + '*') if smse_filenames: smse_filenames = fileio.sort_nicely(smse_filenames) smse_dfs = [] for smse_filename in smse_filenames: smse_dfs.append(pd.DataFrame(fileio.load(smse_filename))) smse_dfs = pd.concat(smse_dfs, ignore_index=True, axis=0) fileio.save(smse_dfs, processing_dir + 'SMSE' + outputsuffix + file_extentions) del smse_dfs expv_filenames = glob.glob(processing_dir + 'batch_*/' + 'EXPV' + outputsuffix + '*') if expv_filenames: expv_filenames = fileio.sort_nicely(expv_filenames) expv_dfs = [] for expv_filename in expv_filenames: expv_dfs.append(pd.DataFrame(fileio.load(expv_filename))) expv_dfs = pd.concat(expv_dfs, ignore_index=True, axis=0) fileio.save(expv_dfs, processing_dir + 'EXPV' + outputsuffix + file_extentions) del expv_dfs msll_filenames = glob.glob(processing_dir + 'batch_*/' + 'MSLL' + outputsuffix + '*') if msll_filenames: msll_filenames = fileio.sort_nicely(msll_filenames) msll_dfs = [] for msll_filename in msll_filenames: msll_dfs.append(pd.DataFrame(fileio.load(msll_filename))) msll_dfs = pd.concat(msll_dfs, ignore_index=True, axis=0) fileio.save(msll_dfs, processing_dir + 'MSLL' + outputsuffix + file_extentions) del msll_dfs if func != 'predict' and func != 'transfer': if not os.path.isdir(processing_dir + 'Models') and \ os.path.exists(os.path.join(batches[0], 'Models')): os.mkdir(processing_dir + 'Models') meta_filenames = glob.glob(processing_dir + 'batch_*/Models/' + 'meta_data.md') mY = [] sY = [] mX = [] sX = [] if meta_filenames: meta_filenames = fileio.sort_nicely(meta_filenames) with open(meta_filenames[0], 'rb') as file: meta_data = pickle.load(file) if meta_data['standardize']: for meta_filename in meta_filenames: mY.append(meta_data['mean_resp']) sY.append(meta_data['std_resp']) mX.append(meta_data['mean_cov']) sX.append(meta_data['std_cov']) meta_data['mean_resp'] = np.stack(mY) meta_data['std_resp'] = np.stack(sY) meta_data['mean_cov'] = np.stack(mX) meta_data['std_cov'] = np.stack(sX) with open(os.path.join(processing_dir, 'Models', 'meta_data.md'), 'wb') as file: pickle.dump(meta_data, file) batch_dirs = glob.glob(processing_dir + 'batch_*/') if batch_dirs: batch_dirs = fileio.sort_nicely(batch_dirs) for b, batch_dir in enumerate(batch_dirs): src_files = glob.glob(batch_dir + 'Models/*.pkl') if src_files: src_files = fileio.sort_nicely(src_files) for f, full_file_name in enumerate(src_files): if os.path.isfile(full_file_name): file_name = full_file_name.split('/')[-1] n = file_name.split('_') n[-1] = str(b * batch_size + f) + '.pkl' n = '_'.join(n) shutil.copy(full_file_name, processing_dir + 'Models/' + n) elif func=='fit': count = count+1 batch1 = glob.glob(batch_dir + '/' + job_name + '*.sh') print('Failed batch: ' + batch1[0]) batch_fail.append(batch1) # list batches that were not executed print('Number of batches that failed:' + str(count)) batch_fail_df = pd.DataFrame(batch_fail) if file_extentions == '.txt': fileio.save_pd(batch_fail_df, processing_dir + 'failed_batches'+ file_extentions) else: fileio.save(batch_fail_df, processing_dir + 'failed_batches' + file_extentions) if not batch_fail: return 1 else: return 0
def split_nm(processing_dir, respfile_path, batch_size, binary, **kwargs): """ This function prepares the input files for normative_parallel. ** Input: * processing_dir -> Full path to the folder of processing * respfile_path -> Full path to the responsefile.txt (subjects x features) * batch_size -> Number of features in each batch * testrespfile_path -> Full path to the test responsefile.txt (subjects x features) * binary -> If True binary file ** Output: * The creation of a folder struture for batch-wise processing witten by (primarily) T Wolfers (adapted) SM Kia """ testrespfile_path = kwargs.pop('testrespfile_path', None) dummy, respfile_extension = os.path.splitext(respfile_path) if (binary and respfile_extension != '.pkl'): raise(ValueError, """If binary is True the file format for the testrespfile file must be .pkl""") elif (binary==False and respfile_extension != '.txt'): raise(ValueError, """If binary is False the file format for the testrespfile file must be .txt""") # splits response into batches if testrespfile_path is None: if (binary==False): respfile = fileio.load_ascii(respfile_path) else: respfile = pd.read_pickle(respfile_path) respfile = pd.DataFrame(respfile) numsub = respfile.shape[1] batch_vec = np.arange(0, numsub, batch_size) batch_vec = np.append(batch_vec, numsub) for n in range(0, (len(batch_vec) - 1)): resp_batch = respfile.iloc[:, (batch_vec[n]): batch_vec[n + 1]] os.chdir(processing_dir) resp = str('resp_batch_' + str(n+1)) batch = str('batch_' + str(n+1)) if not os.path.exists(processing_dir + batch): os.makedirs(processing_dir + batch) if (binary==False): fileio.save_pd(resp_batch, processing_dir + batch + '/' + resp + '.txt') else: resp_batch.to_pickle(processing_dir + batch + '/' + resp + '.pkl') # splits response and test responsefile into batches else: dummy, testrespfile_extension = os.path.splitext(testrespfile_path) if (binary and testrespfile_extension != '.pkl'): raise(ValueError, """If binary is True the file format for the testrespfile file must be .pkl""") elif(binary==False and testrespfile_extension != '.txt'): raise(ValueError, """If binary is False the file format for the testrespfile file must be .txt""") if (binary==False): respfile = fileio.load_ascii(respfile_path) testrespfile = fileio.load_ascii(testrespfile_path) else: respfile = pd.read_pickle(respfile_path) testrespfile = pd.read_pickle(testrespfile_path) respfile = pd.DataFrame(respfile) testrespfile = pd.DataFrame(testrespfile) numsub = respfile.shape[1] batch_vec = np.arange(0, numsub, batch_size) batch_vec = np.append(batch_vec, numsub) for n in range(0, (len(batch_vec) - 1)): resp_batch = respfile.iloc[:, (batch_vec[n]): batch_vec[n + 1]] testresp_batch = testrespfile.iloc[:, (batch_vec[n]): batch_vec[n + 1]] os.chdir(processing_dir) resp = str('resp_batch_' + str(n+1)) testresp = str('testresp_batch_' + str(n+1)) batch = str('batch_' + str(n+1)) if not os.path.exists(processing_dir + batch): os.makedirs(processing_dir + batch) if (binary==False): fileio.save_pd(resp_batch, processing_dir + batch + '/' + resp + '.txt') fileio.save_pd(testresp_batch, processing_dir + batch + '/' + testresp + '.txt') else: resp_batch.to_pickle(processing_dir + batch + '/' + resp + '.pkl') testresp_batch.to_pickle(processing_dir + batch + '/' + testresp + '.pkl')