def upl_qap_output(cfg_file): ''' ''' # Import packages from CPAC.AWS import aws_utils, fetch_creds import os import yaml # Load config file with open(cfg_file,'r') as f: cfg_dict = yaml.load(f) # Init variables bucket_name = cfg_dict["bucket_name"] bucket_out_prefix = cfg_dict["bucket_out_prefix"] creds_path = cfg_dict["creds_path"] bucket = fetch_creds.return_bucket(creds_path, bucket_name) output_dir = cfg_dict['output_directory'] # And upload data upl_files = [] for root, dirs, files in os.walk(output_dir): if files: upl_files.extend([os.path.join(root, fil) for fil in files]) # Using CPAC AWS utils s3_upl_files = [ufile.replace(output_dir, bucket_out_prefix) \ for ufile in upl_files] aws_utils.s3_upload(bucket, upl_files, s3_upl_files)
def upl_qap_output(cfg_file): ''' ''' # Import packages from CPAC.AWS import aws_utils, fetch_creds import os import yaml # Load config file with open(cfg_file, 'r') as f: cfg_dict = yaml.load(f) # Init variables bucket_name = cfg_dict["bucket_name"] bucket_out_prefix = cfg_dict["bucket_out_prefix"] creds_path = cfg_dict["creds_path"] bucket = fetch_creds.return_bucket(creds_path, bucket_name) output_dir = cfg_dict['output_directory'] # And upload data upl_files = [] for root, dirs, files in os.walk(output_dir): if files: upl_files.extend([os.path.join(root, fil) for fil in files]) # Using CPAC AWS utils s3_upl_files = [ufile.replace(output_dir, bucket_out_prefix) \ for ufile in upl_files] aws_utils.s3_upload(bucket, upl_files, s3_upl_files)
def download_outputs(path_prefix, creds_path, bucket_name, qap_type, \ download_to): import pickle from CPAC.AWS import fetch_creds from CPAC.AWS.aws_utils import s3_download src_list = [] bucket = fetch_creds.return_bucket(creds_path, bucket_name) if qap_type == "anat_spatial": search_for = "anatomical_spatial" elif qap_type == "func_spatial": search_for = "functional_spatial" elif qap_type == "func_temporal": search_for = "functional_temporal" for k in bucket.list(prefix=path_prefix): k_name = str(k.name) if (search_for in k_name) and (".csv" in k_name): src_list.append(k_name) s3_download(bucket, src_list, download_to)
def test_return_bucket(self): ''' Method to test the fetch_creds.return_bucket() function Parameters ---------- self : FetchCredsTestCase a unittest.TestCase-inherited class Returns ------- None this function does not return any values, but tests to make sure the fetch_creds.return_bucket() function returns a bucket object ''' # Import packages import boto.s3 # Init variables err_msg = 'Unable to get the S3 bucket because of faulty AWS '\ 'credentials or boto package not found' # Grab the AWS bucket bucket = fetch_creds.return_bucket(self.creds_path, self.bucket_name) # Assert that it is a boto bucket object self.assertIsInstance(bucket, boto.s3.bucket.Bucket, msg=err_msg)
def dl_subj_from_s3(subj_idx, cfg_file, s3_dict_yaml): ''' ''' # Import packages from CPAC.AWS import fetch_creds, aws_utils import yaml # Load config file with open(cfg_file,'r') as f: cfg_dict = yaml.load(f) # Init variables bucket_prefix = cfg_dict["bucket_prefix"] local_prefix = cfg_dict["local_prefix"] bucket_name = cfg_dict["bucket_name"] creds_path = cfg_dict["creds_path"] bucket = fetch_creds.return_bucket(creds_path, bucket_name) s3_list = [] s3_dict = {} # pull in S3 dict yaml with open(s3_dict_yaml,'r') as f: s3_dict = yaml.load(f) if len(s3_dict) == 0: err = "\n[!] Filepaths have not been successfully gathered from " \ "the filepath YAML dictionary!\n" raise Exception(err) # Get list of subject keys for indexing sd_keys = s3_dict.keys() sd_keys.sort() # Grab subject dictionary of interest subj_key = sd_keys[subj_idx-1] sub_dict = s3_dict[subj_key] # Download subject data to local prefix s3_dl = [] for s3_key, s3_path in sub_dict.items(): s3_dl.append(s3_path) sub_dict[s3_key] = s3_path.replace(bucket_prefix, local_prefix) aws_utils.s3_download(bucket, s3_dl, local_prefix=local_prefix, \ bucket_prefix=bucket_prefix) sub_dict = {subj_key : sub_dict} # Return single subject dictionary return sub_dict
def dl_subj_from_s3(subj_idx, cfg_file, s3_dict_yaml): ''' ''' # Import packages from CPAC.AWS import fetch_creds, aws_utils import yaml # Load config file with open(cfg_file, 'r') as f: cfg_dict = yaml.load(f) # Init variables bucket_prefix = cfg_dict["bucket_prefix"] local_prefix = cfg_dict["local_prefix"] bucket_name = cfg_dict["bucket_name"] creds_path = cfg_dict["creds_path"] bucket = fetch_creds.return_bucket(creds_path, bucket_name) s3_list = [] s3_dict = {} # pull in S3 dict yaml with open(s3_dict_yaml, 'r') as f: s3_dict = yaml.load(f) if len(s3_dict) == 0: err = "\n[!] Filepaths have not been successfully gathered from " \ "the filepath YAML dictionary!\n" raise Exception(err) # Get list of subject keys for indexing sd_keys = s3_dict.keys() sd_keys.sort() # Grab subject dictionary of interest subj_key = sd_keys[subj_idx - 1] sub_dict = s3_dict[subj_key] # Download subject data to local prefix s3_dl = [] for s3_key, s3_path in sub_dict.items(): s3_dl.append(s3_path) sub_dict[s3_key] = s3_path.replace(bucket_prefix, local_prefix) aws_utils.s3_download(bucket, s3_dl, local_prefix=local_prefix, \ bucket_prefix=bucket_prefix) sub_dict = {subj_key: sub_dict} # Return single subject dictionary return sub_dict
def test_bucket_access(creds_path, output_directory, subject_id): ''' ''' # Import packages import os import botocore.exceptions as bexc from CPAC.AWS import fetch_creds # Init variables s3_str = 's3://' test_file = '/tmp/test-output.txt' # Explicitly lower-case the "s3" if output_directory.lower().startswith(s3_str): out_dir_sp = output_directory.split('/') out_dir_sp[0] = out_dir_sp[0].lower() output_directory = '/'.join(out_dir_sp) # Get bucket name bucket_name = output_directory.replace(s3_str, '').split('/')[0] # Get bucket bucket = fetch_creds.return_bucket(creds_path, bucket_name) # Create local file with open(test_file, 'w') as f: f.write('test123') f.close() # Formulate test ouput key in bucket path output directory rel_key_path = output_directory.replace(\ os.path.join(s3_str, bucket_name), '').lstrip('/') write_test_key = os.path.join(rel_key_path, 'test-output_%s.txt' % subject_id) # Attempt a write to bucket try: bucket.upload_file(test_file, write_test_key) print 'Confirmed S3 write access for CPAC output!' test_key = bucket.Object(key=write_test_key) test_key.delete() s3_write_access = True # Otherwise we set the access flag to false except bexc.ClientError: s3_write_access = False # Return the access flag return s3_write_access
def s3_download(files_list, local_dir): ''' ''' # Import packages import boto import os from CPAC.AWS import fetch_creds # Init variables local_list = [] bucket = fetch_creds.return_bucket( 'fcp-indi', '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv') # Pull file keys for img_file in files_list: # Get file key on S3 s3_key = bucket.get_key(img_file) key_name = str(s3_key.name) # Get local name key_name_dash = key_name.replace('/', '-') local_name = key_name_dash.replace( 'data-Projects-ABIDE_Initiative-Outputs-', local_dir) # Check dirs and make dirs dirs_name = os.path.dirname(local_name) if not os.path.exists(dirs_name): os.makedirs(dirs_name) # Download data print 'Saving %s to %s...' % (key_name, local_name) s3_key.get_contents_to_filename(local_name) # Append local list local_list.append(local_name) # Return local list return local_list
def s3_download(files_list, local_dir): ''' ''' # Import packages import boto import os from CPAC.AWS import fetch_creds # Init variables local_list = [] bucket = fetch_creds.return_bucket('fcp-indi', '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv') # Pull file keys for img_file in files_list: # Get file key on S3 s3_key = bucket.get_key(img_file) key_name = str(s3_key.name) # Get local name key_name_dash = key_name.replace('/', '-') local_name = key_name_dash.replace('data-Projects-ABIDE_Initiative-Outputs-', local_dir) # Check dirs and make dirs dirs_name = os.path.dirname(local_name) if not os.path.exists(dirs_name): os.makedirs(dirs_name) # Download data print 'Saving %s to %s...' % (key_name, local_name) s3_key.get_contents_to_filename(local_name) # Append local list local_list.append(local_name) # Return local list return local_list
def pull_S3_sublist(yaml_outpath, img_type, bucket_name, bucket_prefix, creds_path): import os from CPAC.AWS import fetch_creds import yaml s3_list = [] s3_dict = {} bucket = fetch_creds.return_bucket(creds_path, bucket_name) # Filter for anat/rest if img_type == 'anat': subkey_type = 'anatomical_scan' elif img_type == 'rest': subkey_type = 'functional_scan' # Build S3-subjects to download for bk in bucket.list(prefix=bucket_prefix): s3_list.append(str(bk.name)) # Build dictionary of filepaths for sfile in s3_list: ssplit = sfile.split('/') sub_id = ssplit[-4] session_id = ssplit[-3] scan_id = ssplit[-2] if img_type in scan_id: # this ONLY handles raw data inputs, not CPAC-generated outputs! if not s3_dict.has_key((sub_id, session_id, scan_id)): resource_dict = {} resource_dict[subkey_type] = sfile s3_dict[(sub_id, session_id, scan_id)] = {} s3_dict[(sub_id, session_id, scan_id)].update(resource_dict) else: s3_dict[(sub_id, session_id, scan_id)].update(resource_dict) else: continue if len(s3_dict) == 0: err = "\n[!] Filepaths have not been successfully gathered from " \ "the S3 bucket!\n" raise Exception(err) dict_len = len(s3_dict) # write yaml file with open(yaml_outpath,"wt") as f: f.write(yaml.dump(s3_dict)) if os.path.isfile(yaml_outpath): print "\nS3 dictionary file successfully created: %s\n" % yaml_outpath print "Total number of subject-session-scans: %d\n" % dict_len else: err = "\n[!] Filepaths from the S3 bucket have not been " \ "successfully saved to the YAML file!\nOutput filepath: %s\n" \ % yaml_outpath raise Exception(err)
import sys, os, glob from CPAC.AWS import aws_utils, fetch_creds # For checking file integrity between local and upload files Currently only unix compatible # Local and uploaded directories must have same file structure # Example: python s3md5sumcheck.py ~/keys-format.csv fcp-indi data/Projects/ABIDE2/RawData/ /home/data/Incoming/abide2/bids_conv/bidsorg/ awscreds = sys.argv[1] bucketname = sys.argv[2] bucketpath = sys.argv[3] localpath = sys.argv[4] if len(sys.argv) >= 6: replace = sys.argv[5] else: replace = None bucket = fetch_creds.return_bucket(awscreds, bucketname) for k in bucket.list(prefix=bucketpath): buckname = k.name print k.name localname = k.name.replace(bucketpath, localpath) if os.path.isfile(localname): localname = os.path.abspath(localname) while os.path.islink(localname): localname = os.readlink(localname) x = os.popen('md5sum ' + localname).read() localmd5 = str(x.split(' ')[0]) etag = str(k.etag).replace('"', '') if '-' in etag: numparts = int(etag.split('-')[-1]) #print (os.stat(localname).st_size/(1024.0*1024.0))/numparts
def main(index, local_dir): ''' Function to download an anatomical dataset from S3 and process it through ANTS antsCorticalThickness.sh script, then upload the data back to S3 Parameters ---------- index : integer the index of the subject to process local_dir : string filepath to the local directory to store the input and processed outputs ''' # Import packages import boto import logging import os import subprocess import time from CPAC.AWS import aws_utils, fetch_creds # Init variables creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv' # Oasis template paths oasis_path = '/home/ubuntu/OASIS-30_Atropos_template/' # Bucket and S3 dataset prefix bucket = fetch_creds.return_bucket(creds_path, 'fcp-indi') prefix = 'data/Projects/CORR/RawData/IBA_TRT/' # Local dirs for working and download dl_dir = os.path.join(local_dir, 'inputs') # Setup logger act_log_path = '/home/ubuntu/run_act_%d.log' % index act_log = setup_logger('act_log', act_log_path, logging.INFO, to_screen=True) # Make input and workdirs if not os.path.exists(dl_dir): os.makedirs(dl_dir) # Get S3 anatomical paths dictionary anat_dict = return_anat_dict(bucket, prefix) # Get lis of unique subject ids to download key_list = sorted(anat_dict.keys()) # Extract subject of interest subj_id = key_list[index] s3_path = anat_dict[subj_id] # Init working dir working_dir = os.path.join(local_dir, '%s_act_workdir' % subj_id) if not os.path.exists(working_dir): os.makedirs(working_dir) # Download data act_log.info('Downloading %s...' % s3_path) s3_key = bucket.get_key(s3_path) s3_filename = os.path.basename(s3_path) dl_filename = os.path.join(dl_dir, subj_id, s3_filename) # Make folders if need be dl_dirs = os.path.dirname(dl_filename) if not os.path.exists(dl_dirs): os.makedirs(dl_dirs) s3_key.get_contents_to_filename(dl_filename) # Create the nipype workflow act_wf = create_workflow(working_dir, dl_filename, oasis_path) # Run the workflow act_log.info('Running the workflow...') # Start timing start = time.time() act_wf.run() # Finish timing fin = time.time() act_log.info('Completed workflow!') # Log finish and total computation time elapsed = (fin - start)/60.0 act_log.info('Total time running is: %f minutes' % elapsed) # Gather processed data act_log.info('Gathering outputs for upload to S3...') upl_list = [] for root, dirs, files in os.walk(working_dir): if files: upl_list.extend([os.path.join(root, fl) for fl in files]) # Update log with upload info act_log.info('Gathered %d files for upload to S3' % len(upl_list)) # Build upload list upl_prefix = os.path.join(prefix.replace('RawData', 'Outputs'), 'ants', subj_id) s3_upl_list = [upl.replace(working_dir, upl_prefix) for upl in upl_list] # Upload to S3 aws_utils.s3_upload(bucket, upl_list, s3_upl_list)
def main(): ''' This function runs the main routine ''' # Import packages from CPAC.AWS import fetch_creds import os import yaml # Init variables creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv' bucket = fetch_creds.return_bucket('fcp-indi', creds_path) bucket_prefix = 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_rerun' sub_fp = '/home/ubuntu/abide/preprocessing/yamls/subs_list.yml' sub_list = yaml.load(open(sub_fp, 'r')) example_subid = '0050002_session_1' # Populate list of files to link to #src_list = [] #src_list = gather_files_tosort(src_list, bucket, bucket_prefix) # Derivatives dictionary {name: (no_files_per_strategy, filt_str)} strat_dict = {'nofilt_noglobal' : ['pipeline_abide_rerun', 'global0'], 'nofilt_global' : ['pipeline_abide_rerun', 'global1'], 'filt_noglobal' : ['pipeline_abide_rerun__freq-filter', 'global0'], 'filt_global' : ['pipeline_abide_rerun__freq-filter', 'global1']} derivs_dict = {'alff' : (1, 'alff_to_standard_smooth', 'nii.gz'), 'degree_binarize' : (1, 'centrality_outputs_smoothed', 'degree_centrality_binarize'), 'degree_weighted' : (1, 'centrality_outputs_smoothed', 'degree_centrality_weighted'), 'dual_regression' : (1, 'dr_tempreg_maps_zstat_stack_to_standard_smooth', 'nii.gz'), 'eigenvector_binarize' : (1, 'centrality_outputs_smoothed', 'eigenvector_centrality_binarize'), 'eigenvector_weighted' : (1, 'centrality_outputs_smoothed', 'eigenvector_centrality_weighted'), 'falff' : (1, 'falff_to_standard_smooth', 'nii.gz'), 'func_mask' : (1, 'functional_brain_mask_to_standard', 'nii.gz'), 'func_mean' : (1, 'mean_functional_in_mni', 'nii.gz'), 'func_preproc' : (1, 'functional_mni', '.nii.gz'), 'lfcd' : (1, 'centrality_outputs_smoothed', 'lfcd_binarize'), 'reho' : (1, 'reho_to_standard_smooth', 'nii.gz'), 'rois_aal' : (4, 'roi_timeseries', 'aal'), 'rois_cc200' : (4, 'roi_timeseries', 'CC200'), 'rois_cc400' : (4, 'roi_timeseries', 'CC400'), 'rois_dosenbach160' : (4, 'roi_timeseries', 'rois_3mm'), 'rois_ez' : (4, 'roi_timeseries', 'ez'), 'rois_ho' : (4, 'roi_timeseries', 'ho_'), 'rois_tt' : (4, 'roi_timeseries', 'tt'), 'vmhc' : (1, 'vmhc_fisher_zstd_zstat_map', 'nii.gz')} # Create error and output dictionaries out_dict = {k : {kk : [] for kk in derivs_dict.keys()} for k in strat_dict.keys()} err_dict = {k : {kk : [] for kk in derivs_dict.keys()} for k in strat_dict.keys()} # Iterate through strategies for strat, filts in strat_dict.items(): print 'building %s...' % strat filt = filts[0] g_sig = filts[1] strat_prefix = os.path.join(bucket_prefix, filt, example_subid) # Iterate through derivatives for deriv, v in derivs_dict.items(): num_files = v[0] deriv_folder = v[1] name_filter = v[2] deriv_prefix = os.path.join(strat_prefix, deriv_folder) keys_list = [] for key in bucket.list(prefix=deriv_prefix): k_name = str(key.name) # If global signal regression was used or didnt need to be if (g_sig in k_name or 'global' not in k_name) and \ name_filter in k_name: keys_list.append(k_name) # Grab only wanted results from keys if len(keys_list) == num_files: out_dict[strat][deriv] = [k for k in keys_list if '.nii.gz' in k or '.1D' in k][0] else: err_dict[strat][deriv] = keys_list print 'error in number of files!' # Go through dictionary and build paths mapping_dict = {} s = 1 # For each subject for sub in sub_list: subid = sub.split('_')[-1] + '_session_1' print 'populating %s...%d' % (subid, s) # For each strategy for strat, deriv_dict in out_dict.items(): strat_prefix = os.path.join(bucket_prefix, strat) # For each derivative, generate src and dst filepaths d = 0 for deriv, filepath in deriv_dict.items(): deriv_prefix = os.path.join(strat_prefix, deriv, sub + '_' + deriv) # Check extensions if filepath.endswith('.nii.gz'): dst_path = deriv_prefix + '.nii.gz' elif filepath.endswith('.1D'): dst_path = deriv_prefix + '.1D' else: raise Exception('Bad extension type') # Get sub id from filepath src_path = filepath.replace(example_subid, subid) mapping_dict[src_path] = dst_path d += 1 if d != 20: print d raw_input('not enough dervivs') s += 1 # Return return out_dict, err_dict, mapping_dict
def pull_S3_sublist(yaml_outpath, img_type, bucket_name, bucket_prefix, creds_path): import os from CPAC.AWS import fetch_creds import yaml s3_list = [] s3_dict = {} bucket = fetch_creds.return_bucket(creds_path, bucket_name) # Filter for anat/rest if img_type == 'anat': subkey_type = 'anatomical_scan' elif img_type == 'rest': subkey_type = 'functional_scan' # Build S3-subjects to download for bk in bucket.list(prefix=bucket_prefix): s3_list.append(str(bk.name)) # Build dictionary of filepaths for sfile in s3_list: ssplit = sfile.split('/') sub_id = ssplit[-4] session_id = ssplit[-3] scan_id = ssplit[-2] if img_type in scan_id: # this ONLY handles raw data inputs, not CPAC-generated outputs! if not s3_dict.has_key((sub_id, session_id, scan_id)): resource_dict = {} resource_dict[subkey_type] = sfile s3_dict[(sub_id, session_id, scan_id)] = {} s3_dict[(sub_id, session_id, scan_id)].update(resource_dict) else: s3_dict[(sub_id, session_id, scan_id)].update(resource_dict) else: continue if len(s3_dict) == 0: err = "\n[!] Filepaths have not been successfully gathered from " \ "the S3 bucket!\n" raise Exception(err) dict_len = len(s3_dict) # write yaml file with open(yaml_outpath, "wt") as f: f.write(yaml.dump(s3_dict)) if os.path.isfile(yaml_outpath): print "\nS3 dictionary file successfully created: %s\n" % yaml_outpath print "Total number of subject-session-scans: %d\n" % dict_len else: err = "\n[!] Filepaths from the S3 bucket have not been " \ "successfully saved to the YAML file!\nOutput filepath: %s\n" \ % yaml_outpath raise Exception(err)
from CPAC.AWS import aws_utils, fetch_creds import os import re bucket = fetch_creds.return_bucket( '/home/jpellman/jpellman-fcp-indi-keys_oldfmt.csv', 'fcp-indi') srclist = [] for i, k in enumerate(bucket.list(prefix='data/Projects/ADHD200/RawData')): srclist.append(k.name) print k.name srclist = sorted(srclist) #niis = [os.path.basename(src) for src in srclist if '.nii.gz' in src] #print set(niis) matchdct={ "anat" : [r"(.+)/([0-9]+)/session_([0-9]+)/anat_([0-9]{1,2})/mprage.nii.gz" , r"\1/sub-\2/ses-\3/anat/sub-\2_ses-\3_run-\4_T1w.nii.gz"] , \ "func" : [r"(.+)/([0-9]+)/session_([0-9]+)/rest_([0-9]{1,2})/rest.nii.gz" , r"\1/sub-\2/ses-\3/func/sub-\2_ses-\3_task-rest_run-\4_bold.nii.gz"] \ } srclist_filt = [] destlist = [] for sl in sorted(srclist): if re.match(matchdct['anat'][0], sl): subbed = re.sub(matchdct['anat'][0], matchdct['anat'][1], sl) elif re.match(matchdct['func'][0], sl): subbed = re.sub(matchdct['func'][0], matchdct['func'][1], sl) else: continue
def cpac_sge_logstats(s3_prefix, str_filt, creds_path, bucket_name): ''' ''' # Import packages from CPAC.AWS import fetch_creds, aws_utils import os import numpy as np import yaml # Init variables bucket = fetch_creds.return_bucket(creds_path, bucket_name) log_keys = [] log_pass = {} log_fail = [] # Get the log file keys print 'Finding log S3 keys...' for key in bucket.list(prefix=s3_prefix): if str_filt in str(key.name): log_keys.append(key) # Get only tasks that finished print 'Searching for complete CPAC runs and getting runtimes...' for idx, key in enumerate(log_keys): kname = str(key.name) # Get log contents as a string in memory log_str = key.get_contents_as_string() # If it passed cpac running without crashing if 'CPAC run complete' in log_str: cpac_pass = True else: cpac_pass = False # Split log strings into list log_str = log_str.split('\n') # If it has 'End' at the end, it ran without crashing if 'End' in log_str[-2] and cpac_pass: # Get runtimes cpac_time, upl_time, num_files, subj = get_cpac_runtimes(log_str) log_pass[subj] = (cpac_time, upl_time, num_files) else: log_fail.append(kname) # Update status print '%.3f%% complete' % (100*(float(idx)/len(log_keys))) # Get stats num_subs_pass = len(log_pass) num_subs_fail = len(log_fail) cpac_times = {sub : times[0] for sub, times in log_pass.items()} cpac_mean = np.mean(cpac_times.values()) upl_times = {sub : times[1] for sub, times in log_pass.items()} upl_mean = np.mean(upl_times.values()) # Save times as yamls with open(os.path.join(os.getcwd(), 'cpac_times.yml'), 'w') as f: f.write(yaml.dump(cpac_times)) with open(os.path.join(os.getcwd(), 'upl_times.yml'), 'w') as f: f.write(yaml.dump(upl_times)) with open(os.path.join(os.getcwd(), 'fail_logs.yml'), 'w') as f: f.write(yaml.dump(log_fail)) # Print report print 'Number of subjects passed: %d' % len(log_pass) print 'Number of subjects failed: %d' % len(log_fail) print 'Average CPAC run time: %.3f minutes' % cpac_mean print 'Average upload time: %.3f minutes' % upl_mean # Return variables return cpac_times, upl_times
def upload_dir_contents(ipdir,s3path, bucketname, creds): srclist=[os.path.abspath(g) for g in glob.glob(ipdir+'/*')] destlist=[s3path+'/'+s.split('/')[-1] for s in srclist] bucket=fetch_creds.return_bucket(creds, bucketname) aws_utils.s3_upload(bucket,srclist,destlist)
def main(index, local_dir): ''' Function to download an anatomical dataset from S3 and process it through Freesurfer's recon-all command, then upload the data back to S3 Parameters ---------- index : integer the index of the subject to process local_dir : string filepath to the local directory to store the input and processed outputs ''' # Import packages import boto import logging import os import subprocess from CPAC.AWS import aws_utils, fetch_creds import pycuda.autoinit import pycuda.driver as cuda from multiprocessing import Process # Init variables creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv' bucket = fetch_creds.return_bucket(creds_path, 'fcp-indi') prefix = 'data/Projects/CORR/RawData/IBA_TRT/' dl_dir = os.path.join(local_dir, 'inputs') subjects_dir = os.path.join(local_dir, 'subjects') # Setup logger fs_log_path = os.path.join(local_dir, 'download_run_fs_%d.log' % index) fs_log = setup_logger('fs_log', fs_log_path, logging.INFO, to_screen=True) # Make input and subject dirs if not os.path.exists(dl_dir): os.makedirs(dl_dir) if not os.path.exists(subjects_dir): os.makedirs(subjects_dir) # Get S3 anatomical paths dictionary anat_dict = return_anat_dict(bucket, prefix) # Get list of unique subject ids to download key_list = sorted(anat_dict.keys()) # Determine number of GPUs num_gpus=cuda.Device.count() subj_id=[] proc=[] upload_proc=[] for inst in range(num_gpus): # Set environment variable. os.environ['FREESURFER_CUDA_DEVICE'] = str(inst) # Get the index of the subject to be run. subj_index = num_gpus*index + inst # Extract subject of interest subj_id.append(key_list[subj_index]) s3_path = anat_dict[subj_id[inst]] # Download data fs_log.info('Downloading %s...' % s3_path) s3_key = bucket.get_key(s3_path) s3_filename = os.path.basename(s3_path) dl_filename = os.path.join(dl_dir, subj_id[inst], s3_filename) # Make folders if need be dl_dirs = os.path.dirname(dl_filename) if not os.path.exists(dl_dirs): os.makedirs(dl_dirs) s3_key.get_contents_to_filename(dl_filename) # Execute recon-all cmd_list = ['recon-all', '-use_gpu','-openmp','8', '-time', '-qcache', '-i', dl_filename, '-subjid', subj_id[inst], '-all'] cmd_str = ' '.join(cmd_list) fs_log.info('Executing %s...' % cmd_str) # Use subprocess to send command and communicate outputs proc.append(subprocess.Popen(cmd_list)) # Run uploads with multiprocessing's Process for inst in range(0,num_gpus): proc[inst].wait() # Gather processed data fs_log.info('Gathering outputs for upload to S3...') upl_list = [] subj_dir = os.path.join(subjects_dir, subj_id[inst]) for root, dirs, files in os.walk(subj_dir): if files: upl_list.extend([os.path.join(root, fl) for fl in files]) # Update log with upload info fs_log.info('Gathered %d files for upload to S3' % len(upl_list)) # Build upload list upl_prefix = os.path.join(prefix.replace('RawData', 'Outputs'), 'freesurfer_gpu', subj_id[inst]) s3_upl_list = [upl.replace(subj_dir, upl_prefix) for upl in upl_list] # Upload to S3 upload_proc.append(Process(target=aws_utils.s3_upload, args=(bucket, upl_list, s3_upl_list))) upload_proc[inst].start() # Check that uploading has finished. for inst in range(0,num_gpus): upload_proc[inst].join()
print 'Usage: %s <path to AWS creds> <temporary directory> <S3 prefix to BIDS base>' % sys.argv[ 0] sys.exit(1) creds = sys.argv[1] tmp = sys.argv[2] s3_prefix = sys.argv[3] # Assumes last character in s3_prefix is a slash. if s3_prefix[-1] != '/': s3_prefix += '/' tmp = os.path.join(tmp, s3_prefix.split('/')[-2]) # Fetch 4 participants from the BIDS dataset and download to a temporary directory. # Start by fetching all keys. bucket = fetch_creds.return_bucket(creds, 'fcp-indi') key_list = [] for i, k in enumerate(bucket.list(prefix=s3_prefix)): key_list.append(str(k.name).replace(s3_prefix, '')) # Fetch all unique participant codes. participants = [k.split('/')[0] for k in key_list if 'sub-' in k] participants = sorted(list(set(participants))) participants = participants[0:4] downloads_list = [ os.path.join(s3_prefix, k) for k in key_list if ('sub-' in k and k.split('/')[0] in participants) or ('sub-' not in k) ] # Download the files.
from CPAC.AWS import aws_utils, fetch_creds import tarfile import os import shutil import re import sys keyspath=sys.argv[1] bucket = fetch_creds.return_bucket(keyspath, 'fcp-indi') #Be sure to put in the last forward slash as may act as wildcard otherwise ipdir='data/Projects/CORR/RawData/' opdir='data/Projects/CORR/RawDataBIDs/' srclist=[] for i,k in enumerate(bucket.list(prefix=ipdir)): srclist.append(k.name) print k.name srclist=sorted(srclist) matchdct={ 'anat' : ["(.+)/([0-9]+)/session_([0-9]{1,2})/anat_([0-9]{1,2})/anat.nii.gz" , r"\1/sub-\2/ses-\3/anat/sub-\2_ses-\3_run-\4_T1w.nii.gz"], #'mpi_anat_comp': #[r"(.+)/([0-9]+)/session_([0-9]{1,2})/anat_([0-9]{1,2})/anat_([a-z12\_]+).nii.gz" , #r"\1/sub-\2/ses-\3/anat/sub-\2_ses-\3_acq-\5_run-\4_T1w.nii.gz"],
def main(sub_idx): # Init variables bucket_name = 'fcp-indi' bucket_prefix = 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_rerun' config_file = '/home/ubuntu/abide_run/settings/pipeline_config_abide_rerun.yml' creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv' local_prefix = '/mnt/eigen_run' sublist_file = '/home/ubuntu/abide_run/eig-subs1.yml' # Pull in bucket, config, and subject sublist = yaml.load(open(sublist_file, 'r')) subject = sublist[sub_idx] sub_id = subject.split('_')[-1] bucket = fetch_creds.return_bucket(creds_path, bucket_name) c = Configuration(yaml.load(open(config_file, 'r'))) # Test to see if theyre already upload to_do = True if to_do: ## Collect functional_mni list from S3 bucket filt_global = 'pipeline_abide_rerun__freq-filter/%s_session_1/functional_mni/_scan_rest_1_rest/_csf_threshold_0.96/_gm_threshold_0.7/_wm_threshold_0.96/_compcor_ncomponents_5_selector_pc10.linear1.wm0.global1.motion1.quadratic1.gm0.compcor1.csf0/_bandpass_freqs_0.01.0.1/bandpassed_demeaned_filtered_antswarp.nii.gz' % sub_id filt_noglobal = filt_global.replace('global1','global0') nofilt_global = 'pipeline_abide_rerun/%s_session_1/functional_mni/_scan_rest_1_rest/_csf_threshold_0.96/_gm_threshold_0.7/_wm_threshold_0.96/_compcor_ncomponents_5_selector_pc10.linear1.wm0.global1.motion1.quadratic1.gm0.compcor1.csf0/residual_antswarp.nii.gz' % sub_id nofilt_noglobal = nofilt_global.replace('global1','global0') s3_functional_mni_list = [filt_global, filt_noglobal, nofilt_global, nofilt_noglobal] s3_functional_mni_list = [os.path.join(bucket_prefix, s) for s in s3_functional_mni_list] # Download contents to local inputs directory try: aws_utils.s3_download(bucket, s3_functional_mni_list, local_prefix=os.path.join(local_prefix, 'centrality_inputs'), bucket_prefix=bucket_prefix) except Exception as e: print 'Unable to find eigenvector centrality inputs for subject %s, skipping...' % sub_id print 'Error: %s' % e return # Build strat dict (dictionary of strategies and local input paths) strat_dict = {'filt_global' : os.path.join(local_prefix, 'centrality_inputs', filt_global), 'filt_noglobal' : os.path.join(local_prefix, 'centrality_inputs', filt_noglobal), 'nofilt_noglobal' : os.path.join(local_prefix, 'centrality_inputs', nofilt_noglobal), 'nofilt_global' : os.path.join(local_prefix, 'centrality_inputs', nofilt_global)} # Create list of processes proc_list = [Process(target=make_workflow, args=(in_name, strat, sub_id, c, local_prefix)) for strat, in_name in strat_dict.items()] # Iterate through processes and fire off for p in proc_list: p.start() for p in proc_list: if p.is_alive(): p.join() # Gather outputs wfs = glob.glob(os.path.join(local_prefix, 'eigen_wf_%s_*' % sub_id)) local_list = [] for wf in wfs: for root, dirs, files in os.walk(wf): if files: local_list.extend([os.path.join(root, f) for f in files]) s3_list = [loc.replace(local_prefix, 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_eigen') for loc in local_list] aws_utils.s3_upload(bucket, local_list, s3_list) # And delete working directories try: for input_file in strat_dict.values(): print 'removing input file %s...' % input_file os.remove(input_file % sub_id) except Exception as e: print 'Unable to remove input files' print 'Error: %s' %e work_dirs = glob.glob(os.path.join(local_prefix, 'eigen_wf_%s_*' % sub_id)) for work_dir in work_dirs: print 'removing %s...' % work_dir shutil.rmtree(work_dir) else: print 'subject %s already processed and uploaded, skipping...' % sub_id
def upload_dir_contents(ipdir, s3path, bucketname, creds): srclist = [os.path.abspath(g) for g in glob.glob(ipdir + '/*')] destlist = [s3path + '/' + s.split('/')[-1] for s in srclist] bucket = fetch_creds.return_bucket(creds, bucketname) aws_utils.s3_upload(bucket, srclist, destlist)
def pull_S3_sublist(yaml_outpath, img_type, cfg_file): # function example use: # # yamlpath = os.path.join(os.getcwd(), "s3dict.yml") # # # Build entire filepath dictionary from S3 # s3_dict_yml = pull_S3_sublist(yamlpath, 'anat', args.config) import os from CPAC.AWS import fetch_creds import yaml s3_list = [] s3_dict = {} # Load config file with open(cfg_file, 'r') as f: cfg_dict = yaml.load(f) bucket_name = cfg_dict["bucket_name"] bucket_prefix = cfg_dict["bucket_prefix"] creds_path = cfg_dict["creds_path"] bucket = fetch_creds.return_bucket(creds_path, bucket_name) # Filter for anat/rest if img_type == 'anat': subkey_type = 'anatomical_scan' elif img_type == 'rest': subkey_type = 'functional_scan' # Build S3-subjects to download for bk in bucket.list(prefix=bucket_prefix): s3_list.append(str(bk.name)) # Build dictionary of filepaths for sfile in s3_list: ssplit = sfile.split('/') sub_id = ssplit[-4] session_id = ssplit[-3] scan_id = ssplit[-2] if img_type in scan_id: # this ONLY handles raw data inputs, not CPAC-generated outputs! if not s3_dict.has_key((sub_id, session_id, scan_id)): resource_dict = {} resource_dict[subkey_type] = sfile s3_dict[(sub_id, session_id, scan_id)] = {} s3_dict[(sub_id, session_id, scan_id)].update(resource_dict) else: s3_dict[(sub_id, session_id, scan_id)].update(resource_dict) else: continue if len(s3_dict) == 0: err = "\n[!] Filepaths have not been successfully gathered from " \ "the S3 bucket!\n" raise Exception(err) # write yaml file with open(yaml_outpath, "wt") as f: f.write(yaml.dump(s3_dict)) if os.path.isfile(yaml_outpath): return yaml_outpath else: err = "\n[!] Filepaths from the S3 bucket have not been " \ "successfully saved to the YAML file!\nOutput filepath: %s\n" \ % yaml_outpath raise Exception(err)
def main(): ''' This function runs the main routine ''' # Import packages from CPAC.AWS import fetch_creds import os import yaml # Init variables creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv' bucket = fetch_creds.return_bucket('fcp-indi', creds_path) bucket_prefix = 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_rerun' sub_fp = '/home/ubuntu/abide/preprocessing/yamls/subs_list.yml' sub_list = yaml.load(open(sub_fp, 'r')) example_subid = '0050002_session_1' # Populate list of files to link to #src_list = [] #src_list = gather_files_tosort(src_list, bucket, bucket_prefix) # Derivatives dictionary {name: (no_files_per_strategy, filt_str)} strat_dict = { 'nofilt_noglobal': ['pipeline_abide_rerun', 'global0'], 'nofilt_global': ['pipeline_abide_rerun', 'global1'], 'filt_noglobal': ['pipeline_abide_rerun__freq-filter', 'global0'], 'filt_global': ['pipeline_abide_rerun__freq-filter', 'global1'] } derivs_dict = { 'alff': (1, 'alff_to_standard_smooth', 'nii.gz'), 'degree_binarize': (1, 'centrality_outputs_smoothed', 'degree_centrality_binarize'), 'degree_weighted': (1, 'centrality_outputs_smoothed', 'degree_centrality_weighted'), 'dual_regression': (1, 'dr_tempreg_maps_zstat_stack_to_standard_smooth', 'nii.gz'), 'eigenvector_binarize': (1, 'centrality_outputs_smoothed', 'eigenvector_centrality_binarize'), 'eigenvector_weighted': (1, 'centrality_outputs_smoothed', 'eigenvector_centrality_weighted'), 'falff': (1, 'falff_to_standard_smooth', 'nii.gz'), 'func_mask': (1, 'functional_brain_mask_to_standard', 'nii.gz'), 'func_mean': (1, 'mean_functional_in_mni', 'nii.gz'), 'func_preproc': (1, 'functional_mni', '.nii.gz'), 'lfcd': (1, 'centrality_outputs_smoothed', 'lfcd_binarize'), 'reho': (1, 'reho_to_standard_smooth', 'nii.gz'), 'rois_aal': (4, 'roi_timeseries', 'aal'), 'rois_cc200': (4, 'roi_timeseries', 'CC200'), 'rois_cc400': (4, 'roi_timeseries', 'CC400'), 'rois_dosenbach160': (4, 'roi_timeseries', 'rois_3mm'), 'rois_ez': (4, 'roi_timeseries', 'ez'), 'rois_ho': (4, 'roi_timeseries', 'ho_'), 'rois_tt': (4, 'roi_timeseries', 'tt'), 'vmhc': (1, 'vmhc_fisher_zstd_zstat_map', 'nii.gz') } # Create error and output dictionaries out_dict = { k: {kk: [] for kk in derivs_dict.keys()} for k in strat_dict.keys() } err_dict = { k: {kk: [] for kk in derivs_dict.keys()} for k in strat_dict.keys() } # Iterate through strategies for strat, filts in strat_dict.items(): print 'building %s...' % strat filt = filts[0] g_sig = filts[1] strat_prefix = os.path.join(bucket_prefix, filt, example_subid) # Iterate through derivatives for deriv, v in derivs_dict.items(): num_files = v[0] deriv_folder = v[1] name_filter = v[2] deriv_prefix = os.path.join(strat_prefix, deriv_folder) keys_list = [] for key in bucket.list(prefix=deriv_prefix): k_name = str(key.name) # If global signal regression was used or didnt need to be if (g_sig in k_name or 'global' not in k_name) and \ name_filter in k_name: keys_list.append(k_name) # Grab only wanted results from keys if len(keys_list) == num_files: out_dict[strat][deriv] = [ k for k in keys_list if '.nii.gz' in k or '.1D' in k ][0] else: err_dict[strat][deriv] = keys_list print 'error in number of files!' # Go through dictionary and build paths mapping_dict = {} s = 1 # For each subject for sub in sub_list: subid = sub.split('_')[-1] + '_session_1' print 'populating %s...%d' % (subid, s) # For each strategy for strat, deriv_dict in out_dict.items(): strat_prefix = os.path.join(bucket_prefix, strat) # For each derivative, generate src and dst filepaths d = 0 for deriv, filepath in deriv_dict.items(): deriv_prefix = os.path.join(strat_prefix, deriv, sub + '_' + deriv) # Check extensions if filepath.endswith('.nii.gz'): dst_path = deriv_prefix + '.nii.gz' elif filepath.endswith('.1D'): dst_path = deriv_prefix + '.1D' else: raise Exception('Bad extension type') # Get sub id from filepath src_path = filepath.replace(example_subid, subid) mapping_dict[src_path] = dst_path d += 1 if d != 20: print d raw_input('not enough dervivs') s += 1 # Return return out_dict, err_dict, mapping_dict
def s3_match_and_move(keyspath, matchdct, ipdir, opdir, dryrun): ''' A function to match, and rename or move keys in an S3 bucket using regular expressions ''' bucket = fetch_creds.return_bucket(keyspath, 'fcp-indi') fo = open('wrongetags.csv', 'a') fo.write('src,dest\n') fo.close() srclist = [] files_converted = [] destlist_tot = [] for i, k in enumerate(bucket.list(prefix=ipdir)): srclist.append(k.name) #print k.name srclist = sorted(srclist) for mk in sorted(matchdct.keys()): print mk print matchdct[mk]['match'] fo = open('wrongetags.csv', 'a') srclist_filt = [] destlist = [] for sl in srclist: if 'include' in matchdct[mk].keys(): if re.search(matchdct[mk]['match'][0], sl) and any( m in sl for m in matchdct[mk]['include']): #print sl,re.sub(matchdct[mk][0],matchdct[mk][1],sl) srclist_filt.append(sl) destlist.append( re.sub(matchdct[mk]['match'][0], matchdct[mk]['match'][1], sl).replace(ipdir, opdir)) else: if re.search(matchdct[mk]['match'][0], sl): #print sl,re.sub(matchdct[mk][0],matchdct[mk][1],sl) srclist_filt.append(sl) destlist.append( re.sub(matchdct[mk]['match'][0], matchdct[mk]['match'][1], sl).replace(ipdir, opdir)) if len(destlist) != len(set(destlist)): raise Exception('Duplicate Destination Filepaths exist') files_converted = files_converted + srclist_filt destlist_tot = destlist_tot + destlist if dryrun == 'yes': for j, slf in enumerate(srclist_filt): if bucket.get_key(destlist[j]): dx = bucket.get_key(destlist[j]) sx = bucket.get_key(srclist_filt[j]) if dx.etag != sx.etag: print '###### wrong etag ##### changing: ', srclist_filt[ j], destlist[j] fo.write(srclist_filt[j] + ',' + destlist[j] + '\n') else: pass #print 'Already Exists and same etag: ',srclist_filt[j],destlist[j] #else: # print 'copying ',srclist_filt[j],destlist[j] else: # Note might error with make_public=True, removing it stops error, unsure why error occurs aws_utils.s3_rename(bucket, srclist_filt, destlist, keep_old=True, make_public=True, overwrite=True) fo.close() print 'num files pulled in:', len( files_converted), 'num files produced', len(destlist_tot) if len(files_converted) != len(destlist_tot): raise Exception( 'There is a mismatch in the total files read in, and total files produced' )
from CPAC.AWS import aws_utils, fetch_creds import tarfile import os import shutil bucket = fetch_creds.return_bucket('/home/ubuntu/doconnor-fcp-indi-keys.csv', 'fcp-indi') src_list = [] for i, k in enumerate(bucket.list(prefix='data/Projects/ACPI/Outputs/')): src_list.append([str(k.name), k.size]) subids = sorted(set([sl[0].split('/')[5].split('-')[0] for sl in src_list])) strats = sorted(set([sl[0].split('/')[4] for sl in src_list])) strats = strats[3:] stratdict = {} for strat in strats: stratdict[strat] = {} subdict = {} for subid in subids: subdict[subid] = {} for i, src_file in enumerate(sorted(src_list)): if (subid in src_file[0]) and (strat in src_file[0]): nme = src_file[0] sze = src_file[1] propdict = {} bits = str(nme).split('/') filename = bits[-1] propdict['name'] = nme propdict['size'] = sze
def pull_S3_sublist(yaml_outpath, img_type, cfg_file): # function example use: # # yamlpath = os.path.join(os.getcwd(), "s3dict.yml") # # # Build entire filepath dictionary from S3 # s3_dict_yml = pull_S3_sublist(yamlpath, 'anat', args.config) import os from CPAC.AWS import fetch_creds import yaml s3_list = [] s3_dict = {} # Load config file with open(cfg_file,'r') as f: cfg_dict = yaml.load(f) bucket_name = cfg_dict["bucket_name"] bucket_prefix = cfg_dict["bucket_prefix"] creds_path = cfg_dict["creds_path"] bucket = fetch_creds.return_bucket(creds_path, bucket_name) # Filter for anat/rest if img_type == 'anat': subkey_type = 'anatomical_scan' elif img_type == 'rest': subkey_type = 'functional_scan' # Build S3-subjects to download for bk in bucket.list(prefix=bucket_prefix): s3_list.append(str(bk.name)) # Build dictionary of filepaths for sfile in s3_list: ssplit = sfile.split('/') sub_id = ssplit[-4] session_id = ssplit[-3] scan_id = ssplit[-2] if img_type in scan_id: # this ONLY handles raw data inputs, not CPAC-generated outputs! if not s3_dict.has_key((sub_id, session_id, scan_id)): resource_dict = {} resource_dict[subkey_type] = sfile s3_dict[(sub_id, session_id, scan_id)] = {} s3_dict[(sub_id, session_id, scan_id)].update(resource_dict) else: s3_dict[(sub_id, session_id, scan_id)].update(resource_dict) else: continue if len(s3_dict) == 0: err = "\n[!] Filepaths have not been successfully gathered from " \ "the S3 bucket!\n" raise Exception(err) # write yaml file with open(yaml_outpath,"wt") as f: f.write(yaml.dump(s3_dict)) if os.path.isfile(yaml_outpath): return yaml_outpath else: err = "\n[!] Filepaths from the S3 bucket have not been " \ "successfully saved to the YAML file!\nOutput filepath: %s\n" \ % yaml_outpath raise Exception(err)
def main(index, local_dir): ''' Function to download an anatomical dataset from S3 and process it through Freesurfer's recon-all command, then upload the data back to S3 Parameters ---------- index : integer the index of the subject to process local_dir : string filepath to the local directory to store the input and processed outputs ''' # Import packages import boto import logging import os import subprocess from CPAC.AWS import aws_utils, fetch_creds # Init variables creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv' bucket = fetch_creds.return_bucket(creds_path, 'fcp-indi') prefix = 'data/Projects/CORR/RawData/IBA_TRT/' dl_dir = os.path.join(local_dir, 'inputs') subjects_dir = os.path.join(local_dir, 'subjects') # Setup logger fs_log_path = os.path.join(local_dir, 'download_run_fs_%d.log' % index) fs_log = setup_logger('fs_log', fs_log_path, logging.INFO, to_screen=True) # Make input and subject dirs if not os.path.exists(dl_dir): os.makedirs(dl_dir) if not os.path.exists(subjects_dir): os.makedirs(subjects_dir) # Get S3 anatomical paths dictionary anat_dict = return_anat_dict(bucket, prefix) # Get list of unique subject ids to download key_list = sorted(anat_dict.keys()) # Extract subject of interest subj_id = key_list[index] s3_path = anat_dict[subj_id] # Download data fs_log.info('Downloading %s...' % s3_path) s3_key = bucket.get_key(s3_path) s3_filename = os.path.basename(s3_path) dl_filename = os.path.join(dl_dir, subj_id, s3_filename) # Make folders if need be dl_dirs = os.path.dirname(dl_filename) if not os.path.exists(dl_dirs): os.makedirs(dl_dirs) s3_key.get_contents_to_filename(dl_filename) # Execute recon-all cmd_list = ['recon-all', '-openmp', '4', '-i', dl_filename, '-subjid', subj_id, '-qcache', '-all'] cmd_str = ' '.join(cmd_list) fs_log.info('Executing %s...' % cmd_str) # Use subprocess to send command and communicate outputs proc = subprocess.Popen(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # Stream output while proc.poll() is None: stdout_line = proc.stdout.readline() fs_log.info(stdout_line) proc.wait() # Gather processed data fs_log.info('Gathering outputs for upload to S3...') upl_list = [] subj_dir = os.path.join(subjects_dir, subj_id) for root, dirs, files in os.walk(subj_dir): if files: upl_list.extend([os.path.join(root, fl) for fl in files]) # Update log with upload info fs_log.info('Gathered %d files for upload to S3' % len(upl_list)) # Build upload list upl_prefix = os.path.join(prefix.replace('RawData', 'Outputs'), 'freesurfer', subj_id) s3_upl_list = [upl.replace(subj_dir, upl_prefix) for upl in upl_list] # Upload to S3 aws_utils.s3_upload(bucket, upl_list, s3_upl_list, overwrite=True, make_public=True)
def main(sub_idx): # Init variables bucket_name = 'fcp-indi' bucket_prefix = 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_rerun' config_file = '/home/ubuntu/abide_run/settings/pipeline_config_abide_rerun.yml' creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv' local_prefix = '/mnt/eigen_run' sublist_file = '/home/ubuntu/abide_run/eig-subs1.yml' # Pull in bucket, config, and subject sublist = yaml.load(open(sublist_file, 'r')) subject = sublist[sub_idx] sub_id = subject.split('_')[-1] bucket = fetch_creds.return_bucket(creds_path, bucket_name) c = Configuration(yaml.load(open(config_file, 'r'))) # Test to see if theyre already upload to_do = True if to_do: ## Collect functional_mni list from S3 bucket filt_global = 'pipeline_abide_rerun__freq-filter/%s_session_1/functional_mni/_scan_rest_1_rest/_csf_threshold_0.96/_gm_threshold_0.7/_wm_threshold_0.96/_compcor_ncomponents_5_selector_pc10.linear1.wm0.global1.motion1.quadratic1.gm0.compcor1.csf0/_bandpass_freqs_0.01.0.1/bandpassed_demeaned_filtered_antswarp.nii.gz' % sub_id filt_noglobal = filt_global.replace('global1', 'global0') nofilt_global = 'pipeline_abide_rerun/%s_session_1/functional_mni/_scan_rest_1_rest/_csf_threshold_0.96/_gm_threshold_0.7/_wm_threshold_0.96/_compcor_ncomponents_5_selector_pc10.linear1.wm0.global1.motion1.quadratic1.gm0.compcor1.csf0/residual_antswarp.nii.gz' % sub_id nofilt_noglobal = nofilt_global.replace('global1', 'global0') s3_functional_mni_list = [ filt_global, filt_noglobal, nofilt_global, nofilt_noglobal ] s3_functional_mni_list = [ os.path.join(bucket_prefix, s) for s in s3_functional_mni_list ] # Download contents to local inputs directory try: aws_utils.s3_download(bucket, s3_functional_mni_list, local_prefix=os.path.join( local_prefix, 'centrality_inputs'), bucket_prefix=bucket_prefix) except Exception as e: print 'Unable to find eigenvector centrality inputs for subject %s, skipping...' % sub_id print 'Error: %s' % e return # Build strat dict (dictionary of strategies and local input paths) strat_dict = { 'filt_global': os.path.join(local_prefix, 'centrality_inputs', filt_global), 'filt_noglobal': os.path.join(local_prefix, 'centrality_inputs', filt_noglobal), 'nofilt_noglobal': os.path.join(local_prefix, 'centrality_inputs', nofilt_noglobal), 'nofilt_global': os.path.join(local_prefix, 'centrality_inputs', nofilt_global) } # Create list of processes proc_list = [ Process(target=make_workflow, args=(in_name, strat, sub_id, c, local_prefix)) for strat, in_name in strat_dict.items() ] # Iterate through processes and fire off for p in proc_list: p.start() for p in proc_list: if p.is_alive(): p.join() # Gather outputs wfs = glob.glob(os.path.join(local_prefix, 'eigen_wf_%s_*' % sub_id)) local_list = [] for wf in wfs: for root, dirs, files in os.walk(wf): if files: local_list.extend([os.path.join(root, f) for f in files]) s3_list = [ loc.replace( local_prefix, 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_eigen' ) for loc in local_list ] aws_utils.s3_upload(bucket, local_list, s3_list) # And delete working directories try: for input_file in strat_dict.values(): print 'removing input file %s...' % input_file os.remove(input_file % sub_id) except Exception as e: print 'Unable to remove input files' print 'Error: %s' % e work_dirs = glob.glob( os.path.join(local_prefix, 'eigen_wf_%s_*' % sub_id)) for work_dir in work_dirs: print 'removing %s...' % work_dir shutil.rmtree(work_dir) else: print 'subject %s already processed and uploaded, skipping...' % sub_id