def parse_methylation(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('mirna.isoform', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) data_df.columns = [ 'Probe_Id', "Beta_Value", "Gene_Symbol", "Chromosome", "Genomic_Coordinate" ] data_df = add_metadata(data_df, metadata) data_df = additional_changes(data_df) # upload the contents of the dataframe in njson format df_string = data_df.to_csv(index=False, header=False, float_format='%.2f') status = gcs.upload_blob_from_string(outfilename, df_string) return status
def parse_protein(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('protein', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # validation tests.assert_notnull_property(data_df, columns_list=['Protein_Name']) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata) return status
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ log = configure_logging('data_load', 'logs/data_load.log') log.info('begin load of data into bigquery') schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') log.info("\tLoading Data data into BigQuery...") load_data_from_file.run( config['project_id'], config['bq_dataset'], config['data']['bq_table'], schemas_dir + config['data']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['data']['output_dir'] + '*', 'NEWLINE_DELIMITED_JSON', 'WRITE_EMPTY' ) log.info('finished load of data into bigquery')
def parse_isoform(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging log = configure_logging('mirna.isoform.transform', "logs/mirna_isoform_transform_" + metadata['AliquotBarcode'] + '.log') try: log.info('start transform of %s' % (metadata['AliquotBarcode'])) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata log.info('\tadd changes and metadata for %s' % (metadata['AliquotBarcode'])) data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, log=log) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename) log.info('finished transform of %s' % (metadata['AliquotBarcode'])) except Exception as e: log.exception('problem transforming %s' % (metadata['AliquotBarcode'])) raise e return status
def identify_data(config): """Gets the metadata info from database """ # setup logging log = configure_logging('methylation_extract', "logs/" + 'methylation_extract.log') log.info('start methylation extract') # cloudSql connection params host = config['cloudsql']['host'] database = config['cloudsql']['db'] user = config['cloudsql']['user'] passwd = config['cloudsql']['passwd'] sqlquery = """ SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform, SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype, DatafileUploaded, IncludeForAnalysis FROM metadata_data WHERE DataType='DNA Methylation' AND DatafileUploaded='true' AND IncludeForAnalysis='yes' """ # connect to db and get results in a dataframe log.info("\tselect file names from db") metadata_df = read_mysql_query(host, database, user, passwd, sqlquery) # rename platforms in rows for i, row in metadata_df.iterrows(): metadata = row.to_dict() platform = row['Platform'] metadata_df.loc[i, 'OutDatafileNameKey'] = config['methylation']['output_dir']\ + platform + '/' + metadata['DatafileName'] + '.csv' metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\ .map(lambda code: config['sample_code2letter'][code]) metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\ .map(lambda inputfile: re.sub(r"^/", "", inputfile)) # tag CELLC samples metadata_df['is_tumor'] = (metadata_df['SampleTypeLetterCode'] != "CELLC") metadata_df[ 'transform_function'] = 'methylation.transform.parse_methylation' log.info("\tFound {0} rows, columns.".format(str(metadata_df.shape))) # Filter - check all "is_" fields - remember all 'is_' fields must be boolean all_flag_columns = [ key for key in metadata_df.columns.values if key.startswith("is_") ] flag_df = metadata_df[all_flag_columns] metadata_df = metadata_df[flag_df.all(axis=1)] log.info("\tAfter filtering: Found {0} rows, columns.".format( str(metadata_df.shape))) log.info('finished methylation extract') return metadata_df
def identify_data(config): """Gets the metadata info from database """ log = configure_logging('cnv', 'logs/cnv_extract.log') log.info('start cnv extract') # cloudSql connection params host = config['cloudsql']['host'] database = config['cloudsql']['db'] user = config['cloudsql']['user'] passwd = config['cloudsql']['passwd'] log.info("\tselect file names from db") sqlquery = """ SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform, SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype, DatafileUploaded, IncludeForAnalysis, DataCenterName FROM metadata_data WHERE DataType='Copy Number Results-SNP' AND DatafileUploaded='true' AND DatafileNameKey LIKE '%.nocnv_hg19.seg.txt' AND IncludeForAnalysis='yes' """ # connect to db and get results in a dataframe metadata_df = read_mysql_query(host, database, user, passwd, sqlquery) # rename platforms in rows. because number of rows exceed import limitation for GCS to BigQuery, rotate into multiple # virtual directories dir_prefix = config['cnv']['output_dir_prefix'] dir_suffixes = config['cnv']['output_dir_suffixes'] num_suffix = len(dir_suffixes) for i, row in metadata_df.iterrows(): metadata = row.to_dict() metadata_df.loc[i, 'OutDatafileNameKey'] = dir_prefix + dir_suffixes[i % num_suffix] \ + metadata['DatafileName'] + '.json' metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\ .map(lambda code: config['sample_code2letter'][code]) metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\ .map(lambda inputfile: re.sub(r"^/", "", inputfile)) # tag CELLC samples metadata_df['is_tumor'] = (metadata_df['SampleTypeLetterCode'] != "CELLC") metadata_df['transform_function'] = 'cnv.transform.parse_cnv' log.info("\tFound {0} rows, columns.".format(str(metadata_df.shape))) # Filter - check all "is_" fields - remember all 'is_' fields must be boolean all_flag_columns = [ key for key in metadata_df.columns.values if key.startswith("is_") ] flag_df = metadata_df[all_flag_columns] metadata_df = metadata_df[flag_df.all(axis=1)] log.info("\tAfter filtering: Found {0} rows, columns.".format( str(metadata_df.shape))) log.info('finished cnv extract') return metadata_df
def parse_cnv(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('cnv', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename) return status
def parse_isoform(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('mirna.isoform', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename) return status
def parse_protein(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging("protein", "logs/" + metadata["AliquotBarcode"] + ".log") # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) # main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # validation tests.assert_notnull_property(data_df, columns_list=["Protein_Name"]) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata) return status
def parse_methylation(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('mirna.isoform', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) data_df.columns = ['Probe_Id', "Beta_Value", "Gene_Symbol", "Chromosome", "Genomic_Coordinate"] data_df = add_metadata(data_df, metadata) data_df = additional_changes(data_df) # upload the contents of the dataframe in njson format df_string = data_df.to_csv(index=False, header=False, float_format='%.2f') status = gcs.upload_blob_from_string(outfilename, df_string) return status
def main(configfilename, hgnc_df_filename): # go through the AA files and create corrected files. try: log_filename = 'etl_process_antibody_files.log' log_name = 'etl_process_antibody_files' log = configure_logging(log_name, log_filename) log.info('start processing antibody annotation files') with open(configfilename) as configfile: config = json.load(configfile) download_antibody_annotation_files(config, log) process_antibody_annotation_files(config, log) fix_gene_protein_inconsistencies(config, hgnc_df_filename, log) log.info('done processing antibody annotation files') except Exception as e: log.exception('fatal problem processing antibody files') raise e
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ log = configure_logging('cnv_load', 'logs/cnv_load.log') log.info('begin load of cnv into bigquery') schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') #print "Loading Methylation 450K data into BigQuery.." #load_data_from_file.run( # config['project_id'], # config['bq_dataset'], # config['methylation']['bq_table'], # schemas_dir + config['methylation']['schema_file'], # 'gs://' + config['buckets']['open'] + '/' +\ # config['methylation']['output_dir'] + 'HumanMethylation450/*', # 'CSV', # 'WRITE_EMPTY' #) dir_prefix = config['cnv']['output_dir_prefix'] dir_suffixes = config['cnv']['output_dir_suffixes'] for dir_suffix in dir_suffixes: log.info("\tLoading CNV data into BigQuery from %s..." % (dir_prefix + dir_suffix)) load_data_from_file.run( config['project_id'], config['bq_dataset'], config['cnv']['bq_table'], schemas_dir + config['cnv']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ dir_prefix + dir_suffix + '*', 'NEWLINE_DELIMITED_JSON', 'WRITE_APPEND' ) log.info("*"*30) log.info('finished load of CNV into bigquery')
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ log = configure_logging('mirna_isoform_matrix_load', 'logs/mirna_isoform_matrix_load.log') log.info('begin load of mirna isoform matrix into bigquery') schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') log.info("\tLoading Isoform HiSeq matrix data into BigQuery..") load_data_from_file.run( config['project_id'], config['bq_dataset'], config['mirna_isoform_matrix']['bq_table_hiseq'], schemas_dir + config['mirna_isoform_matrix']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['mirna_isoform_matrix']['IlluminaHiSeq']['output_dir'] + '*', 'NEWLINE_DELIMITED_JSON', 'WRITE_EMPTY' ) log.info("*" * 30) log.info("\tLoading Isoform GA matrix data into BigQuery..") load_data_from_file.run( config['project_id'], config['bq_dataset'], config['mirna_isoform_matrix']['bq_table_ga'], schemas_dir + config['mirna_isoform_matrix']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['mirna_isoform_matrix']['IlluminaGA']['output_dir'] + '*', 'NEWLINE_DELIMITED_JSON', 'WRITE_APPEND' ) log.info('done load of mirna isoform matrix into bigquery')
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ log = configure_logging('methylation_split', 'logs/methylation_load.log') log.info('begin load of methylation into bigquery') schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') #print "Loading Methylation 450K data into BigQuery.." #load_data_from_file.run( # config['project_id'], # config['bq_dataset'], # config['methylation']['bq_table'], # schemas_dir + config['methylation']['schema_file'], # 'gs://' + config['buckets']['open'] + '/' +\ # config['methylation']['output_dir'] + 'HumanMethylation450/*', # 'CSV', # 'WRITE_EMPTY' #) log.info("\tLoading Methylation data into BigQuery...") load_data_from_file.run( config['project_id'], config['bq_dataset'], config['methylation']['bq_table'], schemas_dir + config['methylation']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['methylation']['output_dir'] + '*', 'CSV', 'WRITE_APPEND' ) main(config, log) log.info('finished load of methylation into bigquery')
action='store_true', default=False) parser.add_argument( '--create_new', help='Set create disposition to True. This deletes old task queue, db etc\ WARNING: This deletes old files, database, task queue etc.', action='store_true', default=False) parser.add_argument( '--debug', help='Runs the pipeline on first 30 records and then exists. Usefull to run\ after the dry run to test the workflow. (default: False)', action='store_true', default=False) args = parser.parse_args() log_filename = 'etl_{0}.log'.format(args.datatype) log_name = 'etl_{0}'.format(args.datatype) log = configure_logging(log_name, log_filename) results = main( args.datatype, args.config_file, args.max_workers, args.dry_run, args.create_new, args.debug )
def identify_data(config): """Gets the metadata info from database """ log = configure_logging('mirna.isoform', 'logs/mirna_isoform_extract_.log') log.info('start mirna isoform extract') # cloudSql connection params host = config['cloudsql']['host'] database = config['cloudsql']['db'] user = config['cloudsql']['user'] passwd = config['cloudsql']['passwd'] log.info("\tselect file names from db") sqlquery = """ SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform, SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype, DatafileUploaded, IncludeForAnalysis, DataCenterName FROM metadata_data WHERE DataType='Expression-miRNA Isoform' AND DatafileUploaded='true' AND DatafileNameKey LIKE '%hg19.mirbase20.isoform.quantification.txt' AND IncludeForAnalysis='yes' """ # connect to db and get results in a dataframe metadata_df = read_mysql_query(host, database, user, passwd, sqlquery) # rename platforms in rows for i, row in metadata_df.iterrows(): metadata = row.to_dict() if 'IlluminaHiSeq' in metadata['Platform']: metadata_df.loc[i, 'Platform'] = 'IlluminaHiSeq' platform = 'IlluminaHiSeq' elif 'IlluminaGA' in metadata['Platform']: metadata_df.loc[i, 'Platform'] = 'IlluminaGA' platform = 'IlluminaGA' else: raise Exception('Did not match any given platform') metadata_df.loc[i, 'OutDatafileNameKey'] = config['mirna']['isoform']['output_dir']\ + platform + '/' + metadata['DatafileName'] + '.json' metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\ .map(lambda code: config['sample_code2letter'][code]) metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\ .map(lambda inputfile: re.sub(r"^/", "", inputfile)) # tag CELLC samples metadata_df['is_tumor'] = (metadata_df['SampleTypeLetterCode'] != "CELLC") metadata_df['transform_function'] = 'mirna.isoform.transform.parse_isoform' log.info("\tFound {0} rows, columns.".format(str(metadata_df.shape))) # Filter - check all "is_" fields - remember all 'is_' fields must be boolean all_flag_columns = [ key for key in metadata_df.columns.values if key.startswith("is_") ] flag_df = metadata_df[all_flag_columns] metadata_df = metadata_df[flag_df.all(axis=1)] log.info("\tAfter filtering: Found {0} rows, columns.".format( str(metadata_df.shape))) log.info('finished mirna isoform extract') return metadata_df
(chromosome)) time.sleep(20) except HttpError as err: print 'Error:', pprint.pprint(err.content) except AccessTokenRefreshError: print( "Credentials have been revoked or expired, please re-run" "the application to re-authorize") def main(config, log): log.info('start splitting methylation data by chromosome') project_id = config['project_id'] dataset_id = config['bq_dataset'] chromosomes = map(str, range(1, 23)) + ['X', 'Y'] # chromosomes = map(lambda orig_string: 'chr' + orig_string, chr_nums) for chromosome in chromosomes: split_table_by_chr(chromosome, project_id, dataset_id, log) log.info('done splitting methylation data by chromosome') if __name__ == '__main__': # setup logging with open(sys.argv[1]) as configfile: config = json.load(configfile) log = configure_logging('methylation_split', "logs/methylation_transform_split" + '.log') main(config, log)
def identify_data(config): """Gets the metadata info from database """ log = configure_logging('data_et', 'logs/data_et.log') log.info('start data extract and transform') # cloudSql connection params host = config['cloudsql']['host'] database = config['cloudsql']['db'] user = config['cloudsql']['user'] passwd = config['cloudsql']['passwd'] log.info("\tselect data records from db") sqlquery = """ SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, AliquotUUID, DataArchiveName, DataArchiveURL, DataArchiveVersion, DataCenterCode, DataCenterName, DataCenterType, DatafileMD5, DatafileName, DatafileNameKey, DatafileUploaded, DataLevel, Datatype, GenomeReference, IncludeForAnalysis, MAGETabArchiveName, MAGETabArchiveURL, Pipeline, Platform, Project, Repository, SampleType, SampleTypeCode, SDRFFileName, SDRFFileNameKey, SecurityProtocol, Species, Study, wasDerivedFrom, library_strategy, state, reason_for_state, analysis_id, analyte_code, last_modified, platform_full_name, GG_dataset_id, GG_readgroupset_id FROM metadata_data WHERE Project = 'TCGA' AND study = '%s' AND DatafileUploaded='true' AND DatafileNameKey is not null AND IncludeForAnalysis='yes' """ studies = config['all_tumor_types'] for study in studies: # connect to db and get results in a dataframe log.info("\tselect data records from db for %s" % (study)) metadata_df = read_mysql_query(host, database, user, passwd, sqlquery % (study)) log.info("\tFound {0} rows, columns." .format(str(metadata_df.shape))) project_id = config['project_id'] bucket_name = config['buckets']['open'] gcs = GcsConnector(project_id, bucket_name) log.info("\tupload data to GCS.") gcs.convert_df_to_njson_and_upload(metadata_df, config['data']['output_dir'] + config['data']['bq_table'] + '_%s.json' % (study)) log.info('finished extract and transform')
def main(config): log_filename = 'etl_download_isoform.log' log_name = 'etl_download_isoform' log = configure_logging(log_name, log_filename) log.info('begin downloading isoform files') # etl = util.DataETL("isb-cgc", "isb-cgc-open") # this starts a new connection project_id = config['project_id'] bucket_name = config['buckets']['open'] # connect to bucket to get files gcs = GcsConnector(project_id, bucket_name) isoform_file = re.compile("^.*.isoform.quantification.txt.json$") data_library = gcs.search_files( search_patterns=['.isoform.quantification.txt'], regex_search_pattern=isoform_file, prefixes=[config['mirna_isoform_matrix']['isoform_gcs_dir']]) # we are eliminating bad files - size 0; could be hg18 etc data_library.loc[:, 'basefilename'] = data_library['filename'].map( lambda x: os.path.splitext(os.path.basename(x))[0].replace( '.json', '')) data_library = data_library.query('size > 0') log.info('\tbegin selecting isoform files from sql-lite isoform db') conn = sqlite3.connect(config['mirna_isoform_matrix']['isoform_file_db']) sql = 'SELECT * from {0}'.format('task_queue') all_files_df = pd.read_sql_query(sql, conn) conn.close() log.info('\tfinished selecting isoform files') log.info('\tbegin reading from down loaded files') with open(config['mirna_isoform_matrix'] ['isoform_download_prev_files']) as f: lines = f.read().splitlines() log.info('\tfinished reading from down loaded files') log.info( 'filter files.\n\tfiles in cloud storage: %s\n\tfiles previously marked to download: %s\n%s\n' % (len(data_library), len(all_files_df), data_library)) all_files_df = all_files_df[(all_files_df.DatafileName.isin( data_library.basefilename))] all_files_df = all_files_df[~(all_files_df.DatafileName.isin(lines))] data_library = all_files_df log.info('finished filter files: %s\n%s\n' % (len(data_library), data_library)) conn = sqlite3.connect(config['mirna_isoform_matrix']['isoform_file_db']) submit_to_queue(data_library, conn, 'task_queue', log) queue_df = data_library # restart ETL; this gets the diff; also takes care of errors try: conn = sqlite3.connect('isoform_download.db') sql = 'SELECT * from task_queue_status where errors="None"' queue_df2 = pd.read_sql_query(sql, conn) log.info('\tso far completed: ' % (len(queue_df2))) queue_df = queue_df[~( queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))] log.info('\tso far not completed: ' % (len(queue_df))) except Exception: log.exception( '\n++++++++++++++++++++++\n\tproblem filtering completed jobs, ignoring\n++++++++++++++++++++++\n' ) # ----------------------------------------------------- # thread this with concurrent futures #------------------------------------------------------ log.info('\tsubmit jobs to process manager') pm = process_manager.ProcessManager(max_workers=200, db='isoform_download.db', table='task_queue_status', log=log) for count, df in data_library.iterrows(): row = df.to_dict() if 0 == count % 512: time.sleep(10) if 0 == count % 2048: log.info('\t\tsubmitting %s file: %s' % (count, row['DatafileName'])) if not os.path.isdir( config['mirna_isoform_matrix']['isoform_download_dir'] + row['Platform']): os.makedirs( config['mirna_isoform_matrix']['isoform_download_dir'] + row['Platform']) outfilename = config['mirna_isoform_matrix'][ 'isoform_download_dir'] + row['Platform'] + "/" + row[ 'DatafileName'] pm.submit(download_file, project_id, bucket_name, row['DatafileNameKey'], outfilename, '') time.sleep(0.2) log.info('\tsubmitted %s total jobs to process manager' % (count)) log.info('\tstart process manager completion check') pm.start() log.info('finished downloading isoform files')