def main(datatype, config_file, max_workers, dry_run, create_new, debug): """ Pipeline """ # config params config = json.load(open(config_file)) project_id = config['project_id'] bucket_name = config['buckets']['open'] table_task_queue = 'task_queue' table_task_queue_status = 'task_queue_status' db_filename = 'etl-{0}.db'.format(datatype) log_filename = 'etl_{0}.log'.format(datatype) log_name = 'etl_{0}'.format(datatype) log.info('start pipeline for %s' % (datatype)) # check if the table exists and issue warning if os.path.exists(db_filename): log.warning('Using the already available database file - {0}'.format(db_filename)) time.sleep(2) # connect to the database conn = sqlite3.connect(db_filename, check_same_thread=False) #------------------------------- # Submit to task queue #------------------------------- print "="*30 + "\nQuerying Google Cloud SQL metadata_data table" queue_df = extract_functions[datatype](config) submit_to_queue(queue_df, conn, table_task_queue) #-------------- # Tests #-------------- tests.assert_notnull_property(queue_df, columns_list=['SampleTypeCode', 'SampleTypeLetterCode',\ 'Study', 'Platform', 'SampleBarcode', 'OutDatafileNameKey',\ 'ParticipantBarcode', 'DatafileNameKey', 'AliquotBarcode']) if create_new: # delete the old queue(task_queue_status) and re-run conn.execute('DROP TABLE IF EXISTS {0}'.format(table_task_queue_status)) # Validate and get diff ; restart ETL; also takes care of errors queue_df = validate_and_get_diff(conn, queue_df, table_task_queue_status) if debug: # debug mode runs top 30 rows log.debug('Running in debug mode (first 30 records)') queue_df = queue_df.head(30) if dry_run: log.info('finished dry run for %s' % (datatype)) sys.exit() #-------------------------------------------- # Execution #------------------------------------------------------ pmr = process_manager.ProcessManager(max_workers=max_workers, db=db_filename, table=table_task_queue_status, log=log) for index, row in queue_df.iterrows(): metadata = row.to_dict() inputfilename = metadata['DatafileNameKey'] outputfilename = metadata['OutDatafileNameKey'] # transform #transform_functions[datatype]( project_id, bucket_name,\ # inputfilename, outputfilename, metadata) future = pmr.submit(transform_functions[datatype], project_id, bucket_name,\ inputfilename, outputfilename, metadata) time.sleep(0.1 + 0.5 * random.random()) if index % 100 == 0: time.sleep(5) pmr.start() log.info('finished pipeline for %s' % (datatype))
# Through out the script we use lowecase heading names oncotator_columns = [ line.rstrip('\n').lower() for line in config['maf']['oncotator_input_columns'] ] oncotator_input_files_dest = config['maf']['oncotator_input_files_dest'] #----------------------------------- # Extract #----------------------------------- data_library = extract.search_files(config) # log all files found writer = ExcelWriter('maf_part1.log.xlsx') data_library.to_excel(writer, "maf_files") writer.save() #----------------------------------------------------- # Execution #------------------------------------------------------ pm = process_manager.ProcessManager(max_workers=20) for index, row in data_library.iterrows(): inputfilename = row['filename'] outputfilename = oncotator_input_files_dest + row[ 'unique_filename'].replace(".maf", ".txt") # transform future = pm.submit(transform.generate_oncotator_inputfiles, project_id, bucket_name, inputfilename, outputfilename, oncotator_columns) time.sleep(0.2) pm.start()
return True if __name__ == '__main__': config = json.load(open(sys.argv[1])) project_id = config['project_id'] bucket_name = config['buckets']['open'] sample_code2letter = config['sample_code2letter'] # get disease_codes/studies( TODO this must be changed to get the disease code from the file name) df = convert_file_to_dataframe(open(sys.argv[2])) df = cleanup_dataframe(df) studies = list(set(df['Study'].tolist())) # get bq columns ( this allows the user to select the columns # , without worrying about the index, case-sensitivenes etc selected_columns = pd.read_table(sys.argv[3], names=['bq_columns']) transposed = selected_columns.T transposed.columns = transposed.loc['bq_columns'] transposed = cleanup_dataframe(transposed) bq_columns = transposed.columns.values # submit threads by disease code pm = process_manager.ProcessManager(max_workers=33, db='maf.db', table='task_queue_status') for idx, df_group in df.groupby(['Study']): future = pm.submit(process_oncotator_output, project_id, bucket_name, df_group, bq_columns, sample_code2letter) #process_oncotator_output( project_id, bucket_name, df_group, bq_columns, sample_code2letter) time.sleep(0.2) pm.start()
def main(config): log_filename = 'etl_download_isoform.log' log_name = 'etl_download_isoform' log = configure_logging(log_name, log_filename) log.info('begin downloading isoform files') # etl = util.DataETL("isb-cgc", "isb-cgc-open") # this starts a new connection project_id = config['project_id'] bucket_name = config['buckets']['open'] # connect to bucket to get files gcs = GcsConnector(project_id, bucket_name) isoform_file = re.compile("^.*.isoform.quantification.txt.json$") data_library = gcs.search_files( search_patterns=['.isoform.quantification.txt'], regex_search_pattern=isoform_file, prefixes=[config['mirna_isoform_matrix']['isoform_gcs_dir']]) # we are eliminating bad files - size 0; could be hg18 etc data_library.loc[:, 'basefilename'] = data_library['filename'].map( lambda x: os.path.splitext(os.path.basename(x))[0].replace( '.json', '')) data_library = data_library.query('size > 0') log.info('\tbegin selecting isoform files from sql-lite isoform db') conn = sqlite3.connect(config['mirna_isoform_matrix']['isoform_file_db']) sql = 'SELECT * from {0}'.format('task_queue') all_files_df = pd.read_sql_query(sql, conn) conn.close() log.info('\tfinished selecting isoform files') log.info('\tbegin reading from down loaded files') with open(config['mirna_isoform_matrix'] ['isoform_download_prev_files']) as f: lines = f.read().splitlines() log.info('\tfinished reading from down loaded files') log.info( 'filter files.\n\tfiles in cloud storage: %s\n\tfiles previously marked to download: %s\n%s\n' % (len(data_library), len(all_files_df), data_library)) all_files_df = all_files_df[(all_files_df.DatafileName.isin( data_library.basefilename))] all_files_df = all_files_df[~(all_files_df.DatafileName.isin(lines))] data_library = all_files_df log.info('finished filter files: %s\n%s\n' % (len(data_library), data_library)) conn = sqlite3.connect(config['mirna_isoform_matrix']['isoform_file_db']) submit_to_queue(data_library, conn, 'task_queue', log) queue_df = data_library # restart ETL; this gets the diff; also takes care of errors try: conn = sqlite3.connect('isoform_download.db') sql = 'SELECT * from task_queue_status where errors="None"' queue_df2 = pd.read_sql_query(sql, conn) log.info('\tso far completed: ' % (len(queue_df2))) queue_df = queue_df[~( queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))] log.info('\tso far not completed: ' % (len(queue_df))) except Exception: log.exception( '\n++++++++++++++++++++++\n\tproblem filtering completed jobs, ignoring\n++++++++++++++++++++++\n' ) # ----------------------------------------------------- # thread this with concurrent futures #------------------------------------------------------ log.info('\tsubmit jobs to process manager') pm = process_manager.ProcessManager(max_workers=200, db='isoform_download.db', table='task_queue_status', log=log) for count, df in data_library.iterrows(): row = df.to_dict() if 0 == count % 512: time.sleep(10) if 0 == count % 2048: log.info('\t\tsubmitting %s file: %s' % (count, row['DatafileName'])) if not os.path.isdir( config['mirna_isoform_matrix']['isoform_download_dir'] + row['Platform']): os.makedirs( config['mirna_isoform_matrix']['isoform_download_dir'] + row['Platform']) outfilename = config['mirna_isoform_matrix'][ 'isoform_download_dir'] + row['Platform'] + "/" + row[ 'DatafileName'] pm.submit(download_file, project_id, bucket_name, row['DatafileNameKey'], outfilename, '') time.sleep(0.2) log.info('\tsubmitted %s total jobs to process manager' % (count)) log.info('\tstart process manager completion check') pm.start() log.info('finished downloading isoform files')
# restart ETL; this gets the diff; also takes care of errors try: sql = 'SELECT * from task_queue_status where errors="None"' queue_df2 = pd.read_sql_query(sql, conn) print 'completed: ', len(queue_df2) queue_df = queue_df[~( queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))] print 'Not completed: ', len(queue_df) except Exception, e: pass # ----------------------------------------------------- # thread this with concurrent futures #------------------------------------------------------ pm = process_manager.ProcessManager(max_workers=200, db='isoform_download', table='task_queue_status') for i, df in data_library.iterrows(): row = df.to_dict() print row['DatafileName'] outfilename = "/mnt/datadisk-3/isoform_files/" + row[ 'Platform'] + "/" + row['DatafileName'] future = pm.submit(download_file, project_id, bucket_name, row['DatafileNameKey'], outfilename, '') time.sleep(0.2) pm.start() if __name__ == "__main__": config = json.load(open(sys.argv[1]))
# Extract #----------------------------------- data_library = extract.search_files(config) # log all files found writer = ExcelWriter('maf_part1.log.xlsx') data_library.to_excel(writer, "maf_files") writer.save() #----------------------------------------------------- # Execution #------------------------------------------------------ log_filename = 'etl_maf_part1.log' log_name = 'etl_maf_part1.log' log = configure_logging(log_name, log_filename) log.info('start maf part1 pipeline') pm = process_manager.ProcessManager(max_workers=20, db='maf1.db', table='task_queue_status', log=log) for index, row in data_library.iterrows(): inputfilename = row['filename'] outputfilename = oncotator_input_files_dest + row[ 'unique_filename'].replace(".maf", ".txt") # transform future = pm.submit(transform.generate_oncotator_inputfiles, project_id, bucket_name, inputfilename, outputfilename, oncotator_columns) time.sleep(0.2) pm.start() log.info('finished maf part1 pipeline')