def parse_protein(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('protein', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # validation tests.assert_notnull_property(data_df, columns_list=['Protein_Name']) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata) return status
def parse_protein(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging("protein", "logs/" + metadata["AliquotBarcode"] + ".log") # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) # main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # validation tests.assert_notnull_property(data_df, columns_list=["Protein_Name"]) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata) return status
def parse_protein(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging log = configure_logging( 'protein', "logs/protein_transform_" + metadata['AliquotBarcode'] + '.log') try: log.info('start transform of %s' % (metadata['AliquotBarcode'])) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1, log=log) log.info('\tadd changes and metadata for %s' % (metadata['AliquotBarcode'])) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # validation tests.assert_notnull_property(data_df, columns_list=['Protein_Name']) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata) log.info('finished transform of %s' % (metadata['AliquotBarcode'])) except Exception as e: log.exception('problem transforming %s' % (metadata['AliquotBarcode'])) raise e return status
def main(datatype, config_file, max_workers, dry_run, create_new, debug): """ Pipeline """ # config params config = json.load(open(config_file)) project_id = config['project_id'] bucket_name = config['buckets']['open'] table_task_queue = 'task_queue' table_task_queue_status = 'task_queue_status' db_filename = 'etl-{0}.db'.format(datatype) log_filename = 'etl_{0}.log'.format(datatype) log_name = 'etl_{0}'.format(datatype) log.info('start pipeline for %s' % (datatype)) # check if the table exists and issue warning if os.path.exists(db_filename): log.warning('Using the already available database file - {0}'.format(db_filename)) time.sleep(2) # connect to the database conn = sqlite3.connect(db_filename, check_same_thread=False) #------------------------------- # Submit to task queue #------------------------------- print "="*30 + "\nQuerying Google Cloud SQL metadata_data table" queue_df = extract_functions[datatype](config) submit_to_queue(queue_df, conn, table_task_queue) #-------------- # Tests #-------------- tests.assert_notnull_property(queue_df, columns_list=['SampleTypeCode', 'SampleTypeLetterCode',\ 'Study', 'Platform', 'SampleBarcode', 'OutDatafileNameKey',\ 'ParticipantBarcode', 'DatafileNameKey', 'AliquotBarcode']) if create_new: # delete the old queue(task_queue_status) and re-run conn.execute('DROP TABLE IF EXISTS {0}'.format(table_task_queue_status)) # Validate and get diff ; restart ETL; also takes care of errors queue_df = validate_and_get_diff(conn, queue_df, table_task_queue_status) if debug: # debug mode runs top 30 rows log.debug('Running in debug mode (first 30 records)') queue_df = queue_df.head(30) if dry_run: log.info('finished dry run for %s' % (datatype)) sys.exit() #-------------------------------------------- # Execution #------------------------------------------------------ pmr = process_manager.ProcessManager(max_workers=max_workers, db=db_filename, table=table_task_queue_status, log=log) for index, row in queue_df.iterrows(): metadata = row.to_dict() inputfilename = metadata['DatafileNameKey'] outputfilename = metadata['OutDatafileNameKey'] # transform #transform_functions[datatype]( project_id, bucket_name,\ # inputfilename, outputfilename, metadata) future = pmr.submit(transform_functions[datatype], project_id, bucket_name,\ inputfilename, outputfilename, metadata) time.sleep(0.1 + 0.5 * random.random()) if index % 100 == 0: time.sleep(5) pmr.start() log.info('finished pipeline for %s' % (datatype))
def main(datatype, config_file, max_workers, dry_run, create_new, debug): """ Pipeline """ # config params config = json.load(open(config_file)) project_id = config['project_id'] bucket_name = config['buckets']['open'] table_task_queue = 'task_queue' table_task_queue_status = 'task_queue_status' db_filename = 'etl-{0}.db'.format(datatype) log_filename = 'etl_{0}.log'.format(datatype) log_name = 'etl_{0}'.format(datatype) # check if the table exists and issue warning if os.path.exists(db_filename): log.warning('Using the already available database file - {0}'.format(db_filename)) time.sleep(2) # connect to the database conn = sqlite3.connect(db_filename, check_same_thread=False) #------------------------------- # Submit to task queue #------------------------------- print "="*30 + "\nQuerying Google Cloud SQL metadata_data table" queue_df = extract_functions[datatype](config) submit_to_queue(queue_df, conn, table_task_queue) #-------------- # Tests #-------------- tests.assert_notnull_property(queue_df, columns_list=['SampleTypeCode', 'SampleTypeLetterCode',\ 'Study', 'Platform', 'SampleBarcode', 'OutDatafileNameKey',\ 'ParticipantBarcode', 'DatafileNameKey', 'AliquotBarcode']) if create_new: # delete the old queue(task_queue_status) and re-run conn.execute('DROP TABLE IF EXISTS {0}'.format(table_task_queue_status)) # Validate and get diff ; restart ETL; also takes care of errors queue_df = validate_and_get_diff(conn, queue_df, table_task_queue_status) if debug: # debug mode runs top 30 rows log.debug('Running in debug mode (first 30 records)') queue_df = queue_df.head(30) if dry_run: sys.exit() #-------------------------------------------- # Execution #------------------------------------------------------ pmr = process_manager.ProcessManager(max_workers=max_workers, db=db_filename, table=table_task_queue_status) for index, row in queue_df.iterrows(): metadata = row.to_dict() inputfilename = metadata['DatafileNameKey'] outputfilename = metadata['OutDatafileNameKey'] # transform #transform_functions[datatype]( project_id, bucket_name,\ # inputfilename, outputfilename, metadata) future = pmr.submit(transform_functions[datatype], project_id, bucket_name,\ inputfilename, outputfilename, metadata) time.sleep(0.1 + 0.5 * random.random()) if index % 100 == 0: time.sleep(5) pmr.start()