示例#1
0
def parse_protein(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('protein', "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs,
                                                project_id,
                                                bucket_name,
                                                filename,
                                                skiprows=1)
    data_df = additional_changes(data_df)
    data_df = add_metadata(data_df, metadata)

    # validation
    tests.assert_notnull_property(data_df, columns_list=['Protein_Name'])

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df,
                                                outfilename,
                                                metadata=metadata)
    return status
示例#2
0
def parse_protein(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging("protein", "logs/" + metadata["AliquotBarcode"] + ".log")

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    # main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1)
    data_df = additional_changes(data_df)
    data_df = add_metadata(data_df, metadata)

    # validation
    tests.assert_notnull_property(data_df, columns_list=["Protein_Name"])

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata)
    return status
示例#3
0
def parse_protein(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    log = configure_logging(
        'protein',
        "logs/protein_transform_" + metadata['AliquotBarcode'] + '.log')
    try:
        log.info('start transform of %s' % (metadata['AliquotBarcode']))
        # connect to the cloud bucket
        gcs = GcsConnector(project_id, bucket_name)

        #main steps: download, convert to df, cleanup, transform, add metadata
        data_df = gcutils.convert_blob_to_dataframe(gcs,
                                                    project_id,
                                                    bucket_name,
                                                    filename,
                                                    skiprows=1,
                                                    log=log)
        log.info('\tadd changes and metadata for %s' %
                 (metadata['AliquotBarcode']))
        data_df = additional_changes(data_df)
        data_df = add_metadata(data_df, metadata)

        # validation
        tests.assert_notnull_property(data_df, columns_list=['Protein_Name'])

        # upload the contents of the dataframe in njson format
        status = gcs.convert_df_to_njson_and_upload(data_df,
                                                    outfilename,
                                                    metadata=metadata)
        log.info('finished transform of %s' % (metadata['AliquotBarcode']))
    except Exception as e:
        log.exception('problem transforming %s' % (metadata['AliquotBarcode']))
        raise e
    return status
示例#4
0
def main(datatype, config_file, max_workers, dry_run, create_new, debug):
    """
    Pipeline
    """
    # config params
    config = json.load(open(config_file))
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    table_task_queue = 'task_queue'
    table_task_queue_status = 'task_queue_status' 
    db_filename = 'etl-{0}.db'.format(datatype)
    log_filename = 'etl_{0}.log'.format(datatype)
    log_name = 'etl_{0}'.format(datatype)

    log.info('start pipeline for %s' % (datatype))
    # check if the table exists and issue warning
    if os.path.exists(db_filename):
       log.warning('Using the already available database file - {0}'.format(db_filename))
       time.sleep(2)
    
    # connect to the database
    conn = sqlite3.connect(db_filename, check_same_thread=False)
   
    #-------------------------------
    # Submit to task queue
    #-------------------------------
    print "="*30 + "\nQuerying Google Cloud SQL metadata_data table"
    queue_df = extract_functions[datatype](config)
    submit_to_queue(queue_df, conn, table_task_queue)

    #--------------
    # Tests
    #--------------
    tests.assert_notnull_property(queue_df, columns_list=['SampleTypeCode', 'SampleTypeLetterCode',\
                     'Study', 'Platform', 'SampleBarcode', 'OutDatafileNameKey',\
                                         'ParticipantBarcode', 'DatafileNameKey', 'AliquotBarcode'])

    if create_new:
         # delete the old queue(task_queue_status) and re-run
        conn.execute('DROP TABLE IF EXISTS {0}'.format(table_task_queue_status))

    # Validate and get diff ; restart ETL; also takes care of errors
    queue_df = validate_and_get_diff(conn, queue_df, table_task_queue_status)

    if debug:
        # debug mode runs top 30 rows
        log.debug('Running in debug mode (first 30 records)')
        queue_df = queue_df.head(30)

    if dry_run:
        log.info('finished dry run for %s' % (datatype))
        sys.exit()


    #--------------------------------------------
    # Execution
    #------------------------------------------------------
    pmr = process_manager.ProcessManager(max_workers=max_workers, db=db_filename, table=table_task_queue_status, log=log)
    for index, row in queue_df.iterrows():
        metadata = row.to_dict()
        inputfilename = metadata['DatafileNameKey']
        outputfilename = metadata['OutDatafileNameKey']
        # transform
        #transform_functions[datatype]( project_id, bucket_name,\
        #                inputfilename, outputfilename, metadata)
        future = pmr.submit(transform_functions[datatype], project_id, bucket_name,\
                        inputfilename, outputfilename, metadata)
    
        time.sleep(0.1 + 0.5  * random.random())
        if index % 100 == 0:
            time.sleep(5)

    pmr.start()
    log.info('finished pipeline for %s' % (datatype))
示例#5
0
def main(datatype, config_file, max_workers, dry_run, create_new, debug):
    """
    Pipeline
    """
    # config params
    config = json.load(open(config_file))
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    table_task_queue = 'task_queue'
    table_task_queue_status = 'task_queue_status' 
    db_filename = 'etl-{0}.db'.format(datatype)
    log_filename = 'etl_{0}.log'.format(datatype)
    log_name = 'etl_{0}'.format(datatype)

    # check if the table exists and issue warning
    if os.path.exists(db_filename):
       log.warning('Using the already available database file - {0}'.format(db_filename))
       time.sleep(2)
    
    # connect to the database
    conn = sqlite3.connect(db_filename, check_same_thread=False)
   
    #-------------------------------
    # Submit to task queue
    #-------------------------------
    print "="*30 + "\nQuerying Google Cloud SQL metadata_data table"
    queue_df = extract_functions[datatype](config)
    submit_to_queue(queue_df, conn, table_task_queue)

    #--------------
    # Tests
    #--------------
    tests.assert_notnull_property(queue_df, columns_list=['SampleTypeCode', 'SampleTypeLetterCode',\
                     'Study', 'Platform', 'SampleBarcode', 'OutDatafileNameKey',\
                                         'ParticipantBarcode', 'DatafileNameKey', 'AliquotBarcode'])

    if create_new:
         # delete the old queue(task_queue_status) and re-run
        conn.execute('DROP TABLE IF EXISTS {0}'.format(table_task_queue_status))

    # Validate and get diff ; restart ETL; also takes care of errors
    queue_df = validate_and_get_diff(conn, queue_df, table_task_queue_status)

    if debug:
        # debug mode runs top 30 rows
        log.debug('Running in debug mode (first 30 records)')
        queue_df = queue_df.head(30)

    if dry_run:
        sys.exit()


    #--------------------------------------------
    # Execution
    #------------------------------------------------------
    pmr = process_manager.ProcessManager(max_workers=max_workers, db=db_filename, table=table_task_queue_status)
    for index, row in queue_df.iterrows():
        metadata = row.to_dict()
        inputfilename = metadata['DatafileNameKey']
        outputfilename = metadata['OutDatafileNameKey']
        # transform
        #transform_functions[datatype]( project_id, bucket_name,\
        #                inputfilename, outputfilename, metadata)
        future = pmr.submit(transform_functions[datatype], project_id, bucket_name,\
                        inputfilename, outputfilename, metadata)
    
        time.sleep(0.1 + 0.5  * random.random())
        if index % 100 == 0:
            time.sleep(5)

    pmr.start()