示例#1
0
def parse_methylation(project_id, bucket_name, filename, outfilename,
                      metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('mirna.isoform',
                      "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs,
                                                project_id,
                                                bucket_name,
                                                filename,
                                                skiprows=1)
    data_df.columns = [
        'Probe_Id', "Beta_Value", "Gene_Symbol", "Chromosome",
        "Genomic_Coordinate"
    ]

    data_df = add_metadata(data_df, metadata)
    data_df = additional_changes(data_df)

    # upload the contents of the dataframe in njson format
    df_string = data_df.to_csv(index=False, header=False, float_format='%.2f')
    status = gcs.upload_blob_from_string(outfilename, df_string)

    return status
示例#2
0
def parse_protein(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('protein', "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs,
                                                project_id,
                                                bucket_name,
                                                filename,
                                                skiprows=1)
    data_df = additional_changes(data_df)
    data_df = add_metadata(data_df, metadata)

    # validation
    tests.assert_notnull_property(data_df, columns_list=['Protein_Name'])

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df,
                                                outfilename,
                                                metadata=metadata)
    return status
示例#3
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """
    log = configure_logging('data_load', 'logs/data_load.log')
    log.info('begin load of data into bigquery')

    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    log.info("\tLoading Data data into BigQuery...")
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['data']['bq_table'],
        schemas_dir + config['data']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['data']['output_dir'] + '*',
        'NEWLINE_DELIMITED_JSON',
        'WRITE_EMPTY'
    )

    log.info('finished load of data into bigquery')
示例#4
0
def parse_isoform(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    log = configure_logging('mirna.isoform.transform', "logs/mirna_isoform_transform_" + metadata['AliquotBarcode'] + '.log')
    try:
        log.info('start transform of %s' % (metadata['AliquotBarcode']))
        # connect to the cloud bucket
        gcs = GcsConnector(project_id, bucket_name)
    
        #main steps: download, convert to df, cleanup, transform, add metadata
        log.info('\tadd changes and metadata for %s' % (metadata['AliquotBarcode']))
        data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, log=log)
        data_df = additional_changes(data_df)
        data_df = add_metadata(data_df, metadata)
    
        # upload the contents of the dataframe in njson format
        status = gcs.convert_df_to_njson_and_upload(data_df, outfilename)
        log.info('finished transform of %s' % (metadata['AliquotBarcode']))
    except Exception as e:
        log.exception('problem transforming %s' % (metadata['AliquotBarcode']))
        raise e
    return status
示例#5
0
def identify_data(config):
    """Gets the metadata info from database
    """
    # setup logging
    log = configure_logging('methylation_extract',
                            "logs/" + 'methylation_extract.log')
    log.info('start methylation extract')

    # cloudSql connection params
    host = config['cloudsql']['host']
    database = config['cloudsql']['db']
    user = config['cloudsql']['user']
    passwd = config['cloudsql']['passwd']

    sqlquery = """
        SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform,
              SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype,
              DatafileUploaded, IncludeForAnalysis
        FROM metadata_data
        WHERE DataType='DNA Methylation'
        AND DatafileUploaded='true'
        AND IncludeForAnalysis='yes'
        """

    # connect to db and get results in a dataframe
    log.info("\tselect file names from db")
    metadata_df = read_mysql_query(host, database, user, passwd, sqlquery)

    # rename platforms in rows
    for i, row in metadata_df.iterrows():
        metadata = row.to_dict()
        platform = row['Platform']
        metadata_df.loc[i, 'OutDatafileNameKey'] = config['methylation']['output_dir']\
                            + platform + '/' + metadata['DatafileName'] + '.csv'

    metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\
                                               .map(lambda code: config['sample_code2letter'][code])

    metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\
                                               .map(lambda inputfile: re.sub(r"^/", "", inputfile))

    # tag CELLC samples
    metadata_df['is_tumor'] = (metadata_df['SampleTypeLetterCode'] != "CELLC")
    metadata_df[
        'transform_function'] = 'methylation.transform.parse_methylation'

    log.info("\tFound {0} rows, columns.".format(str(metadata_df.shape)))

    # Filter - check all "is_" fields - remember all 'is_' fields must be boolean
    all_flag_columns = [
        key for key in metadata_df.columns.values if key.startswith("is_")
    ]
    flag_df = metadata_df[all_flag_columns]
    metadata_df = metadata_df[flag_df.all(axis=1)]

    log.info("\tAfter filtering: Found {0} rows, columns.".format(
        str(metadata_df.shape)))

    log.info('finished methylation extract')
    return metadata_df
示例#6
0
def identify_data(config):
    """Gets the metadata info from database
    """
    log = configure_logging('cnv', 'logs/cnv_extract.log')
    log.info('start cnv extract')
    # cloudSql connection params
    host = config['cloudsql']['host']
    database = config['cloudsql']['db']
    user = config['cloudsql']['user']
    passwd = config['cloudsql']['passwd']

    log.info("\tselect file names from db")
    sqlquery = """
        SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform,
              SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype,
              DatafileUploaded, IncludeForAnalysis, DataCenterName
        FROM metadata_data
        WHERE DataType='Copy Number Results-SNP'
        AND DatafileUploaded='true'
        AND DatafileNameKey LIKE '%.nocnv_hg19.seg.txt'
        AND IncludeForAnalysis='yes'
        """

    # connect to db and get results in a dataframe
    metadata_df = read_mysql_query(host, database, user, passwd, sqlquery)

    # rename platforms in rows.  because number of rows exceed import limitation for GCS to BigQuery, rotate into multiple
    # virtual directories
    dir_prefix = config['cnv']['output_dir_prefix']
    dir_suffixes = config['cnv']['output_dir_suffixes']
    num_suffix = len(dir_suffixes)
    for i, row in metadata_df.iterrows():
        metadata = row.to_dict()
        metadata_df.loc[i, 'OutDatafileNameKey'] = dir_prefix + dir_suffixes[i % num_suffix] \
                             + metadata['DatafileName'] + '.json'

    metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\
                                               .map(lambda code: config['sample_code2letter'][code])

    metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\
                                               .map(lambda inputfile: re.sub(r"^/", "", inputfile))

    # tag CELLC samples
    metadata_df['is_tumor'] = (metadata_df['SampleTypeLetterCode'] != "CELLC")
    metadata_df['transform_function'] = 'cnv.transform.parse_cnv'

    log.info("\tFound {0} rows, columns.".format(str(metadata_df.shape)))

    # Filter - check all "is_" fields - remember all 'is_' fields must be boolean
    all_flag_columns = [
        key for key in metadata_df.columns.values if key.startswith("is_")
    ]
    flag_df = metadata_df[all_flag_columns]
    metadata_df = metadata_df[flag_df.all(axis=1)]

    log.info("\tAfter filtering: Found {0} rows, columns.".format(
        str(metadata_df.shape)))

    log.info('finished cnv extract')
    return metadata_df
示例#7
0
def parse_cnv(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('cnv', "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename)
    data_df = additional_changes(data_df)
    data_df = add_metadata(data_df, metadata)

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename)
    return status
示例#8
0
def parse_isoform(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('mirna.isoform',
                      "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name,
                                                filename)
    data_df = additional_changes(data_df)
    data_df = add_metadata(data_df, metadata)

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename)
    return status
示例#9
0
def parse_protein(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging("protein", "logs/" + metadata["AliquotBarcode"] + ".log")

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    # main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1)
    data_df = additional_changes(data_df)
    data_df = add_metadata(data_df, metadata)

    # validation
    tests.assert_notnull_property(data_df, columns_list=["Protein_Name"])

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata)
    return status
示例#10
0
def parse_methylation(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('mirna.isoform', "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1)
    data_df.columns = ['Probe_Id', "Beta_Value", "Gene_Symbol", "Chromosome", "Genomic_Coordinate"]

    data_df = add_metadata(data_df, metadata)
    data_df = additional_changes(data_df)

    # upload the contents of the dataframe in njson format
    df_string = data_df.to_csv(index=False, header=False, float_format='%.2f')
    status = gcs.upload_blob_from_string(outfilename, df_string)

    return status
示例#11
0
def main(configfilename, hgnc_df_filename):
    # go through the AA files and create corrected files.
    try:
        log_filename = 'etl_process_antibody_files.log'
        log_name = 'etl_process_antibody_files'
        log = configure_logging(log_name, log_filename)

        log.info('start processing antibody annotation files')
        with open(configfilename) as configfile:
            config = json.load(configfile)
        download_antibody_annotation_files(config, log)
        process_antibody_annotation_files(config, log)
        fix_gene_protein_inconsistencies(config, hgnc_df_filename, log)
        log.info('done processing antibody annotation files')
    except Exception as e:
        log.exception('fatal problem processing antibody files')
        raise e
示例#12
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """
    log = configure_logging('cnv_load', 'logs/cnv_load.log')
    log.info('begin load of cnv into bigquery')
    
    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    #print "Loading Methylation 450K data into BigQuery.."
    #load_data_from_file.run(
    #    config['project_id'],
    #    config['bq_dataset'],
    #    config['methylation']['bq_table'],
    #    schemas_dir + config['methylation']['schema_file'],
    #    'gs://' + config['buckets']['open'] + '/' +\
    #        config['methylation']['output_dir'] + 'HumanMethylation450/*',
    #    'CSV',
    #    'WRITE_EMPTY'
    #)
    dir_prefix = config['cnv']['output_dir_prefix']
    dir_suffixes = config['cnv']['output_dir_suffixes']
    for dir_suffix in dir_suffixes:
        log.info("\tLoading CNV data into BigQuery from %s..." % (dir_prefix + dir_suffix))
        load_data_from_file.run(
            config['project_id'],
            config['bq_dataset'],
            config['cnv']['bq_table'],
            schemas_dir + config['cnv']['schema_file'],
            'gs://' + config['buckets']['open'] + '/' +\
                dir_prefix + dir_suffix + '*',
            'NEWLINE_DELIMITED_JSON',
            'WRITE_APPEND'
        )
        log.info("*"*30)

    log.info('finished load of CNV into bigquery')
示例#13
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """
    log = configure_logging('mirna_isoform_matrix_load',
                            'logs/mirna_isoform_matrix_load.log')
    log.info('begin load of mirna isoform matrix into bigquery')

    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    log.info("\tLoading Isoform HiSeq matrix data into BigQuery..")
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['mirna_isoform_matrix']['bq_table_hiseq'],
        schemas_dir + config['mirna_isoform_matrix']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['mirna_isoform_matrix']['IlluminaHiSeq']['output_dir'] + '*',
        'NEWLINE_DELIMITED_JSON',
        'WRITE_EMPTY'
    )
    log.info("*" * 30)
    log.info("\tLoading Isoform GA matrix data into BigQuery..")
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['mirna_isoform_matrix']['bq_table_ga'],
        schemas_dir + config['mirna_isoform_matrix']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['mirna_isoform_matrix']['IlluminaGA']['output_dir'] + '*',
        'NEWLINE_DELIMITED_JSON',
        'WRITE_APPEND'
    )

    log.info('done load of mirna isoform matrix into bigquery')
示例#14
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """
    log = configure_logging('methylation_split', 'logs/methylation_load.log')
    log.info('begin load of methylation into bigquery')

    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    #print "Loading Methylation 450K data into BigQuery.."
    #load_data_from_file.run(
    #    config['project_id'],
    #    config['bq_dataset'],
    #    config['methylation']['bq_table'],
    #    schemas_dir + config['methylation']['schema_file'],
    #    'gs://' + config['buckets']['open'] + '/' +\
    #        config['methylation']['output_dir'] + 'HumanMethylation450/*',
    #    'CSV',
    #    'WRITE_EMPTY'
    #)
    log.info("\tLoading Methylation data into BigQuery...")
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['methylation']['bq_table'],
        schemas_dir + config['methylation']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['methylation']['output_dir'] + '*',
        'CSV',
        'WRITE_APPEND'
    )

    main(config, log)

    log.info('finished load of methylation into bigquery')
示例#15
0
        action='store_true',
        default=False)
    parser.add_argument(
        '--create_new',
        help='Set create disposition to True. This deletes old task queue, db etc\
                WARNING: This deletes old files, database, task queue etc.',
        action='store_true',
        default=False)
    parser.add_argument(
        '--debug',
        help='Runs the pipeline on first 30 records  and then exists. Usefull to run\
             after the dry run to test the workflow. (default: False)',
        action='store_true',
        default=False)

    args = parser.parse_args()

    log_filename = 'etl_{0}.log'.format(args.datatype)
    log_name = 'etl_{0}'.format(args.datatype)
    log = configure_logging(log_name, log_filename)


    results = main(
        args.datatype,
        args.config_file,
        args.max_workers,
        args.dry_run,
        args.create_new,
        args.debug
    )
示例#16
0
def identify_data(config):
    """Gets the metadata info from database
    """
    log = configure_logging('mirna.isoform', 'logs/mirna_isoform_extract_.log')
    log.info('start mirna isoform extract')
    # cloudSql connection params
    host = config['cloudsql']['host']
    database = config['cloudsql']['db']
    user = config['cloudsql']['user']
    passwd = config['cloudsql']['passwd']

    log.info("\tselect file names from db")
    sqlquery = """
        SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform,
              SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype,
              DatafileUploaded, IncludeForAnalysis, DataCenterName
        FROM metadata_data
        WHERE DataType='Expression-miRNA Isoform'
        AND DatafileUploaded='true'
        AND DatafileNameKey LIKE '%hg19.mirbase20.isoform.quantification.txt'
        AND IncludeForAnalysis='yes'
        """

    # connect to db and get results in a dataframe
    metadata_df = read_mysql_query(host, database, user, passwd, sqlquery)

    # rename platforms in rows
    for i, row in metadata_df.iterrows():
        metadata = row.to_dict()
        if 'IlluminaHiSeq' in metadata['Platform']:
            metadata_df.loc[i, 'Platform'] = 'IlluminaHiSeq'
            platform = 'IlluminaHiSeq'
        elif 'IlluminaGA' in metadata['Platform']:
            metadata_df.loc[i, 'Platform'] = 'IlluminaGA'
            platform = 'IlluminaGA'
        else:
            raise Exception('Did not match any given platform')

        metadata_df.loc[i, 'OutDatafileNameKey'] = config['mirna']['isoform']['output_dir']\
                            + platform + '/' + metadata['DatafileName'] + '.json'

    metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\
                                               .map(lambda code: config['sample_code2letter'][code])

    metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\
                                               .map(lambda inputfile: re.sub(r"^/", "", inputfile))

    # tag CELLC samples
    metadata_df['is_tumor'] = (metadata_df['SampleTypeLetterCode'] != "CELLC")
    metadata_df['transform_function'] = 'mirna.isoform.transform.parse_isoform'

    log.info("\tFound {0} rows, columns.".format(str(metadata_df.shape)))

    # Filter - check all "is_" fields - remember all 'is_' fields must be boolean
    all_flag_columns = [
        key for key in metadata_df.columns.values if key.startswith("is_")
    ]
    flag_df = metadata_df[all_flag_columns]
    metadata_df = metadata_df[flag_df.all(axis=1)]

    log.info("\tAfter filtering: Found {0} rows, columns.".format(
        str(metadata_df.shape)))

    log.info('finished mirna isoform extract')
    return metadata_df
示例#17
0
        action='store_true',
        default=False)
    parser.add_argument(
        '--create_new',
        help='Set create disposition to True. This deletes old task queue, db etc\
                WARNING: This deletes old files, database, task queue etc.',
        action='store_true',
        default=False)
    parser.add_argument(
        '--debug',
        help='Runs the pipeline on first 30 records  and then exists. Usefull to run\
             after the dry run to test the workflow. (default: False)',
        action='store_true',
        default=False)

    args = parser.parse_args()

    log_filename = 'etl_{0}.log'.format(args.datatype)
    log_name = 'etl_{0}'.format(args.datatype)
    log = configure_logging(log_name, log_filename)


    results = main(
        args.datatype,
        args.config_file,
        args.max_workers,
        args.dry_run,
        args.create_new,
        args.debug
    )
示例#18
0
                (chromosome))
            time.sleep(20)

    except HttpError as err:
        print 'Error:', pprint.pprint(err.content)

    except AccessTokenRefreshError:
        print(
            "Credentials have been revoked or expired, please re-run"
            "the application to re-authorize")


def main(config, log):
    log.info('start splitting methylation data by chromosome')
    project_id = config['project_id']
    dataset_id = config['bq_dataset']
    chromosomes = map(str, range(1, 23)) + ['X', 'Y']
    #    chromosomes = map(lambda orig_string: 'chr' + orig_string, chr_nums)
    for chromosome in chromosomes:
        split_table_by_chr(chromosome, project_id, dataset_id, log)
    log.info('done splitting methylation data by chromosome')


if __name__ == '__main__':
    # setup logging
    with open(sys.argv[1]) as configfile:
        config = json.load(configfile)
    log = configure_logging('methylation_split',
                            "logs/methylation_transform_split" + '.log')
    main(config, log)
示例#19
0
def identify_data(config):
    """Gets the metadata info from database
    """
    log = configure_logging('data_et', 'logs/data_et.log')
    log.info('start data extract and transform')
    # cloudSql connection params
    host = config['cloudsql']['host']
    database = config['cloudsql']['db']
    user = config['cloudsql']['user']
    passwd = config['cloudsql']['passwd']

    log.info("\tselect data records from db")
    sqlquery = """
        SELECT 
            ParticipantBarcode,
            SampleBarcode,
            AliquotBarcode,
            AliquotUUID,
            DataArchiveName,
            DataArchiveURL,
            DataArchiveVersion,
            DataCenterCode,
            DataCenterName,
            DataCenterType,
            DatafileMD5,
            DatafileName,
            DatafileNameKey,
            DatafileUploaded,
            DataLevel,
            Datatype,
            GenomeReference,
            IncludeForAnalysis,
            MAGETabArchiveName,
            MAGETabArchiveURL,
            Pipeline,
            Platform,
            Project,
            Repository,
            SampleType,
            SampleTypeCode,
            SDRFFileName,
            SDRFFileNameKey,
            SecurityProtocol,
            Species,
            Study,
            wasDerivedFrom,
            library_strategy,
            state,
            reason_for_state,
            analysis_id,
            analyte_code,
            last_modified,
            platform_full_name,
            GG_dataset_id,
            GG_readgroupset_id
        FROM metadata_data
        WHERE Project = 'TCGA'
            AND study = '%s'
            AND DatafileUploaded='true'
            AND DatafileNameKey is not null
            AND IncludeForAnalysis='yes'
        """

    studies = config['all_tumor_types']
    for study in studies:
        # connect to db and get results in a dataframe
        log.info("\tselect data records from db for %s" % (study))
        metadata_df = read_mysql_query(host, database, user, passwd, sqlquery % (study))
        log.info("\tFound {0} rows, columns." .format(str(metadata_df.shape)))
    
        project_id = config['project_id']
        bucket_name = config['buckets']['open']
        gcs = GcsConnector(project_id, bucket_name)
        log.info("\tupload data to GCS.")
        gcs.convert_df_to_njson_and_upload(metadata_df, config['data']['output_dir'] + config['data']['bq_table'] + '_%s.json' % (study))
    
    log.info('finished extract and transform')
def main(config):

    log_filename = 'etl_download_isoform.log'
    log_name = 'etl_download_isoform'
    log = configure_logging(log_name, log_filename)
    log.info('begin downloading isoform files')
    #    etl = util.DataETL("isb-cgc", "isb-cgc-open") # this starts a new connection
    project_id = config['project_id']
    bucket_name = config['buckets']['open']

    # connect to bucket to get files
    gcs = GcsConnector(project_id, bucket_name)
    isoform_file = re.compile("^.*.isoform.quantification.txt.json$")
    data_library = gcs.search_files(
        search_patterns=['.isoform.quantification.txt'],
        regex_search_pattern=isoform_file,
        prefixes=[config['mirna_isoform_matrix']['isoform_gcs_dir']])
    # we are eliminating bad files - size 0; could be hg18 etc
    data_library.loc[:, 'basefilename'] = data_library['filename'].map(
        lambda x: os.path.splitext(os.path.basename(x))[0].replace(
            '.json', ''))
    data_library = data_library.query('size > 0')

    log.info('\tbegin selecting isoform files from sql-lite isoform db')
    conn = sqlite3.connect(config['mirna_isoform_matrix']['isoform_file_db'])
    sql = 'SELECT * from {0}'.format('task_queue')
    all_files_df = pd.read_sql_query(sql, conn)
    conn.close()
    log.info('\tfinished selecting isoform files')

    log.info('\tbegin reading from down loaded files')
    with open(config['mirna_isoform_matrix']
              ['isoform_download_prev_files']) as f:
        lines = f.read().splitlines()
    log.info('\tfinished reading from down loaded files')

    log.info(
        'filter files.\n\tfiles in cloud storage: %s\n\tfiles previously marked to download: %s\n%s\n'
        % (len(data_library), len(all_files_df), data_library))
    all_files_df = all_files_df[(all_files_df.DatafileName.isin(
        data_library.basefilename))]
    all_files_df = all_files_df[~(all_files_df.DatafileName.isin(lines))]
    data_library = all_files_df
    log.info('finished filter files: %s\n%s\n' %
             (len(data_library), data_library))

    conn = sqlite3.connect(config['mirna_isoform_matrix']['isoform_file_db'])
    submit_to_queue(data_library, conn, 'task_queue', log)
    queue_df = data_library

    # restart ETL; this gets the diff; also takes care of errors
    try:
        conn = sqlite3.connect('isoform_download.db')
        sql = 'SELECT * from task_queue_status where errors="None"'
        queue_df2 = pd.read_sql_query(sql, conn)
        log.info('\tso far completed: ' % (len(queue_df2)))
        queue_df = queue_df[~(
            queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))]
        log.info('\tso far not completed: ' % (len(queue_df)))
    except Exception:
        log.exception(
            '\n++++++++++++++++++++++\n\tproblem filtering completed jobs, ignoring\n++++++++++++++++++++++\n'
        )

    # -----------------------------------------------------
    # thread this with concurrent futures
    #------------------------------------------------------
    log.info('\tsubmit jobs to process manager')
    pm = process_manager.ProcessManager(max_workers=200,
                                        db='isoform_download.db',
                                        table='task_queue_status',
                                        log=log)
    for count, df in data_library.iterrows():
        row = df.to_dict()
        if 0 == count % 512:
            time.sleep(10)
        if 0 == count % 2048:
            log.info('\t\tsubmitting %s file: %s' %
                     (count, row['DatafileName']))
        if not os.path.isdir(
                config['mirna_isoform_matrix']['isoform_download_dir'] +
                row['Platform']):
            os.makedirs(
                config['mirna_isoform_matrix']['isoform_download_dir'] +
                row['Platform'])
        outfilename = config['mirna_isoform_matrix'][
            'isoform_download_dir'] + row['Platform'] + "/" + row[
                'DatafileName']
        pm.submit(download_file, project_id, bucket_name,
                  row['DatafileNameKey'], outfilename, '')
        time.sleep(0.2)
    log.info('\tsubmitted %s total jobs to process manager' % (count))

    log.info('\tstart process manager completion check')
    pm.start()

    log.info('finished downloading isoform files')