示例#1
0
def parse_methylation(project_id, bucket_name, filename, outfilename,
                      metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('mirna.isoform',
                      "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs,
                                                project_id,
                                                bucket_name,
                                                filename,
                                                skiprows=1)
    data_df.columns = [
        'Probe_Id', "Beta_Value", "Gene_Symbol", "Chromosome",
        "Genomic_Coordinate"
    ]

    data_df = add_metadata(data_df, metadata)
    data_df = additional_changes(data_df)

    # upload the contents of the dataframe in njson format
    df_string = data_df.to_csv(index=False, header=False, float_format='%.2f')
    status = gcs.upload_blob_from_string(outfilename, df_string)

    return status
示例#2
0
def parse_isoform(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    log = configure_logging('mirna.isoform.transform', "logs/mirna_isoform_transform_" + metadata['AliquotBarcode'] + '.log')
    try:
        log.info('start transform of %s' % (metadata['AliquotBarcode']))
        # connect to the cloud bucket
        gcs = GcsConnector(project_id, bucket_name)
    
        #main steps: download, convert to df, cleanup, transform, add metadata
        log.info('\tadd changes and metadata for %s' % (metadata['AliquotBarcode']))
        data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, log=log)
        data_df = additional_changes(data_df)
        data_df = add_metadata(data_df, metadata)
    
        # upload the contents of the dataframe in njson format
        status = gcs.convert_df_to_njson_and_upload(data_df, outfilename)
        log.info('finished transform of %s' % (metadata['AliquotBarcode']))
    except Exception as e:
        log.exception('problem transforming %s' % (metadata['AliquotBarcode']))
        raise e
    return status
示例#3
0
def main():
    """Example to download a file from the Google Storage, transform,
        and load to Google Storage and BigQuery
    """

    project_id = ''
    bucket_name = ''
    # example file in bucket
    filename = ''
    outfilename = ''

    # connect to the google cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

     # main steps: download, convert to df
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1)

    #---------------------------------------------------------
    # get required information
    # get chromosome 1 and Genomic_Coordinate > 20000000
    #---------------------------------------------------------
    data_df = (data_df.query("Chromosome == '1' and Genomic_Coordinate > 20000000")\
                 .query("Beta_value > 0.2"))
    # we can assign this query to a new dataframe and have new data

    # upload the contents of the dataframe in njson format to google storage
    # set metadata on the blob/object
    metadata = {'info': 'etl-test'}
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata)

    print status
示例#4
0
def parse_protein(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('protein', "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs,
                                                project_id,
                                                bucket_name,
                                                filename,
                                                skiprows=1)
    data_df = additional_changes(data_df)
    data_df = add_metadata(data_df, metadata)

    # validation
    tests.assert_notnull_property(data_df, columns_list=['Protein_Name'])

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df,
                                                outfilename,
                                                metadata=metadata)
    return status
示例#5
0
def parse_cnv(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('cnv', "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename)
    data_df = additional_changes(data_df)
    data_df = add_metadata(data_df, metadata)

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename)
    return status
示例#6
0
def parse_isoform(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('mirna.isoform',
                      "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name,
                                                filename)
    data_df = additional_changes(data_df)
    data_df = add_metadata(data_df, metadata)

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename)
    return status
示例#7
0
def parse_protein(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging("protein", "logs/" + metadata["AliquotBarcode"] + ".log")

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    # main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1)
    data_df = additional_changes(data_df)
    data_df = add_metadata(data_df, metadata)

    # validation
    tests.assert_notnull_property(data_df, columns_list=["Protein_Name"])

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata)
    return status
示例#8
0
def main():
    """Example to download a file from the Google Storage, transform,
        and load to Google Storage and BigQuery
    """

    project_id = ''
    bucket_name = ''
    # example file in bucket
    filename = 'TCGA-OR-A5J1-01A-11D-A29J-05.txt'
    outfilename = ''

    # read the stringIO/file into a pandas dataframe
    # load the file into a table
    data_df = pandas.read_table(filename, sep="\t", skiprows=1,
                                lineterminator='\n', comment='#')

    # clean up the dataframe for upload to BigQuery
    data_df = cleanup_dataframe(data_df)

    # connect to the google cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    # main steps: download, convert to df
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1)

    #---------------------------------------------------------
    # get required information
    # get chromosome 1 and Genomic_Coordinate > 20000000
    #---------------------------------------------------------
    data_df = (data_df.query("Chromosome == '1' and Genomic_Coordinate > 20000000")\
                .query("Beta_value > 0.2"))
    # we can assign this query to a new dataframe and have new data

    # upload the contents of the dataframe in njson format to google storage
    # set metadata on the blob/object
    metadata = {'info': 'etl-test'}
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata)

    print status
示例#9
0
def parse_methylation(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('mirna.isoform', "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1)
    data_df.columns = ['Probe_Id', "Beta_Value", "Gene_Symbol", "Chromosome", "Genomic_Coordinate"]

    data_df = add_metadata(data_df, metadata)
    data_df = additional_changes(data_df)

    # upload the contents of the dataframe in njson format
    df_string = data_df.to_csv(index=False, header=False, float_format='%.2f')
    status = gcs.upload_blob_from_string(outfilename, df_string)

    return status
示例#10
0
def main():
    """Example to download a file from the Google Storage, transform,
        and load to Google Storage and BigQuery
    """

    project_id = ''
    bucket_name = ''
    # example file in bucket
    filename = ''
    outfilename = ''

    # connect to the google cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    # main steps: download, convert to df
    data_df = gcutils.convert_blob_to_dataframe(gcs,
                                                project_id,
                                                bucket_name,
                                                filename,
                                                skiprows=1)

    #---------------------------------------------------------
    # get required information
    # get chromosome 1 and Genomic_Coordinate > 20000000
    #---------------------------------------------------------
    data_df = (data_df.query("Chromosome == '1' and Genomic_Coordinate > 20000000")\
                 .query("Beta_value > 0.2"))
    # we can assign this query to a new dataframe and have new data

    # upload the contents of the dataframe in njson format to google storage
    # set metadata on the blob/object
    metadata = {'info': 'etl-test'}
    status = gcs.convert_df_to_njson_and_upload(data_df,
                                                outfilename,
                                                metadata=metadata)

    print status
示例#11
0
def process_oncotator_output(project_id, bucket_name, data_library, bq_columns, sample_code2letter):
    study = data_library['Study'].iloc[0]

    # this needed to stop pandas from converting them to FLOAT
    dtype = {
        "Transcript_Exon" : "object"
       ,"NCBI_Build" : "object"
       ,"COSMIC_Total_Alterations_In_Gene" : "object"
       ,"CCLE_ONCOMAP_Total_Mutations_In_Gene" : "object"
       ,"HGNC_HGNC_ID" : "object"
       ,"UniProt_AApos" : "object"
       ,"Transcript_Position" : "object"
       ,"HGNC_OMIM_ID_Supplied_By_NCBI" : "object"
      }


    file_count = 0
   
    # create an empty dataframe. we use this to merge dataframe
    disease_bigdata_df = pd.DataFrame()

    # iterate over the selected files
    for oncotator_file in data_library['filename']:
        file_count+= 1

        log.info('-'*10 + "{0}: Processing file {1}".format(file_count, oncotator_file) + '-'*10)

        try:
           gcs = GcsConnector(project_id, bucket_name)
           # covert the file to a dataframe
           filename = 'tcga/intermediary/MAF/oncotator_output_files/' + oncotator_file
           df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename)
        except Exception as e:
           print e
           raise
           
        if df.empty:
           log.debug('empty dataframe for file: ' + str(oncotator_file))
           continue

        #------------------------------
        # different operations on the frame
        #------------------------------
        # get only the required BigQuery columns
        df = df[bq_columns]
       
        # format oncotator columns; name changes etc 
        df = format_oncotator_columns(df)

        # add new columns
        df = add_columns(df, sample_code2letter, study)

        disease_bigdata_df = disease_bigdata_df.append(df, ignore_index = True)
            

    # this is a merged dataframe
    if not disease_bigdata_df.empty:
        
        # remove duplicates; various rules; see check duplicates)
        df = check_duplicates.remove_maf_duplicates(df, sample_code2letter)

        # enforce unique mutation
        unique_mutation = ['Chromosome', 'Start_Position', 'End_Position', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode']

        # merge mutations from multiple centers
        concat_df = []
        for idx, df_group in df.groupby(unique_mutation):
            if len(df_group) > 1:
                # tolist; unique list; sort; concat
                df_group.loc[:,'Center'] = ";".join(map(str,sorted(list(set(df_group['Center'].tolist())))))
            concat_df.append(df_group)
        df = pd.concat(concat_df)

        # enforce unique mutation
        df = remove_duplicates(df, unique_mutation)

        # convert the df to new-line JSON and the upload the file
        gcs.convert_df_to_njson_and_upload(disease_bigdata_df, "tcga/intermediary/MAF/bigquery_data_files/{0}.json".format(study))

    else:
        raise Exception('Empty dataframe!')
    return True
示例#12
0
def process_oncotator_output(project_id, bucket_name, data_library, bq_columns,
                             sample_code2letter, oncotator_object_path,
                             oncotator_object_output_path):
    study = data_library['Study'].iloc[0]

    # this needed to stop pandas from converting them to FLOAT
    dtype = {
        "Transcript_Exon": "object",
        "NCBI_Build": "object",
        "COSMIC_Total_Alterations_In_Gene": "object",
        "CCLE_ONCOMAP_Total_Mutations_In_Gene": "object",
        "HGNC_HGNC_ID": "object",
        "UniProt_AApos": "object",
        "Transcript_Position": "object",
        "HGNC_OMIM_ID_Supplied_By_NCBI": "object"
    }

    file_count = 0

    # create an empty dataframe. we use this to merge dataframe
    disease_bigdata_df = pd.DataFrame()

    # iterate over the selected files
    for oncotator_file in data_library['filename']:
        file_count += 1

        log.info(
            '-' * 10 +
            "{0}: Processing file {1}".format(file_count, oncotator_file) +
            '-' * 10)

        try:
            # covert the file to a dataframe
            filename = oncotator_object_path + oncotator_file
            gcs = GcsConnector(project_id, bucket_name)
            log.info('%s: converting %s to dataframe' % (study, filename))
            df = gcutils.convert_blob_to_dataframe(gcs,
                                                   project_id,
                                                   bucket_name,
                                                   filename,
                                                   log=log)
            log.info('%s: done converting %s to dataframe' % (study, filename))
        except RuntimeError as re:
            log.warning('%s: problem cleaning dataframe for %s: %s' %
                        (study, filename, re))
        except Exception as e:
            log.exception('%s: problem converting to dataframe for %s: %s' %
                          (study, filename, e))
            raise e

        if df.empty:
            log.warning('%s: empty dataframe for file: %s' %
                        (study, oncotator_file))
            continue

        #------------------------------
        # different operations on the frame
        #------------------------------
        # get only the required BigQuery columns
        df = df[bq_columns]

        # format oncotator columns; name changes etc
        df = format_oncotator_columns(df)

        # add new columns
        df = add_columns(df, sample_code2letter, study)

        disease_bigdata_df = disease_bigdata_df.append(df, ignore_index=True)

        log.info('-' * 10 + "{0}: Finished file({3}) {1}. rows: {2}".format(
            file_count, oncotator_file, len(df), study) + '-' * 10)

    # this is a merged dataframe
    if not disease_bigdata_df.empty:
        # remove duplicates; various rules; see check duplicates)

        log.info(
            '\tcalling check_duplicates to collapse aliquots with %s rows' %
            (len(disease_bigdata_df)))
        disease_bigdata_df = check_duplicates.remove_maf_duplicates(
            disease_bigdata_df, sample_code2letter, log)
        log.info(
            '\tfinished check_duplicates to collapse aliquots with %s rows' %
            (len(disease_bigdata_df)))

        # enforce unique mutation--previous
        # unique_mutation = ['Chromosome', 'Start_Position', 'End_Position', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode']
        # enforce unique mutation
        unique_mutation = [
            'Hugo_Symbol', 'Entrez_Gene_Id', 'Chromosome', 'Start_Position',
            'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele1',
            'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode'
        ]
        # merge mutations from multiple centers
        log.info(
            '\tconsolidate the centers for duplicate mutations into list for %s'
            % (study))
        seencenters = set()

        def concatcenters(df_group):
            if len(df_group) > 1:
                centers = list(set(df_group['Center'].tolist()))
                uniquecenters = set()
                delim = config['maf']['center_delim']
                for center in centers:
                    fields = center.split(delim)
                    for field in fields:
                        uniquecenters.add(field)
                sortedunique = delim.join(sorted(list(uniquecenters)))
                df_group.loc[:, 'Center'] = sortedunique
                if sortedunique not in seencenters:
                    log.info('unique centers: %s' % sortedunique)
                    seencenters.add(sortedunique)
            return df_group

        disease_bigdata_df = disease_bigdata_df.groupby(unique_mutation).apply(
            concatcenters)
        log.info(
            '\tfinished consolidating centers for duplicate mutations for %s' %
            (study))

        # enforce unique mutation
        log.info(
            '\tcalling remove_duplicates to collapse mutations with %s rows for %s'
            % (len(disease_bigdata_df), study))
        disease_bigdata_df = remove_duplicates(disease_bigdata_df,
                                               unique_mutation)
        log.info(
            '\tfinished remove_duplicates to collapse mutations with %s rows for %s'
            % (len(disease_bigdata_df), study))

        # convert the disease_bigdata_df to new-line JSON and upload the file
        uploadpath = oncotator_object_output_path + "{0}.json".format(study)
        log.info('%s: uploading %s to GCS' % (study, uploadpath))
        gcs.convert_df_to_njson_and_upload(disease_bigdata_df, uploadpath)
        log.info('%s: done uploading %s to GCS' % (study, uploadpath))

    else:
        log.warning('Empty dataframe for %s in %s!' % (oncotator_file, study))
    return True