示例#1
0
def save_db(maf_path, db_path, hypermutator_count):
    # merge all data frames together with the first
    # data frames given priority over later data frames
    df_cols = ['Gene_Symbol', 'Tumor_Sample', 'Tumor_Type', 'Chromosome',
               'Start_Position', 'End_Position', 'Variant_Classification',
               'Reference_Allele', 'Tumor_Allele', 'Protein_Change']
    df = pd.DataFrame(columns=df_cols)
    for single_maf in maf_path.split(','):
        tmp_df = pd.read_csv(single_maf, sep='\t')
        samp_names = set(df['Tumor_Sample'].tolist())
        tmp_df = tmp_df[tmp_df['Tumor_Sample'].apply(lambda x: x not in samp_names)]
        df = pd.concat([df, tmp_df])

    _utils.drop_table('maf_mutation', db_path, kind='sqlite')
    conn = sqlite3.connect(db_path)  # open connection

    # save tsv to sqlite3 database
    psql.write_frame(df,  # pandas dataframe
                     'maf_mutation',  # table name
                     con=conn,  # connection
                     flavor='sqlite',  # use sqlite
                     if_exists='replace')  # drop table if exists

    # filter hypermutator samples
    filter_hypermutators(hypermutator_count, conn, db_path)
示例#2
0
def save_db(df, genedb_path):
    """Saves the data into the gene_features table.

    If the table already exists, the table is droped and then
    re-inserted.

    **Parameters**

    df : pd.DataFrame
        data to insert into DB table
    genedb_path : str
        path to sqlite db
    """
    logger.debug('Dropping gene_features table IF EXISTS.')
    _utils.drop_table('gene_features', genes_db_path=genedb_path, kind='sqlite')  # drop table if exists
    logger.debug('After dropping gene_features table IF EXISTS.')

    logger.info('Saving gene_features table ...')
    conn = sqlite3.connect(genedb_path)  # open connection
    # save to sqlite3 database
    psql.write_frame(df,  # pandas dataframe
                     'gene_features',  # table name
                     con=conn,  # connection
                     flavor='sqlite',  # use sqlite
                     if_exists='replace')  # drop table if exists
    conn.close()
    logger.info('Finished saving gene_features table.')
示例#3
0
def save_db(maf_path, db_path, hypermutator_count):
    # merge all data frames together with the first
    # data frames given priority over later data frames
    df_cols = [
        'Gene_Symbol', 'Tumor_Sample', 'Tumor_Type', 'Chromosome',
        'Start_Position', 'End_Position', 'Variant_Classification',
        'Reference_Allele', 'Tumor_Allele', 'Protein_Change'
    ]
    df = pd.DataFrame(columns=df_cols)
    for single_maf in maf_path.split(','):
        tmp_df = pd.read_csv(single_maf, sep='\t')
        samp_names = set(df['Tumor_Sample'].tolist())
        tmp_df = tmp_df[tmp_df['Tumor_Sample'].apply(
            lambda x: x not in samp_names)]
        df = pd.concat([df, tmp_df])

    _utils.drop_table('maf_mutation', db_path, kind='sqlite')
    conn = sqlite3.connect(db_path)  # open connection

    # save tsv to sqlite3 database
    psql.write_frame(
        df,  # pandas dataframe
        'maf_mutation',  # table name
        con=conn,  # connection
        flavor='sqlite',  # use sqlite
        if_exists='replace')  # drop table if exists

    # filter hypermutator samples
    filter_hypermutators(hypermutator_count, conn, db_path)
示例#4
0
def save_db(df, genedb_path):
    """Saves the data into the gene_features table.

    If the table already exists, the table is droped and then
    re-inserted.

    **Parameters**

    df : pd.DataFrame
        data to insert into DB table
    genedb_path : str
        path to sqlite db
    """
    logger.debug('Dropping gene_features table IF EXISTS.')
    _utils.drop_table('gene_features',
                      genes_db_path=genedb_path,
                      kind='sqlite')  # drop table if exists
    logger.debug('After dropping gene_features table IF EXISTS.')

    logger.info('Saving gene_features table ...')
    conn = sqlite3.connect(genedb_path)  # open connection
    # save to sqlite3 database
    psql.write_frame(
        df,  # pandas dataframe
        'gene_features',  # table name
        con=conn,  # connection
        flavor='sqlite',  # use sqlite
        if_exists='replace')  # drop table if exists
    conn.close()
    logger.info('Finished saving gene_features table.')
示例#5
0
def filter_hypermutators(hypermutator_count, conn, db_path=''):
    """Query database to find hypermutator samples so they can
    be excluded from further analysis.

    **Parameters**

    hypermutator_count : int
        samples with mutation counts below this number are allowed
    conn : db connection
        database connection
    db_path : str
        if using non-config defined db, specify the db path
    """
    sql = (
        "SELECT *"
        " FROM maf_mutation"
        " WHERE Tumor_Sample in ("
        "     SELECT y.Tumor_Sample"
        "     FROM ("
        "         SELECT x.Tumor_Sample, SUM(x.mut_indicator) as MutationCounts"
        "         FROM ( "
        "             SELECT Tumor_Sample, 1 as mut_indicator"
        "             FROM maf_mutation"
        "         ) x "
        "         GROUP BY Tumor_Sample"
        "     ) y"
        "     WHERE y.MutationCounts<%d"
        " )" % hypermutator_count)

    df = psql.frame_query(sql, conn)  # get non hypermutator mutations

    _utils.drop_table('maf_mutation', db_path, kind='sqlite')

    psql.write_frame(df,
                     'maf_mutation',
                     conn,
                     flavor='sqlite',
                     if_exists='replace')
示例#6
0
def filter_hypermutators(hypermutator_count, conn, db_path=''):
    """Query database to find hypermutator samples so they can
    be excluded from further analysis.

    **Parameters**

    hypermutator_count : int
        samples with mutation counts below this number are allowed
    conn : db connection
        database connection
    db_path : str
        if using non-config defined db, specify the db path
    """
    sql = ("SELECT *"
          " FROM maf_mutation"
          " WHERE Tumor_Sample in ("
          "     SELECT y.Tumor_Sample"
          "     FROM ("
          "         SELECT x.Tumor_Sample, SUM(x.mut_indicator) as MutationCounts"
          "         FROM ( "
          "             SELECT Tumor_Sample, 1 as mut_indicator"
          "             FROM maf_mutation"
          "         ) x "
          "         GROUP BY Tumor_Sample"
          "     ) y"
          "     WHERE y.MutationCounts<%d"
          " )" % hypermutator_count)

    df = psql.frame_query(sql, conn)  # get non hypermutator mutations

    _utils.drop_table('maf_mutation', db_path, kind='sqlite')

    psql.write_frame(df,
                     'maf_mutation',
                     conn,
                     flavor='sqlite',
                     if_exists='replace')
示例#7
0
def save_db(hypermutator_ct,
            gene_tsv_path,
            genedb_path,
            is_genes_tgz=False,
            only_genome_wide=True,
            use_unknown_status=False):
    """Saves tab delim gene mutation file to a sqlite3 db.

    NOTE: Uses pandas to store all contents in memory and then
    saves to sqlite db. This may cause large memory usage.

    Parameters
    ----------
    hypermutator_ct : int
        filter for overly mutated samples
    gene_tsv_path : str
        path to tab delim file containing all gene mutations
    cnv_tsv_path : str
        path to tab delim file containing cosmic cnv mutations
    cell_line_path : str
        path to cosmic cell line project file
    genedb_pah : str
        path to sqlite3 db
    """
    # read data
    df = pd.read_csv(gene_tsv_path, sep='\t')
    # cnv_df = pd.read_csv(cnv_tsv_path, sep=r'\t|:|\.\.')

    # filter out cell line samples
    #if cell_line_path:
    #cell_line_df = pd.read_csv(cell_line_path, sep='\t')
    #cell_line_sample_names = set(cell_line_df['Sample name'].tolist())
    #else:
    #cell_line_sample_names = set([])

    # skip this if COSMIC not used
    #df = df[df['SampleName'].apply(lambda x: x not in cell_line_sample_names)]

    if is_genes_tgz:
        # fix sample names so they match with external data
        df['SampleName'] = df['SampleName'].apply(parse_sample_name)

        # fix types that pandas gets wrong
        # see http://pandas.pydata.org/pandas-docs/dev/gotchas.html
        # for details on missing NA support for integers
        df['hg18chrom'] = df['hg18chrom'].fillna(-1)
        df['hg19chrom'] = df['hg19chrom'].fillna(-1)
        df['hg18start'] = df['hg18start'].fillna(-1)
        df['hg19start'] = df['hg19start'].fillna(-1)
        df['hg18end'] = df['hg18end'].fillna(-1)
        df['hg19end'] = df['hg19end'].fillna(-1)
        df['hg18chrom'] = df['hg18chrom'].astype(int)
        df['hg19chrom'] = df['hg19chrom'].astype(int)
        df['hg18start'] = df['hg18start'].astype(int)
        df['hg19start'] = df['hg19start'].astype(int)
        df['hg18end'] = df['hg18end'].astype(int)
        df['hg19end'] = df['hg19end'].astype(int)
    else:
        df = handle_cosmic_mutation_export(df, only_genome_wide,
                                           use_unknown_status)

    # drop table if already exists
    _utils.drop_table('cosmic_mutation', genedb_path, kind='sqlite')
    # _utils.drop_table('cosmic_cnv', genedb_path, kind='sqlite')

    conn = sqlite3.connect(genedb_path)  # open connection

    # save tsv to sqlite3 database
    psql.write_frame(
        df,  # pandas dataframe
        'cosmic_mutation',  # table name
        con=conn,  # connection
        flavor='sqlite',  # use sqlite
        if_exists='replace')  # drop table if exists
    #psql.write_frame(cnv_df,  # pandas dataframe
    #'cosmic_cnv',  # table name
    #con=conn,  # connection
    #flavor='sqlite',  # use sqlite
    #if_exists='replace')  # drop table if exists

    # drop table and re-insert data without hypermutators
    filter_hypermutators(hypermutator_ct, conn, genedb_path)

    conn.close()  # close