def save_db(maf_path, db_path, hypermutator_count): # merge all data frames together with the first # data frames given priority over later data frames df_cols = ['Gene_Symbol', 'Tumor_Sample', 'Tumor_Type', 'Chromosome', 'Start_Position', 'End_Position', 'Variant_Classification', 'Reference_Allele', 'Tumor_Allele', 'Protein_Change'] df = pd.DataFrame(columns=df_cols) for single_maf in maf_path.split(','): tmp_df = pd.read_csv(single_maf, sep='\t') samp_names = set(df['Tumor_Sample'].tolist()) tmp_df = tmp_df[tmp_df['Tumor_Sample'].apply(lambda x: x not in samp_names)] df = pd.concat([df, tmp_df]) _utils.drop_table('maf_mutation', db_path, kind='sqlite') conn = sqlite3.connect(db_path) # open connection # save tsv to sqlite3 database psql.write_frame(df, # pandas dataframe 'maf_mutation', # table name con=conn, # connection flavor='sqlite', # use sqlite if_exists='replace') # drop table if exists # filter hypermutator samples filter_hypermutators(hypermutator_count, conn, db_path)
def save_db(df, genedb_path): """Saves the data into the gene_features table. If the table already exists, the table is droped and then re-inserted. **Parameters** df : pd.DataFrame data to insert into DB table genedb_path : str path to sqlite db """ logger.debug('Dropping gene_features table IF EXISTS.') _utils.drop_table('gene_features', genes_db_path=genedb_path, kind='sqlite') # drop table if exists logger.debug('After dropping gene_features table IF EXISTS.') logger.info('Saving gene_features table ...') conn = sqlite3.connect(genedb_path) # open connection # save to sqlite3 database psql.write_frame(df, # pandas dataframe 'gene_features', # table name con=conn, # connection flavor='sqlite', # use sqlite if_exists='replace') # drop table if exists conn.close() logger.info('Finished saving gene_features table.')
def save_db(maf_path, db_path, hypermutator_count): # merge all data frames together with the first # data frames given priority over later data frames df_cols = [ 'Gene_Symbol', 'Tumor_Sample', 'Tumor_Type', 'Chromosome', 'Start_Position', 'End_Position', 'Variant_Classification', 'Reference_Allele', 'Tumor_Allele', 'Protein_Change' ] df = pd.DataFrame(columns=df_cols) for single_maf in maf_path.split(','): tmp_df = pd.read_csv(single_maf, sep='\t') samp_names = set(df['Tumor_Sample'].tolist()) tmp_df = tmp_df[tmp_df['Tumor_Sample'].apply( lambda x: x not in samp_names)] df = pd.concat([df, tmp_df]) _utils.drop_table('maf_mutation', db_path, kind='sqlite') conn = sqlite3.connect(db_path) # open connection # save tsv to sqlite3 database psql.write_frame( df, # pandas dataframe 'maf_mutation', # table name con=conn, # connection flavor='sqlite', # use sqlite if_exists='replace') # drop table if exists # filter hypermutator samples filter_hypermutators(hypermutator_count, conn, db_path)
def save_db(df, genedb_path): """Saves the data into the gene_features table. If the table already exists, the table is droped and then re-inserted. **Parameters** df : pd.DataFrame data to insert into DB table genedb_path : str path to sqlite db """ logger.debug('Dropping gene_features table IF EXISTS.') _utils.drop_table('gene_features', genes_db_path=genedb_path, kind='sqlite') # drop table if exists logger.debug('After dropping gene_features table IF EXISTS.') logger.info('Saving gene_features table ...') conn = sqlite3.connect(genedb_path) # open connection # save to sqlite3 database psql.write_frame( df, # pandas dataframe 'gene_features', # table name con=conn, # connection flavor='sqlite', # use sqlite if_exists='replace') # drop table if exists conn.close() logger.info('Finished saving gene_features table.')
def filter_hypermutators(hypermutator_count, conn, db_path=''): """Query database to find hypermutator samples so they can be excluded from further analysis. **Parameters** hypermutator_count : int samples with mutation counts below this number are allowed conn : db connection database connection db_path : str if using non-config defined db, specify the db path """ sql = ( "SELECT *" " FROM maf_mutation" " WHERE Tumor_Sample in (" " SELECT y.Tumor_Sample" " FROM (" " SELECT x.Tumor_Sample, SUM(x.mut_indicator) as MutationCounts" " FROM ( " " SELECT Tumor_Sample, 1 as mut_indicator" " FROM maf_mutation" " ) x " " GROUP BY Tumor_Sample" " ) y" " WHERE y.MutationCounts<%d" " )" % hypermutator_count) df = psql.frame_query(sql, conn) # get non hypermutator mutations _utils.drop_table('maf_mutation', db_path, kind='sqlite') psql.write_frame(df, 'maf_mutation', conn, flavor='sqlite', if_exists='replace')
def filter_hypermutators(hypermutator_count, conn, db_path=''): """Query database to find hypermutator samples so they can be excluded from further analysis. **Parameters** hypermutator_count : int samples with mutation counts below this number are allowed conn : db connection database connection db_path : str if using non-config defined db, specify the db path """ sql = ("SELECT *" " FROM maf_mutation" " WHERE Tumor_Sample in (" " SELECT y.Tumor_Sample" " FROM (" " SELECT x.Tumor_Sample, SUM(x.mut_indicator) as MutationCounts" " FROM ( " " SELECT Tumor_Sample, 1 as mut_indicator" " FROM maf_mutation" " ) x " " GROUP BY Tumor_Sample" " ) y" " WHERE y.MutationCounts<%d" " )" % hypermutator_count) df = psql.frame_query(sql, conn) # get non hypermutator mutations _utils.drop_table('maf_mutation', db_path, kind='sqlite') psql.write_frame(df, 'maf_mutation', conn, flavor='sqlite', if_exists='replace')
def save_db(hypermutator_ct, gene_tsv_path, genedb_path, is_genes_tgz=False, only_genome_wide=True, use_unknown_status=False): """Saves tab delim gene mutation file to a sqlite3 db. NOTE: Uses pandas to store all contents in memory and then saves to sqlite db. This may cause large memory usage. Parameters ---------- hypermutator_ct : int filter for overly mutated samples gene_tsv_path : str path to tab delim file containing all gene mutations cnv_tsv_path : str path to tab delim file containing cosmic cnv mutations cell_line_path : str path to cosmic cell line project file genedb_pah : str path to sqlite3 db """ # read data df = pd.read_csv(gene_tsv_path, sep='\t') # cnv_df = pd.read_csv(cnv_tsv_path, sep=r'\t|:|\.\.') # filter out cell line samples #if cell_line_path: #cell_line_df = pd.read_csv(cell_line_path, sep='\t') #cell_line_sample_names = set(cell_line_df['Sample name'].tolist()) #else: #cell_line_sample_names = set([]) # skip this if COSMIC not used #df = df[df['SampleName'].apply(lambda x: x not in cell_line_sample_names)] if is_genes_tgz: # fix sample names so they match with external data df['SampleName'] = df['SampleName'].apply(parse_sample_name) # fix types that pandas gets wrong # see http://pandas.pydata.org/pandas-docs/dev/gotchas.html # for details on missing NA support for integers df['hg18chrom'] = df['hg18chrom'].fillna(-1) df['hg19chrom'] = df['hg19chrom'].fillna(-1) df['hg18start'] = df['hg18start'].fillna(-1) df['hg19start'] = df['hg19start'].fillna(-1) df['hg18end'] = df['hg18end'].fillna(-1) df['hg19end'] = df['hg19end'].fillna(-1) df['hg18chrom'] = df['hg18chrom'].astype(int) df['hg19chrom'] = df['hg19chrom'].astype(int) df['hg18start'] = df['hg18start'].astype(int) df['hg19start'] = df['hg19start'].astype(int) df['hg18end'] = df['hg18end'].astype(int) df['hg19end'] = df['hg19end'].astype(int) else: df = handle_cosmic_mutation_export(df, only_genome_wide, use_unknown_status) # drop table if already exists _utils.drop_table('cosmic_mutation', genedb_path, kind='sqlite') # _utils.drop_table('cosmic_cnv', genedb_path, kind='sqlite') conn = sqlite3.connect(genedb_path) # open connection # save tsv to sqlite3 database psql.write_frame( df, # pandas dataframe 'cosmic_mutation', # table name con=conn, # connection flavor='sqlite', # use sqlite if_exists='replace') # drop table if exists #psql.write_frame(cnv_df, # pandas dataframe #'cosmic_cnv', # table name #con=conn, # connection #flavor='sqlite', # use sqlite #if_exists='replace') # drop table if exists # drop table and re-insert data without hypermutators filter_hypermutators(hypermutator_ct, conn, genedb_path) conn.close() # close