def main(opts): # read in mutations mut_df = pd.read_csv(opts['mutations'], sep='\t') orig_num_mut = len(mut_df) # correct chromosome names mut_df['Chromosome'] = correct_chrom_names(mut_df['Chromosome']) # fix additional issues mut_df = mut_df.dropna(subset=['Tumor_Allele', 'Start_Position', 'Chromosome']) logger.info('Kept {0} mutations after droping mutations with missing ' 'information (Droped: {1})'.format(len(mut_df), orig_num_mut - len(mut_df))) mut_df = utils._fix_mutation_df(mut_df) # read genome fasta file genome_fa = pysam.Fastafile(opts['fasta']) # read BED file for transcripts bed_dict = utils.read_bed(opts['bed'], []) gene2bed = {item.gene_name: item for bed_list in bed_dict.values() for item in bed_list} # group mutations by gene mut_grpby = mut_df.groupby('Gene') unmapped_mut_list = [] for i, mut_info in mut_grpby: gene_name = mut_info['Gene'].iloc[0] # try to find tx annotation for gene bed = None try: bed = gene2bed[gene_name] except KeyError: pass if bed: # get coding positions, mutations unmapped to the reference tx will have # NA for a coding position for ix, row in mut_info.iterrows(): coding_pos = bed.query_position(bed.strand, row['Chromosome'], row['Start_Position']) if not coding_pos: unmapped_mut_list.append(row.tolist()) else: #unmapped_mut_df = pd.concat([unmapped_mut_df, mut_info]) unmapped_mut_list += mut_info.values.tolist() # save the unmapped mutations to a file unmapped_mut_df = pd.DataFrame(unmapped_mut_list, columns=mut_df.columns) logger.info('{0} mutations were unmappable to a ' 'reference transcript'.format(len(unmapped_mut_df))) unmapped_mut_df.to_csv(opts['unmapped'], sep='\t', index=False) coord_base, coord_strand = detect_coordinates(mut_df, genome_fa) logger.info('RESULT: {0}-based coordinates, positions reported on {1} strand'.format(coord_base, coord_strand)) genome_fa.close()
def main(opts): # hack to index the FASTA file gene_fa = pysam.Fastafile(opts['input']) gene_fa.close() # Get Mutations mut_df = pd.read_csv(opts['mutations'], sep='\t') orig_num_mut = len(mut_df) # rename columns to fit my internal column names rename_dict = { 'Hugo_Symbol': 'Gene', 'Tumor_Sample_Barcode': 'Tumor_Sample', 'Tumor_Seq_Allele2' : 'Tumor_Allele' } mut_df.rename(columns=rename_dict, inplace=True) # restrict to only observed genes if flag present restricted_genes = None if opts['restrict_genes']: restricted_genes = set(mut_df['Gene'].unique()) # process indels indel_df = indel.keep_indels(mut_df) # return indels only indel_df.loc[:, 'Start_Position'] = indel_df['Start_Position'] - 1 # convert to 0-based indel_df.loc[:, 'indel len'] = indel_df['indel len'] + 1 logger.info('There were {0} indels identified.'.format(len(indel_df))) mut_df = mut_df.dropna(subset=['Tumor_Allele', 'Start_Position', 'Chromosome']) logger.info('Kept {0} mutations after droping mutations with missing ' 'information (Droped: {1})'.format(len(mut_df), orig_num_mut - len(mut_df))) # select valid single nucleotide variants only mut_df = utils._fix_mutation_df(mut_df, opts['unique']) # read in bed info bed_dict = utils.read_bed(opts['bed'], restricted_genes) # perform permutation opts['handle'] = open(opts['output'], 'w') multiprocess_permutation(bed_dict, mut_df, opts, indel_df) # save indels if opts['maf']: #with open(opts['output'], 'a') as handle: mywriter = csv.writer(opts['handle'], delimiter='\t', lineterminator='\n') for maf_lines in indel.simulate_indel_maf(indel_df, bed_dict, opts['num_iterations'], opts['seed']): mywriter.writerows(maf_lines) opts['handle'].close()
def main(opts, mut_df=None, frameshift_df=None): # hack to index the FASTA file gene_fa = pysam.Fastafile(opts['input']) gene_fa.close() # Get Mutations if mut_df is None: mut_df = pd.read_csv(opts['mutations'], sep='\t') orig_num_mut = len(mut_df) # rename columns to fit my internal column names rename_dict = { 'Hugo_Symbol': 'Gene', 'Tumor_Sample_Barcode': 'Tumor_Sample', 'Tumor_Seq_Allele2': 'Tumor_Allele' } mut_df.rename(columns=rename_dict, inplace=True) # drop rows with missing info na_cols = ['Gene', 'Tumor_Allele', 'Start_Position', 'Chromosome'] mut_df = mut_df.dropna(subset=na_cols) logger.info('Kept {0} mutations after droping mutations with missing ' 'information (Droped: {1})'.format(len(mut_df), orig_num_mut - len(mut_df))) # count frameshifts if opts['kind'] == 'tsg': if frameshift_df is None: # read in mutations if mut_df is None: mut_df = pd.read_csv(opts['mutations'], sep='\t') # count number of frameshifts frameshift_df = cf.count_frameshift_total(mut_df, opts['bed'], opts['use_unmapped']) # calculate the proportion of inactivating #num_inact = len(mut_df[mut_df['Variant_Classification'].isin(utils.variant_inactivating)]) #num_non_inact = len(mut_df[mut_df['Variant_Classification'].isin(utils.variant_non_inactivating)]) num_fs = len(mut_df[mut_df['Variant_Classification'].isin( utils.variant_frameshift)]) num_all = len(mut_df[mut_df['Variant_Classification'].isin( utils.all_variants)]) #p_inactivating = float(num_inact) / (num_inact + num_non_inact) p_inactivating = float(num_fs) / num_all # select valid single nucleotide variants only mut_df = utils._fix_mutation_df(mut_df, opts['unique']) # log random number seed choice if provided if opts['seed'] is not None: logger.info('Pseudo Random Number Generator Seed: {0}'.format( opts['seed'])) # read BED file bed_dict = utils.read_bed(opts['bed']) # Perform BH p-value adjustment and tidy up data for output if opts['kind'] == 'oncogene': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) permutation_df = pr.handle_oncogene_results(permutation_result, opts['num_iterations']) elif opts['kind'] == 'tsg': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts, frameshift_df, p_inactivating) permutation_df = pr.handle_tsg_results(permutation_result) elif opts['kind'] == 'hotmaps1d': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) #frameshift_df, p_inactivating) permutation_df = pr.handle_hotmaps_results(permutation_result) elif opts['kind'] == 'protein': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) permutation_df = pr.handle_protein_results(permutation_result) elif opts['kind'] == 'effect': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) permutation_df = pr.handle_effect_results(permutation_result) # save output if opts['output']: permutation_df.to_csv(opts['output'], sep='\t', index=False) return permutation_df
def main(opts): global cols if opts['score_dir']: cols.extend(['Total MGAEntropy', 'Total Missense VEST']) # hack to index the FASTA file gene_fa = pysam.Fastafile(opts['input']) gene_fa.close() # Get Mutations mut_df = pd.read_csv(opts['mutations'], sep='\t') orig_num_mut = len(mut_df) mut_df = mut_df.dropna( subset=['Tumor_Allele', 'Start_Position', 'Chromosome']) logger.info('Kept {0} mutations after droping mutations with missing ' 'information (Droped: {1})'.format(len(mut_df), orig_num_mut - len(mut_df))) # select valid single nucleotide variants only mut_df = utils._fix_mutation_df(mut_df) # read in bed info bed_dict = utils.read_bed(opts['bed'], []) # perform permutation test #permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) sim_result, obs_result = multiprocess_permutation(bed_dict, mut_df, opts) # report number of observed non-silent and silent mutations #obs_result = [x[1] for x in permutation_result] # actually observed num mutations #obs_result = permutation_result[1] # actually observed num mutations if not opts['by_sample']: total_non_silent = sum(o[0] for o in obs_result) total_silent = sum(o[1] for o in obs_result) total_nonsense = sum(o[2] for o in obs_result) total_loststop = sum(o[3] for o in obs_result) total_splice_site = sum(o[4] for o in obs_result) total_loststart = sum(o[5] for o in obs_result) total_missense = sum(o[6] for o in obs_result) if opts['score_dir']: total_mgaentropy = sum(o[7] for o in obs_result) total_vest = sum(o[8] for o in obs_result) logger.info( 'There were {0} non-silent SNVs and {1} silent SNVs actually ' 'observed from the provided mutations.'.format( total_non_silent, total_silent)) logger.info( 'There were {0} missense SNVs, {1} nonsense SNVs, {2} lost stop SNVs, ' ', {3} lost start, and {4} splice site SNVs'.format( total_missense, total_nonsense, total_loststop, total_loststart, total_splice_site)) else: obs_non_silent_df = obs_result #sim_result = [s[0] for s in permutation_result] # results with permutation #sim_result = permutation_result[0] # convert to dataframe to save to file non_silent_ratio_df = pd.DataFrame(sim_result, columns=cols) # save simulation output non_silent_ratio_df.to_csv(opts['output'], sep='\t', index=False) # save observed values if file provided if opts['observed_output']: if not opts['by_sample']: obs_result = [ total_non_silent, total_silent, total_nonsense, total_loststop, total_splice_site, total_loststart, total_missense ] if opts['score_dir']: obs_result.extend([total_mgaentropy, total_vest]) obs_non_silent_df = pd.DataFrame([obs_result], columns=cols) obs_non_silent_df.to_csv(opts['observed_output'], sep='\t', index=False) else: obs_non_silent_df.to_csv(opts['observed_output'], sep='\t') return non_silent_ratio_df
def main(opts): # read in mutations mut_df = pd.read_csv(opts['mutations'], sep='\t') orig_num_mut = len(mut_df) # correct chromosome names mut_df['Chromosome'] = correct_chrom_names(mut_df['Chromosome']) # fix additional issues mut_df = mut_df.dropna( subset=['Tumor_Allele', 'Start_Position', 'Chromosome']) logger.info('Kept {0} mutations after droping mutations with missing ' 'information (Droped: {1})'.format(len(mut_df), orig_num_mut - len(mut_df))) mut_df = utils._fix_mutation_df(mut_df) # read genome fasta file genome_fa = pysam.Fastafile(opts['fasta']) # read BED file for transcripts bed_dict = utils.read_bed(opts['bed'], []) gene2bed = { item.gene_name: item for bed_list in bed_dict.values() for item in bed_list } # group mutations by gene mut_grpby = mut_df.groupby('Gene') unmapped_mut_list = [] for i, mut_info in mut_grpby: gene_name = mut_info['Gene'].iloc[0] # try to find tx annotation for gene bed = None try: bed = gene2bed[gene_name] except KeyError: pass if bed: # get coding positions, mutations unmapped to the reference tx will have # NA for a coding position for ix, row in mut_info.iterrows(): coding_pos = bed.query_position(bed.strand, row['Chromosome'], row['Start_Position']) if not coding_pos: unmapped_mut_list.append(row.tolist()) else: #unmapped_mut_df = pd.concat([unmapped_mut_df, mut_info]) unmapped_mut_list += mut_info.values.tolist() # save the unmapped mutations to a file unmapped_mut_df = pd.DataFrame(unmapped_mut_list, columns=mut_df.columns) logger.info('{0} mutations were unmappable to a ' 'reference transcript'.format(len(unmapped_mut_df))) unmapped_mut_df.to_csv(opts['unmapped'], sep='\t', index=False) coord_base, coord_strand = detect_coordinates(mut_df, genome_fa) logger.info( 'RESULT: {0}-based coordinates, positions reported on {1} strand'. format(coord_base, coord_strand)) genome_fa.close()
def main(opts): global cols if opts['score_dir']: cols.extend(['Total MGAEntropy', 'Total Missense VEST']) # hack to index the FASTA file gene_fa = pysam.Fastafile(opts['input']) gene_fa.close() # Get Mutations mut_df = pd.read_csv(opts['mutations'], sep='\t') orig_num_mut = len(mut_df) mut_df = mut_df.dropna(subset=['Tumor_Allele', 'Start_Position', 'Chromosome']) logger.info('Kept {0} mutations after droping mutations with missing ' 'information (Droped: {1})'.format(len(mut_df), orig_num_mut - len(mut_df))) # select valid single nucleotide variants only mut_df = utils._fix_mutation_df(mut_df) # read in bed info bed_dict = utils.read_bed(opts['bed']) # perform permutation test #permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) sim_result, obs_result = multiprocess_permutation(bed_dict, mut_df, opts) # report number of observed non-silent and silent mutations #obs_result = [x[1] for x in permutation_result] # actually observed num mutations #obs_result = permutation_result[1] # actually observed num mutations if not opts['by_sample']: total_non_silent = sum(o[0] for o in obs_result) total_silent = sum(o[1] for o in obs_result) total_nonsense = sum(o[2] for o in obs_result) total_loststop = sum(o[3] for o in obs_result) total_splice_site = sum(o[4] for o in obs_result) total_loststart = sum(o[5] for o in obs_result) total_missense = sum(o[6] for o in obs_result) if opts['score_dir']: total_mgaentropy = sum(o[7] for o in obs_result) total_vest = sum(o[8] for o in obs_result) logger.info('There were {0} non-silent SNVs and {1} silent SNVs actually ' 'observed from the provided mutations.'.format(total_non_silent, total_silent)) logger.info('There were {0} missense SNVs, {1} nonsense SNVs, {2} lost stop SNVs, ' ', {3} lost start, and {4} splice site SNVs'.format(total_missense, total_nonsense, total_loststop, total_loststart, total_splice_site)) else: obs_non_silent_df = obs_result #sim_result = [s[0] for s in permutation_result] # results with permutation #sim_result = permutation_result[0] # convert to dataframe to save to file non_silent_ratio_df = pd.DataFrame(sim_result, columns=cols) # save simulation output non_silent_ratio_df.to_csv(opts['output'], sep='\t', index=False) # save observed values if file provided if opts['observed_output']: if not opts['by_sample']: obs_result = [total_non_silent, total_silent, total_nonsense, total_loststop, total_splice_site, total_loststart, total_missense] if opts['score_dir']: obs_result.extend([total_mgaentropy, total_vest]) obs_non_silent_df = pd.DataFrame([obs_result], columns=cols) obs_non_silent_df.to_csv(opts['observed_output'], sep='\t', index=False) else: obs_non_silent_df.to_csv(opts['observed_output'], sep='\t') return non_silent_ratio_df
def main(opts, mut_df=None, frameshift_df=None): # hack to index the FASTA file gene_fa = pysam.Fastafile(opts['input']) gene_fa.close() # Get Mutations if mut_df is None: mut_df = pd.read_csv(opts['mutations'], sep='\t') orig_num_mut = len(mut_df) # rename columns to fit my internal column names rename_dict = { 'Hugo_Symbol': 'Gene', 'Tumor_Sample_Barcode': 'Tumor_Sample', 'Tumor_Seq_Allele2' : 'Tumor_Allele' } mut_df.rename(columns=rename_dict, inplace=True) # drop rows with missing info na_cols = ['Gene', 'Tumor_Allele', 'Start_Position', 'Chromosome'] mut_df = mut_df.dropna(subset=na_cols) logger.info('Kept {0} mutations after droping mutations with missing ' 'information (Droped: {1})'.format(len(mut_df), orig_num_mut - len(mut_df))) # count frameshifts if opts['kind'] == 'tsg': if frameshift_df is None: # read in mutations if mut_df is None: mut_df = pd.read_csv(opts['mutations'], sep='\t') # count number of frameshifts frameshift_df = cf.count_frameshift_total(mut_df, opts['bed'], opts['use_unmapped']) # calculate the proportion of inactivating #num_inact = len(mut_df[mut_df['Variant_Classification'].isin(utils.variant_inactivating)]) #num_non_inact = len(mut_df[mut_df['Variant_Classification'].isin(utils.variant_non_inactivating)]) num_fs = len(mut_df[mut_df['Variant_Classification'].isin(utils.variant_frameshift)]) num_all = len(mut_df[mut_df['Variant_Classification'].isin(utils.all_variants)]) #p_inactivating = float(num_inact) / (num_inact + num_non_inact) p_inactivating = float(num_fs) / num_all # select valid single nucleotide variants only mut_df = utils._fix_mutation_df(mut_df, opts['unique']) # log random number seed choice if provided if opts['seed'] is not None: logger.info('Pseudo Random Number Generator Seed: {0}'.format(opts['seed'])) # read BED file bed_dict = utils.read_bed(opts['bed']) # Perform BH p-value adjustment and tidy up data for output if opts['kind'] == 'oncogene': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) permutation_df = pr.handle_oncogene_results(permutation_result, opts['num_iterations']) elif opts['kind'] == 'tsg': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts, frameshift_df, p_inactivating) permutation_df = pr.handle_tsg_results(permutation_result) elif opts['kind'] == 'hotmaps1d': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) #frameshift_df, p_inactivating) permutation_df = pr.handle_hotmaps_results(permutation_result) elif opts['kind'] == 'protein': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) permutation_df = pr.handle_protein_results(permutation_result) elif opts['kind'] == 'effect': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) permutation_df = pr.handle_effect_results(permutation_result) # save output if opts['output']: permutation_df.to_csv(opts['output'], sep='\t', index=False) return permutation_df