def main(opts, mut_df=None, frameshift_df=None): # hack to index the FASTA file gene_fa = pysam.Fastafile(opts['input']) gene_fa.close() # Get Mutations if mut_df is None: mut_df = pd.read_csv(opts['mutations'], sep='\t') orig_num_mut = len(mut_df) # rename columns to fit my internal column names rename_dict = { 'Hugo_Symbol': 'Gene', 'Tumor_Sample_Barcode': 'Tumor_Sample', 'Tumor_Seq_Allele2': 'Tumor_Allele' } mut_df.rename(columns=rename_dict, inplace=True) # drop rows with missing info na_cols = ['Gene', 'Tumor_Allele', 'Start_Position', 'Chromosome'] mut_df = mut_df.dropna(subset=na_cols) logger.info('Kept {0} mutations after droping mutations with missing ' 'information (Droped: {1})'.format(len(mut_df), orig_num_mut - len(mut_df))) # count frameshifts if opts['kind'] == 'tsg': if frameshift_df is None: # read in mutations if mut_df is None: mut_df = pd.read_csv(opts['mutations'], sep='\t') # count number of frameshifts frameshift_df = cf.count_frameshift_total(mut_df, opts['bed'], opts['use_unmapped']) # calculate the proportion of inactivating #num_inact = len(mut_df[mut_df['Variant_Classification'].isin(utils.variant_inactivating)]) #num_non_inact = len(mut_df[mut_df['Variant_Classification'].isin(utils.variant_non_inactivating)]) num_fs = len(mut_df[mut_df['Variant_Classification'].isin( utils.variant_frameshift)]) num_all = len(mut_df[mut_df['Variant_Classification'].isin( utils.all_variants)]) #p_inactivating = float(num_inact) / (num_inact + num_non_inact) p_inactivating = float(num_fs) / num_all # select valid single nucleotide variants only mut_df = utils._fix_mutation_df(mut_df, opts['unique']) # log random number seed choice if provided if opts['seed'] is not None: logger.info('Pseudo Random Number Generator Seed: {0}'.format( opts['seed'])) # read BED file bed_dict = utils.read_bed(opts['bed']) # Perform BH p-value adjustment and tidy up data for output if opts['kind'] == 'oncogene': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) permutation_df = pr.handle_oncogene_results(permutation_result, opts['num_iterations']) elif opts['kind'] == 'tsg': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts, frameshift_df, p_inactivating) permutation_df = pr.handle_tsg_results(permutation_result) elif opts['kind'] == 'hotmaps1d': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) #frameshift_df, p_inactivating) permutation_df = pr.handle_hotmaps_results(permutation_result) elif opts['kind'] == 'protein': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) permutation_df = pr.handle_protein_results(permutation_result) elif opts['kind'] == 'effect': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) permutation_df = pr.handle_effect_results(permutation_result) # save output if opts['output']: permutation_df.to_csv(opts['output'], sep='\t', index=False) return permutation_df
def main(opts, mut_df=None, frameshift_df=None): # hack to index the FASTA file gene_fa = pysam.Fastafile(opts['input']) gene_fa.close() # Get Mutations if mut_df is None: mut_df = pd.read_csv(opts['mutations'], sep='\t') orig_num_mut = len(mut_df) # rename columns to fit my internal column names rename_dict = { 'Hugo_Symbol': 'Gene', 'Tumor_Sample_Barcode': 'Tumor_Sample', 'Tumor_Seq_Allele2' : 'Tumor_Allele' } mut_df.rename(columns=rename_dict, inplace=True) # drop rows with missing info na_cols = ['Gene', 'Tumor_Allele', 'Start_Position', 'Chromosome'] mut_df = mut_df.dropna(subset=na_cols) logger.info('Kept {0} mutations after droping mutations with missing ' 'information (Droped: {1})'.format(len(mut_df), orig_num_mut - len(mut_df))) # count frameshifts if opts['kind'] == 'tsg': if frameshift_df is None: # read in mutations if mut_df is None: mut_df = pd.read_csv(opts['mutations'], sep='\t') # count number of frameshifts frameshift_df = cf.count_frameshift_total(mut_df, opts['bed'], opts['use_unmapped']) # calculate the proportion of inactivating #num_inact = len(mut_df[mut_df['Variant_Classification'].isin(utils.variant_inactivating)]) #num_non_inact = len(mut_df[mut_df['Variant_Classification'].isin(utils.variant_non_inactivating)]) num_fs = len(mut_df[mut_df['Variant_Classification'].isin(utils.variant_frameshift)]) num_all = len(mut_df[mut_df['Variant_Classification'].isin(utils.all_variants)]) #p_inactivating = float(num_inact) / (num_inact + num_non_inact) p_inactivating = float(num_fs) / num_all # select valid single nucleotide variants only mut_df = utils._fix_mutation_df(mut_df, opts['unique']) # log random number seed choice if provided if opts['seed'] is not None: logger.info('Pseudo Random Number Generator Seed: {0}'.format(opts['seed'])) # read BED file bed_dict = utils.read_bed(opts['bed']) # Perform BH p-value adjustment and tidy up data for output if opts['kind'] == 'oncogene': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) permutation_df = pr.handle_oncogene_results(permutation_result, opts['num_iterations']) elif opts['kind'] == 'tsg': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts, frameshift_df, p_inactivating) permutation_df = pr.handle_tsg_results(permutation_result) elif opts['kind'] == 'hotmaps1d': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) #frameshift_df, p_inactivating) permutation_df = pr.handle_hotmaps_results(permutation_result) elif opts['kind'] == 'protein': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) permutation_df = pr.handle_protein_results(permutation_result) elif opts['kind'] == 'effect': permutation_result = multiprocess_permutation(bed_dict, mut_df, opts) permutation_df = pr.handle_effect_results(permutation_result) # save output if opts['output']: permutation_df.to_csv(opts['output'], sep='\t', index=False) return permutation_df