def make_zscores(copy_number, mutation, clinical, outdir, genes): cancer_type = util.get_cancer_type(copy_number) clinical_data = util.get_clinical_data(clinical) cnv = pd.read_csv(copy_number, index_col=0) mutation = mutation_base.prep_mutation_data(mutation, clinical_data) p53_mutation = mutation['\'TP53'].rename('TP53_mutation') cnv_by_patient = cnv.transpose() clinical_and_cnv = cnv_by_patient.join(clinical_data, how='inner') clinical_mutations_and_cnv = clinical_and_cnv.join(p53_mutation, how='inner') cox_dicts = {} for gene in genes['Gene']: clinical_gene = clinical_mutations_and_cnv[[ gene, 'TP53_mutation', 'time', 'censor' ]] cox_dict = calculate_cox(clinical_gene, gene) cox_dict['mutation_count'] = clinical_gene['TP53_mutation'].sum() clinical_gene.to_csv( os.path.join( outdir, cancer_type + '_' + gene[1:] + '_p53_and_cna_data.csv')) cox_dicts[gene[1:]] = cox_dict return cox_dicts
def make_zscores(mutation, clinical, breaks, outdir): clinical_data = util.get_clinical_data(clinical) mut = mutation_base.prep_mutation_data(mutation, clinical_data) cancer_type = util.get_cancer_type(mutation) print cancer_type structural_breaks = pd.read_csv(breaks, index_col=0) structural_breaks = structural_breaks.astype(int) mut_and_breaks = mut.join(structural_breaks, how='inner') num_patients = len(mut_and_breaks) results = [] for gene in mut_and_breaks: if gene in ('time', 'censor', 'breaks'): # skip metadata continue num_mutations = mut_and_breaks[gene].sum() if num_mutations >= MUTATION_PERCENT * num_patients: cox_dict = analysis.do_multivariate_cox(mut_and_breaks.time, mut_and_breaks.censor, mut_and_breaks[gene], mut_and_breaks[['breaks']]) cox_dict['gene'] = gene results.append(cox_dict) results_df = pd.DataFrame(results) results_df = results_df.set_index('gene') results_df.to_csv(os.path.join(outdir, cancer_type + '_mut_cox.csv'))
def make_mutation_zscores(mutation, clinical, gene_list): cancer_type = util.get_cancer_type(mutation) # get mutation patients clinical_data = util.get_clinical_data(clinical) mutation = mutation_base.prep_mutation_data(mutation, clinical_data) present_gene_list = list( set(gene_list.values) & set(mutation.columns.values)) mutation_gene_list_only = mutation[present_gene_list] mutation_and_clinical = mutation_gene_list_only.join(clinical_data, how='inner') num_patients = len(mutation_and_clinical.index) results = pd.DataFrame() for gene in mutation_and_clinical: if gene in ['time', 'censor']: continue num_mutations = mutation_and_clinical[gene].sum() if num_mutations >= MUTATION_PERCENT * num_patients: cox_dict = analysis.do_cox(mutation_and_clinical.time, mutation_and_clinical.censor, mutation_and_clinical[gene]) cox_dict['cancer_type'] = cancer_type cox_dict['gene'] = gene cox_dict['num_mutations'] = num_mutations results = results.append(cox_dict, ignore_index=True) print results return results
def make_corrs(copy_number, rnaseq, mutation, clinical, outdir, genes): cancer_type = util.get_cancer_type(copy_number) clinical_data = util.get_clinical_data(clinical) cnv = pd.read_csv(copy_number, index_col=0) cnv_by_patient = cnv.transpose() rnaseq = pd.read_csv(rnaseq, low_memory=False, sep='\t') rnaseq = rnaseq.drop([0]) rnaseq = rnaseq.set_index('Hybridization REF').astype(np.float) rnaseq = rnaseq.transpose().reset_index() rnaseq = util.maybe_clear_non_01s(rnaseq, 'index', cancer_type) rnaseq = util.add_identifier_column(rnaseq, 'index') rnaseq_clean = rnaseq.set_index('identifier').drop('index', 1).astype(np.float) rnaseq_log2 = rnaseq_clean.apply(np.log2) rnaseq_clipped_log2 = np.clip(rnaseq_log2, 0, np.inf) rna_cnv = cnv_by_patient[genes['Gene']].join(rnaseq_clipped_log2, how='inner') mutation = mutation_base.prep_mutation_data(mutation, clinical_data) print mutation.index included_patients = set(list(mutation.index)) & set(list(rna_cnv.index)) rna_cnv = rna_cnv.loc[included_patients] rna_cnv.T.to_csv(os.path.join(outdir, cancer_type + '_cnv_rnaseq_data.csv')) corr_dict = {} for gene in genes['Gene']: corr = rna_cnv.corrwith(rna_cnv[gene]).drop(genes['Gene']) corr_dict[cancer_type + '_' + gene] = corr return pd.DataFrame(corr_dict)
def make_zscores(data, clinical, outdir): subtype = clinical.split('.')[1] clinical_data = pd.read_csv(clinical, index_col=0, header=0) clinical_data = clinical_data.dropna(subset=['time', 'censor'], how='any') subtype_col = clinical_data.columns[-1] cancer_type = util.get_cancer_type(data) df = mb.prep_mutation_data(data, clinical_data) print cancer_type num_patients = len(set(clinical_data.index) & set(df.index)) print 'Number of patients present in both:', num_patients clinical_and_data = df.join(clinical_data, how='inner') print 'Num patients, other count:', len(df.index) outfile = os.path.join(outdir, cancer_type + '_' + subtype + '_zscores.csv') formatstring = '{0}, {1}, {2}, {3}, {4}\n' zscore_count = 0 zscore_skipped = 0 with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num patients,num mutations\n') for gene in clinical_and_data: if gene not in ('time', 'censor', 'index', subtype_col): # skip metadata num_mutations = clinical_and_data[gene].sum() # print gene, num_mutations if num_mutations >= MUTATION_PERCENT * num_patients: try: cox_dict = analysis.do_cox(clinical_and_data.time, clinical_and_data.censor, clinical_and_data[gene]) out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n'], num_mutations)) zscore_count += 1 except rpy2.rinterface.RRuntimeError as e: print 'WARN: skipped ', gene, ' due to R error' zscore_skipped += 1 continue else: zscore_skipped += 1 continue print 'Total:', clinical_and_data.shape[ 1] - 3 # minus time, censor, index print 'Output length:', zscore_count print 'Skipped:', zscore_skipped
def make_zscores(copy_number, mutation, clinical, outdir, genes): cancer_type = util.get_cancer_type(copy_number) clinical_data = util.get_clinical_data(clinical) cnv = pd.read_csv(copy_number, index_col=0) mutation = mutation_base.prep_mutation_data(mutation, clinical_data) for g in genes['Gene']: if g not in mutation.columns: mutation[g] = 0 print mutation[g] mutations = mutation[genes['Gene']] # cox multivariate won't work if there's a quote in the multivar name, so remove it gene_names = [x[1:] + '_mutations' for x in genes['Gene']] mutations.columns = gene_names cnv_by_patient = cnv.transpose() clinical_and_cnv = cnv_by_patient.join(clinical_data, how='inner') clinical_mutations_and_cnv = clinical_and_cnv.join(mutations, how='inner') cox_dicts = {} for gene in gene_names: plain_gene_name = gene.split('_')[0] # little shenanigans to make the names work. CNAs still have a quote, and # mutations have a suffix clinical_gene = clinical_mutations_and_cnv[[ '\'' + plain_gene_name, gene, 'time', 'censor' ]] cox_dict = calculate_cox(clinical_gene, gene) cox_dict['mutation_count'] = clinical_gene[gene].sum() clinical_gene.to_csv( os.path.join( outdir, cancer_type + '_' + plain_gene_name + '_mutation_and_cna_data.csv')) cox_dicts[plain_gene_name] = cox_dict return cox_dicts
def make_zscores(data, clinical, hypermutated_patients, outdir): clinical_data = util.get_clinical_data(clinical) hypermutated = set(clinical_data.index).intersection(hypermutated_patients['patients']) print 'Hypermutated in clinical file:', len(hypermutated) clinical_data = clinical_data.drop(hypermutated) cancer_type = util.get_cancer_type(data) df = mb.prep_mutation_data(data, clinical_data) print 'Remaining hypermutated:', set(df.index).intersection(hypermutated) num_patients = len(set(clinical_data.index) & set(df.index)) print 'Number of patients present in both:', num_patients clinical_and_data = df.join(clinical_data, how='inner') print 'Num patients, other count:', len(df.index) outfile = os.path.join(outdir, cancer_type + '_non-hypermutated_zscores.csv') formatstring = '{0}, {1}, {2}, {3}, {4}\n' zscore_count = 0 zscore_skipped = 0 with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num patients,num mutations\n') for gene in clinical_and_data: if gene not in ('time', 'censor', 'index'): # skip metadata num_mutations = clinical_and_data[gene].sum() # print gene, num_mutations if num_mutations >= MUTATION_PERCENT * num_patients: try: cox_dict = analysis.do_cox(clinical_and_data.time, clinical_and_data.censor, clinical_and_data[gene]) out.write(formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n'], num_mutations)) zscore_count += 1 except rpy2.rinterface.RRuntimeError as e: print 'WARN: skipped ', gene, ' due to R error' zscore_skipped += 1 continue else: zscore_skipped += 1 continue
def do_cox_models(clinical, cn_file, mut_file, outdir): cn = pd.read_csv(cn_file) cn_by_patient = cn.transpose() cn_by_patient = cn_by_patient.drop(['Chromosome', 'Location']) cn_by_patient.columns = cn_by_patient.loc['Symbol'] cn = cn_by_patient[['\'MYC']] mut = mutation_base.prep_mutation_data(mut_file, clinical) p53_mut = mut[['\'TP53']] p53_mut.columns = ['TP53'] data = cn.join(clinical, how='inner') data = data.join(p53_mut, how='inner') analyses = { 'CNA only': [age_r, 'her2_0', 'combined_er_pr', 'stage_0', 'stage_1'], 'CNA + P53': ['TP53', age_r, 'her2_0', 'combined_er_pr', 'stage_0', 'stage_1'] } results = pd.DataFrame() pp = pprint.PrettyPrinter(indent=2) for g in cn: for name, a in analyses.iteritems(): cox_dict = analysis.do_multivariate_cox(data.time, data.censor, data[g], data[a], float_vars=True) cox_dict['gene'] = name + ' ' + g results = results.append(cox_dict, ignore_index=True) cox_dict = analysis.do_multivariate_cox(data.time, data.censor, data['TP53'], data[analyses['CNA only']], float_vars=True) cox_dict['gene'] = 'TP53 mut' results = results.append(cox_dict, ignore_index=True) results = results.set_index('gene') results.T.to_csv(os.path.join(outdir, 'breast_analysis.csv'))
def main(): mutation_dir, clinical_dir, outdir = get_options() mutation_files = os.listdir(mutation_dir) mutation_files = util.remove_extraneous_files(mutation_files) results = pd.DataFrame() for mut in mutation_files: if '_' in mut: continue cancer_type = util.get_cancer_type(mut) print cancer_type clinical = glob.glob( os.path.join(clinical_dir, '*' + cancer_type + '*'))[0] clinical_data = pd.read_csv(clinical, index_col=0) mutation = mutation_base.prep_mutation_data( os.path.join(mutation_dir, mut), clinical_data) data = mutation[['\'TP53']].join(clinical_data, how='inner') print data wt_as = data[data['\'TP53'] == 0]['breaks'] mut_as = data[data['\'TP53'] != 0]['breaks'] wt_q = wt_as.quantile([0.10, 0.25, 0.50, 0.75, 0.90]) mut_q = mut_as.quantile([0.10, 0.25, 0.50, 0.75, 0.90]) statistic, p = stats.mannwhitneyu(wt_as, mut_as) wt_q['cancer_type'] = cancer_type wt_q['mut?'] = 'wt' mut_q['cancer_type'] = cancer_type mut_q['mut?'] = 'mut' wt_q['mann-whitney-p'] = p results = results.append(wt_q) results = results.append(mut_q) results = results.set_index(['cancer_type', 'mut?']) results.to_csv(os.path.join(outdir, 'breaks_and_p53_quantiles.csv'))
def calculate_cox(mutation, clinical, outdir, metagene_file=None, make_km=False): clinical_data = util.get_clinical_data(clinical) df = mutation_base.prep_mutation_data(mutation, clinical_data) clinical_and_data = df.join(clinical_data, how='inner') num_patients = len(clinical_and_data) #prep output file cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0] if metagene_file: formatstring = '{0}, {1}, {2}, {3}, {4}, {5}\n' outfile = os.path.join( outdir, cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) + '_metagene_zscores.csv') print "Processing metagene..." metagene = metagene_lib.get_metagene_data(metagene_file, cancer_type) print "Complete" else: outfile = os.path.join( outdir, cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) + '.zscores.out.csv') formatstring = '{0}, {1}, {2}, {3}, {4}\n' with open(outfile, 'w') as out: if metagene_file: out.write( 'gene,zscore,pvalue,metagene-zscore,metagene-pvalue,num patients\n' ) else: out.write('gene,zscore,pvalue,num mutations,num patients\n') for gene in clinical_and_data: if gene in ['time', 'censor']: continue num_mutations = int(clinical_and_data[gene].sum()) if num_mutations >= MUTATION_PERCENT * num_patients: time = clinical_and_data['time'] censor = clinical_and_data['censor'] data = clinical_and_data[gene] if metagene_file: cox_dict = analysis.do_metagene_cox( time, censor, data, metagene) out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['metagene-z'], cox_dict['metagene-p'], cox_dict['n'])) else: name = cancer_type + '_' + gene if make_km: analysis.do_km(name, time, censor, data, outdir) clinical_and_data['time', 'censor', gene].to_csv( os.path.join(outdir, name + '_data.csv'), columns=['time', 'censor', 'mutated']) cox_dict = analysis.do_cox(time, censor, data) out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], num_mutations, cox_dict['n']))