def make_zscores(copy_number, mutation, clinical, outdir, genes):
    cancer_type = util.get_cancer_type(copy_number)

    clinical_data = util.get_clinical_data(clinical)
    cnv = pd.read_csv(copy_number, index_col=0)

    mutation = mutation_base.prep_mutation_data(mutation, clinical_data)
    p53_mutation = mutation['\'TP53'].rename('TP53_mutation')

    cnv_by_patient = cnv.transpose()
    clinical_and_cnv = cnv_by_patient.join(clinical_data, how='inner')

    clinical_mutations_and_cnv = clinical_and_cnv.join(p53_mutation,
                                                       how='inner')

    cox_dicts = {}
    for gene in genes['Gene']:
        clinical_gene = clinical_mutations_and_cnv[[
            gene, 'TP53_mutation', 'time', 'censor'
        ]]
        cox_dict = calculate_cox(clinical_gene, gene)
        cox_dict['mutation_count'] = clinical_gene['TP53_mutation'].sum()

        clinical_gene.to_csv(
            os.path.join(
                outdir,
                cancer_type + '_' + gene[1:] + '_p53_and_cna_data.csv'))
        cox_dicts[gene[1:]] = cox_dict
    return cox_dicts
def make_zscores(mutation, clinical, breaks, outdir):
    clinical_data = util.get_clinical_data(clinical)
    mut = mutation_base.prep_mutation_data(mutation, clinical_data)

    cancer_type = util.get_cancer_type(mutation)
    print cancer_type

    structural_breaks = pd.read_csv(breaks, index_col=0)
    structural_breaks = structural_breaks.astype(int)
    mut_and_breaks = mut.join(structural_breaks, how='inner')
    num_patients = len(mut_and_breaks)

    results = []
    for gene in mut_and_breaks:
        if gene in ('time', 'censor', 'breaks'):  # skip metadata
            continue
        num_mutations = mut_and_breaks[gene].sum()
        if num_mutations >= MUTATION_PERCENT * num_patients:
            cox_dict = analysis.do_multivariate_cox(mut_and_breaks.time,
                                                    mut_and_breaks.censor,
                                                    mut_and_breaks[gene],
                                                    mut_and_breaks[['breaks']])
            cox_dict['gene'] = gene
            results.append(cox_dict)
    results_df = pd.DataFrame(results)
    results_df = results_df.set_index('gene')
    results_df.to_csv(os.path.join(outdir, cancer_type + '_mut_cox.csv'))
示例#3
0
def make_mutation_zscores(mutation, clinical, gene_list):
    cancer_type = util.get_cancer_type(mutation)

    # get mutation patients
    clinical_data = util.get_clinical_data(clinical)
    mutation = mutation_base.prep_mutation_data(mutation, clinical_data)

    present_gene_list = list(
        set(gene_list.values) & set(mutation.columns.values))
    mutation_gene_list_only = mutation[present_gene_list]

    mutation_and_clinical = mutation_gene_list_only.join(clinical_data,
                                                         how='inner')
    num_patients = len(mutation_and_clinical.index)

    results = pd.DataFrame()
    for gene in mutation_and_clinical:
        if gene in ['time', 'censor']:
            continue
        num_mutations = mutation_and_clinical[gene].sum()
        if num_mutations >= MUTATION_PERCENT * num_patients:
            cox_dict = analysis.do_cox(mutation_and_clinical.time,
                                       mutation_and_clinical.censor,
                                       mutation_and_clinical[gene])
            cox_dict['cancer_type'] = cancer_type
            cox_dict['gene'] = gene
            cox_dict['num_mutations'] = num_mutations
            results = results.append(cox_dict, ignore_index=True)
    print results
    return results
def make_corrs(copy_number, rnaseq, mutation, clinical, outdir, genes):
  cancer_type = util.get_cancer_type(copy_number)

  clinical_data = util.get_clinical_data(clinical)
  cnv = pd.read_csv(copy_number, index_col=0)
  cnv_by_patient = cnv.transpose()


  rnaseq =  pd.read_csv(rnaseq, low_memory=False, sep='\t')
  rnaseq = rnaseq.drop([0])
  rnaseq = rnaseq.set_index('Hybridization REF').astype(np.float)
  rnaseq = rnaseq.transpose().reset_index()
  rnaseq = util.maybe_clear_non_01s(rnaseq, 'index', cancer_type)
  rnaseq = util.add_identifier_column(rnaseq, 'index')
  rnaseq_clean = rnaseq.set_index('identifier').drop('index', 1).astype(np.float)
  rnaseq_log2 = rnaseq_clean.apply(np.log2)
  rnaseq_clipped_log2 = np.clip(rnaseq_log2, 0, np.inf)
  rna_cnv = cnv_by_patient[genes['Gene']].join(rnaseq_clipped_log2, how='inner')

  mutation = mutation_base.prep_mutation_data(mutation, clinical_data)
  print mutation.index

  included_patients = set(list(mutation.index)) & set(list(rna_cnv.index))

  rna_cnv = rna_cnv.loc[included_patients]

  rna_cnv.T.to_csv(os.path.join(outdir, cancer_type + '_cnv_rnaseq_data.csv'))

  corr_dict = {}
  for gene in genes['Gene']:
    corr = rna_cnv.corrwith(rna_cnv[gene]).drop(genes['Gene'])
    corr_dict[cancer_type + '_' + gene] = corr

  return pd.DataFrame(corr_dict)
def make_zscores(data, clinical, outdir):
    subtype = clinical.split('.')[1]
    clinical_data = pd.read_csv(clinical, index_col=0, header=0)
    clinical_data = clinical_data.dropna(subset=['time', 'censor'], how='any')
    subtype_col = clinical_data.columns[-1]

    cancer_type = util.get_cancer_type(data)
    df = mb.prep_mutation_data(data, clinical_data)

    print cancer_type
    num_patients = len(set(clinical_data.index) & set(df.index))
    print 'Number of patients present in both:', num_patients

    clinical_and_data = df.join(clinical_data, how='inner')
    print 'Num patients, other count:', len(df.index)

    outfile = os.path.join(outdir,
                           cancer_type + '_' + subtype + '_zscores.csv')
    formatstring = '{0}, {1}, {2}, {3}, {4}\n'

    zscore_count = 0
    zscore_skipped = 0
    with open(outfile, 'w') as out:
        out.write('gene,zscore,pvalue,num patients,num mutations\n')
        for gene in clinical_and_data:
            if gene not in ('time', 'censor', 'index',
                            subtype_col):  # skip metadata
                num_mutations = clinical_and_data[gene].sum()
                # print gene, num_mutations
                if num_mutations >= MUTATION_PERCENT * num_patients:
                    try:
                        cox_dict = analysis.do_cox(clinical_and_data.time,
                                                   clinical_and_data.censor,
                                                   clinical_and_data[gene])
                        out.write(
                            formatstring.format(gene, cox_dict['z'],
                                                cox_dict['p'], cox_dict['n'],
                                                num_mutations))
                        zscore_count += 1
                    except rpy2.rinterface.RRuntimeError as e:
                        print 'WARN: skipped ', gene, ' due to R error'
                        zscore_skipped += 1
                        continue
                else:
                    zscore_skipped += 1
                    continue

        print 'Total:', clinical_and_data.shape[
            1] - 3  # minus time, censor, index
        print 'Output length:', zscore_count
        print 'Skipped:', zscore_skipped
def make_zscores(copy_number, mutation, clinical, outdir, genes):
    cancer_type = util.get_cancer_type(copy_number)

    clinical_data = util.get_clinical_data(clinical)
    cnv = pd.read_csv(copy_number, index_col=0)

    mutation = mutation_base.prep_mutation_data(mutation, clinical_data)

    for g in genes['Gene']:
        if g not in mutation.columns:
            mutation[g] = 0
            print mutation[g]

    mutations = mutation[genes['Gene']]

    # cox multivariate won't work if there's a quote in the multivar name, so remove it
    gene_names = [x[1:] + '_mutations' for x in genes['Gene']]
    mutations.columns = gene_names

    cnv_by_patient = cnv.transpose()
    clinical_and_cnv = cnv_by_patient.join(clinical_data, how='inner')

    clinical_mutations_and_cnv = clinical_and_cnv.join(mutations, how='inner')

    cox_dicts = {}
    for gene in gene_names:
        plain_gene_name = gene.split('_')[0]
        # little shenanigans to make the names work. CNAs still have a quote, and
        # mutations have a suffix
        clinical_gene = clinical_mutations_and_cnv[[
            '\'' + plain_gene_name, gene, 'time', 'censor'
        ]]
        cox_dict = calculate_cox(clinical_gene, gene)
        cox_dict['mutation_count'] = clinical_gene[gene].sum()

        clinical_gene.to_csv(
            os.path.join(
                outdir, cancer_type + '_' + plain_gene_name +
                '_mutation_and_cna_data.csv'))
        cox_dicts[plain_gene_name] = cox_dict
    return cox_dicts
def make_zscores(data, clinical, hypermutated_patients, outdir):
  clinical_data = util.get_clinical_data(clinical)
  hypermutated = set(clinical_data.index).intersection(hypermutated_patients['patients'])
  print 'Hypermutated in clinical file:', len(hypermutated)
  clinical_data = clinical_data.drop(hypermutated)

  cancer_type = util.get_cancer_type(data)
  df = mb.prep_mutation_data(data, clinical_data)

  print 'Remaining hypermutated:', set(df.index).intersection(hypermutated)
  num_patients = len(set(clinical_data.index) & set(df.index))
  print 'Number of patients present in both:', num_patients

  clinical_and_data = df.join(clinical_data, how='inner')
  print 'Num patients, other count:', len(df.index)

  outfile = os.path.join(outdir, cancer_type + '_non-hypermutated_zscores.csv')
  formatstring = '{0}, {1}, {2}, {3}, {4}\n'

  zscore_count = 0
  zscore_skipped = 0
  with open(outfile, 'w') as out:
    out.write('gene,zscore,pvalue,num patients,num mutations\n')
    for gene in clinical_and_data:
      if gene not in ('time', 'censor', 'index'): # skip metadata
        num_mutations = clinical_and_data[gene].sum()
        # print gene, num_mutations
        if num_mutations >= MUTATION_PERCENT * num_patients:
          try:
            cox_dict = analysis.do_cox(clinical_and_data.time,
                                       clinical_and_data.censor,
                                       clinical_and_data[gene])
            out.write(formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n'], num_mutations))
            zscore_count += 1
          except rpy2.rinterface.RRuntimeError as e:
            print 'WARN: skipped ', gene, ' due to R error'
            zscore_skipped += 1
            continue
        else:
          zscore_skipped += 1
          continue
示例#8
0
def do_cox_models(clinical, cn_file, mut_file, outdir):
    cn = pd.read_csv(cn_file)
    cn_by_patient = cn.transpose()
    cn_by_patient = cn_by_patient.drop(['Chromosome', 'Location'])
    cn_by_patient.columns = cn_by_patient.loc['Symbol']
    cn = cn_by_patient[['\'MYC']]

    mut = mutation_base.prep_mutation_data(mut_file, clinical)
    p53_mut = mut[['\'TP53']]
    p53_mut.columns = ['TP53']

    data = cn.join(clinical, how='inner')
    data = data.join(p53_mut, how='inner')

    analyses = {
        'CNA only': [age_r, 'her2_0', 'combined_er_pr', 'stage_0', 'stage_1'],
        'CNA + P53':
        ['TP53', age_r, 'her2_0', 'combined_er_pr', 'stage_0', 'stage_1']
    }
    results = pd.DataFrame()
    pp = pprint.PrettyPrinter(indent=2)
    for g in cn:
        for name, a in analyses.iteritems():
            cox_dict = analysis.do_multivariate_cox(data.time,
                                                    data.censor,
                                                    data[g],
                                                    data[a],
                                                    float_vars=True)
            cox_dict['gene'] = name + ' ' + g
            results = results.append(cox_dict, ignore_index=True)

    cox_dict = analysis.do_multivariate_cox(data.time,
                                            data.censor,
                                            data['TP53'],
                                            data[analyses['CNA only']],
                                            float_vars=True)
    cox_dict['gene'] = 'TP53 mut'
    results = results.append(cox_dict, ignore_index=True)

    results = results.set_index('gene')
    results.T.to_csv(os.path.join(outdir, 'breast_analysis.csv'))
示例#9
0
def main():
    mutation_dir, clinical_dir, outdir = get_options()
    mutation_files = os.listdir(mutation_dir)
    mutation_files = util.remove_extraneous_files(mutation_files)

    results = pd.DataFrame()
    for mut in mutation_files:
        if '_' in mut:
            continue
        cancer_type = util.get_cancer_type(mut)
        print cancer_type
        clinical = glob.glob(
            os.path.join(clinical_dir, '*' + cancer_type + '*'))[0]

        clinical_data = pd.read_csv(clinical, index_col=0)
        mutation = mutation_base.prep_mutation_data(
            os.path.join(mutation_dir, mut), clinical_data)
        data = mutation[['\'TP53']].join(clinical_data, how='inner')
        print data

        wt_as = data[data['\'TP53'] == 0]['breaks']
        mut_as = data[data['\'TP53'] != 0]['breaks']

        wt_q = wt_as.quantile([0.10, 0.25, 0.50, 0.75, 0.90])
        mut_q = mut_as.quantile([0.10, 0.25, 0.50, 0.75, 0.90])

        statistic, p = stats.mannwhitneyu(wt_as, mut_as)

        wt_q['cancer_type'] = cancer_type
        wt_q['mut?'] = 'wt'
        mut_q['cancer_type'] = cancer_type
        mut_q['mut?'] = 'mut'
        wt_q['mann-whitney-p'] = p

        results = results.append(wt_q)
        results = results.append(mut_q)

    results = results.set_index(['cancer_type', 'mut?'])
    results.to_csv(os.path.join(outdir, 'breaks_and_p53_quantiles.csv'))
示例#10
0
def calculate_cox(mutation,
                  clinical,
                  outdir,
                  metagene_file=None,
                  make_km=False):
    clinical_data = util.get_clinical_data(clinical)
    df = mutation_base.prep_mutation_data(mutation, clinical_data)
    clinical_and_data = df.join(clinical_data, how='inner')
    num_patients = len(clinical_and_data)

    #prep output file
    cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0]
    if metagene_file:
        formatstring = '{0}, {1}, {2}, {3}, {4}, {5}\n'
        outfile = os.path.join(
            outdir, cancer_type + '_mutation-fraction-' +
            str(MUTATION_PERCENT) + '_metagene_zscores.csv')

        print "Processing metagene..."
        metagene = metagene_lib.get_metagene_data(metagene_file, cancer_type)
        print "Complete"
    else:
        outfile = os.path.join(
            outdir, cancer_type + '_mutation-fraction-' +
            str(MUTATION_PERCENT) + '.zscores.out.csv')
        formatstring = '{0}, {1}, {2}, {3}, {4}\n'

    with open(outfile, 'w') as out:
        if metagene_file:
            out.write(
                'gene,zscore,pvalue,metagene-zscore,metagene-pvalue,num patients\n'
            )
        else:
            out.write('gene,zscore,pvalue,num mutations,num patients\n')

        for gene in clinical_and_data:
            if gene in ['time', 'censor']:
                continue

            num_mutations = int(clinical_and_data[gene].sum())
            if num_mutations >= MUTATION_PERCENT * num_patients:
                time = clinical_and_data['time']
                censor = clinical_and_data['censor']
                data = clinical_and_data[gene]

                if metagene_file:
                    cox_dict = analysis.do_metagene_cox(
                        time, censor, data, metagene)
                    out.write(
                        formatstring.format(gene, cox_dict['z'], cox_dict['p'],
                                            cox_dict['metagene-z'],
                                            cox_dict['metagene-p'],
                                            cox_dict['n']))
                else:
                    name = cancer_type + '_' + gene
                    if make_km:
                        analysis.do_km(name, time, censor, data, outdir)
                        clinical_and_data['time', 'censor', gene].to_csv(
                            os.path.join(outdir, name + '_data.csv'),
                            columns=['time', 'censor', 'mutated'])

                    cox_dict = analysis.do_cox(time, censor, data)
                    out.write(
                        formatstring.format(gene, cox_dict['z'], cox_dict['p'],
                                            num_mutations, cox_dict['n']))