def main():
    clinical_dir, output_dir, extra_data_dir = get_options()
    clinical_files = os.listdir(clinical_dir)
    clinical_files = util.remove_extraneous_files(clinical_files)

    zscore_data = {}
    for f in clinical_files:
        clinical_path = os.path.join(clinical_dir, f)
        cancer_type = util.get_cancer_type(f)
        if cancer_type == 'COADREAD':
            extra_data = prep_extra_data(extra_data_dir, 'COAD')
        else:
            extra_data = prep_extra_data(extra_data_dir, cancer_type)

        clinical = util.get_clinical_data(clinical_path)
        clinical = clinical.join(extra_data)
        purity_header = 'Purity_InfiniumPurify'

        cox_dict = analysis.do_cox(clinical.time, clinical.censor,
                                   clinical[purity_header])
        zscore_data[cancer_type] = cox_dict

        purity_mean = clinical[purity_header].mean()
        print purity_mean
        clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean,
                                        0, 1)
        analysis.do_km(cancer_type, clinical.time, clinical.censor,
                       clinical.km_split, output_dir)

    out_df = pd.DataFrame(zscore_data).transpose()
    out_df.to_csv(
        os.path.join(output_dir, 'add_data_simple_purity_zscores.csv'))
def make_zscores(copy_number, clinical, gene_list):
    clinical_data = pd.read_csv(clinical,
                                sep=util.get_sep_from_filename(clinical))
    clinical_data = clinical_data.set_index('PATIENT_ID')
    relevant_clinical = clinical_data[[u'Time', u'Censor']].astype(float)
    relevant_clinical = relevant_clinical.dropna()

    df = pd.read_csv(copy_number, sep=util.get_sep_from_filename(copy_number))

    df = df.drop_duplicates(subset=['Hugo_Symbol'], keep='first')
    df = df.dropna(subset=['Hugo_Symbol'])

    df_by_patient = df.transpose()
    df_by_patient.columns = df_by_patient.loc['Hugo_Symbol']
    df_by_patient = df_by_patient[gene_list]
    print df_by_patient
    clinical_and_cnv = df_by_patient.join(relevant_clinical, how='inner')
    num_patients = clinical_and_cnv.shape[0]

    cancer_type = util.get_cancer_type(copy_number)

    results = []
    for gene in clinical_and_cnv:
        if gene in ('Time', 'Censor'):  # skip metadata
            continue
        if clinical_and_cnv[gene].count() > 10:
            num_with_copy_number = (clinical_and_cnv[gene] != 0).sum()
            cox_dict = analysis.do_cox(clinical_and_cnv.Time,
                                       clinical_and_cnv.Censor,
                                       clinical_and_cnv[gene],
                                       float_time=True)
            cox_dict['gene'] = gene
            results.append(cox_dict)
    return results
Пример #3
0
def calculate_cox_for_cancer_type(requested_data, mutation_data, outdir):
    cancer_type = util.get_cancer_type(mutation_data)
    clinical = os.path.join('.', 'clinical', cancer_type + '.clin.merged.txt')
    clinical_data = util.get_clinical_data(clinical)

    start_pos = None
    if cancer_type in ['COADREAD', 'OV']:
        folder = os.path.dirname(mutation_data)
        mutation_data = os.path.join(folder, 'HG36_HG37',
                                     cancer_type + '_hg36_hg37.txt')
        start_pos = u'hg37_start'

    df, clinical_with_sequenced_patients, num_patients = zscores_for_mutants.prep_data(
        mutation_data, clinical_data)
    if not start_pos:
        upper_columns = [i.upper() for i in df.columns]
        start_pos_index = upper_columns.index('START_POSITION')
        start_pos = df.columns[start_pos_index]

    patients_with_gene = df.groupby(level=u'Hugo_Symbol')
    output_data = []
    for i, request in requested_data.iteritems():
        gene = i[1:]
        # print gene
        # print request
        if gene in patients_with_gene.groups.keys():
            patients_with_requested_gene = patients_with_gene.get_group(gene)
            mutated_at_positions = patients_with_requested_gene[
                start_pos].isin(request)
            # print mutated_at_positions
            patients_with_requested_positions = patients_with_requested_gene[
                mutated_at_positions]
            ids_with_requested_positions = patients_with_requested_positions.index.get_level_values(
                'identifier')
            if len(
                    ids_with_requested_positions
            ) >= MUTATION_PERCENT * clinical_with_sequenced_patients.shape[0]:
                analysis_data = pd.DataFrame(
                    {'mutated': np.ones(len(ids_with_requested_positions))},
                    index=ids_with_requested_positions)
                analysis_data = analysis_data.join(
                    clinical_with_sequenced_patients, how='right')
                analysis_data['mutated'].fillna(0, inplace=True)
                cox_dict = analysis.do_cox(analysis_data['time'],
                                           analysis_data['censor'],
                                           analysis_data['mutated'])

                outdict = {cancer_type + ' p': cox_dict['p']}
                outdict[cancer_type + ' z'] = cox_dict['z']
                outdict[cancer_type +
                        ' mutants'] = len(ids_with_requested_positions)
                outdict[cancer_type + ' n'] = cox_dict['n']
                outdict['gene'] = i
                outdict['positions'] = ':'.join(request)
                output_data.append(outdict)
    outdata = pd.DataFrame(output_data)
    print outdata
    if len(outdata):
        outdata = outdata.set_index(['gene', 'positions'])
    return outdata
def calculate_cox(mutation, clinical_data, outdir, univariate_file=None):
  df = prep_data(mutation, clinical_data)
  df = df.join(clinical_data, how='inner')
  num_patients = len(df.index)

  gene_pairs = itertools.combinations(COMMONLY_MUTATED, 2)

  #prep output file
  cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0]
  print cancer_type
  outfile = os.path.join(outdir, cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) + '.zscores.out.csv')
  if univariate_file:
    univariate_data = pd.read_csv(univariate_file, index_col=0)
    outfile = os.path.join(outdir,
        cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) + '.notalonesignificant.zscores.out.csv')
  formatstring = '{0}, {1}, {2}, {3}, {4}\n'

  with open(outfile, 'w') as out:
    out.write('gene,zscore,pvalue,num mutations,num patients\n')

    for gene_pair in gene_pairs:
      gene_pair = pd.Series(gene_pair)
      gene_pair_str = '-'.join(gene_pair)
      if gene_pair.isin(df.columns.values).sum() < 2:
        continue
      paired_mutations = df[list(gene_pair)]
      double_mutated_patients = paired_mutations[paired_mutations.sum(axis=1) == 2].index
      num_mutations = len(double_mutated_patients)
      print gene_pair_str, num_mutations

      if num_mutations >= MUTATION_PERCENT * num_patients:
        # if we have univariate data, check to see that neither gene is significant for survival independently
        #  before calculation
        if univariate_file:
          if univariate_data.loc[gene_pair[0]].zscore < -1.96 or univariate_data.loc[gene_pair[0]].zscore > 1.96:
            print 'Skipping pair', gene_pair_str, 'for gene 0'
            continue
          if univariate_data.loc[gene_pair[1]].zscore < -1.96 or univariate_data.loc[gene_pair[1]].zscore > 1.96:
            print 'Skipping pair', gene_pair_str, 'for gene 1'
            continue

        # analysis_data = pd.DataFrame({'mutated': np.ones(num_mutations)}, index=double_mutated_patients)
        analysis_data = pd.DataFrame()
        analysis_data['time'] = df['time']
        analysis_data['censor'] = df['censor']
        analysis_data['mutated'] = 0
        analysis_data.loc[double_mutated_patients,'mutated'] = 1

        #Do analysis!
        print 'Doing analysis for', gene_pair_str, 'with',  num_mutations, 'double mutations', 'of', num_patients

        name = cancer_type+ '_' + gene_pair_str.replace('\'', '')
        print name
        cox_dict = analysis.do_cox(analysis_data['time'], analysis_data['censor'], analysis_data['mutated'])
        out.write(formatstring.format(gene_pair_str,
                                      cox_dict['z'],
                                      cox_dict['p'],
                                      num_mutations,cox_dict['n']))
        analysis_data.to_csv(os.path.join(outdir, name + '_data.csv'),
                             columns=['time', 'censor', 'mutated'])
def make_zscores(copy_number, clinical, outdir):
  clinical_data = pd.read_csv(clinical, sep=util.get_sep_from_filename(clinical))
  clinical_data = clinical_data.set_index('PATIENT_ID')
  relevant_clinical = clinical_data[[u'Time', u'Censor']].astype(float)
  relevant_clinical = relevant_clinical.dropna()

  df = pd.read_csv(copy_number, sep=util.get_sep_from_filename(copy_number))

  df = df.drop_duplicates(subset=['Hugo_Symbol'], keep='first')
  df = df.dropna(subset=['Hugo_Symbol'])

  df_by_patient = df.transpose()
  df_by_patient.columns = df_by_patient.loc['Hugo_Symbol']
  clinical_and_cnv = df_by_patient.join(relevant_clinical, how='inner')
  num_patients = clinical_and_cnv.shape[0]

  cancer_type = util.get_cancer_type(copy_number)
  outfile = os.path.join(outdir, cancer_type + '.cbioportal_zscores.csv')
  formatstring = '{0}, {1}, {2}, {3}\n'

  with open(outfile, 'w') as out:
    out.write('gene,zscore,pvalue,num patients\n')
    for gene in clinical_and_cnv:
      if gene not in ('Time', 'Censor'): # skip metadata
        if clinical_and_cnv[gene].count() > 10:

          num_with_copy_number = (clinical_and_cnv[gene] != 0).sum()
          cox_dict = analysis.do_cox(clinical_and_cnv.Time,
                                     clinical_and_cnv.Censor,
                                     clinical_and_cnv[gene],
                                     float_time=True)
          if gene[0] != '\'':
            gene = '\'' + gene
          out.write(formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n']))
Пример #6
0
def make_mutation_zscores(mutation, clinical, gene_list):
    cancer_type = util.get_cancer_type(mutation)

    # get mutation patients
    clinical_data = util.get_clinical_data(clinical)
    mutation = mutation_base.prep_mutation_data(mutation, clinical_data)

    present_gene_list = list(
        set(gene_list.values) & set(mutation.columns.values))
    mutation_gene_list_only = mutation[present_gene_list]

    mutation_and_clinical = mutation_gene_list_only.join(clinical_data,
                                                         how='inner')
    num_patients = len(mutation_and_clinical.index)

    results = pd.DataFrame()
    for gene in mutation_and_clinical:
        if gene in ['time', 'censor']:
            continue
        num_mutations = mutation_and_clinical[gene].sum()
        if num_mutations >= MUTATION_PERCENT * num_patients:
            cox_dict = analysis.do_cox(mutation_and_clinical.time,
                                       mutation_and_clinical.censor,
                                       mutation_and_clinical[gene])
            cox_dict['cancer_type'] = cancer_type
            cox_dict['gene'] = gene
            cox_dict['num_mutations'] = num_mutations
            results = results.append(cox_dict, ignore_index=True)
    print results
    return results
Пример #7
0
def main():
    copy_number_loc, clinical, outdir = get_options()
    cnas = os.listdir(copy_number_loc)
    cnas = util.remove_extraneous_files(cnas)

    results = pd.DataFrame()
    for c in cnas:
        cancer_type = util.get_cancer_type(c)
        print cancer_type

        clinical_file = glob.glob(
            os.path.join(clinical, '*' + cancer_type + '*.txt'))[0]
        clin = util.get_clinical_data(clinical_file)

        patient_breaks = count_breaks(os.path.join(copy_number_loc, c))
        patient_breaks = patient_breaks.reset_index()
        patient_breaks = util.maybe_clear_non_01s(patient_breaks, 'Sample',
                                                  cancer_type)
        patient_breaks = util.add_identifier_column(patient_breaks, 'Sample')
        patient_breaks = patient_breaks.set_index('identifier')
        patient_breaks = patient_breaks.drop('Sample', axis=1)

        breaks_and_clin = patient_breaks.join(clin, how='inner')
        breaks_and_clin.to_csv(
            os.path.join(outdir, cancer_type + '_breaks.csv'))
        cox = analysis.do_cox(breaks_and_clin.time, breaks_and_clin.censor,
                              breaks_and_clin.breaks)
        cox['cancer_type'] = cancer_type
        results = results.append(cox, ignore_index=True)

    results.to_csv(os.path.join(outdir, 'cox_results.csv'))
Пример #8
0
def make_cox(g, clinical_data, cancer_type, cutoff_percent, seq_patients,
             outdir):
    gene_codon = g['Gene-Codon'].iloc[0]

    by_patient = g.reset_index()
    by_patient = by_patient.pivot(index='level_3',
                                  columns='index',
                                  values='mutated')
    clinical_data = clinical_data.loc[seq_patients]
    num_seq_mutated = len(by_patient)
    if num_seq_mutated <= cutoff_percent * seq_patients.size:
        return None
    print gene_codon
    print num_seq_mutated, seq_patients.size, cutoff_percent * seq_patients.size
    print 'num patients w mut in more than 1 codon:', (by_patient.sum(axis=1) >
                                                       1).sum()
    by_patient['any_mut'] = by_patient.sum(axis=1) >= 1
    by_patient = by_patient.join(clinical_data, how='outer')
    by_patient['any_mut'] = by_patient[['any_mut']].fillna(0).astype(int)
    by_patient = by_patient.dropna(subset=['time', 'censor'], how='any')

    cox_dict = analysis.do_cox(by_patient.time, by_patient.censor,
                               by_patient.any_mut)
    cox_dict['cancer_type'] = cancer_type
    cox_dict['num_mutated_w_clinical'] = by_patient['any_mut'].sum()
    cox_dict['num_sequence_mutated'] = num_seq_mutated
    by_patient.to_csv(
        os.path.join(
            outdir, cancer_type + '_' + gene_codon + '_' +
            str(cutoff_percent) + 'cutoff_clinical.csv'))
    return pd.Series(cox_dict)
Пример #9
0
def main():
    clinical_dir, output_dir, header_file = get_options()
    headers = pd.read_csv(header_file, index_col=0, header=None)
    clinical_files = os.listdir(clinical_dir)
    clinical_files = util.remove_extraneous_files(clinical_files)

    zscore_data = {}
    for f in clinical_files:
        clinical_path = os.path.join(clinical_dir, f)
        cancer_type = util.get_cancer_type(f)
        purity_header = headers.get_value(cancer_type, 1)

        clinical = util.get_clinical_data(clinical_path,
                                          extra_rows=[purity_header])

        cox_dict = analysis.do_cox(clinical.time, clinical.censor,
                                   clinical[purity_header])
        zscore_data[cancer_type] = cox_dict

        purity_mean = clinical[purity_header].mean()
        print purity_mean
        clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean,
                                        0, 1)
        analysis.do_km(cancer_type, clinical.time, clinical.censor,
                       clinical.km_split, output_dir)

    out_df = pd.DataFrame(zscore_data).transpose()
    out_df.to_csv(os.path.join(output_dir, 'simple_purity_zscores.csv'))
def make_cn_zscores(copy_number, clinical, interesting_genes=None, outdir='.'):
    clinical_data = util.get_clinical_data(clinical)
    cnv = pd.read_csv(copy_number, index_col=0)
    cnv_by_patient = cnv.transpose()

    cancer_type = util.get_cancer_type(copy_number)

    relevant_genes = '\'' + interesting_genes.index
    relevant_genes = list(relevant_genes)
    cnv = cnv_by_patient[relevant_genes]

    cnv = cnv.join(clinical_data, how='inner')

    results = []
    for gene in cnv:
        if gene in ('time', 'censor'):  # skip metadata
            continue
        if cnv[gene].count() > 10:
            cnv[gene + '_split'] = np.nan
            cnv.loc[cnv[gene] <= -0.3, gene + '_split'] = -1
            cnv.loc[cnv[gene].between(-0.3, 0.3), gene + '_split'] = 0
            cnv.loc[cnv[gene] >= 0.3, gene + '_split'] = 1

            cox_dict = analysis.do_cox(cnv.time, cnv.censor,
                                       cnv[gene + '_split'])
            cox_dict['gene'] = gene
            cox_dict['cancer_type'] = cancer_type
            results.append(cox_dict)
    cnv.to_csv(os.path.join(outdir, cancer_type + '_trichotomized.csv'))
    return results
Пример #11
0
def do_single_cancer_type_cna(name, clinical, cna, outdir):
  cna = cna.T
  if 'Chromosome' in cna.columns:
    cna = cna.drop(['Chromosome', 'Location'])
  print 'Patient count for CNAs:', cna.shape
  cnas_and_clinical = cna.join(clinical, how='inner')
  num_patients = cnas_and_clinical.shape[0]
  print 'num patients:', num_patients
  formatstring = '{0}, {1}, {2}, {3}\n'

  outfile = os.path.join(outdir, name.replace(' ', '-') + '.cnas.out.csv')
  print outfile
  with open(outfile, 'w') as out:
    out.write('gene,zscore,pvalue,num patients\n')
    for gene in cnas_and_clinical:
        if gene in ['Time', 'Censor']:
          continue
        number_non_zero = cnas_and_clinical[cnas_and_clinical[gene] != 0][gene].shape[0]
        try:
          cox_dict = analysis.do_cox(cnas_and_clinical.Time,
                                     cnas_and_clinical.Censor,
                                     cnas_and_clinical[gene], )
          out.write(formatstring.format(
                        gene, cox_dict['z'], cox_dict['p'], cox_dict['n']))
        except rpy2.rinterface.RRuntimeError as e:
          print 'Skipped ', gene, 'due to R error.'
Пример #12
0
def calculate_cox(data, gene):
    data_cox_dict = collections.defaultdict(lambda: np.nan)
    if data[gene].count() > 10:
        try:
            data_cox_dict = analysis.do_cox(data.time, data.censor, data[gene])
        except rpy2.rinterface.RRuntimeError as e:
            print 'WARN: skipped', gene, 'due to R error'
    return data_cox_dict
def calculate_any_change_zscores(input_file):
    input_data = pd.read_csv(input_file, index_col=0)
    input_data = input_data.dropna(subset=['time', 'censor', 'copy number'],
                                   how='any')
    print input_data.shape

    input_data['any_change'] = ~np.isnan(input_data.continuous_len)
    any_change_zscore = analysis.do_cox(input_data.time, input_data.censor,
                                        input_data['any_change'])
    any_change_zscore['any_change_count'] = input_data.any_change.sum()
    print any_change_zscore
    return any_change_zscore
def calculate_broad_change_zscores(input_file):
    input_data = pd.read_csv(input_file, index_col=0)
    input_data = input_data.dropna(subset=['time', 'censor', 'copy number'],
                                   how='any')

    input_data['broad'] = input_data.continuous_len > FOCAL_CUTOFF
    # print input_data
    broad_zscore = analysis.do_cox(input_data.time, input_data.censor,
                                   input_data['broad'])
    broad_zscore['broad_count'] = input_data.broad.sum()
    print broad_zscore

    return broad_zscore
def make_zscores(data, clinical, outdir):
    subtype = clinical.split('.')[1]
    clinical_data = pd.read_csv(clinical, index_col=0, header=0)
    clinical_data = clinical_data.dropna(subset=['time', 'censor'], how='any')
    subtype_col = clinical_data.columns[-1]

    cancer_type = util.get_cancer_type(data)
    df = mb.prep_mutation_data(data, clinical_data)

    print cancer_type
    num_patients = len(set(clinical_data.index) & set(df.index))
    print 'Number of patients present in both:', num_patients

    clinical_and_data = df.join(clinical_data, how='inner')
    print 'Num patients, other count:', len(df.index)

    outfile = os.path.join(outdir,
                           cancer_type + '_' + subtype + '_zscores.csv')
    formatstring = '{0}, {1}, {2}, {3}, {4}\n'

    zscore_count = 0
    zscore_skipped = 0
    with open(outfile, 'w') as out:
        out.write('gene,zscore,pvalue,num patients,num mutations\n')
        for gene in clinical_and_data:
            if gene not in ('time', 'censor', 'index',
                            subtype_col):  # skip metadata
                num_mutations = clinical_and_data[gene].sum()
                # print gene, num_mutations
                if num_mutations >= MUTATION_PERCENT * num_patients:
                    try:
                        cox_dict = analysis.do_cox(clinical_and_data.time,
                                                   clinical_and_data.censor,
                                                   clinical_and_data[gene])
                        out.write(
                            formatstring.format(gene, cox_dict['z'],
                                                cox_dict['p'], cox_dict['n'],
                                                num_mutations))
                        zscore_count += 1
                    except rpy2.rinterface.RRuntimeError as e:
                        print 'WARN: skipped ', gene, ' due to R error'
                        zscore_skipped += 1
                        continue
                else:
                    zscore_skipped += 1
                    continue

        print 'Total:', clinical_and_data.shape[
            1] - 3  # minus time, censor, index
        print 'Output length:', zscore_count
        print 'Skipped:', zscore_skipped
Пример #16
0
def calculate_zscores_for_file(mutation_file, clinical_file, outdir, hgnc):
    df, clinical_data_with_sequenced_patients, num_patients = prep_data(
        mutation_file, clinical_file, hgnc)

    cancer_type = get_icgc_cancer_type(mutation_file)
    print cancer_type
    formatstring = '{0}, {1}, {2}, {3}, {4}\n'
    outfile = os.path.join(
        outdir, cancer_type + '_mutation_percent_' + str(MUTATION_PERCENT) +
        '.icgc_zscores.out.csv')
    with open(outfile, 'w') as out:
        out.write('gene,zscore,pvalue,num mutations,num patients\n')

        #for every gene, collect the clinical data with the mutation data.
        patients_with_gene = df.groupby(level=u'gene_affected')
        for gene, gene_df in patients_with_gene:
            mutated_patient_list = gene_df.index.get_level_values(
                'icgc_donor_id').unique()
            num_mutations = len(mutated_patient_list)

            if num_mutations >= MUTATION_PERCENT * num_patients:
                # take the patients with mutations and without, and build an analysis dataframe with time and censor.
                analysis_data = pd.DataFrame(
                    {'mutated': np.ones(num_mutations)},
                    index=mutated_patient_list)
                analysis_data = analysis_data.join(
                    clinical_data_with_sequenced_patients, how='right')
                analysis_data['mutated'].fillna(0, inplace=True)

                #Do analysis!
                print 'Doing analysis for %s: mutated %d of %d' % (
                    gene, num_mutations, num_patients)
                time = analysis_data['Time']
                censor = analysis_data['Censor']
                split = analysis_data['mutated']

                cox_dict = analysis.do_cox(time, censor, split)
                if cox_dict['n'] != len(analysis_data['Time']):
                    print 'ERROR'
                if gene[0] != '\'':
                    gene = '\'' + gene
                out.write(
                    formatstring.format(gene, cox_dict['z'], cox_dict['p'],
                                        num_mutations, cox_dict['n']))
                analysis_data.to_csv(os.path.join(outdir,
                                                  gene[1:] + '_data.csv'),
                                     columns=['Time', 'Censor', 'mutated'],
                                     index_label='patient')
Пример #17
0
def make_zscores(copy_number, clinical, outdir, metagene_file=None):
    clinical_data = util.get_clinical_data(clinical)

    df = pd.read_csv(copy_number)
    df = df.drop(['Chromosome', 'Location'], axis=1)
    df_by_patient = df.transpose()
    df_by_patient.columns = df_by_patient.loc['Symbol']
    clinical_and_cnv = df_by_patient.join(clinical_data, how='inner')

    cancer_type = util.get_cancer_type(copy_number)
    if metagene_file:
        formatstring = '{0}, {1}, {2}, {3}, {4}, {5}\n'
        outfile = os.path.join(outdir, cancer_type + '_metagene_zscores.csv')

        print "Processing metagene..."
        metagene = metagene_lib.get_metagene_data(metagene_file, cancer_type)
        print "Complete"
    else:
        outfile = os.path.join(outdir, cancer_type + '_zscores.csv')
        formatstring = '{0}, {1}, {2}, {3}\n'

    with open(outfile, 'w') as out:
        if metagene_file:
            out.write(
                'gene,zscore,pvalue,metagene-zscore,metagene-pvalue,num patients\n'
            )
        else:
            out.write('gene,zscore,pvalue,num patients\n')
        for gene in clinical_and_cnv:
            if gene not in ('time', 'censor'):  # skip metadata
                if clinical_and_cnv[gene].count() > 10:
                    if metagene_file:
                        cox_dict = analysis.do_metagene_cox(
                            clinical_and_cnv.time, clinical_and_cnv.censor,
                            clinical_and_cnv[gene], metagene)
                        out.write(
                            formatstring.format(gene, cox_dict['z'],
                                                cox_dict['p'],
                                                cox_dict['metagene-z'],
                                                cox_dict['metagene-p'],
                                                cox_dict['n']))
                    else:
                        cox_dict = analysis.do_cox(clinical_and_cnv.time,
                                                   clinical_and_cnv.censor,
                                                   clinical_and_cnv[gene])
                        out.write(
                            formatstring.format(gene, cox_dict['z'],
                                                cox_dict['p'], cox_dict['n']))
def calculate_broad_change_restricted_zscores(input_file):
    input_data = pd.read_csv(input_file, index_col=0)
    input_data = input_data.dropna(subset=['time', 'censor', 'copy number'],
                                   how='any')
    print input_data.shape

    # ignore patients that have a focal change
    input_data = input_data.drop(
        input_data[input_data.continuous_len <= FOCAL_CUTOFF].index)

    input_data['broad'] = input_data.continuous_len > FOCAL_CUTOFF
    broad_restricted_zscore = analysis.do_cox(input_data.time,
                                              input_data.censor,
                                              input_data['broad'])
    broad_restricted_zscore['broad_count'] = input_data.broad.sum()
    print broad_restricted_zscore
    return broad_restricted_zscore
def do_single_cancer_type_mutation(cancer_type, cancer_type_clinical,
                                   mutation_file, name_conversions, outdir):
    patients_in_both = list(
        set(mutation_file.columns).intersection(set(
            cancer_type_clinical.index)))
    cancer_type_mutations = mutation_file[patients_in_both].T
    print 'Patient count for Mutations:', cancer_type_mutations.shape
    cancer_type_mutations_and_clinical = cancer_type_mutations.join(
        cancer_type_clinical)
    formatstring = '{0}, {1}, {2}, {3}, {4}\n'
    num_patients = cancer_type_mutations_and_clinical.shape[0]
    outfile = os.path.join(outdir, 'mutations',
                           cancer_type.replace(' ', '-') + '.out.csv')
    print outfile
    with open(outfile, 'w') as out:
        out.write('gene,zscore,pvalue,num mutations,num patients\n')
        for gene in cancer_type_mutations_and_clinical:
            if gene in ['Time', 'Censor']:
                continue
            if cancer_type_mutations_and_clinical[gene].sum(
            ) >= MUTATION_PERCENT * num_patients:
                try:
                    cox_dict = analysis.do_cox(
                        cancer_type_mutations_and_clinical.Time,
                        cancer_type_mutations_and_clinical.Censor,
                        cancer_type_mutations_and_clinical[gene])
                    orig_gene = gene
                    if gene in name_conversions.index:
                        print 'Converting gene', gene, 'to', name_conversions[
                            'TCGA'].loc[gene]
                        gene = name_conversions['TCGA'].loc[gene]
                    out.write(
                        formatstring.format(
                            gene, cox_dict['z'], cox_dict['p'],
                            cancer_type_mutations_and_clinical[orig_gene].sum(
                            ), cox_dict['n']))
                    cancer_type_mutations_and_clinical.to_csv(
                        os.path.join(
                            outdir, 'mutations/',
                            cancer_type + '_' + gene + '_mutations.csv'),
                        columns=['Time', 'Censor', orig_gene])
                except rpy2.rinterface.RRuntimeError as e:
                    print 'Skipped ', gene, 'due to R error.'
def main():
    indir, outdir = get_options()

    print os.path.join(indir, '*' + 'TP53' + '*')
    files = glob.glob(os.path.join(indir, '*', '*' + '_TP53_data.csv'))

    results = []
    for f in files:
        print f
        cancer_type = os.path.basename(os.path.dirname(f))
        df = pd.read_csv(f, index_col=0)
        cox_dict = analysis.do_cox(df.time, df.censor, df.mutated)
        cox_dict['cancer_type'] = cancer_type
        results.append(cox_dict)

    results_df = pd.DataFrame(results)
    print results_df
    results = results_df.set_index('cancer_type')
    results.to_csv(os.path.join(outdir, 'tcga_p53_mutation_zscores.csv'))
def make_zscores(data, clinical, hypermutated_patients, outdir):
  clinical_data = util.get_clinical_data(clinical)
  hypermutated = set(clinical_data.index).intersection(hypermutated_patients['patients'])
  print 'Hypermutated in clinical file:', len(hypermutated)
  clinical_data = clinical_data.drop(hypermutated)

  cancer_type = util.get_cancer_type(data)
  df = mb.prep_mutation_data(data, clinical_data)

  print 'Remaining hypermutated:', set(df.index).intersection(hypermutated)
  num_patients = len(set(clinical_data.index) & set(df.index))
  print 'Number of patients present in both:', num_patients

  clinical_and_data = df.join(clinical_data, how='inner')
  print 'Num patients, other count:', len(df.index)

  outfile = os.path.join(outdir, cancer_type + '_non-hypermutated_zscores.csv')
  formatstring = '{0}, {1}, {2}, {3}, {4}\n'

  zscore_count = 0
  zscore_skipped = 0
  with open(outfile, 'w') as out:
    out.write('gene,zscore,pvalue,num patients,num mutations\n')
    for gene in clinical_and_data:
      if gene not in ('time', 'censor', 'index'): # skip metadata
        num_mutations = clinical_and_data[gene].sum()
        # print gene, num_mutations
        if num_mutations >= MUTATION_PERCENT * num_patients:
          try:
            cox_dict = analysis.do_cox(clinical_and_data.time,
                                       clinical_and_data.censor,
                                       clinical_and_data[gene])
            out.write(formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n'], num_mutations))
            zscore_count += 1
          except rpy2.rinterface.RRuntimeError as e:
            print 'WARN: skipped ', gene, ' due to R error'
            zscore_skipped += 1
            continue
        else:
          zscore_skipped += 1
          continue
Пример #22
0
def do_single_cancer_type_mutation(name, clinical, mutations, outdir):
  mutations_and_clinical = mutations.join(clinical, how='inner')
  formatstring = '{0}, {1}, {2}, {3}, {4}\n'
  num_patients = mutations_and_clinical.shape[0]
  print 'Number of patients in both:', num_patients
  outfile = os.path.join(outdir, name.replace(' ', '-') + '.mutations.out.csv')
  print outfile
  with open(outfile, 'w') as out:
    out.write('gene,zscore,pvalue,num mutations,num patients\n')
    for gene in mutations_and_clinical:
        if gene in ['Time', 'Censor']:
          continue
        if mutations_and_clinical[gene].sum() >= MUTATION_PERCENT*num_patients:
          try:
            cox_dict = analysis.do_cox(mutations_and_clinical.Time,
                                       mutations_and_clinical.Censor,
                                       mutations_and_clinical[gene])
            out.write(formatstring.format(
                          gene, cox_dict['z'], cox_dict['p'], mutations_and_clinical[gene].sum(), cox_dict['n']))
            mutations_and_clinical.to_csv(os.path.join(outdir, 'raw_mutations/', name + '_' +  gene + '_mutations.csv'), columns=['Time', 'Censor', gene])
          except rpy2.rinterface.RRuntimeError as e:
            print 'Skipped ', gene, 'due to R error.'
Пример #23
0
def make_cnv_zscores(copy_number, clinical, gene_list):
    cancer_type = util.get_cancer_type(copy_number)

    cna = pd.read_csv(copy_number)
    cna_by_patient = cna.transpose()
    cna_by_patient.columns = cna_by_patient.loc['Symbol']
    cna_by_patient_gene_list_only = cna_by_patient[gene_list]

    clinical_data = util.get_clinical_data(clinical)
    clinical_and_cnv = cna_by_patient_gene_list_only.join(clinical_data,
                                                          how='inner')

    results = pd.DataFrame()
    for gene in clinical_and_cnv:
        if gene in ['time', 'censor']:
            continue
        cox_dict = analysis.do_cox(clinical_and_cnv.time,
                                   clinical_and_cnv.censor,
                                   clinical_and_cnv[gene])
        cox_dict['cancer_type'] = cancer_type
        cox_dict['gene'] = gene
        results = results.append(cox_dict, ignore_index=True)
    return results
def calculate_zscores_for_file(mutation_file, clinical_data, gene_list,
                               cancer_type):
    df, clinical_data_with_sequenced_patients, num_patients = prep_data(
        mutation_file, clinical_data)
    df = df[df.index.get_level_values(0).isin(gene_list)]

    #for every gene, collect the clinical data with the mutation data.
    patients_with_gene = df.groupby(level=u'Hugo_Symbol')
    results = []
    for gene, gene_df in patients_with_gene:
        mutated_patient_list = gene_df.index.get_level_values(
            'Tumor_Sample_Barcode').unique()
        num_mutations = len(mutated_patient_list)

        # take the patients with mutations and without, and build an analysis dataframe with time and censor.
        analysis_data = pd.DataFrame({'mutated': np.ones(num_mutations)},
                                     index=mutated_patient_list)
        analysis_data = analysis_data.join(
            clinical_data_with_sequenced_patients, how='right')
        analysis_data['mutated'].fillna(0, inplace=True)

        #Do analysis!
        print 'Doing analysis for %s: mutated %d of %d' % (gene, num_mutations,
                                                           num_patients)
        time = analysis_data['Time']
        censor = analysis_data['Censor']
        split = analysis_data['mutated']

        cox_dict = analysis.do_cox(time, censor, split)
        cox_dict['gene'] = gene
        cox_dict['num_mutations'] = num_mutations
        if cox_dict['n'] != len(analysis_data['Time']):
            print 'ERROR'
        if gene[0] != '\'':
            gene = '\'' + gene
        results.append(cox_dict)
    return results
def calculate_cox(mutation, clinical_data, outdir):
    df = prep_data(mutation, clinical_data)
    df = df.join(clinical_data, how='inner')
    num_patients = len(df.index)

    #prep output file
    cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0]
    print cancer_type
    outfile = os.path.join(
        outdir, cancer_type + '.driver_mutation_count.zscores.out.csv')

    print 'Missing driver genes:', set(COMMONLY_MUTATED) - set(df.columns)

    present_driver_genes = list(
        set(df.columns).intersection(set(COMMONLY_MUTATED)))
    print present_driver_genes
    driver_mutations = df[present_driver_genes]
    print driver_mutations
    driver_mutations['driver_mutation_count'] = driver_mutations.sum(
        axis=1, skipna=True)
    driver_mutations['time'] = df['time']
    driver_mutations['censor'] = df['censor']

    analysis_data = pd.DataFrame()
    analysis_data['time'] = driver_mutations['time']
    analysis_data['censor'] = driver_mutations['censor']
    analysis_data['driver_mutation_count'] = driver_mutations[
        'driver_mutation_count']

    #Do analysis!
    cox_dict = analysis.do_cox(analysis_data['time'], analysis_data['censor'],
                               analysis_data['driver_mutation_count'])
    with open(outfile, 'w') as out:
        out.write('Z: ' + str(cox_dict['z']) + ', P: ' + str(cox_dict['p']) +
                  ', n: ' + str(cox_dict['n']) + '\n')
        driver_mutations.to_csv(out)
    return cox_dict
def do_single_cancer_type_cna(cancer_type, cancer_type_clinical, cna_file,
                              name_conversions, outdir):
    print 'Duplicate count:', cancer_type_clinical.index.duplicated(
        keep='first').sum()
    cancer_type_cnas = cna_file[cancer_type_clinical.index].T
    print 'Patient count for CNAs:', cancer_type_cnas.shape
    cancer_type_cnas_and_clinical = cancer_type_cnas.join(cancer_type_clinical)
    print cancer_type_cnas_and_clinical.shape
    num_patients = cancer_type_cnas_and_clinical.shape[0]
    print 'num patients:', num_patients
    formatstring = '{0}, {1}, {2}, {3}\n'

    outfile = os.path.join(outdir, 'cnas',
                           cancer_type.replace(' ', '-') + '.out.csv')
    print outfile
    with open(outfile, 'w') as out:
        out.write('gene,zscore,pvalue,num patients\n')
        for gene in cancer_type_cnas_and_clinical:
            if gene in ['Time', 'Censor']:
                continue
            number_non_zero = cancer_type_cnas_and_clinical[
                cancer_type_cnas_and_clinical[gene] != 0][gene].shape[0]
            try:
                # print cancer_type_cnas_and_clinical[['Time', 'Censor', gene]]
                cox_dict = analysis.do_cox(
                    cancer_type_cnas_and_clinical.Time,
                    cancer_type_cnas_and_clinical.Censor,
                    cancer_type_cnas_and_clinical[gene])
                if gene in name_conversions.index:
                    print 'Converting gene', gene, 'to', name_conversions[
                        'TCGA'].loc[gene]
                    gene = name_conversions['TCGA'].loc[gene]
                out.write(
                    formatstring.format(gene, cox_dict['z'], cox_dict['p'],
                                        cox_dict['n']))
            except rpy2.rinterface.RRuntimeError as e:
                print 'Skipped ', gene, 'due to R error.'
def make_zscores(copy_number, clinical_data, outdir):
    df = pd.read_csv(copy_number, sep=',')
    df_by_patient = df.transpose()
    df_by_patient.columns = df_by_patient.loc['Symbol']
    df_by_patient = df_by_patient.clip(upper=10)
    num_patients = df_by_patient.shape[0]
    clinical_and_cnv = df_by_patient.join(clinical_data, how='inner')

    cancer_type = util.get_cancer_type(copy_number)

    outfile = os.path.join(outdir, cancer_type + '_zscores.csv')
    formatstring = '{0}, {1}, {2}, {3}\n'

    with open(outfile, 'w') as out:
        out.write('gene,zscore,pvalue,num patients\n')
        for gene in clinical_and_cnv:
            if gene not in ('Time', 'Censor'):  # skip metadata
                num_with_copy_number = (clinical_and_cnv[gene] != 0).sum()
                cox_dict = analysis.do_cox(clinical_and_cnv.Time,
                                           clinical_and_cnv.Censor,
                                           clinical_and_cnv[gene])
                out.write(
                    formatstring.format(gene, cox_dict['z'], cox_dict['p'],
                                        cox_dict['n']))
Пример #28
0
def calculate_cox(mutation,
                  clinical,
                  outdir,
                  metagene_file=None,
                  make_km=False):
    clinical_data = util.get_clinical_data(clinical)
    df = mutation_base.prep_mutation_data(mutation, clinical_data)
    clinical_and_data = df.join(clinical_data, how='inner')
    num_patients = len(clinical_and_data)

    #prep output file
    cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0]
    if metagene_file:
        formatstring = '{0}, {1}, {2}, {3}, {4}, {5}\n'
        outfile = os.path.join(
            outdir, cancer_type + '_mutation-fraction-' +
            str(MUTATION_PERCENT) + '_metagene_zscores.csv')

        print "Processing metagene..."
        metagene = metagene_lib.get_metagene_data(metagene_file, cancer_type)
        print "Complete"
    else:
        outfile = os.path.join(
            outdir, cancer_type + '_mutation-fraction-' +
            str(MUTATION_PERCENT) + '.zscores.out.csv')
        formatstring = '{0}, {1}, {2}, {3}, {4}\n'

    with open(outfile, 'w') as out:
        if metagene_file:
            out.write(
                'gene,zscore,pvalue,metagene-zscore,metagene-pvalue,num patients\n'
            )
        else:
            out.write('gene,zscore,pvalue,num mutations,num patients\n')

        for gene in clinical_and_data:
            if gene in ['time', 'censor']:
                continue

            num_mutations = int(clinical_and_data[gene].sum())
            if num_mutations >= MUTATION_PERCENT * num_patients:
                time = clinical_and_data['time']
                censor = clinical_and_data['censor']
                data = clinical_and_data[gene]

                if metagene_file:
                    cox_dict = analysis.do_metagene_cox(
                        time, censor, data, metagene)
                    out.write(
                        formatstring.format(gene, cox_dict['z'], cox_dict['p'],
                                            cox_dict['metagene-z'],
                                            cox_dict['metagene-p'],
                                            cox_dict['n']))
                else:
                    name = cancer_type + '_' + gene
                    if make_km:
                        analysis.do_km(name, time, censor, data, outdir)
                        clinical_and_data['time', 'censor', gene].to_csv(
                            os.path.join(outdir, name + '_data.csv'),
                            columns=['time', 'censor', 'mutated'])

                    cox_dict = analysis.do_cox(time, censor, data)
                    out.write(
                        formatstring.format(gene, cox_dict['z'], cox_dict['p'],
                                            num_mutations, cox_dict['n']))
def calculate_cox(mutation, clinical_data, key, outdir):
    df, clinical_data_with_sequenced_patients, num_patients = prep_data(
        mutation, clinical_data, key)

    #prep output file
    cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0]
    print cancer_type
    outfile = os.path.join(
        outdir, (cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) +
                 '_vaf_cutoff-' + str(VARIANT_ALLELE_FREQ_CUTOFF) +
                 '.zscores.out.csv'))
    formatstring = '\'{0}, {1}, {2}, {3}, {4}\n'

    with open(outfile, 'w') as out:
        out.write('gene,zscore,pvalue,num mutations,num patients\n')

        #for every gene, collect the clinical data with the mutation data.
        #  only for non-silent mutations
        patients_with_gene = df.groupby(level=u'Hugo_Symbol')
        for gene, gene_df in patients_with_gene:
            # Remove silent mutations
            non_silent = gene_df.where(
                gene_df[u'Variant_Classification'] != 'Silent')
            non_silent = non_silent.dropna(subset=[u'Variant_Classification'])
            mutated_patient_list = non_silent.index.get_level_values(
                'identifier').unique()

            num_mutations = len(mutated_patient_list)

            if num_mutations >= MUTATION_PERCENT * num_patients:
                # Get "effectively mutated" patients: those who's VAF >= median
                median_vaf = non_silent['VAF'].median()
                greater_than_median = non_silent[
                    non_silent['VAF'] >= median_vaf]
                effectively_mutated_patients = greater_than_median.index.get_level_values(
                    'identifier').unique()
                num_effective_mutations = len(effectively_mutated_patients)

                # take the patients with mutations and without, and build an analysis dataframe with time and censor.
                analysis_data = pd.DataFrame(
                    {'mutated': np.ones(num_effective_mutations)},
                    index=effectively_mutated_patients)
                analysis_data = analysis_data.join(
                    clinical_data_with_sequenced_patients, how='right')
                analysis_data['mutated'].fillna(0, inplace=True)

                #Do analysis!
                print 'Doing analysis for ', gene, num_mutations
                time = analysis_data['time']
                censor = analysis_data['censor']
                split = analysis_data['mutated']

                name = cancer_type + '_' + gene
                analysis.do_km(name, time, censor, split, outdir)
                cox_dict = analysis.do_cox(time, censor, split)
                if cox_dict['n'] != len(analysis_data['time']):
                    print 'ERROR'
                out.write(
                    formatstring.format(gene, cox_dict['z'], cox_dict['p'],
                                        num_mutations, cox_dict['n']))
                analysis_data.to_csv(os.path.join(outdir, name + '_data.csv'),
                                     columns=['time', 'censor', 'mutated'])