Exemplo n.º 1
def get_gistic_genes(data_path, cancer, filter_with_rna=True,
                     collapse_on_bands=True, min_patients=5):
    Gets a matrix of events for high grade amplifications and homozygous 
    We filter down this list by asserting that a copy number event corresponds
    with a resultant expression change. 
    The final matrix merges gene-level events on the same band to combine
    redundant events and reduce the test space.     
    gistic = FH.get_gistic_gene_matrix(data_path, cancer, '01')
    deletion = gistic[(gistic == -2).sum(1) > min_patients]
    amp = gistic[(gistic == 2).sum(1) > min_patients]
    ft = pd.MultiIndex.from_tuples  # rediculously long pandas names
    deletion.index = ft([('Deletion', s[0], s[2]) for s in deletion.index])
    amp.index = ft([('Amplification', s[0], s[2]) for s in amp.index])
    if filter_with_rna:
        rna = FH.read_rnaSeq(data_path, cancer)
        deletion = rna_filter(deletion, -2, rna)
        amp = rna_filter(amp, 2, rna)
    cna_genes = amp.append(deletion)
    if collapse_on_bands == False:
        return cna_genes
    cna_genes = pd.DataFrame({(a[0], a[1], tuple(b.index.get_level_values(2))): 
                           b.mean().round() for a, b in 
                           cna_genes.groupby(level=[0, 1])}).T
    cna_genes.index = pd.MultiIndex.from_tuples(cna_genes.index)
    return cna_genes
Exemplo n.º 2
def get_gistic_genes(data_path,
    Gets a matrix of events for high grade amplifications and homozygous 
    We filter down this list by asserting that a copy number event corresponds
    with a resultant expression change. 
    The final matrix merges gene-level events on the same band to combine
    redundant events and reduce the test space.     
    gistic = FH.get_gistic_gene_matrix(data_path, cancer, '01')
    deletion = gistic[(gistic == -2).sum(1) > min_patients]
    amp = gistic[(gistic == 2).sum(1) > min_patients]
    ft = pd.MultiIndex.from_tuples  # rediculously long pandas names
    deletion.index = ft([('Deletion', s[0], s[2]) for s in deletion.index])
    amp.index = ft([('Amplification', s[0], s[2]) for s in amp.index])

    if filter_with_rna:
        rna = FH.read_rnaSeq(data_path, cancer)
        deletion = rna_filter(deletion, -2, rna)
        amp = rna_filter(amp, 2, rna)

    cna_genes = amp.append(deletion)
    if collapse_on_bands == False:
        return cna_genes

    cna_genes = pd.DataFrame({(a[0], a[1], tuple(b.index.get_level_values(2))):
                              for a, b in cna_genes.groupby(level=[0, 1])}).T
    cna_genes.index = pd.MultiIndex.from_tuples(cna_genes.index)
    return cna_genes
Exemplo n.º 3
def get_cna_rates(data_path, cancer, patients=None):
    Get copy-number aberration rates from GISTIC processing pipeline.  
    This function depends on the current Firehose output of this program 
    as of July 2013.
    gistic = FH.get_gistic_gene_matrix(data_path, cancer)
    amp_gene_all = (gistic >= 1).astype(int).sum()
    amp_gene_high = (gistic == 2).astype(int).sum()
    del_gene_all = (gistic <= -1).astype(int).sum()
    del_gene_homo = (gistic <= -2).astype(int).sum()
    lesions = FH.get_gistic_lesions(data_path, cancer)
    amp_lesion_all = (lesions.ix['Amplification'] >= 1).sum()
    amp_lesion_high = (lesions.ix['Amplification'] == 2).sum()
    del_lesion_all = (lesions.ix['Deletion'] <= -1).sum()
    del_lesion_homo = (lesions.ix['Deletion'] == -2).sum()
    arm_cn = FH.get_gistic_arm_values(data_path, cancer)
    chromosomal_instability = arm_cn.abs().mean()
    cna_df = {'gene_amp': amp_gene_all, 'gene_amp_high': amp_gene_high,
              'gene_del': del_gene_all, 'gene_del_homo': del_gene_homo,
              'lesion_amp': amp_lesion_all, 'lesion_amp_high': amp_lesion_high,
              'lesion_del': del_lesion_all, 'lesion_del_homo': del_lesion_homo,
              'chrom_instability': chromosomal_instability}
    cna_df = pd.DataFrame(cna_df)
    if patients is not None:
        cna_df = cna_df.ix[patients].dropna()
    return cna_df
Exemplo n.º 4
def get_global_vars(data_path, cancer, patients=None):
    Get compiled DataFrame of global molecular variables from Firehose
    data.  Returns a feature by patient DataFrame with (data-type, variable)
    on the columns and patient barcodes on the index.
        data_matrix = FH.read_rnaSeq(data_path, cancer, patients)
        U, S, vH = frame_svd(data_matrix)
        exp_pc = pd.DataFrame({'pc1': vH[0], 'pc2': vH[1]})
        exp_pc = pd.DataFrame()
        data_matrix = read_methylation(data_path, cancer, patients)
        U, S, vH = frame_svd(data_matrix)
        meth_pc = pd.DataFrame({'pc1': vH[0], 'pc2': vH[1]})
        meth_pc = pd.DataFrame()
        meth_age, amar = 'FAIL', 'FAIL'
        # meth_age, amar = get_age_signal(data_path, cancer) 
        meth_pc = meth_pc.join(meth_age).join(amar)
        print 'Should probably check this out'
    cna_rates = get_cna_rates(data_path, cancer, patients)
    mutation_rates = get_mutation_rates(data_path, cancer, patients)
    gv = pd.concat([exp_pc, meth_pc, cna_rates, mutation_rates],
                    keys=['mRNASeq', 'methylation', 'cna', 'mutation'], axis=1)
    gv = gv.dropna(how='all', axis=1)
    return gv
Exemplo n.º 5
def get_gistic(data_path, cancer, filter_with_rna=True,
               collapse_on_bands=True, min_patients=5):
    Get the combined GISTIC feature matrix for testing. 
    lesions = FH.get_gistic_lesions(cancer, data_path)
    cna_genes = get_gistic_genes(data_path, cancer, filter_with_rna,
                                 collapse_on_bands, min_patients)
    cna = cna_genes.append(lesions)
    return cna
Exemplo n.º 6
 def __init__(self, run, cancer, cn_type, patients=None):
     Dataset.__init__(self, cancer.path, cn_type, compressed=False)
     min_pat = run.parameters['min_patients']
     if cn_type == 'CN_broad':
         self.df = FH.get_gistic(run.data_path, cancer.name, 
         if patients is not None:
             self.df = self.df.ix[:, patients].dropna(1, how='all')
         self.features = self.df
Exemplo n.º 7
 def __init__(self, run, cancer, cn_type, patients=None):
     Dataset.__init__(self, cancer.path, cn_type, compressed=False)
     min_pat = run.parameters['min_patients']
     if cn_type == 'CN_broad':
         self.df = FH.get_gistic(run.data_path, cancer.name,
         if patients is not None:
             self.df = self.df.ix[:, patients].dropna(1, how='all')
         self.features = self.df
Exemplo n.º 8
def get_gistic(data_path,
    Get the combined GISTIC feature matrix for testing. 
    lesions = FH.get_gistic_lesions(cancer, data_path)
    cna_genes = get_gistic_genes(data_path, cancer, filter_with_rna,
                                 collapse_on_bands, min_patients)
    cna = cna_genes.append(lesions)
    return cna
Exemplo n.º 9
    def __init__(self, run, cancer, patients=None,
                 create_features=True, draw_figures=False):
        Dataset.__init__(self, cancer.path, 'Mutation', compressed=False)
        self.df = FH.get_mutation_matrix(run.data_path, cancer.name)
        if patients is not None:
            self.df = self.df.ix[:, patients].dropna(1, how='all')

        if create_features is True:
            min_pat = run.parameters['min_patients']
            self._create_feature_matrix(run.gene_sets, min_pat)
        if draw_figures is True:
Exemplo n.º 10
def get_cna_rates(data_path, cancer, patients=None):
    Get copy-number aberration rates from GISTIC processing pipeline.  
    This function depends on the current Firehose output of this program 
    as of July 2013.
    gistic = FH.get_gistic_gene_matrix(data_path, cancer)
    amp_gene_all = (gistic >= 1).astype(int).sum()
    amp_gene_high = (gistic == 2).astype(int).sum()
    del_gene_all = (gistic <= -1).astype(int).sum()
    del_gene_homo = (gistic <= -2).astype(int).sum()

    lesions = FH.get_gistic_lesions(data_path, cancer)
    amp_lesion_all = (lesions.ix['Amplification'] >= 1).sum()
    amp_lesion_high = (lesions.ix['Amplification'] == 2).sum()
    del_lesion_all = (lesions.ix['Deletion'] <= -1).sum()
    del_lesion_homo = (lesions.ix['Deletion'] == -2).sum()

    arm_cn = FH.get_gistic_arm_values(data_path, cancer)
    chromosomal_instability = arm_cn.abs().mean()

    cna_df = {
        'gene_amp': amp_gene_all,
        'gene_amp_high': amp_gene_high,
        'gene_del': del_gene_all,
        'gene_del_homo': del_gene_homo,
        'lesion_amp': amp_lesion_all,
        'lesion_amp_high': amp_lesion_high,
        'lesion_del': del_lesion_all,
        'lesion_del_homo': del_lesion_homo,
        'chrom_instability': chromosomal_instability
    cna_df = pd.DataFrame(cna_df)
    if patients is not None:
        cna_df = cna_df.ix[patients].dropna()
    return cna_df
Exemplo n.º 11
def get_beta_values(data_path, cancer, patients=None, tissue_code='All'):
    Retrieve methylation beta-values from my pre-processed file.  
    TCGA has a lot more columns that eat up memory, so I parse out the 
    beta-values in preprocess_methylation.py.  
    This file still has all of the probes by pateints so it still eats my
    kill a computer without a lot of memory (takes ~2GB for HNSC). 
    path = '{}/ucsd_processing/{}/methylation450/'.format(data_path, cancer)
    t = pd.read_table(path + 'beta_values.txt', skiprows=[1], index_col=[0])
    t = t.rename(columns=lambda s: s if s != t.columns[0] else 'symbol')
    t = t.set_index('symbol', append=True)
    t = t.swaplevel(0, 1)
    t = t.sort_index()  # think this is a bug that it needs to be sorted
    t = FH.fix_barcode_columns(t, patients, tissue_code)
    return t
Exemplo n.º 12
def get_beta_values(data_path, cancer, patients=None, tissue_code='All'):
    Retrieve methylation beta-values from my pre-processed file.  
    TCGA has a lot more columns that eat up memory, so I parse out the 
    beta-values in preprocess_methylation.py.  
    This file still has all of the probes by pateints so it still eats my
    kill a computer without a lot of memory (takes ~2GB for HNSC). 
    path = '{}/ucsd_processing/{}/methylation450/'.format(data_path, cancer)
    t = pd.read_table(path + 'beta_values.txt', skiprows=[1], index_col=[0])
    t = t.rename(columns=lambda s: s if s != t.columns[0] else 'symbol')
    t = t.set_index('symbol', append=True)
    t = t.swaplevel(0, 1)
    t = t.sort_index()  # think this is a bug that it needs to be sorted
    t = FH.fix_barcode_columns(t, patients, tissue_code)
    return t
Exemplo n.º 13
    def __init__(self,
        Dataset.__init__(self, cancer.path, 'Mutation', compressed=False)
        self.df = FH.get_mutation_matrix(run.data_path, cancer.name)
        if patients is not None:
            self.df = self.df.ix[:, patients].dropna(1, how='all')

        if create_features is True:
            min_pat = run.parameters['min_patients']
            self._create_feature_matrix(run.gene_sets, min_pat)

        if draw_figures is True:
Exemplo n.º 14
def get_global_vars(data_path, cancer, patients=None):
    Get compiled DataFrame of global molecular variables from Firehose
    data.  Returns a feature by patient DataFrame with (data-type, variable)
    on the columns and patient barcodes on the index.
        data_matrix = FH.read_rnaSeq(data_path, cancer, patients)
        U, S, vH = frame_svd(data_matrix)
        exp_pc = pd.DataFrame({'pc1': vH[0], 'pc2': vH[1]})
        exp_pc = pd.DataFrame()

        data_matrix = read_methylation(data_path, cancer, patients)
        U, S, vH = frame_svd(data_matrix)
        meth_pc = pd.DataFrame({'pc1': vH[0], 'pc2': vH[1]})
        meth_pc = pd.DataFrame()

        meth_age, amar = 'FAIL', 'FAIL'
        # meth_age, amar = get_age_signal(data_path, cancer)
        meth_pc = meth_pc.join(meth_age).join(amar)
        print 'Should probably check this out'

    cna_rates = get_cna_rates(data_path, cancer, patients)
    mutation_rates = get_mutation_rates(data_path, cancer, patients)

    gv = pd.concat([exp_pc, meth_pc, cna_rates, mutation_rates],
                   keys=['mRNASeq', 'methylation', 'cna', 'mutation'],
    gv = gv.dropna(how='all', axis=1)
    return gv