def get_gistic_genes(data_path, cancer, filter_with_rna=True, collapse_on_bands=True, min_patients=5): """ Gets a matrix of events for high grade amplifications and homozygous deletions. We filter down this list by asserting that a copy number event corresponds with a resultant expression change. The final matrix merges gene-level events on the same band to combine redundant events and reduce the test space. """ gistic = FH.get_gistic_gene_matrix(data_path, cancer, '01') deletion = gistic[(gistic == -2).sum(1) > min_patients] amp = gistic[(gistic == 2).sum(1) > min_patients] ft = pd.MultiIndex.from_tuples # rediculously long pandas names deletion.index = ft([('Deletion', s[0], s[2]) for s in deletion.index]) amp.index = ft([('Amplification', s[0], s[2]) for s in amp.index]) if filter_with_rna: rna = FH.read_rnaSeq(data_path, cancer) deletion = rna_filter(deletion, -2, rna) amp = rna_filter(amp, 2, rna) cna_genes = amp.append(deletion) if collapse_on_bands == False: return cna_genes cna_genes = pd.DataFrame({(a[0], a[1], tuple(b.index.get_level_values(2))): b.mean().round() for a, b in cna_genes.groupby(level=[0, 1])}).T cna_genes.index = pd.MultiIndex.from_tuples(cna_genes.index) return cna_genes
def get_gistic_genes(data_path, cancer, filter_with_rna=True, collapse_on_bands=True, min_patients=5): ''' Gets a matrix of events for high grade amplifications and homozygous deletions. We filter down this list by asserting that a copy number event corresponds with a resultant expression change. The final matrix merges gene-level events on the same band to combine redundant events and reduce the test space. ''' gistic = FH.get_gistic_gene_matrix(data_path, cancer, '01') deletion = gistic[(gistic == -2).sum(1) > min_patients] amp = gistic[(gistic == 2).sum(1) > min_patients] ft = pd.MultiIndex.from_tuples # rediculously long pandas names deletion.index = ft([('Deletion', s[0], s[2]) for s in deletion.index]) amp.index = ft([('Amplification', s[0], s[2]) for s in amp.index]) if filter_with_rna: rna = FH.read_rnaSeq(data_path, cancer) deletion = rna_filter(deletion, -2, rna) amp = rna_filter(amp, 2, rna) cna_genes = amp.append(deletion) if collapse_on_bands == False: return cna_genes cna_genes = pd.DataFrame({(a[0], a[1], tuple(b.index.get_level_values(2))): b.mean().round() for a, b in cna_genes.groupby(level=[0, 1])}).T cna_genes.index = pd.MultiIndex.from_tuples(cna_genes.index) return cna_genes
def get_cna_rates(data_path, cancer, patients=None): """ Get copy-number aberration rates from GISTIC processing pipeline. This function depends on the current Firehose output of this program as of July 2013. """ gistic = FH.get_gistic_gene_matrix(data_path, cancer) amp_gene_all = (gistic >= 1).astype(int).sum() amp_gene_high = (gistic == 2).astype(int).sum() del_gene_all = (gistic <= -1).astype(int).sum() del_gene_homo = (gistic <= -2).astype(int).sum() lesions = FH.get_gistic_lesions(data_path, cancer) amp_lesion_all = (lesions.ix['Amplification'] >= 1).sum() amp_lesion_high = (lesions.ix['Amplification'] == 2).sum() del_lesion_all = (lesions.ix['Deletion'] <= -1).sum() del_lesion_homo = (lesions.ix['Deletion'] == -2).sum() arm_cn = FH.get_gistic_arm_values(data_path, cancer) chromosomal_instability = arm_cn.abs().mean() cna_df = {'gene_amp': amp_gene_all, 'gene_amp_high': amp_gene_high, 'gene_del': del_gene_all, 'gene_del_homo': del_gene_homo, 'lesion_amp': amp_lesion_all, 'lesion_amp_high': amp_lesion_high, 'lesion_del': del_lesion_all, 'lesion_del_homo': del_lesion_homo, 'chrom_instability': chromosomal_instability} cna_df = pd.DataFrame(cna_df) if patients is not None: cna_df = cna_df.ix[patients].dropna() return cna_df
def get_global_vars(data_path, cancer, patients=None): """ Get compiled DataFrame of global molecular variables from Firehose data. Returns a feature by patient DataFrame with (data-type, variable) on the columns and patient barcodes on the index. """ try: data_matrix = FH.read_rnaSeq(data_path, cancer, patients) U, S, vH = frame_svd(data_matrix) exp_pc = pd.DataFrame({'pc1': vH[0], 'pc2': vH[1]}) except: exp_pc = pd.DataFrame() try: data_matrix = read_methylation(data_path, cancer, patients) U, S, vH = frame_svd(data_matrix) meth_pc = pd.DataFrame({'pc1': vH[0], 'pc2': vH[1]}) except: meth_pc = pd.DataFrame() try: meth_age, amar = 'FAIL', 'FAIL' # meth_age, amar = get_age_signal(data_path, cancer) meth_pc = meth_pc.join(meth_age).join(amar) print 'Should probably check this out' except: pass cna_rates = get_cna_rates(data_path, cancer, patients) mutation_rates = get_mutation_rates(data_path, cancer, patients) gv = pd.concat([exp_pc, meth_pc, cna_rates, mutation_rates], keys=['mRNASeq', 'methylation', 'cna', 'mutation'], axis=1) gv = gv.dropna(how='all', axis=1) return gv
def get_gistic(data_path, cancer, filter_with_rna=True, collapse_on_bands=True, min_patients=5): """ Get the combined GISTIC feature matrix for testing. """ lesions = FH.get_gistic_lesions(cancer, data_path) cna_genes = get_gistic_genes(data_path, cancer, filter_with_rna, collapse_on_bands, min_patients) cna = cna_genes.append(lesions) return cna
def __init__(self, run, cancer, cn_type, patients=None): ''' ''' Dataset.__init__(self, cancer.path, cn_type, compressed=False) min_pat = run.parameters['min_patients'] if cn_type == 'CN_broad': self.df = FH.get_gistic(run.data_path, cancer.name, min_patients=min_pat) if patients is not None: self.df = self.df.ix[:, patients].dropna(1, how='all') self.features = self.df
def get_gistic(data_path, cancer, filter_with_rna=True, collapse_on_bands=True, min_patients=5): ''' Get the combined GISTIC feature matrix for testing. ''' lesions = FH.get_gistic_lesions(cancer, data_path) cna_genes = get_gistic_genes(data_path, cancer, filter_with_rna, collapse_on_bands, min_patients) cna = cna_genes.append(lesions) return cna
def __init__(self, run, cancer, patients=None, create_features=True, draw_figures=False): """ """ Dataset.__init__(self, cancer.path, 'Mutation', compressed=False) self.df = FH.get_mutation_matrix(run.data_path, cancer.name) if patients is not None: self.df = self.df.ix[:, patients].dropna(1, how='all') if create_features is True: min_pat = run.parameters['min_patients'] self._create_feature_matrix(run.gene_sets, min_pat) if draw_figures is True: self._create_pathway_figures(run.gene_sets)
def get_cna_rates(data_path, cancer, patients=None): ''' Get copy-number aberration rates from GISTIC processing pipeline. This function depends on the current Firehose output of this program as of July 2013. ''' gistic = FH.get_gistic_gene_matrix(data_path, cancer) amp_gene_all = (gistic >= 1).astype(int).sum() amp_gene_high = (gistic == 2).astype(int).sum() del_gene_all = (gistic <= -1).astype(int).sum() del_gene_homo = (gistic <= -2).astype(int).sum() lesions = FH.get_gistic_lesions(data_path, cancer) amp_lesion_all = (lesions.ix['Amplification'] >= 1).sum() amp_lesion_high = (lesions.ix['Amplification'] == 2).sum() del_lesion_all = (lesions.ix['Deletion'] <= -1).sum() del_lesion_homo = (lesions.ix['Deletion'] == -2).sum() arm_cn = FH.get_gistic_arm_values(data_path, cancer) chromosomal_instability = arm_cn.abs().mean() cna_df = { 'gene_amp': amp_gene_all, 'gene_amp_high': amp_gene_high, 'gene_del': del_gene_all, 'gene_del_homo': del_gene_homo, 'lesion_amp': amp_lesion_all, 'lesion_amp_high': amp_lesion_high, 'lesion_del': del_lesion_all, 'lesion_del_homo': del_lesion_homo, 'chrom_instability': chromosomal_instability } cna_df = pd.DataFrame(cna_df) if patients is not None: cna_df = cna_df.ix[patients].dropna() return cna_df
def get_beta_values(data_path, cancer, patients=None, tissue_code='All'): """ Retrieve methylation beta-values from my pre-processed file. TCGA has a lot more columns that eat up memory, so I parse out the beta-values in preprocess_methylation.py. This file still has all of the probes by pateints so it still eats my kill a computer without a lot of memory (takes ~2GB for HNSC). """ path = '{}/ucsd_processing/{}/methylation450/'.format(data_path, cancer) t = pd.read_table(path + 'beta_values.txt', skiprows=[1], index_col=[0]) t = t.rename(columns=lambda s: s if s != t.columns[0] else 'symbol') t = t.set_index('symbol', append=True) t = t.swaplevel(0, 1) t = t.sort_index() # think this is a bug that it needs to be sorted t = FH.fix_barcode_columns(t, patients, tissue_code) return t
def get_beta_values(data_path, cancer, patients=None, tissue_code='All'): ''' Retrieve methylation beta-values from my pre-processed file. TCGA has a lot more columns that eat up memory, so I parse out the beta-values in preprocess_methylation.py. This file still has all of the probes by pateints so it still eats my kill a computer without a lot of memory (takes ~2GB for HNSC). ''' path = '{}/ucsd_processing/{}/methylation450/'.format(data_path, cancer) t = pd.read_table(path + 'beta_values.txt', skiprows=[1], index_col=[0]) t = t.rename(columns=lambda s: s if s != t.columns[0] else 'symbol') t = t.set_index('symbol', append=True) t = t.swaplevel(0, 1) t = t.sort_index() # think this is a bug that it needs to be sorted t = FH.fix_barcode_columns(t, patients, tissue_code) return t
def get_global_vars(data_path, cancer, patients=None): ''' Get compiled DataFrame of global molecular variables from Firehose data. Returns a feature by patient DataFrame with (data-type, variable) on the columns and patient barcodes on the index. ''' try: data_matrix = FH.read_rnaSeq(data_path, cancer, patients) U, S, vH = frame_svd(data_matrix) exp_pc = pd.DataFrame({'pc1': vH[0], 'pc2': vH[1]}) except: exp_pc = pd.DataFrame() try: data_matrix = read_methylation(data_path, cancer, patients) U, S, vH = frame_svd(data_matrix) meth_pc = pd.DataFrame({'pc1': vH[0], 'pc2': vH[1]}) except: meth_pc = pd.DataFrame() try: meth_age, amar = 'FAIL', 'FAIL' # meth_age, amar = get_age_signal(data_path, cancer) meth_pc = meth_pc.join(meth_age).join(amar) print 'Should probably check this out' except: pass cna_rates = get_cna_rates(data_path, cancer, patients) mutation_rates = get_mutation_rates(data_path, cancer, patients) gv = pd.concat([exp_pc, meth_pc, cna_rates, mutation_rates], keys=['mRNASeq', 'methylation', 'cna', 'mutation'], axis=1) gv = gv.dropna(how='all', axis=1) return gv