def get_global_vars(data_path, cancer, patients=None): ''' Get compiled DataFrame of global molecular variables from Firehose data. Returns a feature by patient DataFrame with (data-type, variable) on the columns and patient barcodes on the index. ''' try: data_matrix = FH.read_rnaSeq(data_path, cancer, patients) U, S, vH = frame_svd(data_matrix) exp_pc = pd.DataFrame({'pc1': vH[0], 'pc2': vH[1]}) except: exp_pc = pd.DataFrame() try: data_matrix = read_methylation(data_path, cancer, patients) U, S, vH = frame_svd(data_matrix) meth_pc = pd.DataFrame({'pc1': vH[0], 'pc2': vH[1]}) except: meth_pc = pd.DataFrame() try: meth_age, amar = 'FAIL', 'FAIL' # meth_age, amar = get_age_signal(data_path, cancer) meth_pc = meth_pc.join(meth_age).join(amar) print 'Should probably check this out' except: pass cna_rates = get_cna_rates(data_path, cancer, patients) mutation_rates = get_mutation_rates(data_path, cancer, patients) gv = pd.concat([exp_pc, meth_pc, cna_rates, mutation_rates], keys=['mRNASeq', 'methylation', 'cna', 'mutation'], axis=1) gv = gv.dropna(how='all', axis=1) return gv
def pathway_mutation_section_exp(cancer, gene_sets, cutoff=.25): #Format data for report path = cancer.report_folder + '/' pathway_table_file = path + 'pathway_table.csv' pathway_table = format_pathway_table_exp(cancer, gene_sets) if 'survival' in pathway_table: pathway_table.sort(columns='survival') pathway_table.to_csv(pathway_table_file) keepers = cancer.q_pathways[(cancer.q_pathways < .25).sum(1) > 0].index pathway_table = pathway_table.ix[keepers] if 'survival' in pathway_table: pathway_table = pathway_table.sort(columns='survival') pathway_table = pathway_table.head(20) pathway_table_r = com.convert_to_r_dataframe(pathway_table.replace(nan, 1.23)) #@UndefinedVariable if len(pathway_table) == 0: return nz.addTo(nz.newSubSection('Expressed Pathways'), nz.newParagraph('')) #Overview tableCaption1 = ('Association of pathway level expression patterns with patient' + 'clinical features.') table1 = nz.newTable(pathway_table_r, tableCaption1, file=pathway_table_file, significantDigits=2); #Fill in the details pathway_pos = dict((p,i) for i,p in enumerate(pathway_table.index)) col_pos = dict((c,i) for i,c in enumerate(pathway_table.columns)) #age scatter plots for p in (pathway_table['age'][pathway_table['age'] < cutoff]).index: fig_file = path + FIG_EXT + p + '_age.png' draw_pathway_age_scatter(p, cancer, fig_file) age_fig1 = nz.newFigure(fig_file, 'Age of patients with or without' + 'mutation to pathway.') result1 = nz.addTo(nz.newResult('', isSignificant='TRUE'), nz.addTo(nz.newSection(p), age_fig1)) table1 = nz.addTo(table1, result1, row=pathway_pos[p]+1, column=col_pos['age']+1) #survival curves for p in (pathway_table['survival'][pathway_table['survival'] < cutoff]).index: fig_file = path + FIG_EXT + p + '_survival.png' data_frame = cancer.data_matrix.ix[gene_sets[p]].dropna() U,S,vH = frame_svd(((data_frame.T - data_frame.mean(1)) / data_frame.std(1)).T) strat = (vH[0] > vH[0].std()).astype(int) - (vH[0] < -vH[0].std()) + 1 draw_survival_curves(cancer.clinical, Series(strat, name='pc'), labels=['low','mid','high'], filename=fig_file) sv_fig1 = nz.newFigure(fig_file, 'Survival of patients with ' + 'varying levels of pathway expression.') fig_file2 = path + FIG_EXT + p + '.svg' draw_pathway_eig_bar(U, fig_file2) sv_fig_2 = nz.newFigure(fig_file2, 'Loading for first eigen-patient.') result1 = nz.addTo(nz.newResult('', isSignificant='TRUE'), nz.addTo(nz.newSection(p), sv_fig1, sv_fig_2)) table1 = nz.addTo(table1, result1, row=pathway_pos[p]+1, column=col_pos['survival']+1) section = nz.addTo(nz.newSubSection('Pathway Mutations'), table1) return section
def add_eig_bar(pathway, cancer, table, pos, fig_path): fig_file = cancer.report_folder + '/' + FIG_EXT + pathway + + '.svg' if os.path.isfile(fig_file): data_frame = cancer.data_matrix.ix[cancer.gene_sets[pathway]].dropna() U,S,vH = frame_svd(((data_frame.T - data_frame.mean(1)) / data_frame.std(1)).T) draw_pathway_eig_bar(U, fig_file) sv_fig = nz.newFigure(fig_file, 'Loading for first eigen-patient.') result1 = nz.addTo(nz.newResult('', isSignificant='TRUE'), nz.addTo(nz.newSection(pathway), sv_fig)) table = nz.addTo(table, result1, row=pos[0], column=pos[1])
def _get_meta_features(self, gene_sets, filter_down): gs = extract_geneset_pcs(self.df, gene_sets, filter_down) self.loadings, self.pct_var, pathways = gs if hasattr(self.global_vars, 'background'): r = screen_feature(self.global_vars.background, pearson_pandas, pathways) pathways = pathways.ix[r.p > 10e-5] pathways = ((pathways.T - pathways.mean(1)) / pathways.std(1)).T U, S, pc = frame_svd(pathways) self.pathways = pathways self.features['pathways'] = pathways self.global_vars['pathway_pc1'] = pc[0] self.global_vars['pathway_pc2'] = pc[1] self.global_loadings['pathway_pc1'] = U[0] self.global_loadings['pathway_pc2'] = U[1]
def create_figure_real(cancer, fig_type, vec, file_name): if fig_type in cancer.survival_tests: hit_vec = -1*(vec < -1) + (vec > 1) draw_survival_curves(cancer.clinical, hit_vec, filename=file_name, labels=['low','normal','high'], **cancer.survival_tests[fig_type]) elif fig_type in cancer.real_variables: series_scatter(vec, cancer.clinical[fig_type].astype(float), filename=file_name) elif fig_type in cancer.binary_variables: violin_plot_pandas(cancer.clinical[fig_type], vec, filename=file_name) elif fig_type == 'pathway_bar': genes = cancer.gene_sets[vec.name] U,S,vH = frame_svd(cancer.data_matrix.ix[genes].dropna()) draw_pathway_eig_bar(U, file_name)
def _get_real_features(self): binary, singles, real = extract_features(self.df) background_df = real.ix[real.index.diff(singles.index)].dropna() background = extract_pc(background_df, 0) ss = screen_feature(background['pat_vec'], pearson_pandas, singles) singles = singles.ix[ss.p > 10e-5] singles = ((singles.T - singles.mean(1)) / singles.std(1)).T U, S, pc = frame_svd(singles) self.features['binary'] = binary self.features['real'] = singles self.global_vars['background'] = background['pat_vec'] self.global_vars['filtered_pc1'] = pc[0] self.global_vars['filtered_pc2'] = pc[1] self.global_loadings['background'] = background['gene_vec'] self.global_loadings['filtered_pc1'] = U[0] self.global_loadings['filtered_pc2'] = U[1]
def _calc_global_pcs(self, drop_pc1=False): ''' Normalize data and calculate principal components. If drop_pc1 is set to True, also reconstructs the normalized data without the first PC. ''' df = self.df.xs('01', axis=1, level=1) norm = ((df.T - df.mean(1)) / df.std(1)).T U,S,vH = frame_svd(norm) self.global_vars['pc1'] = vH[0] self.global_vars['pc2'] = vH[1] self.global_loadings['pc1'] = U[0] self.global_loadings['pc2'] = U[1] if drop_pc1 is True: S_n = S.copy() S_n[0] = 0 norm = U.dot(pd.DataFrame(diag(S_n)).dot(vH.T)) return norm
def _calc_global_pcs(self, drop_pc1=False): ''' Normalize data and calculate principal components. If drop_pc1 is set to True, also reconstructs the normalized data without the first PC. ''' df = self.df.xs('01', axis=1, level=1) norm = ((df.T - df.mean(1)) / df.std(1)).T U, S, vH = frame_svd(norm) self.global_vars['pc1'] = vH[0] self.global_vars['pc2'] = vH[1] self.global_loadings['pc1'] = U[0] self.global_loadings['pc2'] = U[1] if drop_pc1 is True: S_n = S.copy() S_n[0] = 0 norm = U.dot(pd.DataFrame(diag(S_n)).dot(vH.T)) return norm