def extract_pc_filtered(df, pc_threshold=.2, filter_down=True): ''' First pre-filters for patients with no tumor/normal change. Then normalizes by normals. ''' if ('11' in df.columns.levels[1]) and filter_down: tt = df.xs('11', axis=1, level=1) rr = df.apply(exp_change, 1).sort('p') m, s = tt.mean(1), tt.std(1) df_n = df.xs('01', axis=1, level=1) df_n = ((df_n.T - m) / s).T df_n = df_n.ix[true_index(rr.p < .05)] else: # No matched normals df_n = df.xs('01', axis=1, level=1) df_n = ((df_n.T - df_n.mean(1)) / df_n.std(1)).T pc = extract_pc(df_n, pc_threshold, standardize=False) return pc
def extract_pc_filtered(df, pc_threshold=.2, filter_down=True): ''' First pre-filters for patients with no tumor/normal change. Then normalizes by normals. ''' if ('11' in df.columns.levels[1]) and filter_down: tt = df.xs('11', axis=1, level=1) rr = df.apply(exp_change, 1).sort('p') m, s = tt.mean(1), tt.std(1) df_n = df.xs('01', axis=1, level=1) df_n = ((df_n.T - m) / s).T df_n = df_n.ix[true_index(rr.p < .05)] else: #No matched normals df_n = df.xs('01', axis=1, level=1) df_n = ((df_n.T - df_n.mean(1)) / df_n.std(1)).T pc = extract_pc(df_n, pc_threshold, standardize=False) return pc
def _get_real_features(self): binary, singles, real = extract_features(self.df) background_df = real.ix[real.index.diff(singles.index)].dropna() background = extract_pc(background_df, 0) ss = screen_feature(background['pat_vec'], pearson_pandas, singles) singles = singles.ix[ss.p > 10e-5] singles = ((singles.T - singles.mean(1)) / singles.std(1)).T U, S, pc = frame_svd(singles) self.features['binary'] = binary self.features['real'] = singles self.global_vars['background'] = background['pat_vec'] self.global_vars['filtered_pc1'] = pc[0] self.global_vars['filtered_pc2'] = pc[1] self.global_loadings['background'] = background['gene_vec'] self.global_loadings['filtered_pc1'] = U[0] self.global_loadings['filtered_pc2'] = U[1]
def peel_pc(df): ''' Wrapper around extract_pc. Flips the PC slightly differently based on correlation with the mean. Does not standardize data for PCA due to underlying distribution of beta values. ''' try: r = extract_pc(df-.5) l,r,p = r['gene_vec'], r['pat_vec'], r['pct_var'] mean = df.mean(1) if l.corr(mean) < 0: l = l*-1 r = r*-1 return l,r,p except: r = df.mean() return np.nan, r, np.nan
def peel_pc(df): ''' Wrapper around extract_pc. Flips the PC slightly differently based on correlation with the mean. Does not standardize data for PCA due to underlying distribution of beta values. ''' try: r = extract_pc(df - .5) l, r, p = r['gene_vec'], r['pat_vec'], r['pct_var'] mean = df.mean(1) if l.corr(mean) < 0: l = l * -1 r = r * -1 return l, r, p except: r = df.mean() return np.nan, r, np.nan
def run_clinical_real(cancer, clinical, data_path, gene_sets, survival_tests, real_variables, binary_variables, data_type='expression', drop_pc=False): if data_type == 'expression': data_matrix = read_rnaSeq(cancer, data_path) data_matrix = data_matrix.groupby(by=lambda n: n.split('|')[0]).mean() elif data_type == 'expression_array': data_matrix = read_mrna(cancer, data_path) elif data_type == 'methylation': data_matrix = read_methylation(cancer, data_path) if drop_pc: data_matrix = drop_first_norm_pc(data_matrix) pc = dict((p, extract_pc(data_matrix.ix[g])) for p, g in gene_sets.iteritems()) pc = DataFrame(dict((p, (v - v.mean()) / v.std()) for p,v in pc.iteritems() if type(v) != type(None))).T #clinical['pc'] = extract_pc(data_matrix.dropna(), pc_threshold=0) tests = get_tests(clinical, survival_tests, real_variables, binary_variables, var_type='real') #return locals() p_pathways, q_pathways = run_tests(tests, pc) return locals()