def main(args): feat, case_ids = load_features(args.src) lab = load_labels(args.labsrc) feat = drop_high_cor(feat, cor_thresh=0.8) print('Features after high cor drop') print(feat.head()) run_tsne(feat, lab)
def main(args): scores = pd.read_csv(args.scores_src, index_col=None, header=0) scores.drop(scores.columns[-1], inplace=True, axis=1) scores_caseids = scores['Surgical Number'] scores_caseids = np.array([translate_sn2hash(x) for x in scores_caseids]) drop_rows = np.squeeze(scores.index.values[scores_caseids == 'drop_me']) print('Dropping: ', drop_rows) scores['case_id'] = scores_caseids scores.drop(drop_rows, inplace=True) print(scores.head()) print(scores.shape) features = pd.read_csv(args.feature_src, index_col=0) print('Features') print(features.head()) print(features.shape) caseids = features['case_id'].values features.drop('case_id', axis=1, inplace=True) features = drop_high_cor(features, 0.8) remaining_features = features.columns indices = [] feature_case_mean = [] for cid in np.unique(caseids): cid_idx = caseids == cid f = features.loc[cid_idx, :].values fmean = np.mean(f, axis=0) print('{}:'.format(cid), fmean.shape) feature_case_mean.append(np.expand_dims(fmean, axis=0)) indices.append(cid) features = pd.DataFrame(np.concatenate(feature_case_mean, axis=0), columns=remaining_features) features['case_id'] = indices print('Features grouped by case') print(features.head()) print(features.shape) matching_indices = np.intersect1d(features['case_id'], scores['case_id']) print('Matched indices:', matching_indices, len(matching_indices)) # Drop rows from features and scores -- then sort them drop_rows = [x for x,c in \ zip(features.index.values, features['case_id']) if c not in matching_indices] features.drop(drop_rows, axis=0, inplace=True) print('FEATURES BEFORE SORTING\n', features.head()) features.sort_values(by='case_id', inplace=True) sorted_caseids_features = features['case_id'].values features.drop('case_id', axis=1, inplace=True) print('FEATURES AFTER SORTING\n', features.head()) features = features.transform(lambda x: (x - np.mean(x)) / np.std(x)) print(features.shape) drop_rows = [x for x,c in \ zip(scores.index.values, scores['case_id'].values) if c not in matching_indices] scores.drop(drop_rows, axis=0, inplace=True) # shuffle columns print('SCORES BEFORE SORTING\n', scores.head()) scores.sort_values(by='case_id', inplace=True) sorted_caseids_scores = scores['case_id'].values to_drop = ['case_id', 'caseid', 'Disease Stage', 'sample name', 'Surgical Number'] scores.drop(to_drop, axis=1, inplace=True) print('SCORES AFTER SORTING\n', scores.head()) print(scores.shape) for fid, sid in zip(sorted_caseids_features, sorted_caseids_scores): print(fid, sid) assert fid == sid fig = plt.figure(figsize=(2,2), dpi=300) logfile = os.path.join(args.dst, 'qvalues.csv') comparison_ids = [] pvalues = [] for c in features.columns: cx = features[c].values for s in scores.columns: sy = scores[s].values try: corr = spearmanr(cx, sy) pcorr = pearsonr(cx, sy) except: print('Failed at {} x {}'.format(c, s)) print('cx: {} sy: {}'.format(cx.shape, sy.shape)) comparison_ids.append('{}_{}'.format(c, s)) pvalues.append(corr.pvalue) if corr.pvalue < 0.001: outstr = '*{}\t{}\tr={:3.3f}\tp={:3.3f}\tpr={:3.3f}\tpp={:3.3f}'.format( c, s, corr.correlation, corr.pvalue, pcorr[0], pcorr[1]) plt.clf() plt.scatter(cx, sy) plt.title('sr={:3.3f} sp={:3.3f}\npr={:3.3f} pp={:3.3f}'.format( corr.correlation, corr.pvalue, pcorr[0], pcorr[1], )) plt.xlabel(c) plt.ylabel(s) plt.savefig(os.path.join(args.dst, '{}_{}.png'.format(c, s)), bbox_inches='tight') else: outstr = ' {}\t{}\tr={:3.3f}\tp={:3.3f}\tpr={:3.3f}\tpp={:3.3f}'.format( c, s, corr.correlation, corr.pvalue, pcorr[0], pcorr[1]) print(outstr) _, qvalues, _, _ = multipletests(pvalues, alpha=0.01, method='fdr_bh') qdf = pd.DataFrame({'q': qvalues, 'p': pvalues}, index=comparison_ids) qdf.sort_values('q', inplace=True) qdf.to_csv(logfile)
def main(args): feat = pd.read_csv(args.feature_src, index_col=None) case_ids = feat['case_id'] tile_ids = feat['tile_id'] stages = feat['stage_str'] feat.drop(['case_id', 'tile_id', 'stage_str'], axis=1, inplace=True) feat.drop([c for c in feat.columns if 'Unnamed' in c], axis=1, inplace=True) feat = feat.sample(frac=args.pct) case_ids = case_ids.loc[feat.index] tile_ids = tile_ids.loc[feat.index] stages = stages.loc[feat.index] print(feat.shape) print(case_ids.shape) print(tile_ids.shape) feat = feat.loc[:, usecols] print('Dropping nan, inf and high corr') feat = drop_high_cor(feat, 0.8) feat = feat.transform(lambda x: (x - np.mean(x)) / np.std(x)) feat = drop_nan_inf(feat) feat = drop_var(feat, 0.5) print(feat.shape) print(feat.head()) if args.average == 'tile': print('Average by tile') feat = feat.groupby(by=tile_ids).mean() stages = stages.groupby(by=tile_ids).max() print(feat.shape) elif args.average == 'case': print('Average by case') feat = feat.groupby(by=case_ids).mean() stages = stages.groupby(by=case_ids).max() print(feat.shape) col_p = sns.color_palette('deep', 2) col_colors = [col_p[int('ae' in x)] for x in feat.columns] row_p = sns.color_palette('muted', 4) row_colors = [] print(np.unique(stages.values)) for s in stages.values: if s in m0_strs: row_colors.append(row_p[0]) elif s in m1_strs: row_colors.append(row_p[1]) elif 'NEPC' in s: row_colors.append(row_p[2]) else: row_colors.append(row_p[3]) print('col_colors', len(col_colors)) print('row_colors', len(row_colors)) # projected = TruncatedSVD(n_components=10).fit_transform(feat.values) # projected = PCA(n_components=10).fit_transform(feat.values) sns.clustermap(feat.values, metric=args.metric, standard_scale=1, col_colors=col_colors, row_colors=row_colors) plt.show()
def main(args): feat = pd.read_csv(args.src, index_col=0, header=0) labels = pd.read_csv(args.labsrc, index_col=0, header=0, sep='\t') # print(feat.head()) # print(labels.head()) case_ids = labels['case_id'].values tile_ids = labels.index.values stages = labels['stage_str'].values feat = drop_high_cor(feat, 0.8) print('Features after high cor drop') # print(feat.shape) # print(feat.head()) feat = feat.transform(lambda x: (x - np.mean(x)) / np.std(x)) print('Features after zscore') # print(feat.shape) # print(feat.head()) feat = feat.fillna(value=0) # feat = drop_nan_inf(feat) # print('Features after dropping nan and infs') # print(feat.shape) # print(feat.head()) ((nepc_f, nepc_lab), (m0_f, m0_lab), (m0p_f, m0p_lab), (m1_f, m1_lab)) = split_sets(feat, labels) del feat if args.filter_stats: remove_cols = filter_stats(nepc_f, m0_f) nepc_f.drop(remove_cols, inplace=True, axis=1) m0_f.drop(remove_cols, inplace=True, axis=1) m0p_f.drop(remove_cols, inplace=True, axis=1) m1_f.drop(remove_cols, inplace=True, axis=1) train_x, train_y = make_training(m0_f, nepc_f) print('train_x', train_x.shape) print('train_y', train_y.shape) print('m1_f', m1_f.shape) # model = ElasticNet(alpha=1e-3, max_iter=10000).fit(train_x, train_y) # model = ElasticNetCV(cv=25).fit(train_x, train_y) # model = ElasticNetCV(alphas=np.arange(1e-5, 1e-1, 20), # cv=10, max_iter=20000, n_jobs=-1).fit(train_x, train_y) model = RandomForestRegressor(oob_score=True, max_features='sqrt', max_depth=20, n_estimators=50, n_jobs=-1).fit(train_x, train_y) with open('feature_importance.txt', 'w+') as f: for v, coef in zip(train_x.columns, model.feature_importances_): f.write('{}\t{}\n'.format(v, coef)) if args.aggr_fn == 'max': aggr_fn = np.max elif args.aggr_fn == 'mean': aggr_fn = np.mean """ Predict the M1 cases and gather by max and mean """ yhat_m1 = model.predict(m1_f) case_aggr = [] m1_case_numbers = [] m1_case_vect = m1_lab['case_id'].values for uc in np.unique(m1_case_vect): yx = yhat_m1[m1_case_vect == uc] case_aggr.append(aggr_fn(yx)) case_num = int(uc.split('-')[1]) m1_case_numbers.append(case_num) m1_case_aggr = np.array(case_aggr) m1_case_numbers = np.array(m1_case_numbers) """ Predict M0P cases """ yhat_m0p = model.predict(m0p_f) case_aggr = [] m0p_case_numbers = [] m0p_case_vect = m0p_lab['case_id'].values for uc in np.unique(m0p_case_vect): yx = yhat_m0p[m0p_case_vect == uc] case_aggr.append(aggr_fn(yx)) case_num = int(uc.split('-')[1]) m0p_case_numbers.append(case_num) m0p_case_aggr = np.array(case_aggr) m0p_case_numbers = np.array(m0p_case_numbers) """ Check on the training data """ # yhat_train = [] # # Just do m0 and nepc separately # for cid in np.unique(m0_lab['case_id'].values): # feat_case, feat_other = split_case(m0_f, m0_lab, cid) # feat_split = pd.concat([feat_other, nepc_f]) # y_split = [0]*feat_other.shape[0] + [1]*nepc_f.shape[0] # model = RandomForestRegressor(n_estimators=100, n_jobs=-1).fit(feat_split, y_split) # yh = model.predict(feat_case) # print(cid, yh) # yhat_train += list(yh) # for cid in np.unique(nepc_lab['case_id'].values): # feat_case, feat_other = split_case(nepc_f, nepc_lab, cid) # feat_split = pd.concat([m0_f, feat_other]) # y_split = [0]*m0_f.shape[0] + [1]*feat_other.shape[0] # model = RandomForestRegressor(n_estimators=100, n_jobs=-1).fit(feat_split, y_split) # yh = model.predict(feat_case) # print(cid, yh) # yhat_train += list(yh) # yhat_train = np.asarray(yhat_train) # print(yhat_train.shape) m0_cases = m0_lab['case_id'].values nepc_cases = nepc_lab['case_id'].values train_case_vect = np.concatenate([m0_cases, nepc_cases]) # yhat_train = model.predict(train_x) yhat_train = model.oob_prediction_ train_aggr, train_case_y = [], [] for uc in np.unique(train_case_vect): idx = train_case_vect == uc train_aggr.append(aggr_fn(yhat_train[idx])) train_case_y.append(train_y[idx][0]) train_aggr = np.array(train_aggr) train_case_y = np.array(train_case_y) """ Do some statistical tests """ dotest = mannwhitneyu # test_args = {'equal_var': True} test_args = {} test_m0_m1 = dotest(yhat_train[train_y == 0], yhat_m1, **test_args) test_m0_m0p = dotest(yhat_train[train_y == 0], yhat_m0p, **test_args) test_m0_nepc = dotest(yhat_train[train_y == 0], yhat_train[train_y == 1], **test_args) test_nepc_m1 = dotest(yhat_train[train_y == 1], yhat_m1, **test_args) print('Tiles M0 vs M1', test_m0_m1) print('Tiles M0 vs M0P', test_m0_m0p) print('Tiles M0 vs NPEC', test_m0_nepc) print('Tiles NEPC vs M1', test_nepc_m1) test_m0_m1 = dotest(train_aggr[train_case_y == 0], m1_case_aggr, **test_args) test_m0_m0p = dotest(train_aggr[train_case_y == 0], m0p_case_aggr, **test_args) test_m0_nepc = dotest(train_aggr[train_case_y == 0], train_aggr[train_case_y == 1], **test_args) test_nepc_m1 = dotest(train_aggr[train_case_y == 1], m1_case_aggr, **test_args) print('aggr M0 vs M1', test_m0_m1) print('aggr M0 vs M0P', test_m0_m0p) print('aggr M0 vs NPEC', test_m0_nepc) print('aggr NEPC vs M1', test_nepc_m1) print( '------------------------------------------------------------------------------------' ) gene_scores = pd.read_csv('../data/signature_scores_beltram.csv', index_col=None, header=0, sep=',') gene_score_caseid = [] drop_rows = [] matching_scores = [] matching_indices = [] for i, (idx, sn) in enumerate( zip(gene_scores.index.values, gene_scores['Surgical Number'].values)): try: x = int(sn.split(' ')[-1]) if x in m1_case_numbers: gene_score_caseid.append(x) matching_indices.append(idx) matching_scores.append(m1_case_aggr[m1_case_numbers == x][0]) elif x in m0p_case_numbers: gene_score_caseid.append(x) matching_indices.append(idx) matching_scores.append(m0p_case_aggr[m0p_case_numbers == x][0]) else: drop_rows.append(idx) except: drop_rows.append(idx) gene_scores.drop(drop_rows, inplace=True) gene_scores['NEPC Score'] = pd.Series(matching_scores, index=matching_indices) # if args.save_scores: # gene_scores.to_csv('../signature_scores_nepc_scores_nuclei_mean.csv') label_cols = ['caseid', 'Disease Stage', 'sample name', 'Surgical Number'] gene_scores.drop(label_cols, inplace=True, axis=1) plt.figure(figsize=(5, 5), dpi=300) sns.pairplot(gene_scores, kind='reg') if args.dry_run: pass else: plt.savefig('gene_scores_nepc_score_{}_tile.png'.format(args.aggr_fn), bbox_inches='tight') test_cols = [x for x in gene_scores.columns if x != 'NEPC Score'] scores = gene_scores['NEPC Score'].values print( '------------------------------------------------------------------------------------' ) for c in test_cols: ctest = spearmanr(scores, gene_scores[c].values) print('spearman {:40}: {:3.5f} p={:3.5f}'.format( c, ctest.correlation, ctest.pvalue)) ctest = pearsonr(scores, gene_scores[c].values) print('pearson {:40}: {:3.5f} p={:3.5f}'.format( c, ctest[0], ctest[1])) print( '------------------------------------------------------------------------------------' ) if args.boxplot: f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.35, .65)}) plt_m0 = train_aggr[train_case_y == 0] plt_nepc = train_aggr[train_case_y == 1] plt_m1 = m1_case_aggr plt_m0p = m0p_case_aggr sns.distplot( plt_m0, bins=25, norm_hist=True, kde=True, label='M0', ax=ax_hist, ) sns.distplot( plt_nepc, bins=25, norm_hist=True, kde=True, label='NEPC', ax=ax_hist, ) sns.distplot( plt_m1, kde=True, norm_hist=True, bins=25, label='M1', ax=ax_hist, ) sns.distplot( plt_m0p, kde=True, norm_hist=True, bins=25, label='M0-P', ax=ax_hist, ) ax_hist.set_xlabel('Score') ax_hist.set_ylabel('Frequency') concat_scores = np.concatenate([plt_m0, plt_nepc, plt_m1, plt_m0p]) concat_labels = np.array(['M0'] * len(plt_m0) + ['NEPC'] * len(plt_nepc) + ['M1'] * len(plt_m1) + ['M0P'] * len(plt_m0p)) plt_df = pd.DataFrame({'Set': concat_labels, 'Score': concat_scores}) # fig = plt.figure(figsize=(2,2), dpi=300) sns.boxplot(y='Set', x='Score', data=plt_df, ax=ax_box) sns.stripplot(y='Set', x='Score', data=plt_df, size=2.5, jitter=True, linewidth=0.5, ax=ax_box) # ax_box.set_ylabel('') # ax_box.set_xlabel('') # plt.show() if args.dry_run: pass else: plt.savefig('NEPC_score_{}_tile.png'.format(args.aggr_fn), bbox_inches='tight')
def main(args): data = pd.read_csv(args.src, index_col=0, memory_map=True) lab = pd.read_csv(args.lab, index_col=0) print('DATA') print(data.shape) print('LAB') print(lab.shape) print(lab.head()) data = data.sample(frac=args.pct) print(data.shape) # print(data.head()) # Grab the id columns case_ids = data['case_id'].values tile_ids = data['tile_id'].values data.drop(['case_id', 'tile_id'], inplace=True, axis=1) print(data.shape) # print(data.head()) data = drop_high_cor(data, cor_thresh=0.7) print('Features after high cor drop') print(data.head()) lab_case_uid = np.array( [hashlib.md5(x.encode()).hexdigest() for x in lab['case_id'].values]) is_nepc = np.zeros_like(case_ids, dtype=np.bool) not_nepc = np.zeros_like(case_ids, dtype=np.bool) for t_id in np.unique(case_ids): t_idx = case_ids == t_id print('{}: {} {}'.format(t_id, t_idx.shape, t_idx.sum())) assert t_id in lab_case_uid t_label = lab.loc[lab_case_uid == t_id].values t_label = t_label[0, -3] if t_label == 'NEPC': is_nepc[t_idx] = 1 else: not_nepc[t_idx] = 1 nepc_case_feat = data.loc[is_nepc, :].values adeno_case_feat = data.loc[not_nepc, :].values # nepc_case_feat = nepc_case_feat.sample(n=args.nsample).values # adeno_case_feat = adeno_case_feat.sample(n=args.nsample).values print('NEPC features:') print(nepc_case_feat.shape) print('Adeno features:') print(adeno_case_feat.shape) for c in range(nepc_case_feat.shape[1]): nepc_ = nepc_case_feat[:, c] adeno_ = adeno_case_feat[:, c] tt = ttest_ind(nepc_, adeno_) print('{}\t{:3.3f}\t{:3.3f}'.format(c, tt[0], tt[1])) if tt[1] < args.thresh: plt.clf() # df = pd.DataFrame({'NEPC': nepc_, # 'Adeno': adeno_}) sns.distplot(nepc_, label='NEPC') sns.distplot(adeno_, label='Adeno') plt.legend(frameon=True) plt.title('{}\np={}'.format(c, tt[1])) saveto = os.path.join(args.dst, '{}.png'.format(c)) plt.savefig(saveto, bbox_inches='tight')
def main(args): feat, case_ids = load_features(args.src, zscore=True) lab = load_labels(args.labsrc) feat = drop_high_cor(feat, cor_thresh=0.8) print('Features after high cor drop') # train_x, train_y, test_x, test_y = holdout_cases(feat, lab) ((nepc_f, nepc_lab), (m0_f, m0_lab), (m0p_f, m0p_lab), (m1_f, m1_lab)) = split_sets(feat, lab) del feat # Split out non-small-cell-NEPC: nepc_is_sc = np.array([x in scnepc for x in nepc_lab['case_id'].values]) nepc_not_sc = np.array( [x not in scnepc for x in nepc_lab['case_id'].values]) nepc_f_sc = nepc_f.loc[nepc_is_sc, :] nepc_lab_sc = nepc_lab.loc[nepc_is_sc, :] nepc_f_not_sc = nepc_f.loc[nepc_not_sc, :] nepc_lab_not_sc = nepc_lab.loc[nepc_not_sc, :] del nepc_f, nepc_lab print('NEPC SC lab') print(nepc_lab_sc.head()) print(nepc_lab_sc.shape) print('NEPC not SC lab') print(nepc_lab_not_sc.head()) print(nepc_lab_not_sc.shape) if args.filter_stats: remove_cols = filter_stats(nepc_f_sc, m0_f) nepc_f_sc.drop(remove_cols, inplace=True, axis=1) nepc_f_not_sc.drop(remove_cols, inplace=True, axis=1) m0_f.drop(remove_cols, inplace=True, axis=1) m0p_f.drop(remove_cols, inplace=True, axis=1) m1_f.drop(remove_cols, inplace=True, axis=1) train_x, train_y = make_training(m0_f, nepc_f_sc) train_lab = pd.concat([m0_lab, nepc_lab_sc], axis=0) print('train lab') print(train_lab.head()) print(train_lab.shape) # model = ElasticNet(alpha=1e-3, max_iter=50000).fit(train_x, train_y) # model = ElasticNetCV(cv=25).fit(train_x, train_y) # model = ElasticNetCV(alphas=np.arange(1e-5, 1e-1, 20), # cv=10, max_iter=10000, n_jobs=-1).fit(train_x, train_y) model = RandomForestRegressor(oob_score=True, max_depth=25, n_estimators=100, n_jobs=-1).fit(train_x, train_y) with open('feature_importance.txt', 'w+') as f: for v, coef in zip(train_x.columns, model.feature_importances_): f.write('{}\t{}\n'.format(v, coef)) if args.aggr_fn == 'max': aggr_fn = np.max elif args.aggr_fn == 'mean': aggr_fn = np.mean # """ Get M0 case numbers """ # m0_case_numbers = [] # m0_case_vect = m1_lab['case_id'].values # print('M0 Cases:') # for uc in np.unique(m0_case_vect): # case_num = int(uc.plist('-')[1]) # m0_case_numbers.append(case_num) """ Predict the M1 cases and gather by mean """ yhat_m1 = model.predict(m1_f) case_aggr = [] m1_case_numbers = [] m1_case_vect = m1_lab['case_id'].values for uc in np.unique(m1_case_vect): yx = yhat_m1[m1_case_vect == uc] case_aggr.append(aggr_fn(yx)) case_num = int(uc.split('-')[1]) m1_case_numbers.append(case_num) m1_case_aggr = np.array(case_aggr) m1_case_numbers = np.array(m1_case_numbers) # Print out m1_lab['NEPC_score'] = yhat_m1 print('m1 lab') print(m1_lab.head()) """ Predict M0P cases """ yhat_m0p = model.predict(m0p_f) case_aggr = [] m0p_case_numbers = [] m0p_case_vect = m0p_lab['case_id'].values for uc in np.unique(m0p_case_vect): yx = yhat_m0p[m0p_case_vect == uc] case_aggr.append(aggr_fn(yx)) case_num = int(uc.split('-')[1]) m0p_case_numbers.append(case_num) m0p_case_aggr = np.array(case_aggr) m0p_case_numbers = np.array(m0p_case_numbers) # Print out m0p_lab['NEPC_score'] = yhat_m0p print('m0p lab') print(m0p_lab.head()) """ Predict NEPC not SC cases """ yhat_nepc_not_sc = model.predict(nepc_f_not_sc) case_aggr = [] nepc_not_sc_case_numbers = [] nepc_not_sc_case_vect = nepc_lab_not_sc['case_id'].values for uc in np.unique(nepc_not_sc_case_vect): yx = yhat_nepc_not_sc[nepc_not_sc_case_vect == uc] case_aggr.append(aggr_fn(yx)) nepc_not_sc_case_aggr = np.array(case_aggr) nepc_not_sc_case_numbers = np.array(nepc_not_sc_case_numbers) # Print out nepc_lab_not_sc['NEPC_score'] = yhat_nepc_not_sc print('NEPC not sc lab') print(nepc_lab_not_sc.head()) """ Check on training data Run a LOOCV on the training data """ # yhat_train = [] # # Just do m0 and nepc separately # for cid in np.unique(m0_lab['case_id'].values): # feat_case, feat_other = split_case(m0_f, m0_lab, cid) # feat_split = pd.concat([feat_other, nepc_f]) # y_split = [0]*feat_other.shape[0] + [1]*nepc_f.shape[0] # model = RandomForestRegressor(n_estimators=100, n_jobs=-1).fit(feat_split, y_split) # yh = model.predict(feat_case) # print(cid, yh) # yhat_train += list(yh) # for cid in np.unique(nepc_lab['case_id'].values): # feat_case, feat_other = split_case(nepc_f, nepc_lab, cid) # feat_split = pd.concat([m0_f, feat_other]) # y_split = [0]*m0_f.shape[0] + [1]*feat_other.shape[0] # model = RandomForestRegressor(n_estimators=100, n_jobs=-1).fit(feat_split, y_split) # yh = model.predict(feat_case) # print(cid, yh) # yhat_train += list(yh) # yhat_train = np.asarray(yhat_train) # print(yhat_train.shape) m0_cases = m0_lab['case_id'].values nepc_cases = nepc_lab_sc['case_id'].values train_case_vect = np.concatenate([m0_cases, nepc_cases]) # yhat_train = model.predict(train_x) yhat_train = model.oob_prediction_ train_aggr, train_case_y = [], [] for uc in np.unique(train_case_vect): idx = train_case_vect == uc train_aggr.append(aggr_fn(yhat_train[idx])) train_case_y.append(train_y[idx][0]) train_aggr = np.array(train_aggr) train_case_y = np.array(train_case_y) # Print out train_lab['NEPC_score'] = yhat_train print('train lab') print(train_lab.head()) score_lab = pd.concat([m1_lab, m0p_lab, train_lab], axis=0) print(score_lab.shape) score_lab.to_csv('tile_paths_with_NEPC_score.csv') """ write out scores """ with open('nepc_case_scores.txt', 'w+') as f: for mop, mop_score in zip(np.unique(m0p_case_vect), m0p_case_aggr): s = '{}\t{}\n'.format(mop, mop_score) f.write(s) for mop, mop_score in zip(np.unique(m1_case_vect), m1_case_aggr): s = '{}\t{}\n'.format(mop, mop_score) f.write(s) for mop, mop_score in zip(np.unique(train_case_vect), train_aggr): s = '{}\t{}\n'.format(mop, mop_score) f.write(s) """ Do some statistical tests """ dotest = mannwhitneyu # test_args = {'equal_var': True} test_args = {} test_m0_m1 = dotest(yhat_train[train_y == 0], yhat_m1, **test_args) test_m0_m0p = dotest(yhat_train[train_y == 0], yhat_m0p, **test_args) test_m0_nepc = dotest(yhat_train[train_y == 0], yhat_train[train_y == 1], **test_args) test_nepc_m1 = dotest(yhat_train[train_y == 1], yhat_m1, **test_args) test_m0_nepc_not_sc = dotest(yhat_train[train_y == 0], yhat_nepc_not_sc, **test_args) test_nepc_sc_nepc_not_sc = dotest(yhat_train[train_y == 1], yhat_nepc_not_sc, **test_args) print('Tiles M0 vs M1', test_m0_m1) print('Tiles M0 vs M0P', test_m0_m0p) print('Tiles M0 vs NPEC SC', test_m0_nepc) print('Tiles M0 vs NPEC NOT SC', test_m0_nepc_not_sc) print('Tiles NEPC vs M1', test_nepc_m1) print('Tiles NEPC SC vs NEPC NOT SC', test_nepc_sc_nepc_not_sc) test_m0_m1 = dotest(train_aggr[train_case_y == 0], m1_case_aggr, **test_args) test_m0_m0p = dotest(train_aggr[train_case_y == 0], m0p_case_aggr, **test_args) test_m0_nepc = dotest(train_aggr[train_case_y == 0], train_aggr[train_case_y == 1], **test_args) test_nepc_m1 = dotest(train_aggr[train_case_y == 1], m1_case_aggr, **test_args) test_m0_nepc_not_sc = dotest(train_aggr[train_case_y == 0], nepc_not_sc_case_aggr, **test_args) test_nepc_sc_nepc_not_sc = dotest(train_aggr[train_case_y == 1], nepc_not_sc_case_aggr, **test_args) print('aggr M0 vs M1', test_m0_m1) print('aggr M0 vs M0P', test_m0_m0p) print('aggr M0 vs NPEC SC', test_m0_nepc) print('aggr M0 vs NPEC NOT SC', test_m0_nepc_not_sc) print('aggr NEPC vs M1', test_nepc_m1) print('aggr NPEC SC vs NEPC NOT SC', test_nepc_sc_nepc_not_sc) print( '------------------------------------------------------------------------------------' ) if args.genescore: gene_scores = pd.read_csv('../data/signature_scores_beltram.csv', index_col=None, header=0, sep=',') gene_score_caseid = [] drop_rows = [] matching_scores = [] matching_indices = [] for i, (idx, sn) in enumerate( zip(gene_scores.index.values, gene_scores['Surgical Number'].values)): try: x = int(sn.split(' ')[-1]) if x in m1_case_numbers: # print('M1 matched SN {}'.format(x)) gene_score_caseid.append(x) matching_indices.append(idx) matching_scores.append( m1_case_aggr[m1_case_numbers == x][0]) # if x in m0_case_numbers: # print('M0 matched SN {}'.format(x)) # gene_score_caseid.append(x) # matching_indices.append(idx) # matching_scores.append(m1_case_mean[m1_case_numbers==x][0]) elif x in m0p_case_numbers: # print('M0P matched SN {}'.format(x)) gene_score_caseid.append(x) matching_indices.append(idx) matching_scores.append( m0p_case_aggr[m0p_case_numbers == x][0]) else: drop_rows.append(idx) except: drop_rows.append(idx) print(sn) gene_scores.drop(drop_rows, inplace=True) print(gene_scores.shape) gene_scores['NEPC Score'] = pd.Series(matching_scores, index=matching_indices) # if args.save_scores: # gene_scores.to_csv('../data/signature_scores_nepc_scores_mean.csv') label_cols = [ 'caseid', 'Disease Stage', 'sample name', 'Surgical Number' ] gene_scores.drop(label_cols, inplace=True, axis=1) # plt.figure(figsize=(5,5), dpi=300) # sns.pairplot(gene_scores, kind='reg') # plt.savefig('gene_scores_nepc_score_{}.png'.format(args.aggr_fn), bbox_inches='tight') test_cols = [x for x in gene_scores.columns if x != 'NEPC Score'] scores = gene_scores['NEPC Score'].values for c in test_cols: try: ctest = spearmanr(scores, gene_scores[c].values) print('spearman {:40}: {:3.5f} p={:3.5f}'.format( c, ctest.correlation, ctest.pvalue)) ctest = pearsonr(scores, gene_scores[c].values) print('pearson {:40}: {:3.5f} p={:3.5f}'.format( c, ctest[0], ctest[1])) except: print('Test column {} failed'.format(c)) print( '------------------------------------------------------------------------------------' ) if args.boxplot: f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.35, .65)}) plt_m0 = train_aggr[train_case_y == 0] plt_nepc_sc = train_aggr[train_case_y == 1] plt_nepc_not_sc = nepc_not_sc_case_aggr plt_m1 = m1_case_aggr plt_m0p = m0p_case_aggr auc_ = roc_auc_score(y_true=train_case_y, y_score=train_aggr) print('M0 NEPC SC AUC = ', auc_) m0m1 = np.concatenate([plt_m0, plt_m1]) m0m1_y = np.array([0] * len(plt_m0) + [1] * len(plt_m1)) auc_ = roc_auc_score(y_true=m0m1_y, y_score=m0m1) print('M0 M1 AUC = ', auc_) m0m0p = np.concatenate([plt_m0, plt_m0p]) m0m0p_y = np.array([0] * len(plt_m0) + [1] * len(plt_m0p)) auc_ = roc_auc_score(y_true=m0m0p_y, y_score=m0m0p) print('M0 M0P AUC = ', auc_) m0nepc_not_sc = np.concatenate([plt_m0, plt_nepc_not_sc]) m0nepc_not_sc_y = np.array([0] * len(plt_m0) + [1] * len(plt_nepc_not_sc)) auc_ = roc_auc_score(y_true=m0nepc_not_sc_y, y_score=m0nepc_not_sc) print('M0 NEPC not SC AUC = ', auc_) sns.distplot( plt_m0, bins=25, norm_hist=True, kde=True, label='M0', ax=ax_hist, ) sns.distplot( plt_nepc_sc, bins=25, norm_hist=True, kde=True, label='NEPC SC', ax=ax_hist, ) sns.distplot( plt_m1, kde=True, norm_hist=True, bins=25, label='M1', ax=ax_hist, ) sns.distplot( plt_m0p, kde=True, norm_hist=True, bins=25, label='M0-P', ax=ax_hist, ) sns.distplot( plt_nepc_not_sc, kde=True, norm_hist=True, bins=25, label='NEPC not SC', ax=ax_hist, ) ax_hist.set_xlabel('Score') ax_hist.set_ylabel('Frequency') concat_scores = np.concatenate( [plt_m0, plt_nepc_sc, plt_m1, plt_m0p, plt_nepc_not_sc]) concat_labels = np.array(['M0'] * len(plt_m0) + ['NEPC SC'] * len(plt_nepc_sc) + ['M1'] * len(plt_m1) + ['M0P'] * len(plt_m0p) + ['NEPC not SC'] * len(plt_nepc_not_sc)) plt_df = pd.DataFrame({'Set': concat_labels, 'Score': concat_scores}) # fig = plt.figure(figsize=(2,2), dpi=300) sns.boxplot(y='Set', x='Score', data=plt_df, ax=ax_box) sns.stripplot(y='Set', x='Score', data=plt_df, size=2.5, jitter=True, linewidth=0.5, ax=ax_box) # ax_box.set_ylabel('') # ax_box.set_xlabel('') # plt.show() plt.savefig('NEPC_score_{}.png'.format(args.aggr_fn), bbox_inches='tight')
def main(args): data = pd.read_csv(args.src, index_col=0, memory_map=True) lab = pd.read_csv(args.lab) print(data.shape) print(lab.shape) print(lab.head()) data = data.sample(frac=args.pct) print(data.shape) print(data.head()) # Grab the id columns case_id = data['case_id'] tile_id = data['tile_id'] data.drop(['case_id', 'tile_id'], inplace=True, axis=1) print(data.shape) print(data.head()) if args.ae_only: to_drop = [x for x in data.columns if 'ae' not in x] data.drop(to_drop, axis=1, inplace=True) if args.hc_only: to_drop = [x for x in data.columns if 'hc' not in x] data.drop(to_drop, axis=1, inplace=True) data = data.transform(lambda x: (x - np.mean(x)) / np.std(x)) isinfs = np.sum(np.isinf(data.values), axis=0) print('isinfs', isinfs.shape) isnans = np.sum(np.isnan(data.values), axis=0) print('isnans', isnans.shape) print(np.argwhere(isinfs)) print(np.argwhere(isnans)) # data = data.dropna(axis='index') inf_cols = data.columns.values[np.squeeze(np.argwhere(isinfs))] nan_cols = data.columns.values[np.squeeze(np.argwhere(isnans))] print('inf_cols', inf_cols) print('nan_cols', nan_cols) data.drop(inf_cols, axis=1, inplace=True) data.drop(nan_cols, axis=1, inplace=True) print(data.shape) # Drop correlated columns data = drop_high_cor(data, 0.7) if args.average: print('Averaging features') if args.average_by == 'case': print('by: case') data = data.groupby(by=case_id, group_keys=True).mean() lab = lab.groupby('case_id').max() elif args.average_by == 'tile': print('by: tile') data = data.groupby(by=tile_id, group_keys=True).mean() lab = lab.groupby('tile_id').max() else: pass print(data.shape) print(data.head()) print(lab.head()) is_nepc = [] for x, t in zip(lab['stage_str'].values, lab.index.values): if t in data.index: is_nepc.append(x == 'NEPC') is_nepc = np.array(is_nepc) print(is_nepc.shape) else: pass emb = MulticoreTSNE(n_jobs=-1).fit_transform(data) # emb = umap.UMAP().fit_transform(data) plot_embedded(emb, is_nepc)
def main(args): feat = pd.read_csv(args.feature_src, index_col=0) lab = pd.read_csv(args.label_src) case_ids = feat['case_id'] tile_ids = feat.index stages = lab['stage_str'] feat.drop(['case_id'], axis=1, inplace=True) # feat.drop([c for c in feat.columns if 'Unnamed' in c], axis=1, inplace=True) # case_ids = case_ids.loc[feat.index] # tile_ids = tile_ids.loc[feat.index] # stages = stages.loc[feat.index] print(feat.shape) print(case_ids.shape) print(tile_ids.shape) print(stages.shape) print('Dropping nan, inf and high corr') feat = drop_high_cor(feat, 0.8) feat = feat.transform(lambda x: (x - np.mean(x)) / np.std(x)) if os.path.exists(args.reject_feats): usecols = np.invert(np.load(args.reject_feats)) print('Rejecting features', args.reject_feats, usecols.shape, np.sum(usecols)) feat = feat.loc[:, usecols] else: feat = feat.loc[:, usecols] feat = drop_nan_inf(feat) feat = drop_var(feat, 0.5) print(feat.shape) print(feat.head()) if args.average: print('Average by case') feat = feat.groupby(by=case_ids.values).mean() stages = stages.groupby(by=case_ids.values).max() print(feat.shape) print(stages.shape) row_p = sns.color_palette('muted', 3) row_colors = [] print(np.unique(stages.values)) for s in stages.values: if s in m0_strs: row_colors.append(row_p[0]) elif s in m1_strs: row_colors.append(row_p[1]) elif 'NEPC' in s: row_colors.append(row_p[2]) else: row_colors.append(row_p[1]) print('row_colors', len(row_colors)) # projected = TruncatedSVD(n_components=10).fit_transform(feat.values) # projected = PCA(n_components=10).fit_transform(feat.values) sns.clustermap(feat.values, metric=args.metric, standard_scale=1, row_colors=row_colors) plt.savefig(args.dst)
def train(args): feat = pd.read_csv(args.src, index_col=0, header=0) print(feat.head()) print(feat.shape) labels = pd.read_csv(args.labsrc, sep='\t') print(labels.shape) yvect = get_y(feat['case_id'], labels) print(yvect.shape) # Drop rows that come from cases we want to exclude usable_data = yvect != 4 yvect = yvect[usable_data] print(yvect.shape) feat = feat.loc[usable_data, :] nuclei_case_ids = feat['case_id'] nuclei_tile_ids = feat['tile_id'] feat.drop(['case_id', 'tile_id'], axis=1, inplace=True) print('dropped label cols', feat.shape) # drop_cols = [x for x in feat.columns if 'hc' not in x] # feat.drop(drop_cols, inplace=True, axis=1) # print('dropped chosen cols', feat.shape) # Drop columns of features feat = drop_var(feat) print('dropped low var', feat.shape) feat = drop_high_cor(feat, 0.8) print('dropped corr', feat.shape) feat = drop_nan_inf(feat) print('dropped nan inf', feat.shape) feat = feat.transform(lambda x: (x - np.mean(x)) / np.std(x)) print(feat.head()) print(feat.shape) # Split off M1 m1rows = yvect == 2 nepc_not_sc_rows = yvect == 5 m0nepc_rows = yvect < 2 yvect_m0nepc = yvect[m0nepc_rows] feat_m0nepc = feat.loc[m0nepc_rows, :] feat_m1 = feat.loc[m1rows, :] feat_nepc_not_sc = feat.loc[nepc_not_sc_rows, :] del feat, yvect train_idx, test_idx = train_test_split(np.arange(len(yvect_m0nepc))) train_x = feat_m0nepc.iloc[train_idx, :] train_y = yvect_m0nepc[train_idx] test_x = feat_m0nepc.iloc[test_idx, :] test_y = yvect_m0nepc[test_idx] print(train_x.shape) print(test_x.shape) model = RandomForestRegressor(max_depth=35, max_features='sqrt', n_estimators=200, n_jobs=-1).fit(train_x, train_y) #ypred = model.predict(test_x) #print(ypred.shape) #print(ypred.mean()) #print(ypred) #m1pred = model.predict(feat_m1) #nepc_not_sc_pred = model.predict(feat_nepc_not_sc) #plt_m0 = ypred[test_y == 0] #plt_nepc = ypred[test_y == 1] #plt_m1 = m1pred #plt_nepc_not_sc = nepc_not_sc_pred #do_boxplot(plt_m0, plt_nepc, plt_m1, plt_m0p, plt_nepc_not_sc, args.figout) dump(model, args.save) np.save('nucleus_classifier_features.npy', train_x.columns.values)
def main(args): feat = pd.read_csv(args.src, index_col=0, header=0) labels = pd.read_csv(args.labsrc, index_col=0, header=0, sep='\t') feat.drop('case_id', axis=1, inplace=True) use_rows = feat['n_score'].values != 0 feat = feat.iloc[use_rows, :] labels = labels.iloc[use_rows, :] print('using tables:', feat.shape, labels.shape) case_ids = labels['case_id'].values tile_ids = labels.index.values stages = labels['stage_str'].values feat = drop_high_cor(feat, 0.8) print('Features after high cor drop') feat = feat.transform(lambda x: (x - np.mean(x)) / np.std(x)) print('Features after zscore') feat = drop_nan_inf(feat) print('Features after dropping nan and infs') ((nepc_f, nepc_lab), (m0_f, m0_lab), (m0p_f, m0p_lab), (m1_f, m1_lab)) = split_sets(feat, labels) del feat if args.filter_stats: remove_cols = filter_stats(nepc_f, m0_f) nepc_f.drop(remove_cols, inplace=True, axis=1) m0_f.drop(remove_cols, inplace=True, axis=1) m0p_f.drop(remove_cols, inplace=True, axis=1) m1_f.drop(remove_cols, inplace=True, axis=1) train_x, train_y = make_training(m0_f, nepc_f) print('train_x', train_x.shape) print('train_y', train_y.shape) print('m1_f', m1_f.shape) model = RandomForestRegressor(oob_score=True, max_depth=20, max_features='sqrt', n_estimators=150, n_jobs=-1).fit(train_x, train_y) with open('feature_importance.txt', 'w+') as f: for v, coef in zip(train_x.columns, model.feature_importances_): f.write('{}\t{}\n'.format(v, coef)) if args.aggr_fn == 'max': aggr_fn = np.max elif args.aggr_fn == 'mean': aggr_fn = np.mean """ Predict the M1 cases and gather by max and mean """ yhat_m1 = model.predict(m1_f) case_aggr = [] m1_case_numbers = [] m1_case_vect = m1_lab['case_id'].values for uc in np.unique(m1_case_vect): yx = yhat_m1[m1_case_vect == uc] case_aggr.append(aggr_fn(yx)) case_num = int(uc.split('-')[1]) m1_case_numbers.append(case_num) m1_case_aggr = np.array(case_aggr) m1_case_numbers = np.array(m1_case_numbers) """ Predict M0P cases """ yhat_m0p = model.predict(m0p_f) case_aggr = [] m0p_case_numbers = [] m0p_case_vect = m0p_lab['case_id'].values for uc in np.unique(m0p_case_vect): yx = yhat_m0p[m0p_case_vect == uc] case_aggr.append(aggr_fn(yx)) case_num = int(uc.split('-')[1]) m0p_case_numbers.append(case_num) m0p_case_aggr = np.array(case_aggr) m0p_case_numbers = np.array(m0p_case_numbers) """ Check on the training data """ m0_cases = m0_lab['case_id'].values nepc_cases = nepc_lab['case_id'].values train_case_vect = np.concatenate([m0_cases, nepc_cases]) # yhat_train = model.predict(train_x) yhat_train = model.oob_prediction_ train_aggr, train_case_y = [], [] for uc in np.unique(train_case_vect): idx = train_case_vect == uc train_aggr.append(aggr_fn(yhat_train[idx])) train_case_y.append(train_y[idx][0]) train_aggr = np.array(train_aggr) train_case_y = np.array(train_case_y) """ write out scores """ with open('nepc_case_scores.txt', 'w+') as f: for mop, mop_score in zip(np.unique(m0p_case_vect), m0p_case_aggr): s = '{}\t{}\n'.format(mop, mop_score) f.write(s) for mop, mop_score in zip(np.unique(m1_case_vect), m1_case_aggr): s = '{}\t{}\n'.format(mop, mop_score) f.write(s) for mop, mop_score in zip(np.unique(train_case_vect), train_aggr): s = '{}\t{}\n'.format(mop, mop_score) f.write(s) """ Do some statistical tests """ dotest = mannwhitneyu # test_args = {'equal_var': True} test_args = {} test_m0_m1 = dotest(yhat_train[train_y == 0], yhat_m1, **test_args) test_m0_m0p = dotest(yhat_train[train_y == 0], yhat_m0p, **test_args) test_m0_nepc = dotest(yhat_train[train_y == 0], yhat_train[train_y == 1], **test_args) test_nepc_m1 = dotest(yhat_train[train_y == 1], yhat_m1, **test_args) print('Tiles M0 vs M1', test_m0_m1) print('Tiles M0 vs M0P', test_m0_m0p) print('Tiles M0 vs NPEC', test_m0_nepc) print('Tiles NEPC vs M1', test_nepc_m1) test_m0_m1 = dotest(train_aggr[train_case_y == 0], m1_case_aggr, **test_args) test_m0_m0p = dotest(train_aggr[train_case_y == 0], m0p_case_aggr, **test_args) test_m0_nepc = dotest(train_aggr[train_case_y == 0], train_aggr[train_case_y == 1], **test_args) test_nepc_m1 = dotest(train_aggr[train_case_y == 1], m1_case_aggr, **test_args) print('aggr M0 vs M1', test_m0_m1) print('aggr M0 vs M0P', test_m0_m0p) print('aggr M0 vs NPEC', test_m0_nepc) print('aggr NEPC vs M1', test_nepc_m1) """ ROC - AUC """ print( '------------------------------------------------------------------------------------' ) m0nepc_ypred = np.concatenate( [train_aggr[train_case_y == 0], train_aggr[train_case_y == 1]]) m0nepc_ytrue = np.array([0] * np.sum(train_case_y == 0) + [1] * np.sum(train_case_y == 1)) m0m1_ypred = np.concatenate([train_aggr[train_case_y == 0], m1_case_aggr]) m0m1_ytrue = np.array([0] * np.sum(train_case_y == 0) + [1] * len(m1_case_aggr)) m0m0p_ypred = np.concatenate( [train_aggr[train_case_y == 0], m0p_case_aggr]) m0m0p_ytrue = np.array([0] * np.sum(train_case_y == 0) + [1] * len(m0p_case_aggr)) print('m0nepc_ypred', m0nepc_ypred.shape, m0nepc_ytrue.shape) print('m0m1_ypred', m0m1_ypred.shape, m0m1_ypred.shape) print('m0m0p_ypred', m0m0p_ypred.shape, m0m0p_ypred.shape) auc_ = roc_auc_score(y_true=m0nepc_ytrue, y_score=m0nepc_ypred) print('M0 - NEPC AUC = ', auc_) auc_ = roc_auc_score(y_true=m0m1_ytrue, y_score=m0m1_ypred) print('M0 - M1 AUC = ', auc_) auc_ = roc_auc_score(y_true=m0m0p_ytrue, y_score=m0m0p_ypred) print('M0 - M0P AUC = ', auc_) print( '------------------------------------------------------------------------------------' ) gene_scores = pd.read_csv('../data/signature_scores_beltram.csv', index_col=None, header=0, sep=',') gene_score_caseid = [] drop_rows = [] matching_scores = [] matching_indices = [] for i, (idx, sn) in enumerate( zip(gene_scores.index.values, gene_scores['Surgical Number'].values)): try: x = int(sn.split(' ')[-1]) if x in m1_case_numbers: gene_score_caseid.append(x) matching_indices.append(idx) matching_scores.append(m1_case_aggr[m1_case_numbers == x][0]) elif x in m0p_case_numbers: gene_score_caseid.append(x) matching_indices.append(idx) matching_scores.append(m0p_case_aggr[m0p_case_numbers == x][0]) else: drop_rows.append(idx) except: drop_rows.append(idx) gene_scores.drop(drop_rows, inplace=True) label_cols = ['caseid', 'Disease Stage', 'sample name', 'Surgical Number'] gene_scores.drop(label_cols, inplace=True, axis=1) gene_scores['NEPC Score'] = pd.Series(matching_scores, index=matching_indices) plt.figure(figsize=(5, 5), dpi=300) sns.pairplot(gene_scores, kind='reg') plt.savefig('gene_scores_nepc_score_{}_tile.png'.format(args.aggr_fn), bbox_inches='tight') test_cols = [x for x in gene_scores.columns if x != 'NEPC Score'] scores = gene_scores['NEPC Score'].values print( '------------------------------------------------------------------------------------' ) for c in test_cols: ctest = spearmanr(scores, gene_scores[c].values) print('spearman {:40}: {:3.5f} p={:3.5f}'.format( c, ctest.correlation, ctest.pvalue)) ctest = pearsonr(scores, gene_scores[c].values) print('pearson {:40}: {:3.5f} p={:3.5f}'.format( c, ctest[0], ctest[1])) print( '------------------------------------------------------------------------------------' ) if args.boxplot: f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.35, .65)}) plt_m0 = train_aggr[train_case_y == 0] plt_nepc = train_aggr[train_case_y == 1] plt_m1 = m1_case_aggr plt_m0p = m0p_case_aggr sns.distplot( plt_m0, bins=25, norm_hist=True, kde=True, label='M0', ax=ax_hist, ) sns.distplot( plt_nepc, bins=25, norm_hist=True, kde=True, label='NEPC', ax=ax_hist, ) sns.distplot( plt_m1, kde=True, norm_hist=True, bins=25, label='M1', ax=ax_hist, ) sns.distplot( plt_m0p, kde=True, norm_hist=True, bins=25, label='M0-P', ax=ax_hist, ) ax_hist.set_xlabel('Score') ax_hist.set_ylabel('Frequency') concat_scores = np.concatenate([plt_m0, plt_nepc, plt_m1, plt_m0p]) concat_labels = np.array(['M0'] * len(plt_m0) + ['NEPC'] * len(plt_nepc) + ['M1'] * len(plt_m1) + ['M0P'] * len(plt_m0p)) plt_df = pd.DataFrame({'Set': concat_labels, 'Score': concat_scores}) sns.boxplot(y='Set', x='Score', data=plt_df, ax=ax_box) sns.stripplot(y='Set', x='Score', data=plt_df, size=2.5, jitter=True, linewidth=0.5, ax=ax_box) plt.savefig('NEPC_score_{}_tile.png'.format(args.aggr_fn), bbox_inches='tight')
def main(args): feat, case_ids = load_features(args.src, zscore=True) lab = load_labels(args.labsrc) feat = drop_high_cor(feat, cor_thresh=0.8) print('Features after high cor drop') print(feat.head()) is_nepc = np.array([x in nepc_strs for x in lab['stage_str']]) is_adeno = np.array([x in adeno_strs for x in lab['stage_str']]) is_m0 = np.array([x in m0_strs for x in lab['stage_str']]) is_m0p = np.array([x in m0p_strs for x in lab['stage_str']]) is_m1 = np.array([x in m1_strs for x in lab['stage_str']]) nepc_case_feat = feat.loc[is_nepc, :] nepc_lab = lab.loc[is_nepc, :] adeno_case_feat = feat.loc[is_adeno, :] adeno_lab = lab.loc[is_adeno, :] m0_case_feat = feat.loc[is_m0, :] m0_lab = lab.loc[is_m0, :] m0p_case_feat = feat.loc[is_m0p, :] m0p_lab = lab.loc[is_m0p, :] m1_case_feat = feat.loc[is_m1, :] m1_lab = lab.loc[is_m1, :] if args.reduce_case: nepc_case_feat = nepc_case_feat.groupby(nepc_lab['case_id']).mean() adeno_case_feat = adeno_case_feat.groupby(adeno_lab['case_id']).mean() m0_case_feat = m0_case_feat.groupby(m0_lab['case_id']).mean() m0p_case_feat = m0p_case_feat.groupby(m0p_lab['case_id']).mean() m1_case_feat = m1_case_feat.groupby(m1_lab['case_id']).mean() print('NEPC features:', nepc_case_feat.shape) print('Adeno features:', adeno_case_feat.shape) print('M0 features:', m0_case_feat.shape) print('M0p features:', m0p_case_feat.shape) print('M1 features:', m1_case_feat.shape) nepc_adeno_p = [] m0_m1_p = [] m0_m0p_p = [] for c in nepc_case_feat.columns: nepc_ = nepc_case_feat[c].values adeno_ = adeno_case_feat[c].values m0_ = m0_case_feat[c].values m0p_ = m0p_case_feat[c].values m1_ = m1_case_feat[c].values tt_nepc_adeno = ttest_ind(nepc_, adeno_) tt_m0_m1 = ttest_ind(m0_, m1_) tt_m0_m0p = ttest_ind(m0_, m0p_) nepc_adeno_p.append(tt_nepc_adeno[1]) m0_m1_p.append(tt_m0_m1[1]) m0_m0p_p.append(tt_m0_m0p[1]) nepc_adeno_reject, nepc_adeno_q, _, _ = multipletests(nepc_adeno_p, alpha=0.01, method='fdr_bh') m0_m1_reject, m0_m1_q, _, _ = multipletests(m0_m1_p, alpha=0.01, method='fdr_bh') m0_m0p_reject, m0_m0p_q, _, _ = multipletests(m0_m0p_p, alpha=0.01, method='fdr_bh') print('Rejecting {} '.format(np.sum(nepc_adeno_reject))) print('Rejecting {} '.format(np.sum(m0_m1_reject))) print('Rejecting {} '.format(np.sum(m0_m0p_reject))) np.save('nepc_adeno_reject.npy', np.array(nepc_adeno_reject)) np.save('m0_m1_reject.npy', np.array(m0_m1_reject)) np.save('m0_m0p_reject.npy', np.array(m0_m0p_reject)) for i, c in enumerate(nepc_case_feat.columns): print('plotting feature ', c) if not nepc_adeno_reject[i]: tt = True elif not m0_m1_reject[i]: tt = True elif not m0_m0p_reject[i]: tt = True else: tt = False if tt: nepc_ = nepc_case_feat[c].values adeno_ = adeno_case_feat[c].values m0_ = m0_case_feat[c].values m0p_ = m0p_case_feat[c].values m1_ = m1_case_feat[c].values plt.clf() sns.distplot(nepc_, label='NEPC') sns.distplot(adeno_, label='Adeno') sns.distplot(m0_, label='M0') sns.distplot(m0p_, label='M0-P') sns.distplot(m1_, label='M1') plt.legend(frameon=True) plt.title('{}\nnepc q={:.3E}\nm1 q={:.3E}\nm0p q={:.3E}'.format( c, nepc_adeno_q[i], m0_m1_q[i], m0_m0p_q[i])) saveto = os.path.join(args.dst, '{}.png'.format(c)) plt.savefig(saveto, bbox_inches='tight') else: print('skipping feature ', c)