def train_seq_model(train, dev, params): train = downsample_negatives(train, p=0.25) print(train.shape) X_train = train['seq'] y_train = train.Label.values X_dev = dev['seq'] y_dev = dev.Label.values X_train = encode_seq(X_train) X_dev = encode_seq(X_dev) X_shape = (19, 4) mod = SeqModel(input_shape=X_shape, params=params).build() mod.train(X_train, y_train, X_dev, y_dev) probs_train = mod.predict(X_train) probs_dev = mod.predict(X_dev) results = {} train_avgpr = avgPR(y_train, probs_train) dev_avgpr = avgPR(y_dev, probs_dev) train_auroc = auroc(y_train, probs_train) dev_auroc = auroc(y_dev, probs_dev) bench_avgpr, bench_auroc = get_benchmark_score(dev) results['AUPR_train'] = train_avgpr results['AUROC_train'] = train_auroc results['AUPR_dev'] = dev_avgpr results['AUROC_dev'] = dev_auroc results['AUPR_bench'] = bench_avgpr results['AUROC_bench'] = bench_auroc return results, mod
def get_benchmark_score(data, benchmark='GNET'): bdf = load_benchmark(dataset='E116') bench = get_benchmarks_for_slice(data, bdf) bench_avgpr = avgPR(bench['Label'], bench[benchmark]) bench_auroc = auroc(bench['Label'], bench[benchmark]) return bench_avgpr, bench_auroc
def train_model(train, dev, params, return_probs=False): ''' trains a model on input train and dev dataset, with parameter dictionary params. ''' X_train = train.drop(['chr', 'pos', 'rs', 'Label'], axis=1) y_train = train.Label.values X_dev = dev.drop(['chr', 'pos', 'rs', 'Label'], axis=1) y_dev = dev.Label.values X_train = np.log(X_train) X_dev = np.log(X_dev) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_dev = scaler.transform(X_dev) X_shape = (X_train.shape[1], ) if params['use_pc'] == 1: mod = PartialModel(input_shape=X_shape, params=params).build() elif params['use_clust'] == 1: mod = ClusterModel(input_shape=X_shape, params=params).build(X_train) else: mod = BaseModel(input_shape=X_shape, params=params).build() mod.train(X_train, y_train, X_dev, y_dev) probs_train = mod.predict(X_train) probs_dev = mod.predict(X_dev) results = {} train_avgpr = avgPR(y_train, probs_train) dev_avgpr = avgPR(y_dev, probs_dev) train_auroc = auroc(y_train, probs_train) dev_auroc = auroc(y_dev, probs_dev) bench_avgpr, bench_auroc = get_benchmark_score(dev) results['AUPR_train'] = train_avgpr results['AUROC_train'] = train_auroc results['AUPR_dev'] = dev_avgpr results['AUROC_dev'] = dev_auroc results['AUPR_bench'] = bench_avgpr results['AUROC_bench'] = bench_auroc if return_probs: return probs_dev return results, mod
def official_benchmark_splits(train, dev, test, benchmark='GNET'): ''' Loads benchmark predictions corresponding to the given train and dev dataframes and reports metrics on the benchmark model. ''' bench = load_benchmark(dataset='E116') train_bench = get_benchmarks_for_slice(train, bench) dev_bench = get_benchmarks_for_slice(dev, bench) test_bench = get_benchmarks_for_slice(test, bench) metric_report(train_bench['Label'], bin_probs(train_bench[benchmark])) metric_report(dev_bench['Label'], bin_probs(dev_bench[benchmark])) metric_report(test_bench['Label'], bin_probs(test_bench[benchmark])) print(auroc(train_bench['Label'], train_bench[benchmark])) print(auroc(dev_bench['Label'], dev_bench[benchmark])) print(auroc(test_bench['Label'], test_bench[benchmark])) print(avgPR(train_bench['Label'], train_bench[benchmark])) print(avgPR(dev_bench['Label'], dev_bench[benchmark])) print(avgPR(test_bench['Label'], test_bench[benchmark]))
def logistic_benchmark(train, dev): ''' Fits regularized logistic regression benchmark on input train and dev dataframes. ''' from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler X_train = train.iloc[:, 4:] y_train = train.Label.values X_dev = dev.iloc[:, 4:] y_dev = dev.Label.values X_train = np.log(X_train) X_dev = np.log(X_dev) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_dev = scaler.transform(X_dev) clf = LogisticRegressionCV(Cs=5, cv=3, max_iter=1000) clf.fit(X_train, y_train) train_pred = clf.predict(X_train) probs_train = clf.predict_proba(X_train) dev_pred = clf.predict(X_dev) probs_dev = clf.predict_proba(X_dev) results = {} train_avgpr = avgPR(y_train, probs_train[:, 1]) dev_avgpr = avgPR(y_dev, probs_dev[:, 1]) train_auroc = auroc(y_train, probs_train[:, 1]) dev_auroc = auroc(y_dev, probs_dev[:, 1]) results['AUPR_train'] = train_avgpr results['AUROC_train'] = train_auroc results['AUPR_dev'] = dev_avgpr results['AUROC_dev'] = dev_auroc results['AUPR_bench'] = 0 results['AUROC_bench'] = 0 return results, clf
fit_reg=False, palette='Set2') plt.title('t-SNE components by predicted label, threshold=0.03') fig = tsne2.fig fig.savefig(join(cfg.OUTPUT_DIR, 'f2.png'), dpi=150, bbox_inches='tight') # error analysis res = pd.read_csv(join(cfg.OUTPUT_DIR, 'scores.csv')) train = load_train_set('E116') train_counts = train.groupby('chr').sum().Label errs = np.zeros(22) for c in range(1, 23): tmp = res[res['chr'] == c] met = avgPR(tmp['Label'], tmp['Score']) errs[c - 1] = met chrs = np.arange(1, 23) col1 = '#66c2a5' col2 = '#fc8d62' fig, axes = plt.subplots(nrows=2, sharex=True) axes[0].bar(chrs, errs, color=col1) axes[1].bar(chrs, train_counts, color=col2) axes[1].invert_yaxis() plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True axes[0].spines['top'].set_visible(False)