def train_seq_model(train, dev, params):

    train = downsample_negatives(train, p=0.25)
    print(train.shape)
    X_train = train['seq']
    y_train = train.Label.values
    X_dev = dev['seq']
    y_dev = dev.Label.values

    X_train = encode_seq(X_train)
    X_dev = encode_seq(X_dev)

    X_shape = (19, 4)
    mod = SeqModel(input_shape=X_shape, params=params).build()

    mod.train(X_train, y_train, X_dev, y_dev)

    probs_train = mod.predict(X_train)
    probs_dev = mod.predict(X_dev)

    results = {}
    train_avgpr = avgPR(y_train, probs_train)
    dev_avgpr = avgPR(y_dev, probs_dev)
    train_auroc = auroc(y_train, probs_train)
    dev_auroc = auroc(y_dev, probs_dev)
    bench_avgpr, bench_auroc = get_benchmark_score(dev)

    results['AUPR_train'] = train_avgpr
    results['AUROC_train'] = train_auroc
    results['AUPR_dev'] = dev_avgpr
    results['AUROC_dev'] = dev_auroc
    results['AUPR_bench'] = bench_avgpr
    results['AUROC_bench'] = bench_auroc

    return results, mod
예제 #2
0
def get_benchmark_score(data, benchmark='GNET'):
    bdf = load_benchmark(dataset='E116')
    bench = get_benchmarks_for_slice(data, bdf)

    bench_avgpr = avgPR(bench['Label'], bench[benchmark])
    bench_auroc = auroc(bench['Label'], bench[benchmark])

    return bench_avgpr, bench_auroc
def train_model(train, dev, params, return_probs=False):
    ''' trains a model on input train and dev dataset, with parameter dictionary params.
    '''
    X_train = train.drop(['chr', 'pos', 'rs', 'Label'], axis=1)
    y_train = train.Label.values
    X_dev = dev.drop(['chr', 'pos', 'rs', 'Label'], axis=1)
    y_dev = dev.Label.values

    X_train = np.log(X_train)
    X_dev = np.log(X_dev)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)

    X_shape = (X_train.shape[1], )
    if params['use_pc'] == 1:
        mod = PartialModel(input_shape=X_shape, params=params).build()
    elif params['use_clust'] == 1:
        mod = ClusterModel(input_shape=X_shape, params=params).build(X_train)
    else:
        mod = BaseModel(input_shape=X_shape, params=params).build()

    mod.train(X_train, y_train, X_dev, y_dev)

    probs_train = mod.predict(X_train)
    probs_dev = mod.predict(X_dev)

    results = {}
    train_avgpr = avgPR(y_train, probs_train)
    dev_avgpr = avgPR(y_dev, probs_dev)
    train_auroc = auroc(y_train, probs_train)
    dev_auroc = auroc(y_dev, probs_dev)
    bench_avgpr, bench_auroc = get_benchmark_score(dev)

    results['AUPR_train'] = train_avgpr
    results['AUROC_train'] = train_auroc
    results['AUPR_dev'] = dev_avgpr
    results['AUROC_dev'] = dev_auroc
    results['AUPR_bench'] = bench_avgpr
    results['AUROC_bench'] = bench_auroc

    if return_probs:
        return probs_dev

    return results, mod
예제 #4
0
def official_benchmark_splits(train, dev, test, benchmark='GNET'):
    ''' Loads benchmark predictions corresponding to the given train and dev dataframes
        and reports metrics on the benchmark model.
    '''
    bench = load_benchmark(dataset='E116')
    train_bench = get_benchmarks_for_slice(train, bench)
    dev_bench = get_benchmarks_for_slice(dev, bench)
    test_bench = get_benchmarks_for_slice(test, bench)

    metric_report(train_bench['Label'], bin_probs(train_bench[benchmark]))
    metric_report(dev_bench['Label'], bin_probs(dev_bench[benchmark]))
    metric_report(test_bench['Label'], bin_probs(test_bench[benchmark]))

    print(auroc(train_bench['Label'], train_bench[benchmark]))
    print(auroc(dev_bench['Label'], dev_bench[benchmark]))
    print(auroc(test_bench['Label'], test_bench[benchmark]))

    print(avgPR(train_bench['Label'], train_bench[benchmark]))
    print(avgPR(dev_bench['Label'], dev_bench[benchmark]))
    print(avgPR(test_bench['Label'], test_bench[benchmark]))
예제 #5
0
def logistic_benchmark(train, dev):
    ''' Fits regularized logistic regression benchmark on input train and dev dataframes.
    '''
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.preprocessing import StandardScaler

    X_train = train.iloc[:, 4:]
    y_train = train.Label.values
    X_dev = dev.iloc[:, 4:]
    y_dev = dev.Label.values

    X_train = np.log(X_train)
    X_dev = np.log(X_dev)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)

    clf = LogisticRegressionCV(Cs=5, cv=3, max_iter=1000)
    clf.fit(X_train, y_train)

    train_pred = clf.predict(X_train)
    probs_train = clf.predict_proba(X_train)

    dev_pred = clf.predict(X_dev)
    probs_dev = clf.predict_proba(X_dev)

    results = {}
    train_avgpr = avgPR(y_train, probs_train[:, 1])
    dev_avgpr = avgPR(y_dev, probs_dev[:, 1])
    train_auroc = auroc(y_train, probs_train[:, 1])
    dev_auroc = auroc(y_dev, probs_dev[:, 1])

    results['AUPR_train'] = train_avgpr
    results['AUROC_train'] = train_auroc
    results['AUPR_dev'] = dev_avgpr
    results['AUROC_dev'] = dev_auroc
    results['AUPR_bench'] = 0
    results['AUROC_bench'] = 0

    return results, clf
예제 #6
0
                   fit_reg=False,
                   palette='Set2')
plt.title('t-SNE components by predicted label, threshold=0.03')
fig = tsne2.fig
fig.savefig(join(cfg.OUTPUT_DIR, 'f2.png'), dpi=150, bbox_inches='tight')

# error analysis
res = pd.read_csv(join(cfg.OUTPUT_DIR, 'scores.csv'))

train = load_train_set('E116')
train_counts = train.groupby('chr').sum().Label

errs = np.zeros(22)
for c in range(1, 23):
    tmp = res[res['chr'] == c]
    met = avgPR(tmp['Label'], tmp['Score'])
    errs[c - 1] = met

chrs = np.arange(1, 23)
col1 = '#66c2a5'
col2 = '#fc8d62'

fig, axes = plt.subplots(nrows=2, sharex=True)
axes[0].bar(chrs, errs, color=col1)
axes[1].bar(chrs, train_counts, color=col2)
axes[1].invert_yaxis()

plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False
plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True

axes[0].spines['top'].set_visible(False)