Exemplo n.º 1
0
def eval_profile(yt, yp,
                 pos_min_threshold=0.05,
                 neg_max_threshold=0.01,
                 required_min_pos_counts=2.5,
                 binsizes=[1, 2, 4, 10]):
    """
    Evaluate the profile in terms of auPR

    Args:
      yt: true profile (counts)
      yp: predicted profile (fractions)
      pos_min_threshold: fraction threshold above which the position is
         considered to be a positive
      neg_max_threshold: fraction threshold bellow which the position is
         considered to be a negative
      required_min_pos_counts: smallest number of reads the peak should be
         supported by. All regions where 0.05 of the total reads would be
         less than required_min_pos_counts are excluded
    """
    # The filtering
    # criterion assures that each position in the positive class is
    # supported by at least required_min_pos_counts  of reads
    do_eval = yt.sum(axis=1).mean(axis=1) > required_min_pos_counts / pos_min_threshold
    
    # make sure everything sums to one
    yp = yp / yp.sum(axis=1, keepdims=True)
    fracs = yt / yt.sum(axis=1, keepdims=True)
    
    yp_random = permute_array(permute_array(yp[do_eval], axis=1), axis=0)
    out = []
    for binsize in binsizes:
        is_peak = (fracs >= pos_min_threshold).astype(float)
        ambigous = (fracs < pos_min_threshold) & (fracs >= neg_max_threshold)
        is_peak[ambigous] = -1
        y_true = np.ravel(bin_counts_amb(is_peak[do_eval], binsize))

        imbalance = np.sum(y_true == 1) / np.sum(y_true >= 0)
        n_positives = np.sum(y_true == 1)
        n_ambigous = np.sum(y_true == -1)
        frac_ambigous = n_ambigous / y_true.size

        # TODO - I used to have bin_counts_max over here instead of bin_counts_sum
        try:
            res = auprc(y_true,
                        np.ravel(bin_counts_max(yp[do_eval], binsize)))
            res_random = auprc(y_true,
                               np.ravel(bin_counts_max(yp_random, binsize)))
        except ValueError:
            res = np.nan
            res_random = np.nan

        out.append({"binsize": binsize,
                    "auprc": res,
                    "random_auprc": res_random,
                    "n_positives": n_positives,
                    "frac_ambigous": frac_ambigous,
                    "imbalance": imbalance
                    })

    return pd.DataFrame.from_dict(out)
Exemplo n.º 2
0
def train_glmnet(train,
                 test,
                 save_path_pred,
                 save_path_model,
                 save_path_json,
                 n_cores=5):
    ln = LogitNet(alpha=0.5, n_splits=10, n_jobs=n_cores)
    # to sparse
    train_sparse = (csc_matrix(train[0]),
                    csc_matrix(train[1].astype(np.float64).reshape((-1, 1))))
    test_sparse = (csc_matrix(test[0]),
                   csc_matrix(test[1].astype(np.float64).reshape((-1, 1))))

    print("train the model")
    ln.fit(train_sparse[0], train[1])

    print("get predictions")
    y_pred = ln.predict_proba(test_sparse[0])[:, 1]
    auprc = cem.auprc(test[1], y_pred)
    auc = cem.auc(test[1], y_pred)

    # csv
    print("save csv")
    dt = pd.DataFrame({"y_true": test[1], "y_pred": y_pred})
    dt.to_csv(save_path_pred)

    # json
    print("save json")
    write_json({"auprc": auprc, "auc": auc}, save_path_json)
    # model
    print("save model")
    pickle.dump(ln, open(save_path_model, "wb"))
Exemplo n.º 3
0
def test_metrics():
    expect = [0.0, 2.0, 1.0, 2.0]
    y_pred = np.array([0, 0.2, 0.6, 0.4, 1, 0])
    y_true = np.array([1, 0, -1, 1, 0, 0])
    y_true_mask = y_true[y_true != MASK_VALUE]
    y_pred_mask = y_pred[y_true != MASK_VALUE]

    y_true_r = y_true.reshape((-1, 2))
    y_pred_r = y_pred.reshape((-1, 2))
    y_true_mask_r = y_true_mask.reshape((-1, 1))
    y_pred_mask_r = y_pred_mask.reshape((-1, 1))
    res1 = [K.eval(x) for x in cm.contingency_table(y_true, y_pred)]
    res2 = [K.eval(x) for x in cm.contingency_table(y_true_mask, y_pred_mask)]
    res3 = [K.eval(x) for x in cm.contingency_table(y_true_r, y_pred_r)]
    res4 = [
        K.eval(x) for x in cm.contingency_table(y_true_mask_r, y_pred_mask_r)
    ]

    assert sum(res1) == 5
    assert sum(res2) == 5
    assert sum(res3) == 5
    assert sum(res4) == 5
    assert res1 == expect
    assert res2 == expect
    assert res3 == expect
    assert res4 == expect

    assert np.allclose(K.eval(cm.tpr(y_true, y_pred)), cem.tpr(y_true, y_pred))
    assert np.allclose(K.eval(cm.accuracy(y_true, y_pred)),
                       cem.accuracy(y_true, y_pred))

    # other metrics
    assert cem.auprc(y_true, y_pred) > 0
    assert cem.recall_at_precision(y_true, y_pred, .5) > 0

    # test serialization
    s = serialize_keras_object(cm.accuracy)
    a = deserialize_keras_object(s)
    assert a == cm.accuracy