예제 #1
0
 def test_one_row_predict_proba(self):
     # Verify that predict_proba on one row gives 2D output
     m = LogitNet(random_state=42)
     for X, y in itertools.chain(self.binomial, self.multinomial):
         m.fit(X, y)
         p = m.predict_proba(X[0].reshape((1, -1)))
         assert p.shape == (1, len(np.unique(y)))
예제 #2
0
 def test_one_row_predict_proba(self):
     # Verify that predict_proba on one row gives 2D output
     m = LogitNet(random_state=42)
     for X, y in itertools.chain(self.binomial, self.multinomial):
         m.fit(X, y)
         p = m.predict_proba(X[0].reshape((1, -1)))
         assert p.shape == (1, len(np.unique(y)))
예제 #3
0
def train_glmnet(train,
                 test,
                 save_path_pred,
                 save_path_model,
                 save_path_json,
                 n_cores=5):
    ln = LogitNet(alpha=0.5, n_splits=10, n_jobs=n_cores)
    # to sparse
    train_sparse = (csc_matrix(train[0]),
                    csc_matrix(train[1].astype(np.float64).reshape((-1, 1))))
    test_sparse = (csc_matrix(test[0]),
                   csc_matrix(test[1].astype(np.float64).reshape((-1, 1))))

    print("train the model")
    ln.fit(train_sparse[0], train[1])

    print("get predictions")
    y_pred = ln.predict_proba(test_sparse[0])[:, 1]
    auprc = cem.auprc(test[1], y_pred)
    auc = cem.auc(test[1], y_pred)

    # csv
    print("save csv")
    dt = pd.DataFrame({"y_true": test[1], "y_pred": y_pred})
    dt.to_csv(save_path_pred)

    # json
    print("save json")
    write_json({"auprc": auprc, "auc": auc}, save_path_json)
    # model
    print("save model")
    pickle.dump(ln, open(save_path_model, "wb"))
예제 #4
0
 def test_one_row_predict_proba_with_lambda(self):
     # One row to predict_proba along with lambdas should give 3D output
     m = LogitNet(random_state=42)
     lamb = [0.01, 0.02, 0.04, 0.1]
     for X, y in itertools.chain(self.binomial, self.multinomial):
         m.fit(X, y)
         p = m.predict_proba(X[0].reshape((1, -1)), lamb=lamb)
         assert p.shape == (1, len(np.unique(y)), len(lamb))
예제 #5
0
 def test_one_row_predict_proba_with_lambda(self):
     # One row to predict_proba along with lambdas should give 3D output
     m = LogitNet(random_state=42)
     lamb = [0.01, 0.02, 0.04, 0.1]
     for X, y in itertools.chain(self.binomial, self.multinomial):
         m.fit(X, y)
         p = m.predict_proba(X[0].reshape((1, -1)), lamb=lamb)
         assert p.shape == (1, len(np.unique(y)), len(lamb))
예제 #6
0
    def test_coef_interpolation(self):
        x, y = self.binomial[0]
        m = LogitNet(n_splits=0, random_state=561)
        m = m.fit(x, y)

        # predict for a value of lambda between two values on the computed path
        lamb_lo = m.lambda_path_[1]
        lamb_hi = m.lambda_path_[2]

        # a value not equal to one on the computed path
        lamb_mid = (lamb_lo + lamb_hi) / 2.0

        pred_lo = m.predict_proba(x, lamb=lamb_lo)
        pred_hi = m.predict_proba(x, lamb=lamb_hi)
        pred_mid = m.predict_proba(x, lamb=lamb_mid)

        self.assertFalse(np.allclose(pred_lo, pred_mid))
        self.assertFalse(np.allclose(pred_hi, pred_mid))
예제 #7
0
    def test_coef_interpolation(self):
        x, y = self.binomial[0]
        m = LogitNet(n_folds=0, random_state=561)
        m = m.fit(x, y)

        # predict for a value of lambda between two values on the computed path
        lamb_lo = m.lambda_path_[1]
        lamb_hi = m.lambda_path_[2]

        # a value not equal to one on the computed path
        lamb_mid = (lamb_lo + lamb_hi) / 2.0

        pred_lo = m.predict_proba(x, lamb=lamb_lo)
        pred_hi = m.predict_proba(x, lamb=lamb_hi)
        pred_mid = m.predict_proba(x, lamb=lamb_mid)

        self.assertFalse(np.allclose(pred_lo, pred_mid))
        self.assertFalse(np.allclose(pred_hi, pred_mid))
예제 #8
0
def fit_glm(args, X, y):
    print('GLM')

    # fit on full dataset and save model
    np.random.seed(1000)
    glm = LogitNet(alpha=0.5, n_lambda=20, n_jobs=5)
    glm.fit(X, y)

    with open(MODEL_DIR / f'glm_{args.dataset}.pkl', 'wb') as f:
        pickle.dump(glm, f)

    print('In-sample: ')
    tmp = glm.predict_proba(X)
    AUC = roc_auc_score(y, tmp[:, 1])
    APR = average_precision_score(y, tmp[:, 1])
    print('\tAUC ', np.round(AUC, 4))
    print('\tAPR ', np.round(APR, 4))

    print('Out-of-sample: ')
    print(glm.lambda_best_)

    # generate in-dataset CV predictions
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1111)
    np.random.seed(1000)
    glm = LogitNet(alpha=0.5, n_lambda=1, lambda_path=[glm.lambda_best_])
    cv_scores = cross_val_predict(glm,
                                  X,
                                  y,
                                  cv=kf,
                                  method='predict_proba',
                                  n_jobs=-1)
    AUC = roc_auc_score(y, cv_scores[:, 1])
    APR = average_precision_score(y, cv_scores[:, 1])
    print('\tAUC ', np.round(AUC, 4))
    print('\tAPR ', np.round(APR, 4))

    np.save(MODEL_DIR / f'glm_{args.dataset}.npy', cv_scores[:, 1])
예제 #9
0
# Loop over number of iterations
for i in tqdm(range(N_ITERATIONS)):
    # Fit LogisticNet with the training set
    lr = LogitNet(alpha=ALPHA,
                  n_lambda=N_LAMBDA,
                  standardize=False,
                  n_folds=N_FOLDS,
                  max_iter=MAX_ITER,
                  random_state=i)
    lr = lr.fit(X_train, y_train)

    # Identify and save the best lambda
    lamb = lr.lambda_max_

    # Generate scores for training and test sets
    y_train_scores = lr.predict_proba(X_train, lamb=lamb)[:, 1]
    y_test_scores = lr.predict_proba(X_test, lamb=lamb)[:, 1]

    # Save AUCs
    all_y_train_scores.append(y_train_scores)
    all_y_test_scores.append(y_test_scores)

# Generate scores for training and test sets
all_y_train_scores = np.array(all_y_train_scores)
y_train_scores_mean = np.mean(all_y_train_scores, axis=0)
all_y_test_scores = np.array(all_y_test_scores)
y_test_scores_mean = np.mean(all_y_test_scores, axis=0)

# Compute ROC curve and ROC area for each class
n_classes = 2
fpr = dict()
예제 #10
0
y_preds = []
y_tests = []
fpr_interp = np.linspace(0, 1, 100)
tpr_interps = []
aocs = []
for i in range(n_splits):
    print(i)
    model = LogitNet(
        fit_intercept=False,
        n_jobs=cpus)  #(cv=n_cv, n_jobs=min(cpus,n_cv), selection='random')
    X_train = person_biomarker[train_idxs[i], :]
    y_train = sample_data.phenotype[train_idxs[i]]
    X_test = person_biomarker[test_idxs[i], :]
    y_test = sample_data.phenotype[test_idxs[i]]
    model.fit(X_train.todense(), 1 * y_train)
    y_pred = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = sklearn.metrics.roc_curve(y_test, y_pred)
    interp_func = scipy.interpolate.interp1d(fpr, tpr)
    tpr_interp = interp_func(fpr_interp)
    tpr_interps = tpr_interps + [list(tpr_interp)]
    aoc = sklearn.metrics.roc_auc_score(y_test, y_pred)
    aocs = aocs + [aoc]
tpr_df = pd.DataFrame(tpr_interps)
tpr_df.columns = fpr_interp
tpr_df = tpr_df.melt()
tpr_df.columns = ['fpr', 'tpr']
tpr_df.to_csv(BIOMARKER_DIR +
              'results/ML_phenotype_prediction/fpr_vs_tpr_%s_%s.tsv' %
              (biomarker, dataset),
              sep='\t')
print(np.mean(aocs))
classifier = LogitNet(alpha=alpha,
                      n_jobs=n_jobs,
                      min_lambda_ratio=1e-8,
                      n_lambda=150,
                      standardize=True,
                      random_state=(int(network_year) + 1) * (seed + 1),
                      scoring=scoring)

classifier.fit(X_train, y_train)

# Just get the results on the test set
print('Obtainig Test Set results')
X_test = feature_df.loc[test, features].copy()
X_test.loc[:, degree_features] = dt.transform(X_test.loc[:, degree_features])
X_test.loc[:, dwpc_features] = dwpct.transform(X_test.loc[:, dwpc_features])
test_probas = classifier.predict_proba(X_test)[:, 1]
cd_df.loc[test, 'test_probas'] = test_probas


def glmnet_coefs(glmnet_obj, X, f_names):
    """Helper Function to quickly return the model coefs and correspoding fetaure names"""
    l = glmnet_obj.lambda_best_[0]

    coef = glmnet_obj.coef_[0]
    coef = np.insert(coef, 0, glmnet_obj.intercept_)

    names = np.insert(f_names, 0, 'intercept')

    z_intercept = coef[0] + sum(coef[1:] * X.mean(axis=0))
    z_coef = coef[1:] * X.values.std(axis=0)
    z_coef = np.insert(z_coef, 0, z_intercept)