def test_one_row_predict_proba(self): # Verify that predict_proba on one row gives 2D output m = LogitNet(random_state=42) for X, y in itertools.chain(self.binomial, self.multinomial): m.fit(X, y) p = m.predict_proba(X[0].reshape((1, -1))) assert p.shape == (1, len(np.unique(y)))
def train_glmnet(train, test, save_path_pred, save_path_model, save_path_json, n_cores=5): ln = LogitNet(alpha=0.5, n_splits=10, n_jobs=n_cores) # to sparse train_sparse = (csc_matrix(train[0]), csc_matrix(train[1].astype(np.float64).reshape((-1, 1)))) test_sparse = (csc_matrix(test[0]), csc_matrix(test[1].astype(np.float64).reshape((-1, 1)))) print("train the model") ln.fit(train_sparse[0], train[1]) print("get predictions") y_pred = ln.predict_proba(test_sparse[0])[:, 1] auprc = cem.auprc(test[1], y_pred) auc = cem.auc(test[1], y_pred) # csv print("save csv") dt = pd.DataFrame({"y_true": test[1], "y_pred": y_pred}) dt.to_csv(save_path_pred) # json print("save json") write_json({"auprc": auprc, "auc": auc}, save_path_json) # model print("save model") pickle.dump(ln, open(save_path_model, "wb"))
def test_one_row_predict_proba_with_lambda(self): # One row to predict_proba along with lambdas should give 3D output m = LogitNet(random_state=42) lamb = [0.01, 0.02, 0.04, 0.1] for X, y in itertools.chain(self.binomial, self.multinomial): m.fit(X, y) p = m.predict_proba(X[0].reshape((1, -1)), lamb=lamb) assert p.shape == (1, len(np.unique(y)), len(lamb))
def test_coef_interpolation(self): x, y = self.binomial[0] m = LogitNet(n_splits=0, random_state=561) m = m.fit(x, y) # predict for a value of lambda between two values on the computed path lamb_lo = m.lambda_path_[1] lamb_hi = m.lambda_path_[2] # a value not equal to one on the computed path lamb_mid = (lamb_lo + lamb_hi) / 2.0 pred_lo = m.predict_proba(x, lamb=lamb_lo) pred_hi = m.predict_proba(x, lamb=lamb_hi) pred_mid = m.predict_proba(x, lamb=lamb_mid) self.assertFalse(np.allclose(pred_lo, pred_mid)) self.assertFalse(np.allclose(pred_hi, pred_mid))
def test_coef_interpolation(self): x, y = self.binomial[0] m = LogitNet(n_folds=0, random_state=561) m = m.fit(x, y) # predict for a value of lambda between two values on the computed path lamb_lo = m.lambda_path_[1] lamb_hi = m.lambda_path_[2] # a value not equal to one on the computed path lamb_mid = (lamb_lo + lamb_hi) / 2.0 pred_lo = m.predict_proba(x, lamb=lamb_lo) pred_hi = m.predict_proba(x, lamb=lamb_hi) pred_mid = m.predict_proba(x, lamb=lamb_mid) self.assertFalse(np.allclose(pred_lo, pred_mid)) self.assertFalse(np.allclose(pred_hi, pred_mid))
def fit_glm(args, X, y): print('GLM') # fit on full dataset and save model np.random.seed(1000) glm = LogitNet(alpha=0.5, n_lambda=20, n_jobs=5) glm.fit(X, y) with open(MODEL_DIR / f'glm_{args.dataset}.pkl', 'wb') as f: pickle.dump(glm, f) print('In-sample: ') tmp = glm.predict_proba(X) AUC = roc_auc_score(y, tmp[:, 1]) APR = average_precision_score(y, tmp[:, 1]) print('\tAUC ', np.round(AUC, 4)) print('\tAPR ', np.round(APR, 4)) print('Out-of-sample: ') print(glm.lambda_best_) # generate in-dataset CV predictions kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1111) np.random.seed(1000) glm = LogitNet(alpha=0.5, n_lambda=1, lambda_path=[glm.lambda_best_]) cv_scores = cross_val_predict(glm, X, y, cv=kf, method='predict_proba', n_jobs=-1) AUC = roc_auc_score(y, cv_scores[:, 1]) APR = average_precision_score(y, cv_scores[:, 1]) print('\tAUC ', np.round(AUC, 4)) print('\tAPR ', np.round(APR, 4)) np.save(MODEL_DIR / f'glm_{args.dataset}.npy', cv_scores[:, 1])
# Loop over number of iterations for i in tqdm(range(N_ITERATIONS)): # Fit LogisticNet with the training set lr = LogitNet(alpha=ALPHA, n_lambda=N_LAMBDA, standardize=False, n_folds=N_FOLDS, max_iter=MAX_ITER, random_state=i) lr = lr.fit(X_train, y_train) # Identify and save the best lambda lamb = lr.lambda_max_ # Generate scores for training and test sets y_train_scores = lr.predict_proba(X_train, lamb=lamb)[:, 1] y_test_scores = lr.predict_proba(X_test, lamb=lamb)[:, 1] # Save AUCs all_y_train_scores.append(y_train_scores) all_y_test_scores.append(y_test_scores) # Generate scores for training and test sets all_y_train_scores = np.array(all_y_train_scores) y_train_scores_mean = np.mean(all_y_train_scores, axis=0) all_y_test_scores = np.array(all_y_test_scores) y_test_scores_mean = np.mean(all_y_test_scores, axis=0) # Compute ROC curve and ROC area for each class n_classes = 2 fpr = dict()
y_preds = [] y_tests = [] fpr_interp = np.linspace(0, 1, 100) tpr_interps = [] aocs = [] for i in range(n_splits): print(i) model = LogitNet( fit_intercept=False, n_jobs=cpus) #(cv=n_cv, n_jobs=min(cpus,n_cv), selection='random') X_train = person_biomarker[train_idxs[i], :] y_train = sample_data.phenotype[train_idxs[i]] X_test = person_biomarker[test_idxs[i], :] y_test = sample_data.phenotype[test_idxs[i]] model.fit(X_train.todense(), 1 * y_train) y_pred = model.predict_proba(X_test)[:, 1] fpr, tpr, _ = sklearn.metrics.roc_curve(y_test, y_pred) interp_func = scipy.interpolate.interp1d(fpr, tpr) tpr_interp = interp_func(fpr_interp) tpr_interps = tpr_interps + [list(tpr_interp)] aoc = sklearn.metrics.roc_auc_score(y_test, y_pred) aocs = aocs + [aoc] tpr_df = pd.DataFrame(tpr_interps) tpr_df.columns = fpr_interp tpr_df = tpr_df.melt() tpr_df.columns = ['fpr', 'tpr'] tpr_df.to_csv(BIOMARKER_DIR + 'results/ML_phenotype_prediction/fpr_vs_tpr_%s_%s.tsv' % (biomarker, dataset), sep='\t') print(np.mean(aocs))
classifier = LogitNet(alpha=alpha, n_jobs=n_jobs, min_lambda_ratio=1e-8, n_lambda=150, standardize=True, random_state=(int(network_year) + 1) * (seed + 1), scoring=scoring) classifier.fit(X_train, y_train) # Just get the results on the test set print('Obtainig Test Set results') X_test = feature_df.loc[test, features].copy() X_test.loc[:, degree_features] = dt.transform(X_test.loc[:, degree_features]) X_test.loc[:, dwpc_features] = dwpct.transform(X_test.loc[:, dwpc_features]) test_probas = classifier.predict_proba(X_test)[:, 1] cd_df.loc[test, 'test_probas'] = test_probas def glmnet_coefs(glmnet_obj, X, f_names): """Helper Function to quickly return the model coefs and correspoding fetaure names""" l = glmnet_obj.lambda_best_[0] coef = glmnet_obj.coef_[0] coef = np.insert(coef, 0, glmnet_obj.intercept_) names = np.insert(f_names, 0, 'intercept') z_intercept = coef[0] + sum(coef[1:] * X.mean(axis=0)) z_coef = coef[1:] * X.values.std(axis=0) z_coef = np.insert(z_coef, 0, z_intercept)