def fit_gam(X, y, comment, use_x_normalization): print("------------------------------") print(comment) print("------------------------------") np.random.seed(0) if use_x_normalization: X = StandardScaler().fit_transform(X) train_scores = np.array([]) val_scores = np.array([]) kf = KFold(n_splits=10, shuffle=True) for train_index, val_index in kf.split(X): X_train, X_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] clf = LogisticGAM() clf.fit(X_train, y_train) train_scores = np.append(train_scores, clf.accuracy(X_train, y_train) * 100) val_scores = np.append(val_scores, clf.accuracy(X_val, y_val) * 100) print('Training accuracy: {:.2f}%'.format(np.mean(train_scores))) print('Validation accuracy: {:.2f}%'.format(np.mean(val_scores))) print()
def main(): X = pd.read_csv( './dataset/gradcafe/cs_preprocessed_X.csv', usecols=[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]).values # X = pd.read_csv('./dataset/gradcafe/pnp_x.csv', header=None).values y = pd.read_csv('./dataset/gradcafe/cs_preprocessed_Y.csv').values.reshape( -1) np.random.seed(0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) fit_gam(X_train, y_train, "Without normalization on X", False) fit_gam(X_train, y_train, "With normalization on X", True) # Normalization is better X_train = StandardScaler().fit_transform(X_train) X_test = StandardScaler().fit_transform(X_test) np.random.seed(0) clf = LogisticGAM() clf.fit(X_train, y_train) training_accuracy = clf.accuracy(X_train, y_train) * 100 testing_accuracy = clf.accuracy(X_test, y_test) * 100 print("------------------------------") print("Results with normalization on testing set") print("------------------------------") print('Training accuracy: {:.2f}%'.format(training_accuracy)) print('Testing accuracy: {:.2f}%'.format(testing_accuracy)) print()
def spline_classification_plot(ax, X, y, X_eval, y_eval, gam_ref): # gam = LogisticGAM(s(0)).gridsearch(X, y) # documentation of LogisticGAM: https://pygam.readthedocs.io/en/latest/api/logisticgam.html gam = LogisticGAM(s(0, constraints='monotonic_inc', n_splines=5)).gridsearch(X, y) # add a linear term #XX = gam.generate_X_grid(term=0) XX = np.linspace(0, 1, 100) ax.plot(XX, gam.predict_proba(XX), c='g') ax.plot(XX, gam.confidence_intervals(XX, width=0.95), c='r', ls='--') # compute ece and acc after calibration y_ = gam.predict_proba(X_eval) ece = EceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100) mce = MceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100) brier = BrierEval(np.array([1 - y_, y_]).T, y_eval) mse = MseEval(gam, gam_ref, num_bins=100) acc = gam.accuracy(X_eval, y_eval) ax.text(0.05, 0.75, 'ECE=%.4f\nMCE=%.4f\nBrier=%.4f\nACC=%.4f\nMSE=%.4f' % (ece, mce, brier, acc, mse), size=6, ha='left', va='center', bbox={ 'facecolor': 'green', 'alpha': 0.5, 'pad': 4 }) ax.set_xlim(0.0, 1.0) ax.set_ylim(0.0, 1.0) confi = gam.confidence_intervals(X_eval, width=0.95) print gam.summary() return ece, mce, brier, acc, mse, ax, confi
def spline_classification(X, y, X_eval, y_eval, gam_ref): # gam = LogisticGAM(s(0)).gridsearch(X, y) # documentation of LogisticGAM: https://pygam.readthedocs.io/en/latest/api/logisticgam.html gam = LogisticGAM(s(0, constraints='monotonic_inc', n_splines=5)).gridsearch(X, y) # add a linear term #XX = gam.generate_X_grid(term=0) # compute ece and acc after calibration y_ = gam.predict_proba(X_eval) ece = EceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100) mce = MceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100) brier = BrierEval(np.array([1 - y_, y_]).T, y_eval) mse = MseEval(gam, gam_ref, num_bins=100) acc = gam.accuracy(X_eval, y_eval) # compute the confidence on datapoints of X_eval confi = gam.confidence_intervals(X_eval, width=0.95) return ece, mce, brier, acc, mse, confi
n_splines = [5, 10, 15, 20, 25] lams = lams * 6 # shift values to -3, 3 lams = lams - 3 lams = np.exp(lams) cons = [ 'convex', 'concave', 'monotonic_inc', 'monotonic_dec', 'circular', 'none' ] random = LogisticGAM(aa).gridsearch(trainX, trainy, weights=w, lam=lams, n_splines=n_splines) random = random.gridsearch(trainX, trainy, constraints=cons) print(random.lam) print(random.n_splines) print(random.constraints) print(random.accuracy(testX, testy)) from sklearn.metrics import confusion_matrix preds = random.predict(testX) print(confusion_matrix(testy, preds)) for i, term in enumerate(random.terms): if term.isintercept: continue XX = random.generate_X_grid(term=i) pdep, confi = random.partial_dependence(term=i, X=XX, width=0.95) plt.figure() plt.plot(XX[:, term.feature], pdep) plt.plot(XX[:, term.feature], confi, c='r', ls='--') plt.title(names1[i]) plt.show()
ds = load_breast_cancer() X, y = ds.data, ds.target #select first 6 features only X = X[:, 0:6] selected_features = ds.feature_names[0:6] #----------------------------------------------------- #Fit a model with the default parameters gam = LogisticGAM().fit(X, y) gam.summary() roc_auc_score(y, gam.predict_proba(X)) #0.994173140954495 gam.accuracy(X, y) #0.9560632688927944 #----------------------------------------------------- # Explore and interpret individual features plt.ion() plt.rcParams['figure.figsize'] = (28, 8) fig, axs = plt.subplots(1, X.shape[1]) for i, ax in enumerate(axs): XX = gam.generate_X_grid(term=i, meshgrid=True) pdep, confi = gam.partial_dependence(term=i, X=XX, meshgrid=True, width=.95)
acc_log3 = cross_val_score(logreg, X_train_scaled_poly, Y_train, cv=10, scoring='accuracy').mean() print("\n calculate cross-validated accurancy (M2. X_train_scaled_poly):", acc_log2) acc_logs3 = cross_validation.cross_val_predict(logreg, X_train_scaled_poly, Y_train, cv=10) print(metrics.accuracy_score(Y_train, acc_logs2)) print(metrics.classification_report(Y_train, acc_logs3)) print(logreg.coef_) print('\n ------------------------------------------------------------------') # call predict_proba() to get the list of probabilities that the classifier assigned to each instance for each class: ############################################################################################################################### # GAM import pandas as pd from pygam import LogisticGAM # Fit a model with the default parameters gam = LogisticGAM().fit(X_train_scaled, Y_train) gam.summary() print('gam.accuracy(X_train_scaled, Y_train):',gam.accuracy(X_train_scaled, Y_train)) print('gam.accuracy(X_test_scaled, Y_test):',gam.accuracy(X_test_scaled, Y_test)) acc_loggamc = cross_val_score(gam, X_train_scaled, Y_train, cv=10, scoring='accuracy').mean() print('acc_loggam_cross-validation, train_scaled',acc_loggamc) # make predictions for testing set Y_scaler_pred_class = logreg.predict(X_test_scaled) # calculate testing accuracy from sklearn import metrics print('\n ------------------------------------------------------------------') print("\n calculate testing accuracy (M1. X_train_scaled):", metrics.accuracy_score(Y_test, Y_scaler_pred_class)) print('\n ------------------------------------------------------------------') # ROC curves and AUC # https://www.medcalc.org/manual/roc-curves.php