Exemplo n.º 1
0
def fit_gam(X, y, comment, use_x_normalization):
    print("------------------------------")
    print(comment)
    print("------------------------------")
    np.random.seed(0)
    if use_x_normalization:
        X = StandardScaler().fit_transform(X)

    train_scores = np.array([])
    val_scores = np.array([])

    kf = KFold(n_splits=10, shuffle=True)
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        clf = LogisticGAM()
        clf.fit(X_train, y_train)

        train_scores = np.append(train_scores,
                                 clf.accuracy(X_train, y_train) * 100)
        val_scores = np.append(val_scores, clf.accuracy(X_val, y_val) * 100)

    print('Training accuracy: {:.2f}%'.format(np.mean(train_scores)))
    print('Validation accuracy: {:.2f}%'.format(np.mean(val_scores)))
    print()
Exemplo n.º 2
0
def main():
    X = pd.read_csv(
        './dataset/gradcafe/cs_preprocessed_X.csv',
        usecols=[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]).values
    # X = pd.read_csv('./dataset/gradcafe/pnp_x.csv', header=None).values
    y = pd.read_csv('./dataset/gradcafe/cs_preprocessed_Y.csv').values.reshape(
        -1)

    np.random.seed(0)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=0)

    fit_gam(X_train, y_train, "Without normalization on X", False)
    fit_gam(X_train, y_train, "With normalization on X", True)

    # Normalization is better
    X_train = StandardScaler().fit_transform(X_train)
    X_test = StandardScaler().fit_transform(X_test)

    np.random.seed(0)
    clf = LogisticGAM()
    clf.fit(X_train, y_train)

    training_accuracy = clf.accuracy(X_train, y_train) * 100
    testing_accuracy = clf.accuracy(X_test, y_test) * 100

    print("------------------------------")
    print("Results with normalization on testing set")
    print("------------------------------")
    print('Training accuracy: {:.2f}%'.format(training_accuracy))
    print('Testing accuracy: {:.2f}%'.format(testing_accuracy))
    print()
Exemplo n.º 3
0
def spline_classification_plot(ax, X, y, X_eval, y_eval, gam_ref):
    # gam = LogisticGAM(s(0)).gridsearch(X, y)
    # documentation of LogisticGAM: https://pygam.readthedocs.io/en/latest/api/logisticgam.html
    gam = LogisticGAM(s(0, constraints='monotonic_inc',
                        n_splines=5)).gridsearch(X, y)  # add a linear term
    #XX = gam.generate_X_grid(term=0)
    XX = np.linspace(0, 1, 100)
    ax.plot(XX, gam.predict_proba(XX), c='g')
    ax.plot(XX, gam.confidence_intervals(XX, width=0.95), c='r', ls='--')
    # compute ece and acc after calibration
    y_ = gam.predict_proba(X_eval)
    ece = EceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100)
    mce = MceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100)
    brier = BrierEval(np.array([1 - y_, y_]).T, y_eval)
    mse = MseEval(gam, gam_ref, num_bins=100)
    acc = gam.accuracy(X_eval, y_eval)
    ax.text(0.05,
            0.75,
            'ECE=%.4f\nMCE=%.4f\nBrier=%.4f\nACC=%.4f\nMSE=%.4f' %
            (ece, mce, brier, acc, mse),
            size=6,
            ha='left',
            va='center',
            bbox={
                'facecolor': 'green',
                'alpha': 0.5,
                'pad': 4
            })
    ax.set_xlim(0.0, 1.0)
    ax.set_ylim(0.0, 1.0)
    confi = gam.confidence_intervals(X_eval, width=0.95)
    print gam.summary()
    return ece, mce, brier, acc, mse, ax, confi
Exemplo n.º 4
0
def spline_classification(X, y, X_eval, y_eval, gam_ref):
    # gam = LogisticGAM(s(0)).gridsearch(X, y)
    # documentation of LogisticGAM: https://pygam.readthedocs.io/en/latest/api/logisticgam.html
    gam = LogisticGAM(s(0, constraints='monotonic_inc',
                        n_splines=5)).gridsearch(X, y)  # add a linear term
    #XX = gam.generate_X_grid(term=0)
    # compute ece and acc after calibration
    y_ = gam.predict_proba(X_eval)
    ece = EceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100)
    mce = MceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100)
    brier = BrierEval(np.array([1 - y_, y_]).T, y_eval)
    mse = MseEval(gam, gam_ref, num_bins=100)
    acc = gam.accuracy(X_eval, y_eval)
    # compute the confidence on datapoints of X_eval
    confi = gam.confidence_intervals(X_eval, width=0.95)
    return ece, mce, brier, acc, mse, confi
Exemplo n.º 5
0
n_splines = [5, 10, 15, 20, 25]
lams = lams * 6  # shift values to -3, 3
lams = lams - 3
lams = np.exp(lams)
cons = [
    'convex', 'concave', 'monotonic_inc', 'monotonic_dec', 'circular', 'none'
]
random = LogisticGAM(aa).gridsearch(trainX,
                                    trainy,
                                    weights=w,
                                    lam=lams,
                                    n_splines=n_splines)
random = random.gridsearch(trainX, trainy, constraints=cons)
print(random.lam)
print(random.n_splines)
print(random.constraints)
print(random.accuracy(testX, testy))

from sklearn.metrics import confusion_matrix
preds = random.predict(testX)
print(confusion_matrix(testy, preds))
for i, term in enumerate(random.terms):
    if term.isintercept:
        continue
    XX = random.generate_X_grid(term=i)
    pdep, confi = random.partial_dependence(term=i, X=XX, width=0.95)
    plt.figure()
    plt.plot(XX[:, term.feature], pdep)
    plt.plot(XX[:, term.feature], confi, c='r', ls='--')
    plt.title(names1[i])
    plt.show()
Exemplo n.º 6
0
ds = load_breast_cancer()

X, y = ds.data, ds.target

#select first 6 features only
X = X[:, 0:6]

selected_features = ds.feature_names[0:6]

#-----------------------------------------------------
#Fit a model with the default parameters
gam = LogisticGAM().fit(X, y)
gam.summary()

roc_auc_score(y, gam.predict_proba(X))  #0.994173140954495
gam.accuracy(X, y)  #0.9560632688927944

#-----------------------------------------------------
# Explore and interpret individual features

plt.ion()
plt.rcParams['figure.figsize'] = (28, 8)

fig, axs = plt.subplots(1, X.shape[1])

for i, ax in enumerate(axs):
    XX = gam.generate_X_grid(term=i, meshgrid=True)
    pdep, confi = gam.partial_dependence(term=i,
                                         X=XX,
                                         meshgrid=True,
                                         width=.95)
Exemplo n.º 7
0
acc_log3 = cross_val_score(logreg, X_train_scaled_poly, Y_train, cv=10, scoring='accuracy').mean()
print("\n calculate cross-validated accurancy  (M2. X_train_scaled_poly):", acc_log2)
acc_logs3 = cross_validation.cross_val_predict(logreg, X_train_scaled_poly, Y_train, cv=10)
print(metrics.accuracy_score(Y_train, acc_logs2))
print(metrics.classification_report(Y_train, acc_logs3))
print(logreg.coef_)
print('\n ------------------------------------------------------------------')
# call predict_proba() to get the list of probabilities that the classifier assigned to each instance for each class:
###############################################################################################################################
# GAM
import pandas as pd
from pygam import LogisticGAM
# Fit a model with the default parameters
gam = LogisticGAM().fit(X_train_scaled, Y_train)
gam.summary()
print('gam.accuracy(X_train_scaled, Y_train):',gam.accuracy(X_train_scaled, Y_train))
print('gam.accuracy(X_test_scaled, Y_test):',gam.accuracy(X_test_scaled, Y_test))
acc_loggamc = cross_val_score(gam, X_train_scaled, Y_train, cv=10, scoring='accuracy').mean()
print('acc_loggam_cross-validation, train_scaled',acc_loggamc)


# make predictions for testing set
Y_scaler_pred_class = logreg.predict(X_test_scaled)
# calculate testing accuracy
from sklearn import metrics

print('\n ------------------------------------------------------------------')
print("\n calculate testing accuracy (M1. X_train_scaled):", metrics.accuracy_score(Y_test, Y_scaler_pred_class))
print('\n ------------------------------------------------------------------')
#  ROC curves and AUC
# https://www.medcalc.org/manual/roc-curves.php