def fit_pygam_model(X_train: pandas.core.frame.DataFrame,
                    X_test: pandas.core.frame.DataFrame,
                    y_train: pandas.core.frame.DataFrame,
                    y_test: pandas.core.frame.DataFrame):
    '''
    Creates a general additive model LinearGAM (normally distributed errors)
    with grid search. Returns the best model with given hyperparameters.
    hyperparameters: n_splines and lam regularization parameter.
    '''
    from pygam import LinearGAM
    gam = LinearGAM().gridsearch(X_train.values,
                                 y_train,
                                 n_splines=np.arange(3, 20),
                                 lam=np.logspace(-3, 3, 11))
    print(gam.summary())

    y_train_predicted = gam.predict(X_train)
    y_test_predicted = np.floor(gam.predict(X_test))

    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted))
    mae_train = mean_absolute_error(y_train, y_train_predicted)
    r2_train = r2_score(y_train, y_train_predicted)
    print("RMSE of training set is {}".format(rmse_train))
    print("MAE of testing set is {}".format(mae_train))
    print("R2 score of training set is {}\n".format(r2_train))

    if len(y_test) > 0:
        rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predicted))
        mae_test = mean_absolute_error(y_test, y_test_predicted)
        r2_test = r2_score(y_test, y_test_predicted)
        print("RMSE of testing set is {}".format(rmse_test))
        print("MAE of testing set is {}".format(mae_test))
        print("R2 score of testing set is {}\n".format(r2_test))
    '''
    Visualize the feature significance and confidence intervals
    '''
    num_features = len(X_train.columns)
    fig = plt.figure(figsize=(18, 12))
    fig.subplots_adjust(hspace=0.4)

    cnt = 1
    p_values = gam.statistics_['p_values']

    for i in range(num_features):
        axs = fig.add_subplot(num_features, 1, cnt)
        m = gam.generate_X_grid(term=i)
        axs.plot(m[:, i],
                 gam.partial_dependence(term=i,
                                        X=m))  # this is the actual coefficents
        axs.plot(m[:, i],
                 gam.partial_dependence(term=i, X=m, width=.95)[1],
                 c='r',
                 ls='--')  # this plots the confidence intervals
        axs.set_title(X_train.columns[i] +
                      ('*' if p_values[cnt] < 0.05 else ''))
        cnt += 1
Exemplo n.º 2
0
def GAM_linear(X, y):
    X= X.to_numpy()
    y = y.to_numpy()
    from pygam import LinearGAM, s, f, te
    gam = LinearGAM(s(0) +s(1) +f(2))
    gam.gridsearch(X,y)
    y_pred = gam.predict(X)
    y_pred = pd.DataFrame(y_pred)
    y_pred['actual'] =y
    y_pred['residual'] = y_pred.actual-y_pred[0]
    return gam, gam.summary(), y_pred
Exemplo n.º 3
0
def fit_gam_plot_dependencies(df=None,
                              features=None,
                              target=None,
                              basis_1=s,
                              basis_2=False,
                              summary=False):
    X = df[features]
    y = df[target]

    if basis_1 and basis_2:
        gam = LinearGAM(basis_1(0, lam=60) + basis_2(1, lam=60),
                        fit_intercept=True).fit(X, y)

    elif basis_1:
        gam = LinearGAM(basis_1(0, lam=60), fit_intercept=True).fit(X, y)

    else:
        print('no basis called for features.. error')

    if summary:
        print(gam.summary())
    plot_gam_partial_dependencies(gam, features, target)
                    'SqFtTotLiving', ax)

plt.tight_layout()
plt.show()

### Generalized Additive Models

predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade']
outcome = 'AdjSalePrice'
X = house_98105[predictors].values
y = house_98105[outcome]

## model
gam = LinearGAM(s(0, n_splines=12) + l(1) + l(2) + l(3) + l(4))
gam.gridsearch(X, y)
print(gam.summary())

fig, axes = plt.subplots(figsize=(8, 8), ncols=2, nrows=3)

titles = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade']
for i, title in enumerate(titles):
    ax = axes[i // 2, i % 2]
    XX = gam.generate_X_grid(term=i)
    ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX))
    ax.plot(XX[:, i],
            gam.partial_dependence(term=i, X=XX, width=.95)[1],
            c='r',
            ls='--')
    ax.set_title(titles[i])

axes[2][1].set_visible(False)
Exemplo n.º 5
0
print('Read data.')

lams = np.random.rand(150000, 4) * 8 - 3
lams = np.exp(lams)

# randomized grid search
print('Initialized Linear GAM.')
gam_grid = LinearGAM(s(0) + s(1) + s(2) + s(3))
print("Grid searching Linear GAM's lambdas.")
gam_grid.gridsearch(X, y, lam=lams)

with open(f"models/{timestamp} {sys.argv[1]}.pickle", "wb") as handle:
    pickle.dump(gam_grid, handle)
print('Serialized GAM as pickle.')

print(gam_grid.summary())

# plotting
plt.figure(figsize=(16, 16 / 1.618))
fig, axs = plt.subplots(1, 3)

titles = ["pm10median", "time", "tmpd"]
for i, ax in enumerate(axs):
    XX = gam_grid.generate_X_grid(term=i)
    ax.plot(XX[:, i], gam_grid.partial_dependence(term=i, X=XX))
    ax.plot(
        XX[:, i],
        gam_grid.partial_dependence(term=i, X=XX, width=0.95)[1],
        c="r",
        ls="--",
    )
Exemplo n.º 6
0
plt.xlabel('-1*disp')
plt.ylabel('mpg')
plt.title('LOESS Smoothing')
plt.show()
'''
-------------------------------------------------------------------------------
------------------------Generalized Additive Models----------------------------
-------------------------------------------------------------------------------
'''

#GAMs
#https://github.com/dswah/pyGAM
#https://codeburst.io/pygam-getting-started-with-generalized-additive-models-in-python-457df5b4705f
from pygam import LinearGAM, LogisticGAM
gam_model = LinearGAM().fit(d[['disp', 'wt']], d['mpg'])
print(gam_model.summary())
gam_predictions = gam_model.predict(d[['disp', 'wt']])
gam_mse = np.mean((gam_predictions - d['mpg'])**2)
print('MSE:', gam_mse)

#Plot the predictions with confidence intervals
plt.plot(list(d.index), gam_predictions, 'r--')
plt.plot(list(d.index),
         gam_model.prediction_intervals(d[['disp', 'wt']], width=.95),
         color='b',
         ls='--')
plt.scatter(list(d.index), d['mpg'], facecolor='gray', edgecolors='none')
plt.xlabel('Row Index')
plt.ylabel('mpg')
plt.title('GAM Prediction with 95% Condidence Interval')
plt.show()
Exemplo n.º 7
0
ax = plt.axes(projection='3d')
ax.plot_surface(XX[0], XX[1], Z, cmap='viridis')

#Simple interactions, copare with te()

from pygam import LinearGAM, s
from pygam.datasets import toy_interaction

X, y = toy_interaction(return_X_y=True)

gam = LinearGAM(s(0, by=1)).fit(X, y)
gam.summary()

gam1 = LinearGAM(s(0) + s(1)).fit(X, y)
gam1.summary()

gam2 = LinearGAM(te(0, 1)).fit(X, y)
gam2.summary()

import pandas as pd

pd.DataFrame(X).corr()

######################################################
# regression

from pygam import LinearGAM, s, f
from pygam.datasets import wage

X, y = wage(return_X_y=True)
Exemplo n.º 8
0
from pygam.datasets import wage
from pygam import LinearGAM, s, f
import numpy as np
import matplotlib.pyplot as plt

X, y = wage()

gam = LinearGAM(s(0, n_splines=5) + s(1) + f(2)).fit(X, y)

gam.summary()

lam = np.logspace(-3, 5, 5)
lams = [lam] * 3

gam.gridsearch(X, y, lam=lams)
gam.summary()

lams = np.random.rand(100, 3)  # random points on [0, 1], with shape (100, 3)
lams = lams * 8 - 3  # shift values to -3, 3
lams = np.exp(lams)  # transforms values to 1e-3, 1e3

random_gam = LinearGAM(s(0) + s(1) + f(2)).gridsearch(X, y, lam=lams)
random_gam.summary()

print(gam.statistics_['GCV'] < random_gam.statistics_['GCV'])

for i, term in enumerate(gam.terms):
    if term.isintercept:
        continue

    XX = gam.generate_X_grid(term=i)
#https://pygam.readthedocs.io/en/latest/notebooks/quick_start.html#
import pygam

from pygam.datasets import wage

X, y = wage()

from pygam import LinearGAM, s, f

#Let’s fit a spline term to the first 2 features, and a factor term to the 3rd feature.

gam = LinearGAM(s(0) + s(1) + f(2)).fit(X, y)

gam.summary()
Exemplo n.º 10
0
# https://pygam.readthedocs.io/en/latest/notebooks/quick_start.html

from pygam.datasets import wage

X, y = wage()

X.shape, y.shape

from pygam import LinearGAM, s, f

gam = LinearGAM(s(0) + s(1) + f(2)).fit(X, y)
gam.summary()

#by deafaule s has 20 base functions
gam2 = LinearGAM(s(0, n_splines=5) + s(1) + f(2)).fit(X, y)
gam2.summary()

#by default all terms has lambda penalty of 0.6,
#running grid search for lambda optimization
#using GCV (generalized cv-score)

import numpy as np

lam = np.logspace(-3, 5, 5)
lams = [lam] * 3
lams

gam3 = LinearGAM(s(0) + s(1) + f(2))
gam3.gridsearch(X, y, lam=lams)
gam3.summary()
X = redwine.drop('quality', axis=1).values
y = redwine['quality']
feature_names = redwine.columns[:-1]

#build linear and poisson gam

from pygam import PoissonGAM, LinearGAM

lams = np.logspace(-10, 10, 10)

poiss = PoissonGAM().gridsearch(X, y, lam=lams)
poiss.summary()

lin = LinearGAM().gridsearch(X, y, lam=lams)
lin.summary()

plt.figure()
fig, axs = plt.subplots(1, 11, figsize=(40, 8))
for i, ax in enumerate(axs):
    XX = poiss.generate_X_grid(term=i)
    ax.plot(XX[:, i], poiss.partial_dependence(term=i, X=XX))
    ax.plot(XX[:, i],
            poiss.partial_dependence(term=i, X=XX, width=.95)[1],
            c='r',
            ls='--')
    if i == 0:
        ax.set_ylim(-30, 30)
    ax.set_title(feature_names[i])

plt.figure()