def fit_pygam_model(X_train: pandas.core.frame.DataFrame, X_test: pandas.core.frame.DataFrame, y_train: pandas.core.frame.DataFrame, y_test: pandas.core.frame.DataFrame): ''' Creates a general additive model LinearGAM (normally distributed errors) with grid search. Returns the best model with given hyperparameters. hyperparameters: n_splines and lam regularization parameter. ''' from pygam import LinearGAM gam = LinearGAM().gridsearch(X_train.values, y_train, n_splines=np.arange(3, 20), lam=np.logspace(-3, 3, 11)) print(gam.summary()) y_train_predicted = gam.predict(X_train) y_test_predicted = np.floor(gam.predict(X_test)) rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted)) mae_train = mean_absolute_error(y_train, y_train_predicted) r2_train = r2_score(y_train, y_train_predicted) print("RMSE of training set is {}".format(rmse_train)) print("MAE of testing set is {}".format(mae_train)) print("R2 score of training set is {}\n".format(r2_train)) if len(y_test) > 0: rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predicted)) mae_test = mean_absolute_error(y_test, y_test_predicted) r2_test = r2_score(y_test, y_test_predicted) print("RMSE of testing set is {}".format(rmse_test)) print("MAE of testing set is {}".format(mae_test)) print("R2 score of testing set is {}\n".format(r2_test)) ''' Visualize the feature significance and confidence intervals ''' num_features = len(X_train.columns) fig = plt.figure(figsize=(18, 12)) fig.subplots_adjust(hspace=0.4) cnt = 1 p_values = gam.statistics_['p_values'] for i in range(num_features): axs = fig.add_subplot(num_features, 1, cnt) m = gam.generate_X_grid(term=i) axs.plot(m[:, i], gam.partial_dependence(term=i, X=m)) # this is the actual coefficents axs.plot(m[:, i], gam.partial_dependence(term=i, X=m, width=.95)[1], c='r', ls='--') # this plots the confidence intervals axs.set_title(X_train.columns[i] + ('*' if p_values[cnt] < 0.05 else '')) cnt += 1
def GAM_linear(X, y): X= X.to_numpy() y = y.to_numpy() from pygam import LinearGAM, s, f, te gam = LinearGAM(s(0) +s(1) +f(2)) gam.gridsearch(X,y) y_pred = gam.predict(X) y_pred = pd.DataFrame(y_pred) y_pred['actual'] =y y_pred['residual'] = y_pred.actual-y_pred[0] return gam, gam.summary(), y_pred
def fit_gam_plot_dependencies(df=None, features=None, target=None, basis_1=s, basis_2=False, summary=False): X = df[features] y = df[target] if basis_1 and basis_2: gam = LinearGAM(basis_1(0, lam=60) + basis_2(1, lam=60), fit_intercept=True).fit(X, y) elif basis_1: gam = LinearGAM(basis_1(0, lam=60), fit_intercept=True).fit(X, y) else: print('no basis called for features.. error') if summary: print(gam.summary()) plot_gam_partial_dependencies(gam, features, target)
'SqFtTotLiving', ax) plt.tight_layout() plt.show() ### Generalized Additive Models predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade'] outcome = 'AdjSalePrice' X = house_98105[predictors].values y = house_98105[outcome] ## model gam = LinearGAM(s(0, n_splines=12) + l(1) + l(2) + l(3) + l(4)) gam.gridsearch(X, y) print(gam.summary()) fig, axes = plt.subplots(figsize=(8, 8), ncols=2, nrows=3) titles = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade'] for i, title in enumerate(titles): ax = axes[i // 2, i % 2] XX = gam.generate_X_grid(term=i) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX)) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--') ax.set_title(titles[i]) axes[2][1].set_visible(False)
print('Read data.') lams = np.random.rand(150000, 4) * 8 - 3 lams = np.exp(lams) # randomized grid search print('Initialized Linear GAM.') gam_grid = LinearGAM(s(0) + s(1) + s(2) + s(3)) print("Grid searching Linear GAM's lambdas.") gam_grid.gridsearch(X, y, lam=lams) with open(f"models/{timestamp} {sys.argv[1]}.pickle", "wb") as handle: pickle.dump(gam_grid, handle) print('Serialized GAM as pickle.') print(gam_grid.summary()) # plotting plt.figure(figsize=(16, 16 / 1.618)) fig, axs = plt.subplots(1, 3) titles = ["pm10median", "time", "tmpd"] for i, ax in enumerate(axs): XX = gam_grid.generate_X_grid(term=i) ax.plot(XX[:, i], gam_grid.partial_dependence(term=i, X=XX)) ax.plot( XX[:, i], gam_grid.partial_dependence(term=i, X=XX, width=0.95)[1], c="r", ls="--", )
plt.xlabel('-1*disp') plt.ylabel('mpg') plt.title('LOESS Smoothing') plt.show() ''' ------------------------------------------------------------------------------- ------------------------Generalized Additive Models---------------------------- ------------------------------------------------------------------------------- ''' #GAMs #https://github.com/dswah/pyGAM #https://codeburst.io/pygam-getting-started-with-generalized-additive-models-in-python-457df5b4705f from pygam import LinearGAM, LogisticGAM gam_model = LinearGAM().fit(d[['disp', 'wt']], d['mpg']) print(gam_model.summary()) gam_predictions = gam_model.predict(d[['disp', 'wt']]) gam_mse = np.mean((gam_predictions - d['mpg'])**2) print('MSE:', gam_mse) #Plot the predictions with confidence intervals plt.plot(list(d.index), gam_predictions, 'r--') plt.plot(list(d.index), gam_model.prediction_intervals(d[['disp', 'wt']], width=.95), color='b', ls='--') plt.scatter(list(d.index), d['mpg'], facecolor='gray', edgecolors='none') plt.xlabel('Row Index') plt.ylabel('mpg') plt.title('GAM Prediction with 95% Condidence Interval') plt.show()
ax = plt.axes(projection='3d') ax.plot_surface(XX[0], XX[1], Z, cmap='viridis') #Simple interactions, copare with te() from pygam import LinearGAM, s from pygam.datasets import toy_interaction X, y = toy_interaction(return_X_y=True) gam = LinearGAM(s(0, by=1)).fit(X, y) gam.summary() gam1 = LinearGAM(s(0) + s(1)).fit(X, y) gam1.summary() gam2 = LinearGAM(te(0, 1)).fit(X, y) gam2.summary() import pandas as pd pd.DataFrame(X).corr() ###################################################### # regression from pygam import LinearGAM, s, f from pygam.datasets import wage X, y = wage(return_X_y=True)
from pygam.datasets import wage from pygam import LinearGAM, s, f import numpy as np import matplotlib.pyplot as plt X, y = wage() gam = LinearGAM(s(0, n_splines=5) + s(1) + f(2)).fit(X, y) gam.summary() lam = np.logspace(-3, 5, 5) lams = [lam] * 3 gam.gridsearch(X, y, lam=lams) gam.summary() lams = np.random.rand(100, 3) # random points on [0, 1], with shape (100, 3) lams = lams * 8 - 3 # shift values to -3, 3 lams = np.exp(lams) # transforms values to 1e-3, 1e3 random_gam = LinearGAM(s(0) + s(1) + f(2)).gridsearch(X, y, lam=lams) random_gam.summary() print(gam.statistics_['GCV'] < random_gam.statistics_['GCV']) for i, term in enumerate(gam.terms): if term.isintercept: continue XX = gam.generate_X_grid(term=i)
#https://pygam.readthedocs.io/en/latest/notebooks/quick_start.html# import pygam from pygam.datasets import wage X, y = wage() from pygam import LinearGAM, s, f #Let’s fit a spline term to the first 2 features, and a factor term to the 3rd feature. gam = LinearGAM(s(0) + s(1) + f(2)).fit(X, y) gam.summary()
# https://pygam.readthedocs.io/en/latest/notebooks/quick_start.html from pygam.datasets import wage X, y = wage() X.shape, y.shape from pygam import LinearGAM, s, f gam = LinearGAM(s(0) + s(1) + f(2)).fit(X, y) gam.summary() #by deafaule s has 20 base functions gam2 = LinearGAM(s(0, n_splines=5) + s(1) + f(2)).fit(X, y) gam2.summary() #by default all terms has lambda penalty of 0.6, #running grid search for lambda optimization #using GCV (generalized cv-score) import numpy as np lam = np.logspace(-3, 5, 5) lams = [lam] * 3 lams gam3 = LinearGAM(s(0) + s(1) + f(2)) gam3.gridsearch(X, y, lam=lams) gam3.summary()
X = redwine.drop('quality', axis=1).values y = redwine['quality'] feature_names = redwine.columns[:-1] #build linear and poisson gam from pygam import PoissonGAM, LinearGAM lams = np.logspace(-10, 10, 10) poiss = PoissonGAM().gridsearch(X, y, lam=lams) poiss.summary() lin = LinearGAM().gridsearch(X, y, lam=lams) lin.summary() plt.figure() fig, axs = plt.subplots(1, 11, figsize=(40, 8)) for i, ax in enumerate(axs): XX = poiss.generate_X_grid(term=i) ax.plot(XX[:, i], poiss.partial_dependence(term=i, X=XX)) ax.plot(XX[:, i], poiss.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--') if i == 0: ax.set_ylim(-30, 30) ax.set_title(feature_names[i]) plt.figure()