def fit_pygam_model(X_train: pandas.core.frame.DataFrame, X_test: pandas.core.frame.DataFrame, y_train: pandas.core.frame.DataFrame, y_test: pandas.core.frame.DataFrame): ''' Creates a general additive model LinearGAM (normally distributed errors) with grid search. Returns the best model with given hyperparameters. hyperparameters: n_splines and lam regularization parameter. ''' from pygam import LinearGAM gam = LinearGAM().gridsearch(X_train.values, y_train, n_splines=np.arange(3, 20), lam=np.logspace(-3, 3, 11)) print(gam.summary()) y_train_predicted = gam.predict(X_train) y_test_predicted = np.floor(gam.predict(X_test)) rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted)) mae_train = mean_absolute_error(y_train, y_train_predicted) r2_train = r2_score(y_train, y_train_predicted) print("RMSE of training set is {}".format(rmse_train)) print("MAE of testing set is {}".format(mae_train)) print("R2 score of training set is {}\n".format(r2_train)) if len(y_test) > 0: rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predicted)) mae_test = mean_absolute_error(y_test, y_test_predicted) r2_test = r2_score(y_test, y_test_predicted) print("RMSE of testing set is {}".format(rmse_test)) print("MAE of testing set is {}".format(mae_test)) print("R2 score of testing set is {}\n".format(r2_test)) ''' Visualize the feature significance and confidence intervals ''' num_features = len(X_train.columns) fig = plt.figure(figsize=(18, 12)) fig.subplots_adjust(hspace=0.4) cnt = 1 p_values = gam.statistics_['p_values'] for i in range(num_features): axs = fig.add_subplot(num_features, 1, cnt) m = gam.generate_X_grid(term=i) axs.plot(m[:, i], gam.partial_dependence(term=i, X=m)) # this is the actual coefficents axs.plot(m[:, i], gam.partial_dependence(term=i, X=m, width=.95)[1], c='r', ls='--') # this plots the confidence intervals axs.set_title(X_train.columns[i] + ('*' if p_values[cnt] < 0.05 else '')) cnt += 1
X = df_pheno.loc[:, 'ageAtScan1_Years'] Y = df_system.loc[:, metric] # Estimate GAM with spline # In[5]: gam = LinearGAM(s(0)).fit(X, Y) gam.gridsearch(X, Y) # Plot # In[6]: XX = gam.generate_X_grid(term=0) pdep, confi = gam.partial_dependence(term=0, X=XX, width=0.95) plt.figure() plt.plot(XX, pdep) # fit plt.plot(XX, confi, c='r', ls='--') # confidence interval plt.plot(XX, gam.prediction_intervals(XX, width=.95), color='b', ls='--') # 95% prediction interval plt.scatter(X, Y, facecolor='gray', edgecolors='none', alpha=0.5) # data plt.xlabel('Age') plt.ylabel('Brain feature') plt.show() # In[7]: metric = 'jd' X = df_pheno.loc[:, ['ageAtScan1_Years', 'mprage_antsCT_vol_TBV']]
outcome = 'AdjSalePrice' X = house_98105[predictors].values y = house_98105[outcome] ## model gam = LinearGAM(s(0, n_splines=12) + l(1) + l(2) + l(3) + l(4)) gam.gridsearch(X, y) print(gam.summary()) fig, axes = plt.subplots(figsize=(8, 8), ncols=2, nrows=3) titles = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade'] for i, title in enumerate(titles): ax = axes[i // 2, i % 2] XX = gam.generate_X_grid(term=i) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX)) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--') ax.set_title(titles[i]) axes[2][1].set_visible(False) plt.tight_layout() plt.show() ## Additional material - not in book ## Regularization ### Lasso
def explain_instance_with_data(self, neighborhood_data, neighborhood_labels, distances, label, num_features, feature_selection='auto', model_regressor=None, gam_type=None): """Takes perturbed data, labels and distances, returns explanation. Args: neighborhood_data: perturbed data, 2d array. first element is assumed to be the original data point. neighborhood_labels: corresponding perturbed labels. should have as many columns as the number of possible labels. distances: distances to original data point. label: label for which we want an explanation num_features: maximum number of features in explanation feature_selection: how to select num_features. options are: 'forward_selection': iteratively add features to the model. This is costly when num_features is high 'highest_weights': selects the features that have the highest product of absolute weight * original data point when learning with all the features 'lasso_path': chooses features based on the lasso regularization path 'none': uses all features, ignores num_features 'auto': uses forward_selection if num_features <= 6, and 'highest_weights' otherwise. model_regressor: sklearn regressor to use in explanation. Defaults to Ridge regression if None. Must have model_regressor.coef_ and 'sample_weight' as a parameter to model_regressor.fit() Returns: (intercept, exp, score): intercept is a float. exp is a sorted list of tuples, where each tuple (x,y) corresponds to the feature id (x) and the local weight (y). The list is sorted by decreasing absolute value of y. score is the R^2 value of the returned explanation """ weights = self.kernel_fn(distances) labels_column = neighborhood_labels[:, label] used_features = self.feature_selection(neighborhood_data, labels_column, weights, num_features, feature_selection) X = neighborhood_data[:, used_features] y = neighborhood_labels[:, label] (X_train, X_test, y_train, y_test, train_weights, test_weights) = train_test_split(X, y, weights, test_size=0.2) linear_model = Ridge(alpha=1, fit_intercept=True, random_state=self.random_state) gam = LinearGAM() dt = DecisionTreeRegressor() linear_model.fit(X_train, y_train, sample_weight=train_weights) gam.fit(X_train, y_train, weights=train_weights) dt.fit(X_train, y_train, sample_weight=train_weights) # # plot # for i, term in enumerate(gam.terms): # if term.isintercept: # continue # XX = gam.generate_X_grid(term=i) # # pdep = gam.predict(XX) # pdep = gam.partial_dependence(term=i, X=XX) + linear_model.intercept_ # # line = XX[:, term.feature] * linear_model.coef_[term.feature] # line = linear_model.predict(XX) # dect = dt.predict(XX) # plt.figure() # plt.plot(XX[:, term.feature], pdep) # plt.plot(XX[:, term.feature], line) # plt.plot(XX[:, term.feature], dect) # plt.title(repr(term)) # plt.show() # exit() y_lr = linear_model.predict(X_test) y_gam = gam.predict(X_test) y_dt = dt.predict(X_test) # y_lr = linear_model.predict(X_train) # y_gam = gam.predict(X_train) # y_dt = dt.predict(X_train) # mse_lr = mean_squared_error(y_test, y_lr, sample_weight=test_weights) # mse_gam = mean_squared_error(y_test, y_gam, sample_weight=test_weights) # mse_dt = mean_squared_error(y_test, y_dt, sample_weight=test_weights) mse_lr = explained_variance_score(y_test, y_lr, sample_weight=test_weights) mse_gam = explained_variance_score(y_test, y_gam, sample_weight=test_weights) mse_dt = explained_variance_score(y_test, y_dt, sample_weight=test_weights) # mse_lr = explained_variance_score(y_train, y_lr, sample_weight=train_weights) # mse_gam = explained_variance_score(y_train, y_gam, sample_weight=train_weights) # mse_dt = explained_variance_score(y_train, y_dt, sample_weight=train_weights) metrics = (mse_lr, mse_gam, mse_dt) prediction_score = linear_model.score(neighborhood_data[:, used_features], labels_column, sample_weight=weights) local_pred = linear_model.predict( neighborhood_data[0, used_features].reshape(1, -1)) linear_exp = sorted(zip(used_features, linear_model.coef_), key=lambda x: np.abs(x[1]), reverse=True) gam_exp = [] for i, term in enumerate(gam.terms): if term.isintercept: continue XX = gam.generate_X_grid(term=i) y = gam.partial_dependence(term=i, X=XX) x = XX[:, i] feature = used_features[i] gam_exp.append((used_features[i], x, y)) if self.verbose: print('Intercept', linear_model.intercept_) print( 'Prediction_local', local_pred, ) print('Right:', neighborhood_labels[0, label]) # return (linear_model.intercept_, # sorted(zip(used_features, linear_model.coef_), # key=lambda x: np.abs(x[1]), reverse=True), # prediction_score, local_pred) return (metrics, linear_exp, gam_exp)
plt.plot(list(d.index), gam_model.prediction_intervals(d[['disp', 'wt']], width=.95), color='b', ls='--') plt.xlabel('Row Index') plt.ylabel('mpg') plt.title('GAM Prediction with 95% Condidence Interval') plt.show() #Plot the partial dependecies of the predictors with confidence intervals plt.rcParams['figure.figsize'] = (12, 8) fig, axs = plt.subplots(1, len(list(d[['disp', 'wt']].columns))) titles = list(d[['disp', 'wt']].columns) for i, ax in enumerate(axs): partial_dep, confidence = gam_model.partial_dependence(d[['disp', 'wt']], feature=i + 1, width=0.95) print(partial_dep) order = d[['disp', 'wt']][titles[i]].sort_values().index.tolist() ax.plot(d[['disp', 'wt']][titles[i]].values[order], partial_dep[order]) ax.plot(d[['disp', 'wt']][titles[i]].values[order], confidence[0][:, 0][order], c='grey', ls='--') ax.plot(d[['disp', 'wt']][titles[i]].values[order], confidence[0][:, 1][order], c='grey', ls='--') ax.set_title(titles[i]) plt.show() #The strength & direction of the relationship corresponds to the slope of the line
def get_fig(sex, value, spl, title): if sex == 'T': data = FS_DATA color = 'rgba(123,0,123,0.2)' lncolor = 'rgb(123,50,123)' elif sex == 'F': data = FS_DATA[FS_DATA['Sex'] == 'F'] color = 'rgba(255,0,0,0.1)' lncolor = 'rgb(255,100,0)' else: data = FS_DATA[FS_DATA['Sex'] == 'M'] color = 'rgba(0,0,255,0.1)' lncolor = 'rgb(0,100,255)' x = data.Age.array y = data[value].array gam = LinearGAM(s(0, n_splines=spl)).fit(x, y) fig = go.Figure() for i, term in enumerate(gam.terms): if term.isintercept: continue XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95) fig.add_traces( go.Scatter(x=XX[:, term.feature], y=confi.T[1], name='UPR', line=dict(dash='dash', color=lncolor), hovertemplate="Age: %{x} <br>Volume: %{y}")) fig.add_traces( go.Scatter(x=XX[:, term.feature], y=pdep, name='Mean', line=dict(color='black', width=3), fill='tonexty', fillcolor=color, hovertemplate="Age: %{x} <br>Volume: %{y}")) fig.add_traces( go.Scatter(x=XX[:, term.feature], y=confi.T[0], name='LWR', line=dict(dash='dash', color=lncolor), fill='tonexty', fillcolor=color, hovertemplate="Age: %{x} <br>Volume: %{y}")) if sex == 'B': data = FS_DATA[FS_DATA['Sex'] == 'F'] x = data.Age.array y = data[value].array gam = LinearGAM(s(0, n_splines=spl)).fit(x, y) color = 'rgba(255,0,0,0.1)' lncolor = 'rgb(255,100,0)' for i, term in enumerate(gam.terms): if term.isintercept: continue XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95) fig.add_traces( go.Scatter( x=XX[:, term.feature], y=confi.T[1], name='UPR', line=dict(dash='dash', color=lncolor), hovertemplate="Age: %{x} <br>Volume: %{y}", opacity=0.1, )) fig.add_traces( go.Scatter(x=XX[:, term.feature], y=pdep, name='Mean', line=dict(color='black', width=3), fill='tonexty', fillcolor=color, hovertemplate="Age: %{x} <br>Volume: %{y}")) fig.add_traces( go.Scatter(x=XX[:, term.feature], y=confi.T[0], name='LWR', line=dict(dash='dash', color=lncolor), fill='tonexty', fillcolor=color, hovertemplate="Age: %{x} <br>Volume: %{y}")) fig.update_layout( xaxis_title="Age", yaxis_title=value[:-4], title=title, width=600, height=400, #color_discrete_map={ # replaces default color mapping by value # "Male": "RebeccaPurple", "Female": "MediumPurple" # }, #fig.add_annotation( # add a text callout with arrow #text="below target!", x="Fri", y=400, arrowhead=1, showarrow=True #) template="simple_white") return fig
y,x = pt.dmatrices(eqn, data=data) # Initialize and fit the model gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5)) gam = gam.gridsearch(np.asarray(x), y) # Specify plot shape titles = ['freedom', 'family', 'year', 'economy', 'health', 'trust'] fig = tools.make_subplots(rows=2, cols=3, subplot_titles=titles) fig['layout'].update(height=800, width=1200, title='pyGAM', showlegend=False) for i, title in enumerate(titles): XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, width=.95) trace = go.Scatter(x=XX[:,i], y=pdep, mode='lines', name='Effect') ci1 = go.Scatter(x = XX[:,i], y=confi[:,0], line=dict(dash='dash', color='grey'), name='95% CI') ci2 = go.Scatter(x = XX[:,i], y=confi[:,1], line=dict(dash='dash', color='grey'), name='95% CI') if i<3: fig.append_trace(trace, 1, i+1) fig.append_trace(ci1, 1, i+1) fig.append_trace(ci2, 1, i+1) else: fig.append_trace(trace, 2, i-2) fig.append_trace(ci1, 2, i-2) fig.append_trace(ci2, 2, i-2) py.plot(fig)
y = data['area'] adjy = y - np.mean(y) gam = LinearGAM(n_splines=10).gridsearch(X, y) XX = generate_X_grid(gam) # fig, axs = plt.subplots(1, 3) titles = ['peri', 'shape', 'perm'] # for i, ax in enumerate(axs): # pdep, confi = gam.partial_dependence(XX, feature=i+1, width=.95) # ax.scatter(X[X.columns[i]], adjy, color='gray', edgecolors='none') # ax.plot(XX[:, i], pdep) # ax.plot(XX[:, i], confi[0], c='r', ls='--') # ax.set_title(titles[i]) pdep, confi = gam.partial_dependence(XX, width=.95) p = list() for i in range(3): p.append(figure(title=titles[i], plot_width=250, toolbar_location=None)) p[i].scatter(X[X.columns[i]], adjy, color='gray', size=5, alpha=0.5) p[i].line(XX[:, i], pdep[:,i], color='blue', line_width=3, alpha=0.5) p[i].line(XX[:, i], confi[i][:, 0], color='red', line_width=3, alpha=0.5, line_dash='dashed') p[i].line(XX[:, i], confi[i][:, 1], color='red', line_width=3, alpha=0.5, line_dash='dashed') show(row([p[0],p[1],p[2]]))
#X[0] es el año X[0] = 0 es 2000?... #X[1] es la edad de la persona #X[2] es su nivel de estudios, 0 = basica, 1=media superior, 2 = universidad, 3= posgrado #y ingresos $$ ## model gam1 = LinearGAM(s(0) + s(1) + f(2), fit_intercept=False) gam1.gridsearch(X, y) ## plotting plt.figure(figsize=(10, 7.5)) fig, axs = plt.subplots(1, 3) titles = ['year', 'age', 'education'] for i, ax in enumerate(axs): XX = gam1.generate_X_grid(term=i) ax.plot(XX[:, i], gam1.partial_dependence(term=i, X=XX)) ax.plot(XX[:, i], gam1.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--') ax.set_title(titles[i]) plt.rcParams['figure.figsize'] = [10, 7.5] XX = [[2021, 29, 4]] print(gam1.predict(XX)) for i in range(3): print(gam1.partial_dependence(term=i, X=XX)) ## model gam2 = LinearGAM(s(0, constraints='monotonic_inc') + s(1, constraints='concave') + f(2),
def get_graph_figure(data, atlas, spl): fig = go.Figure() x = data.Age.array y = data[atlas].array color = 'rgba(123,0,123,0.2)' lncolor = 'rgb(123,50,123)' gam = LinearGAM(s(0, n_splines=spl)).fit(x, y) for i, term in enumerate(gam.terms): if term.isintercept: continue XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95) fig.add_traces( go.Scatter(x=XX[:, term.feature], y=confi.T[1], name='UPR of Total', line=dict(dash='dash', color=lncolor), legendgroup="total", showlegend=False, hovertemplate="Age: %{x} <br>Volume: %{y}")) fig.add_traces( go.Scatter(x=XX[:, term.feature], y=pdep, name='Total', line=dict(color='black', width=3), legendgroup="total", fill='tonexty', fillcolor=color, hovertemplate="Age: %{x} <br>Volume: %{y}")) fig.add_traces( go.Scatter(x=XX[:, term.feature], y=confi.T[0], name='LWR of Total', line=dict(dash='dash', color=lncolor), legendgroup="total", showlegend=False, fill='tonexty', fillcolor=color, hovertemplate="Age: %{x} <br>Volume: %{y}")) x = data[data['Sex'] == 'M'].Age.array y = data[data['Sex'] == 'M'][atlas].array color = 'rgba(0,0,255,0.1)' lncolor = 'rgb(0,100,255)' gam = LinearGAM(s(0, n_splines=spl)).fit(x, y) for i, term in enumerate(gam.terms): if term.isintercept: continue XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95) fig.add_traces( go.Scatter(x=XX[:, term.feature], y=confi.T[1], name='UPR of Male', line=dict(dash='dash', color=lncolor), legendgroup="male", showlegend=False, visible='legendonly', hovertemplate="Age: %{x} <br>Volume: %{y}")) fig.add_traces( go.Scatter(x=XX[:, term.feature], y=pdep, name='Male', line=dict(color='black', width=3), legendgroup="male", visible='legendonly', fill='tonexty', fillcolor=color, hovertemplate="Age: %{x} <br>Volume: %{y}")) fig.add_traces( go.Scatter(x=XX[:, term.feature], y=confi.T[0], name='LWR of Male', line=dict(dash='dash', color=lncolor), legendgroup="male", showlegend=False, visible='legendonly', fill='tonexty', fillcolor=color, hovertemplate="Age: %{x} <br>Volume: %{y}")) x = data[data['Sex'] == 'F'].Age.array y = data[data['Sex'] == 'F'][atlas].array color = 'rgba(255,0,0,0.1)' lncolor = 'rgb(255,100,0)' gam = LinearGAM(s(0, n_splines=spl)).fit(x, y) for i, term in enumerate(gam.terms): if term.isintercept: continue XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95) fig.add_traces( go.Scatter(x=XX[:, term.feature], y=confi.T[1], name='UPR of Female', line=dict(dash='dash', color=lncolor), legendgroup="female", showlegend=False, visible='legendonly', hovertemplate="Age: %{x} <br>Volume: %{y}")) fig.add_traces( go.Scatter(x=XX[:, term.feature], y=pdep, name='Female', line=dict(color='black', width=3), legendgroup="female", visible='legendonly', fill='tonexty', fillcolor=color, hovertemplate="Age: %{x} <br>Volume: %{y}")) fig.add_traces( go.Scatter(x=XX[:, term.feature], y=confi.T[0], name='LWR of Female', line=dict(dash='dash', color=lncolor), legendgroup="female", showlegend=False, visible='legendonly', fill='tonexty', fillcolor=color, hovertemplate="Age: %{x} <br>Volume: %{y}")) if atlas[0] == 'B': ytitle = 'Brain Volume' elif atlas[0] == 'R': ytitle = 'Right ' + atlas[2:-5] else: ytitle = 'Left ' + atlas[2:-5] fig.update_layout( xaxis_title="Age", yaxis_title=ytitle, ) return fig
gam = LinearGAM(n_splines=10).gridsearch(X, y) XX = generate_X_grid(gam) # fig, axs = plt.subplots(1, 3) titles = ['peri', 'shape', 'perm'] # for i, ax in enumerate(axs): # pdep, confi = gam.partial_dependence(XX, feature=i+1, width=.95) # ax.scatter(X[X.columns[i]], adjy, color='gray', edgecolors='none') # ax.plot(XX[:, i], pdep) # ax.plot(XX[:, i], confi[0], c='r', ls='--') # ax.set_title(titles[i]) pdep, confi = gam.partial_dependence(XX, width=.95) p = list() for i in range(3): p.append(figure(title=titles[i], plot_width=250, toolbar_location=None)) p[i].scatter(X[X.columns[i]], adjy, color='gray', size=5, alpha=0.5) p[i].line(XX[:, i], pdep[:, i], color='blue', line_width=3, alpha=0.5) p[i].line(XX[:, i], confi[i][:, 0], color='red', line_width=3, alpha=0.5, line_dash='dashed') p[i].line(XX[:, i], confi[i][:, 1], color='red',