def fit_pygam_model(X_train: pandas.core.frame.DataFrame,
                    X_test: pandas.core.frame.DataFrame,
                    y_train: pandas.core.frame.DataFrame,
                    y_test: pandas.core.frame.DataFrame):
    '''
    Creates a general additive model LinearGAM (normally distributed errors)
    with grid search. Returns the best model with given hyperparameters.
    hyperparameters: n_splines and lam regularization parameter.
    '''
    from pygam import LinearGAM
    gam = LinearGAM().gridsearch(X_train.values,
                                 y_train,
                                 n_splines=np.arange(3, 20),
                                 lam=np.logspace(-3, 3, 11))
    print(gam.summary())

    y_train_predicted = gam.predict(X_train)
    y_test_predicted = np.floor(gam.predict(X_test))

    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted))
    mae_train = mean_absolute_error(y_train, y_train_predicted)
    r2_train = r2_score(y_train, y_train_predicted)
    print("RMSE of training set is {}".format(rmse_train))
    print("MAE of testing set is {}".format(mae_train))
    print("R2 score of training set is {}\n".format(r2_train))

    if len(y_test) > 0:
        rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predicted))
        mae_test = mean_absolute_error(y_test, y_test_predicted)
        r2_test = r2_score(y_test, y_test_predicted)
        print("RMSE of testing set is {}".format(rmse_test))
        print("MAE of testing set is {}".format(mae_test))
        print("R2 score of testing set is {}\n".format(r2_test))
    '''
    Visualize the feature significance and confidence intervals
    '''
    num_features = len(X_train.columns)
    fig = plt.figure(figsize=(18, 12))
    fig.subplots_adjust(hspace=0.4)

    cnt = 1
    p_values = gam.statistics_['p_values']

    for i in range(num_features):
        axs = fig.add_subplot(num_features, 1, cnt)
        m = gam.generate_X_grid(term=i)
        axs.plot(m[:, i],
                 gam.partial_dependence(term=i,
                                        X=m))  # this is the actual coefficents
        axs.plot(m[:, i],
                 gam.partial_dependence(term=i, X=m, width=.95)[1],
                 c='r',
                 ls='--')  # this plots the confidence intervals
        axs.set_title(X_train.columns[i] +
                      ('*' if p_values[cnt] < 0.05 else ''))
        cnt += 1
Пример #2
0
X = df_pheno.loc[:, 'ageAtScan1_Years']
Y = df_system.loc[:, metric]

# Estimate GAM with spline

# In[5]:

gam = LinearGAM(s(0)).fit(X, Y)
gam.gridsearch(X, Y)

# Plot

# In[6]:

XX = gam.generate_X_grid(term=0)
pdep, confi = gam.partial_dependence(term=0, X=XX, width=0.95)

plt.figure()
plt.plot(XX, pdep)  # fit
plt.plot(XX, confi, c='r', ls='--')  # confidence interval
plt.plot(XX, gam.prediction_intervals(XX, width=.95), color='b',
         ls='--')  # 95% prediction interval
plt.scatter(X, Y, facecolor='gray', edgecolors='none', alpha=0.5)  # data
plt.xlabel('Age')
plt.ylabel('Brain feature')
plt.show()

# In[7]:

metric = 'jd'
X = df_pheno.loc[:, ['ageAtScan1_Years', 'mprage_antsCT_vol_TBV']]
outcome = 'AdjSalePrice'
X = house_98105[predictors].values
y = house_98105[outcome]

## model
gam = LinearGAM(s(0, n_splines=12) + l(1) + l(2) + l(3) + l(4))
gam.gridsearch(X, y)
print(gam.summary())

fig, axes = plt.subplots(figsize=(8, 8), ncols=2, nrows=3)

titles = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade']
for i, title in enumerate(titles):
    ax = axes[i // 2, i % 2]
    XX = gam.generate_X_grid(term=i)
    ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX))
    ax.plot(XX[:, i],
            gam.partial_dependence(term=i, X=XX, width=.95)[1],
            c='r',
            ls='--')
    ax.set_title(titles[i])

axes[2][1].set_visible(False)

plt.tight_layout()
plt.show()

## Additional material - not in book
## Regularization
### Lasso
Пример #4
0
    def explain_instance_with_data(self,
                                   neighborhood_data,
                                   neighborhood_labels,
                                   distances,
                                   label,
                                   num_features,
                                   feature_selection='auto',
                                   model_regressor=None,
                                   gam_type=None):
        """Takes perturbed data, labels and distances, returns explanation.

        Args:
            neighborhood_data: perturbed data, 2d array. first element is
                               assumed to be the original data point.
            neighborhood_labels: corresponding perturbed labels. should have as
                                 many columns as the number of possible labels.
            distances: distances to original data point.
            label: label for which we want an explanation
            num_features: maximum number of features in explanation
            feature_selection: how to select num_features. options are:
                'forward_selection': iteratively add features to the model.
                    This is costly when num_features is high
                'highest_weights': selects the features that have the highest
                    product of absolute weight * original data point when
                    learning with all the features
                'lasso_path': chooses features based on the lasso
                    regularization path
                'none': uses all features, ignores num_features
                'auto': uses forward_selection if num_features <= 6, and
                    'highest_weights' otherwise.
            model_regressor: sklearn regressor to use in explanation.
                Defaults to Ridge regression if None. Must have
                model_regressor.coef_ and 'sample_weight' as a parameter
                to model_regressor.fit()

        Returns:
            (intercept, exp, score):
            intercept is a float.
            exp is a sorted list of tuples, where each tuple (x,y) corresponds
            to the feature id (x) and the local weight (y). The list is sorted
            by decreasing absolute value of y.
            score is the R^2 value of the returned explanation
        """

        weights = self.kernel_fn(distances)
        labels_column = neighborhood_labels[:, label]
        used_features = self.feature_selection(neighborhood_data,
                                               labels_column, weights,
                                               num_features, feature_selection)

        X = neighborhood_data[:, used_features]
        y = neighborhood_labels[:, label]
        (X_train, X_test, y_train, y_test, train_weights,
         test_weights) = train_test_split(X, y, weights, test_size=0.2)

        linear_model = Ridge(alpha=1,
                             fit_intercept=True,
                             random_state=self.random_state)

        gam = LinearGAM()
        dt = DecisionTreeRegressor()

        linear_model.fit(X_train, y_train, sample_weight=train_weights)
        gam.fit(X_train, y_train, weights=train_weights)
        dt.fit(X_train, y_train, sample_weight=train_weights)

        # # plot
        # for i, term in enumerate(gam.terms):
        #     if term.isintercept:
        #         continue
        #     XX = gam.generate_X_grid(term=i)
        #     # pdep = gam.predict(XX)
        #     pdep = gam.partial_dependence(term=i, X=XX) + linear_model.intercept_
        #     # line = XX[:, term.feature] * linear_model.coef_[term.feature]
        #     line = linear_model.predict(XX)
        #     dect = dt.predict(XX)
        #     plt.figure()
        #     plt.plot(XX[:, term.feature], pdep)
        #     plt.plot(XX[:, term.feature], line)
        #     plt.plot(XX[:, term.feature], dect)
        #     plt.title(repr(term))
        #     plt.show()
        # exit()

        y_lr = linear_model.predict(X_test)
        y_gam = gam.predict(X_test)
        y_dt = dt.predict(X_test)

        # y_lr = linear_model.predict(X_train)
        # y_gam = gam.predict(X_train)
        # y_dt = dt.predict(X_train)

        # mse_lr = mean_squared_error(y_test, y_lr, sample_weight=test_weights)
        # mse_gam = mean_squared_error(y_test, y_gam, sample_weight=test_weights)
        # mse_dt = mean_squared_error(y_test, y_dt, sample_weight=test_weights)

        mse_lr = explained_variance_score(y_test,
                                          y_lr,
                                          sample_weight=test_weights)
        mse_gam = explained_variance_score(y_test,
                                           y_gam,
                                           sample_weight=test_weights)
        mse_dt = explained_variance_score(y_test,
                                          y_dt,
                                          sample_weight=test_weights)

        # mse_lr = explained_variance_score(y_train, y_lr, sample_weight=train_weights)
        # mse_gam = explained_variance_score(y_train, y_gam, sample_weight=train_weights)
        # mse_dt = explained_variance_score(y_train, y_dt, sample_weight=train_weights)

        metrics = (mse_lr, mse_gam, mse_dt)

        prediction_score = linear_model.score(neighborhood_data[:,
                                                                used_features],
                                              labels_column,
                                              sample_weight=weights)

        local_pred = linear_model.predict(
            neighborhood_data[0, used_features].reshape(1, -1))

        linear_exp = sorted(zip(used_features, linear_model.coef_),
                            key=lambda x: np.abs(x[1]),
                            reverse=True)
        gam_exp = []
        for i, term in enumerate(gam.terms):
            if term.isintercept:
                continue
            XX = gam.generate_X_grid(term=i)
            y = gam.partial_dependence(term=i, X=XX)
            x = XX[:, i]
            feature = used_features[i]
            gam_exp.append((used_features[i], x, y))

        if self.verbose:
            print('Intercept', linear_model.intercept_)
            print(
                'Prediction_local',
                local_pred,
            )
            print('Right:', neighborhood_labels[0, label])
        # return (linear_model.intercept_,
        #         sorted(zip(used_features, linear_model.coef_),
        #                key=lambda x: np.abs(x[1]), reverse=True),
        #         prediction_score, local_pred)
        return (metrics, linear_exp, gam_exp)
Пример #5
0
plt.plot(list(d.index),
         gam_model.prediction_intervals(d[['disp', 'wt']], width=.95),
         color='b',
         ls='--')
plt.xlabel('Row Index')
plt.ylabel('mpg')
plt.title('GAM Prediction with 95% Condidence Interval')
plt.show()

#Plot the partial dependecies of the predictors with confidence intervals
plt.rcParams['figure.figsize'] = (12, 8)
fig, axs = plt.subplots(1, len(list(d[['disp', 'wt']].columns)))
titles = list(d[['disp', 'wt']].columns)
for i, ax in enumerate(axs):
    partial_dep, confidence = gam_model.partial_dependence(d[['disp', 'wt']],
                                                           feature=i + 1,
                                                           width=0.95)
    print(partial_dep)
    order = d[['disp', 'wt']][titles[i]].sort_values().index.tolist()
    ax.plot(d[['disp', 'wt']][titles[i]].values[order], partial_dep[order])
    ax.plot(d[['disp', 'wt']][titles[i]].values[order],
            confidence[0][:, 0][order],
            c='grey',
            ls='--')
    ax.plot(d[['disp', 'wt']][titles[i]].values[order],
            confidence[0][:, 1][order],
            c='grey',
            ls='--')
    ax.set_title(titles[i])
plt.show()
#The strength & direction of the relationship corresponds to the slope of the line
Пример #6
0
def get_fig(sex, value, spl, title):
    if sex == 'T':
        data = FS_DATA
        color = 'rgba(123,0,123,0.2)'
        lncolor = 'rgb(123,50,123)'
    elif sex == 'F':
        data = FS_DATA[FS_DATA['Sex'] == 'F']
        color = 'rgba(255,0,0,0.1)'
        lncolor = 'rgb(255,100,0)'
    else:
        data = FS_DATA[FS_DATA['Sex'] == 'M']
        color = 'rgba(0,0,255,0.1)'
        lncolor = 'rgb(0,100,255)'
    x = data.Age.array
    y = data[value].array

    gam = LinearGAM(s(0, n_splines=spl)).fit(x, y)

    fig = go.Figure()

    for i, term in enumerate(gam.terms):
        if term.isintercept:
            continue

        XX = gam.generate_X_grid(term=i)
        pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95)

        fig.add_traces(
            go.Scatter(x=XX[:, term.feature],
                       y=confi.T[1],
                       name='UPR',
                       line=dict(dash='dash', color=lncolor),
                       hovertemplate="Age: %{x} <br>Volume: %{y}"))

        fig.add_traces(
            go.Scatter(x=XX[:, term.feature],
                       y=pdep,
                       name='Mean',
                       line=dict(color='black', width=3),
                       fill='tonexty',
                       fillcolor=color,
                       hovertemplate="Age: %{x} <br>Volume: %{y}"))

        fig.add_traces(
            go.Scatter(x=XX[:, term.feature],
                       y=confi.T[0],
                       name='LWR',
                       line=dict(dash='dash', color=lncolor),
                       fill='tonexty',
                       fillcolor=color,
                       hovertemplate="Age: %{x} <br>Volume: %{y}"))

    if sex == 'B':
        data = FS_DATA[FS_DATA['Sex'] == 'F']
        x = data.Age.array
        y = data[value].array
        gam = LinearGAM(s(0, n_splines=spl)).fit(x, y)
        color = 'rgba(255,0,0,0.1)'
        lncolor = 'rgb(255,100,0)'
        for i, term in enumerate(gam.terms):
            if term.isintercept:
                continue

            XX = gam.generate_X_grid(term=i)
            pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95)

            fig.add_traces(
                go.Scatter(
                    x=XX[:, term.feature],
                    y=confi.T[1],
                    name='UPR',
                    line=dict(dash='dash', color=lncolor),
                    hovertemplate="Age: %{x} <br>Volume: %{y}",
                    opacity=0.1,
                ))

            fig.add_traces(
                go.Scatter(x=XX[:, term.feature],
                           y=pdep,
                           name='Mean',
                           line=dict(color='black', width=3),
                           fill='tonexty',
                           fillcolor=color,
                           hovertemplate="Age: %{x} <br>Volume: %{y}"))

            fig.add_traces(
                go.Scatter(x=XX[:, term.feature],
                           y=confi.T[0],
                           name='LWR',
                           line=dict(dash='dash', color=lncolor),
                           fill='tonexty',
                           fillcolor=color,
                           hovertemplate="Age: %{x} <br>Volume: %{y}"))

    fig.update_layout(
        xaxis_title="Age",
        yaxis_title=value[:-4],
        title=title,
        width=600,
        height=400,
        #color_discrete_map={ # replaces default color mapping by value
        #        "Male": "RebeccaPurple", "Female": "MediumPurple"
        #    },
        #fig.add_annotation( # add a text callout with arrow
        #text="below target!", x="Fri", y=400, arrowhead=1, showarrow=True
        #)
        template="simple_white")

    return fig
Пример #7
0
y,x = pt.dmatrices(eqn, data=data)

# Initialize and fit the model
gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5))
gam = gam.gridsearch(np.asarray(x), y)

# Specify plot shape
titles = ['freedom', 'family', 'year', 'economy',
          'health', 'trust']

fig = tools.make_subplots(rows=2, cols=3, subplot_titles=titles)
fig['layout'].update(height=800, width=1200, title='pyGAM', showlegend=False)

for i, title in enumerate(titles):
  XX = gam.generate_X_grid(term=i)
  pdep, confi = gam.partial_dependence(term=i, width=.95)
  trace = go.Scatter(x=XX[:,i], y=pdep, mode='lines', name='Effect')
  ci1 = go.Scatter(x = XX[:,i], y=confi[:,0], line=dict(dash='dash', color='grey'), name='95% CI')
  ci2 = go.Scatter(x = XX[:,i], y=confi[:,1], line=dict(dash='dash', color='grey'), name='95% CI')
  if i<3:
    fig.append_trace(trace, 1, i+1)
    fig.append_trace(ci1, 1, i+1)
    fig.append_trace(ci2, 1, i+1)
  else:
    fig.append_trace(trace, 2, i-2)
    fig.append_trace(ci1, 2, i-2)
    fig.append_trace(ci2, 2, i-2)

py.plot(fig)

Пример #8
0
y = data['area']

adjy = y - np.mean(y)

gam = LinearGAM(n_splines=10).gridsearch(X, y)
XX = generate_X_grid(gam)

# fig, axs = plt.subplots(1, 3)
titles = ['peri', 'shape', 'perm']

# for i, ax in enumerate(axs):
#     pdep, confi = gam.partial_dependence(XX, feature=i+1, width=.95)
    
#     ax.scatter(X[X.columns[i]], adjy, color='gray', edgecolors='none')
#     ax.plot(XX[:, i], pdep)
#     ax.plot(XX[:, i], confi[0], c='r', ls='--')
#     ax.set_title(titles[i])
    
    
pdep, confi = gam.partial_dependence(XX, width=.95)
p = list()

for i in range(3):
    p.append(figure(title=titles[i], plot_width=250, toolbar_location=None))
    p[i].scatter(X[X.columns[i]], adjy, color='gray', size=5, alpha=0.5)
    p[i].line(XX[:, i], pdep[:,i], color='blue', line_width=3, alpha=0.5)
    p[i].line(XX[:, i], confi[i][:, 0], color='red', line_width=3, alpha=0.5, line_dash='dashed')
    p[i].line(XX[:, i], confi[i][:, 1], color='red', line_width=3, alpha=0.5, line_dash='dashed')

show(row([p[0],p[1],p[2]]))
#X[0] es el año X[0] = 0 es 2000?...
#X[1] es la edad de la persona
#X[2] es su nivel de estudios, 0 = basica, 1=media superior, 2 = universidad, 3= posgrado
#y ingresos $$

## model
gam1 = LinearGAM(s(0) + s(1) + f(2), fit_intercept=False)
gam1.gridsearch(X, y)

## plotting
plt.figure(figsize=(10, 7.5))
fig, axs = plt.subplots(1, 3)
titles = ['year', 'age', 'education']
for i, ax in enumerate(axs):
    XX = gam1.generate_X_grid(term=i)
    ax.plot(XX[:, i], gam1.partial_dependence(term=i, X=XX))
    ax.plot(XX[:, i],
            gam1.partial_dependence(term=i, X=XX, width=.95)[1],
            c='r',
            ls='--')
    ax.set_title(titles[i])
plt.rcParams['figure.figsize'] = [10, 7.5]

XX = [[2021, 29, 4]]
print(gam1.predict(XX))
for i in range(3):
    print(gam1.partial_dependence(term=i, X=XX))

## model
gam2 = LinearGAM(s(0, constraints='monotonic_inc') +
                 s(1, constraints='concave') + f(2),
Пример #10
0
def get_graph_figure(data, atlas, spl):
    fig = go.Figure()
    x = data.Age.array
    y = data[atlas].array
    color = 'rgba(123,0,123,0.2)'
    lncolor = 'rgb(123,50,123)'
    gam = LinearGAM(s(0, n_splines=spl)).fit(x, y)
    for i, term in enumerate(gam.terms):
        if term.isintercept:
            continue

        XX = gam.generate_X_grid(term=i)
        pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95)

        fig.add_traces(
            go.Scatter(x=XX[:, term.feature],
                       y=confi.T[1],
                       name='UPR of Total',
                       line=dict(dash='dash', color=lncolor),
                       legendgroup="total",
                       showlegend=False,
                       hovertemplate="Age: %{x} <br>Volume: %{y}"))

        fig.add_traces(
            go.Scatter(x=XX[:, term.feature],
                       y=pdep,
                       name='Total',
                       line=dict(color='black', width=3),
                       legendgroup="total",
                       fill='tonexty',
                       fillcolor=color,
                       hovertemplate="Age: %{x} <br>Volume: %{y}"))

        fig.add_traces(
            go.Scatter(x=XX[:, term.feature],
                       y=confi.T[0],
                       name='LWR of Total',
                       line=dict(dash='dash', color=lncolor),
                       legendgroup="total",
                       showlegend=False,
                       fill='tonexty',
                       fillcolor=color,
                       hovertemplate="Age: %{x} <br>Volume: %{y}"))

    x = data[data['Sex'] == 'M'].Age.array
    y = data[data['Sex'] == 'M'][atlas].array
    color = 'rgba(0,0,255,0.1)'
    lncolor = 'rgb(0,100,255)'
    gam = LinearGAM(s(0, n_splines=spl)).fit(x, y)
    for i, term in enumerate(gam.terms):
        if term.isintercept:
            continue

        XX = gam.generate_X_grid(term=i)
        pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95)

        fig.add_traces(
            go.Scatter(x=XX[:, term.feature],
                       y=confi.T[1],
                       name='UPR of Male',
                       line=dict(dash='dash', color=lncolor),
                       legendgroup="male",
                       showlegend=False,
                       visible='legendonly',
                       hovertemplate="Age: %{x} <br>Volume: %{y}"))

        fig.add_traces(
            go.Scatter(x=XX[:, term.feature],
                       y=pdep,
                       name='Male',
                       line=dict(color='black', width=3),
                       legendgroup="male",
                       visible='legendonly',
                       fill='tonexty',
                       fillcolor=color,
                       hovertemplate="Age: %{x} <br>Volume: %{y}"))

        fig.add_traces(
            go.Scatter(x=XX[:, term.feature],
                       y=confi.T[0],
                       name='LWR of Male',
                       line=dict(dash='dash', color=lncolor),
                       legendgroup="male",
                       showlegend=False,
                       visible='legendonly',
                       fill='tonexty',
                       fillcolor=color,
                       hovertemplate="Age: %{x} <br>Volume: %{y}"))

    x = data[data['Sex'] == 'F'].Age.array
    y = data[data['Sex'] == 'F'][atlas].array
    color = 'rgba(255,0,0,0.1)'
    lncolor = 'rgb(255,100,0)'
    gam = LinearGAM(s(0, n_splines=spl)).fit(x, y)
    for i, term in enumerate(gam.terms):
        if term.isintercept:
            continue

        XX = gam.generate_X_grid(term=i)
        pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95)

        fig.add_traces(
            go.Scatter(x=XX[:, term.feature],
                       y=confi.T[1],
                       name='UPR of Female',
                       line=dict(dash='dash', color=lncolor),
                       legendgroup="female",
                       showlegend=False,
                       visible='legendonly',
                       hovertemplate="Age: %{x} <br>Volume: %{y}"))

        fig.add_traces(
            go.Scatter(x=XX[:, term.feature],
                       y=pdep,
                       name='Female',
                       line=dict(color='black', width=3),
                       legendgroup="female",
                       visible='legendonly',
                       fill='tonexty',
                       fillcolor=color,
                       hovertemplate="Age: %{x} <br>Volume: %{y}"))

        fig.add_traces(
            go.Scatter(x=XX[:, term.feature],
                       y=confi.T[0],
                       name='LWR of Female',
                       line=dict(dash='dash', color=lncolor),
                       legendgroup="female",
                       showlegend=False,
                       visible='legendonly',
                       fill='tonexty',
                       fillcolor=color,
                       hovertemplate="Age: %{x} <br>Volume: %{y}"))

    if atlas[0] == 'B':
        ytitle = 'Brain Volume'
    elif atlas[0] == 'R':
        ytitle = 'Right ' + atlas[2:-5]
    else:
        ytitle = 'Left ' + atlas[2:-5]
    fig.update_layout(
        xaxis_title="Age",
        yaxis_title=ytitle,
    )
    return fig
Пример #11
0
gam = LinearGAM(n_splines=10).gridsearch(X, y)
XX = generate_X_grid(gam)

# fig, axs = plt.subplots(1, 3)
titles = ['peri', 'shape', 'perm']

# for i, ax in enumerate(axs):
#     pdep, confi = gam.partial_dependence(XX, feature=i+1, width=.95)

#     ax.scatter(X[X.columns[i]], adjy, color='gray', edgecolors='none')
#     ax.plot(XX[:, i], pdep)
#     ax.plot(XX[:, i], confi[0], c='r', ls='--')
#     ax.set_title(titles[i])

pdep, confi = gam.partial_dependence(XX, width=.95)
p = list()

for i in range(3):
    p.append(figure(title=titles[i], plot_width=250, toolbar_location=None))
    p[i].scatter(X[X.columns[i]], adjy, color='gray', size=5, alpha=0.5)
    p[i].line(XX[:, i], pdep[:, i], color='blue', line_width=3, alpha=0.5)
    p[i].line(XX[:, i],
              confi[i][:, 0],
              color='red',
              line_width=3,
              alpha=0.5,
              line_dash='dashed')
    p[i].line(XX[:, i],
              confi[i][:, 1],
              color='red',