Exemplo n.º 1
0
def _plot_logodd(self):
    # Gérer les manquants dans le GAM
    lignes_completes = np.invert(
        np.isnan(self.predictors_cont).sum(axis=1).astype(bool))

    # Fit du GAM sur tout le monde
    gam = LogisticGAM(dtype=['numerical' for _ in range(self.d_cont)] + ['categorical' for _ in range(
        self.d_qual)]).fit(
        pd.concat([pd.DataFrame(self.predictors_cont[lignes_completes, :]).apply(
            lambda x: x.astype('float')),
            pd.DataFrame(self.predictors_qual[lignes_completes, :]).apply(
                lambda x: x.astype('category'))], axis=1), self.labels[lignes_completes])

    # Quelles que soient les valeurs de predictors_cont_number et
    # predictors_qual_number, on plot tout pour l'instant
    plt.figure()
    fig, axs = plt.subplots(1, self.d_cont + self.d_qual)
    plt.rcParams['figure.figsize'] = (28, 8)
    for i, ax in enumerate(axs):
        try:
            XX = gam.generate_X_grid(term=i)
            ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX))
            ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--')
        except ValueError:  # pragma: no cover
            continue
    plt.show(block=False)
Exemplo n.º 2
0
n_splines = [5, 10, 15, 20, 25]
lams = lams * 6  # shift values to -3, 3
lams = lams - 3
lams = np.exp(lams)
cons = [
    'convex', 'concave', 'monotonic_inc', 'monotonic_dec', 'circular', 'none'
]
random = LogisticGAM(aa).gridsearch(trainX,
                                    trainy,
                                    weights=w,
                                    lam=lams,
                                    n_splines=n_splines)
random = random.gridsearch(trainX, trainy, constraints=cons)
print(random.lam)
print(random.n_splines)
print(random.constraints)
print(random.accuracy(testX, testy))

from sklearn.metrics import confusion_matrix
preds = random.predict(testX)
print(confusion_matrix(testy, preds))
for i, term in enumerate(random.terms):
    if term.isintercept:
        continue
    XX = random.generate_X_grid(term=i)
    pdep, confi = random.partial_dependence(term=i, X=XX, width=0.95)
    plt.figure()
    plt.plot(XX[:, term.feature], pdep)
    plt.plot(XX[:, term.feature], confi, c='r', ls='--')
    plt.title(names1[i])
    plt.show()
Exemplo n.º 3
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        orig_cols = list(X.names)

        import pandas as pd
        import numpy as np
        from sklearn.preprocessing import OneHotEncoder
        from collections import Counter
        import pygam
        from pygam import LinearGAM, LogisticGAM
        import matplotlib.pyplot as plt

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Set up temp folder
        tmp_folder = self._create_tmp_folder(logger)

        # Set up model
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            clf = LogisticGAM(terms="auto",
                              lam=self.params["lam"],
                              max_iter=self.params["max_iter"])
            self.is_classifier = True

        else:
            clf = LinearGAM(terms="auto",
                            lam=self.params["lam"],
                            max_iter=self.params["max_iter"])
            self.is_classifier = False

        X = self.basic_impute(X)
        # Find the datatypes
        X = X.to_pandas()
        X.columns = orig_cols

        # Change continuous features to categorical
        X_datatypes = [str(item) for item in list(X.dtypes)]

        # Change all float32 values to float64
        for ii in range(len(X_datatypes)):
            if X_datatypes[ii] == 'float32':
                X = X.astype({orig_cols[ii]: np.float64})

        X_datatypes = [str(item) for item in list(X.dtypes)]

        # List the categorical and numerical features
        self.X_categorical = [
            orig_cols[col_count] for col_count in range(len(orig_cols))
            if (X_datatypes[col_count] == 'category') or (
                X_datatypes[col_count] == 'object')
        ]
        self.X_numeric = [
            item for item in orig_cols if item not in self.X_categorical
        ]

        # Find the levels and mode for each categorical feature
        # for use in the test set
        self.train_levels = {}
        for item in self.X_categorical:
            self.train_levels[item] = list(set(X[item]))
            self.train_mode[item] = Counter(X[item]).most_common(1)[0][0]

            # One hot encode the categorical features
        # And replace missing values with a Missing category
        if len(self.X_categorical) > 0:
            X.loc[:, self.X_categorical] = X[self.X_categorical].fillna(
                "Missing").copy()
            self.enc = OneHotEncoder(handle_unknown='ignore')

            self.enc.fit(X[self.X_categorical])
            self.encoded_categories = list(
                self.enc.get_feature_names(input_features=self.X_categorical))

            X_enc = self.enc.transform(X[self.X_categorical]).toarray()

            X = pd.concat([
                X[self.X_numeric],
                pd.DataFrame(X_enc, columns=self.encoded_categories)
            ],
                          axis=1)

        # Replace missing values with a missing value code
        self.median_train = {}

        if len(self.X_numeric) > 0:
            for colname in self.X_numeric:
                self.median_train[colname] = X[colname].quantile(0.5)
                X.loc[:, colname] = X[colname].fillna(
                    self.median_train[colname]).copy()

        try:
            clf.fit(X, y)
        except np.linalg.LinAlgError as e:
            raise IgnoreError("np.linalg.LinAlgError") from e
        except pygam.utils.OptimizationError as e:
            raise IgnoreError("pygam.utils.OptimizationError") from e
        except ValueError as e:
            if 'On entry to DLASCL parameter number' in str(e):
                raise IgnoreError('On entry to DLASCL parameter number') from e
            raise

        p_values = np.array(clf.statistics_['p_values'])

        # Plot the partial dependence plots for each feature
        for ii in range(X.shape[1]):
            XX = clf.generate_X_grid(term=ii)
            plt.figure()
            plt.plot(XX[:, ii], clf.partial_dependence(term=ii, X=XX))
            plt.plot(XX[:, ii],
                     clf.partial_dependence(term=ii, X=XX, width=.95)[1],
                     c='r',
                     ls='--')
            plt.title("Partial Dependence " + str(ii),
                      fontdict={'fontsize': 10})
            plt.show()
            plt.savefig(os.path.join(
                tmp_folder, 'Feature_partial_dependence_' + str(ii) + '.png'),
                        bbox_inches="tight")

        if max(p_values[0:(len(p_values) - 1)]) > 0:
            importances = -np.log(p_values[0:(len(p_values) - 1)] + 10**(-16))

            importances = list(importances / max(importances))
        else:
            importances = [1] * (len(p_values) - 1)

        self.mean_target = np.array(sum(y) / len(y))

        self.set_model_properties(model=clf,
                                  features=list(X.columns),
                                  importances=importances,
                                  iterations=self.params['n_estimators'])
Exemplo n.º 4
0
roc_auc_score(y, gam.predict_proba(X))  #0.994173140954495
gam.accuracy(X, y)  #0.9560632688927944

#-----------------------------------------------------
# Explore and interpret individual features

plt.ion()
plt.rcParams['figure.figsize'] = (28, 8)

fig, axs = plt.subplots(1, X.shape[1])

for i, ax in enumerate(axs):
    XX = gam.generate_X_grid(term=i, meshgrid=True)
    pdep, confi = gam.partial_dependence(term=i,
                                         X=XX,
                                         meshgrid=True,
                                         width=.95)
    ax.plot(XX[0], pdep)
    ax.plot(XX[0], confi[:, 0], c='grey', ls='--')
    ax.plot(XX[0], confi[:, 1], c='grey', ls='--')
    ax.set_title(selected_features[i])

plt.show()

#-----------------------------------------------------
# Tuning Smoothness and Penalties

n_splines = [25, 6, 25, 25, 6, 4]
lambda_ = 0.6
constraints = None