예제 #1
0
def fit_gam(X, y, comment, use_x_normalization):
    print("------------------------------")
    print(comment)
    print("------------------------------")
    np.random.seed(0)
    if use_x_normalization:
        X = StandardScaler().fit_transform(X)

    train_scores = np.array([])
    val_scores = np.array([])

    kf = KFold(n_splits=10, shuffle=True)
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        clf = LogisticGAM()
        clf.fit(X_train, y_train)

        train_scores = np.append(train_scores,
                                 clf.accuracy(X_train, y_train) * 100)
        val_scores = np.append(val_scores, clf.accuracy(X_val, y_val) * 100)

    print('Training accuracy: {:.2f}%'.format(np.mean(train_scores)))
    print('Validation accuracy: {:.2f}%'.format(np.mean(val_scores)))
    print()
예제 #2
0
def main():
    X = pd.read_csv(
        './dataset/gradcafe/cs_preprocessed_X.csv',
        usecols=[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]).values
    # X = pd.read_csv('./dataset/gradcafe/pnp_x.csv', header=None).values
    y = pd.read_csv('./dataset/gradcafe/cs_preprocessed_Y.csv').values.reshape(
        -1)

    np.random.seed(0)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=0)

    fit_gam(X_train, y_train, "Without normalization on X", False)
    fit_gam(X_train, y_train, "With normalization on X", True)

    # Normalization is better
    X_train = StandardScaler().fit_transform(X_train)
    X_test = StandardScaler().fit_transform(X_test)

    np.random.seed(0)
    clf = LogisticGAM()
    clf.fit(X_train, y_train)

    training_accuracy = clf.accuracy(X_train, y_train) * 100
    testing_accuracy = clf.accuracy(X_test, y_test) * 100

    print("------------------------------")
    print("Results with normalization on testing set")
    print("------------------------------")
    print('Training accuracy: {:.2f}%'.format(training_accuracy))
    print('Testing accuracy: {:.2f}%'.format(testing_accuracy))
    print()
예제 #3
0
def logistic_GAM(x_tr, y_tr, x_tst, y_tst):
    classifier = LogisticGAM()
    classifier.fit(x_tr, y_tr)
    tr_pred = classifier.predict(x_tr)
    y_pred = classifier.predict(x_tst)
    confusion_matrix = metrics.confusion_matrix(y_tst, y_pred)
    print(confusion_matrix)
    print('Accuracy of logistic regression classifier on test set: {:.2f}' \
          .format(metrics.accuracy_score(y_pred, y_tst)))
    print('Accuracy of logistic regression classifier on train set: {:.2f}' \
          .format(metrics.accuracy_score(tr_pred, y_tr)))
    '''
예제 #4
0
names1 = ["Silent"]
for i in range(1, 37):
    #if(not(i==4 or i==6 or i==10 or i==11)):
    if (not (i == 4 or i == 6 or i == 10 or i == 11)):
        aa += s(i)
        names1.append(names[i])
#print("yeh karre")
gam1 = LogisticGAM(aa)
#print(gam1.lam)
w = []
for y in trainy:
    if (y == 0):
        w.append(1)
    else:
        w.append(10)
gam1 = gam1.fit(trainX, trainy, weights=w)
import numpy as np
lams = np.random.rand(10, 33)  # random points on [0, 1], with shape (100, 3)
n_splines = [5, 10, 15, 20, 25]
lams = lams * 6  # shift values to -3, 3
lams = lams - 3
lams = np.exp(lams)
cons = [
    'convex', 'concave', 'monotonic_inc', 'monotonic_dec', 'circular', 'none'
]
random = LogisticGAM(aa).gridsearch(trainX,
                                    trainy,
                                    weights=w,
                                    lam=lams,
                                    n_splines=n_splines)
random = random.gridsearch(trainX, trainy, constraints=cons)
예제 #5
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        orig_cols = list(X.names)

        import pandas as pd
        import numpy as np
        from sklearn.preprocessing import OneHotEncoder
        from collections import Counter
        import pygam
        from pygam import LinearGAM, LogisticGAM
        import matplotlib.pyplot as plt

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Set up temp folder
        tmp_folder = self._create_tmp_folder(logger)

        # Set up model
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            clf = LogisticGAM(terms="auto",
                              lam=self.params["lam"],
                              max_iter=self.params["max_iter"])
            self.is_classifier = True

        else:
            clf = LinearGAM(terms="auto",
                            lam=self.params["lam"],
                            max_iter=self.params["max_iter"])
            self.is_classifier = False

        X = self.basic_impute(X)
        # Find the datatypes
        X = X.to_pandas()
        X.columns = orig_cols

        # Change continuous features to categorical
        X_datatypes = [str(item) for item in list(X.dtypes)]

        # Change all float32 values to float64
        for ii in range(len(X_datatypes)):
            if X_datatypes[ii] == 'float32':
                X = X.astype({orig_cols[ii]: np.float64})

        X_datatypes = [str(item) for item in list(X.dtypes)]

        # List the categorical and numerical features
        self.X_categorical = [
            orig_cols[col_count] for col_count in range(len(orig_cols))
            if (X_datatypes[col_count] == 'category') or (
                X_datatypes[col_count] == 'object')
        ]
        self.X_numeric = [
            item for item in orig_cols if item not in self.X_categorical
        ]

        # Find the levels and mode for each categorical feature
        # for use in the test set
        self.train_levels = {}
        for item in self.X_categorical:
            self.train_levels[item] = list(set(X[item]))
            self.train_mode[item] = Counter(X[item]).most_common(1)[0][0]

            # One hot encode the categorical features
        # And replace missing values with a Missing category
        if len(self.X_categorical) > 0:
            X.loc[:, self.X_categorical] = X[self.X_categorical].fillna(
                "Missing").copy()
            self.enc = OneHotEncoder(handle_unknown='ignore')

            self.enc.fit(X[self.X_categorical])
            self.encoded_categories = list(
                self.enc.get_feature_names(input_features=self.X_categorical))

            X_enc = self.enc.transform(X[self.X_categorical]).toarray()

            X = pd.concat([
                X[self.X_numeric],
                pd.DataFrame(X_enc, columns=self.encoded_categories)
            ],
                          axis=1)

        # Replace missing values with a missing value code
        self.median_train = {}

        if len(self.X_numeric) > 0:
            for colname in self.X_numeric:
                self.median_train[colname] = X[colname].quantile(0.5)
                X.loc[:, colname] = X[colname].fillna(
                    self.median_train[colname]).copy()

        try:
            clf.fit(X, y)
        except np.linalg.LinAlgError as e:
            raise IgnoreError("np.linalg.LinAlgError") from e
        except pygam.utils.OptimizationError as e:
            raise IgnoreError("pygam.utils.OptimizationError") from e
        except ValueError as e:
            if 'On entry to DLASCL parameter number' in str(e):
                raise IgnoreError('On entry to DLASCL parameter number') from e
            raise

        p_values = np.array(clf.statistics_['p_values'])

        # Plot the partial dependence plots for each feature
        for ii in range(X.shape[1]):
            XX = clf.generate_X_grid(term=ii)
            plt.figure()
            plt.plot(XX[:, ii], clf.partial_dependence(term=ii, X=XX))
            plt.plot(XX[:, ii],
                     clf.partial_dependence(term=ii, X=XX, width=.95)[1],
                     c='r',
                     ls='--')
            plt.title("Partial Dependence " + str(ii),
                      fontdict={'fontsize': 10})
            plt.show()
            plt.savefig(os.path.join(
                tmp_folder, 'Feature_partial_dependence_' + str(ii) + '.png'),
                        bbox_inches="tight")

        if max(p_values[0:(len(p_values) - 1)]) > 0:
            importances = -np.log(p_values[0:(len(p_values) - 1)] + 10**(-16))

            importances = list(importances / max(importances))
        else:
            importances = [1] * (len(p_values) - 1)

        self.mean_target = np.array(sum(y) / len(y))

        self.set_model_properties(model=clf,
                                  features=list(X.columns),
                                  importances=importances,
                                  iterations=self.params['n_estimators'])
예제 #6
0
            player_names=current_dat['Player'],
            binary_prediction=rfc__2020,
            probability_predictions= rfc_probs_2020[:,1])

rfc_predictions_2020
rfc_predictions_2020.to_csv("rfc_predictions.csv")

##### ----- ##### ----- ##### ----- ##### -----# #### ----- ##### ----- ##### ----- ##### ----- #####
# Model 1.3 - Generalized Additive Models

from pygam import LogisticGAM


#Fit a GAM model with the default parameters
gam_model =  LogisticGAM()
gam_model.fit(X_train, y_train)


gam_pred_prob = gam_model.predict_proba(X_test)


gam_preds, complete_gam_dat = top_15_predictions(entire_test_data, gam_pred_prob )


gam_performance = all_nba_test_report(complete_gam_dat)


players_missed(complete_gam_dat)


예제 #7
0
class EpidemicModels:

    # Sequential 6 layer neural network
    def returnSequential6(self):
        model = Sequential()
        model.add(Dense(50, input_dim=20, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    def returnSequential9(self):
        model = Sequential()
        model.add(Dense(80, input_dim=20, activation='relu'))
        model.add(Dense(70, activation='relu'))
        model.add(Dense(60, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    def RNN(self):
        model = Sequential()
        model.add(SimpleRNN(2, input_dim=20))
        model.add(Dense(1, activation='linear'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    def multi_RNN(self):
        model = Sequential()
        model.add(SimpleRNN(2, input_dim=20))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    def baseline(self):
        # Create model
        model = Sequential()
        model.add(Dense(20, input_dim=20, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        # Compile model
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    def lstm(self):
        model = Sequential()
        model.add(LSTM(10, input_dim=20))
        model.add(Dense(1, activation='linear'))
        model.compile(loss='mean_absolute_error', optimizer='adam')
        return model

    def multi_lstm(self):
        model = Sequential()
        model.add(LSTM(4, input_dim=20, return_sequences=True))
        model.add(LSTM(4, input_dim=20))
        model.add(Dense(1, activation='linear'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    # Sequential 4 layer neural network
    def returnSequential2(self):
        model = Sequential()
        model.add(Dense(14, activation='relu', input_dim=20))
        model.add(Dense(units=7, activation='relu'))
        model.add(Dense(units=1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        return model

    def __init__(self, m=1):
        if m == 0:
            self.model = self.baseline()
            self.type = 0
        elif m == 1:
            self.model = self.returnSequential2()
            self.type = 2
        elif m == 2:
            self.model = self.returnSequential6()
            self.type = 2
        elif m == 3:
            self.model = self.RNN()
            self.type = 1
        elif m == 4:
            self.model = self.multi_RNN()
            self.type = 1
        elif m == 5:
            self.model = self.lstm()
            self.type = 1
        elif m == 6:
            self.model = self.multi_lstm()
            self.type = 1
        elif m == 7:
            self.model = LogisticGAM()
            self.type = 3
        elif m == 8:
            self.model = self.returnSequential9()
            self.type = 2

    def returnModel(self):
        return self.model

    def train(self, X, y, bs=10, epochs=100):
        if self.type == 1:
            X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
        if self.type == 3:
            self.model.gridsearch(X, y)
        else:
            self.model.fit(X, y, batch_size=bs, epochs=epochs, shuffle=True)

    def prediction(self, X):
        if self.type == 1:
            X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
        return self.model.predict(X)

    def cross_eval(self, X, y, bs=10, ep=100, k=5):
        scores = []
        if self.type == 0:
            kf = KFold(n_splits=k, shuffle=True, random_state=0)
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0)
                a, score = self.model.evaluate(X_test, y_test, verbose=0)
                scores.append(score)
            return sum(scores) / len(scores)

        elif self.type == 1:
            kf = KFold(n_splits=k, shuffle=False, random_state=0)
            scores = []
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
                X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
                self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0)
                score = self.model.evaluate(X_test, y_test, verbose=0)
                scores.append(score)
            return sum(scores) / len(scores)

        elif self.type == 2:
            kf = KFold(n_splits=k, shuffle=True, random_state=0)
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0)
                a, score = self.model.evaluate(X_test, y_test, verbose=0)
                print(score)
                scores.append(score)
            return sum(scores) / len(scores)

        elif self.type == 3:
            kf = KFold(n_splits=k, shuffle=False, random_state=0)
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                self.model.gridsearch(X_train, y_train)
                y_pre = self.model.predict(X_test)
                print(y_pre)
                scores.append(f1_score(y_pre, y_test))
            return sum(scores) / len(scores)