示例#1
0
def logreg_gridsearch():
    """
    This function reads in the dataset and performs a logistic regression method and .. 
    using Grid Search to find the optimum parameters that will maximize the recall score,
    """

    X_train, X_test, y_train, y_test = retrieve_data( undersampling=True, ratio=1.0, random_state=3 )

    clf = LogisticRegression(random_state=4, solver="liblinear")

    ## Grid search parameter grid
    param_grid= {
        "C" : np.logspace(-3,3,7),
        "penalty" : ["l1", "l2"]
    }

    ## Different scorers for the grid search
    scorers = {
        "precision_score": make_scorer(precision_score),
        "recall_score": make_scorer(recall_score),
        "accuracy_score": make_scorer(accuracy_score)
    }

    ## Creating the grid search object. Using refit="recall_score" to optimize using this score
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=scorers, refit="recall_score", return_train_score=True, n_jobs=-1)

    grid_search.fit(X_train, y_train)

    prediction = grid_search.predict(X_test)
    scores(prediction, y_test, X_train, y_train, grid_search)
def neuralnet_tuned():
    """
    This function reads the dataset and uses the neural network ..
    to test optimized parameters found in the function above.
    """

    X_train, X_test, y_train, y_test = retrieve_data(undersampling=True,
                                                     ratio=1.0,
                                                     random_state=3)

    clf = sklearn.neural_network.MLPClassifier(learning_rate="adaptive",
                                               learning_rate_init=0.001,
                                               activation="logistic",
                                               alpha=0.1,
                                               hidden_layer_sizes=(30, 30, 30,
                                                                   30),
                                               max_iter=500,
                                               solver="lbfgs",
                                               tol=1e-4,
                                               verbose=False)

    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)

    scores(prediction, y_test, X_train, y_train)
示例#3
0
def decisiontree_tuned():
    """
    This function reads the dataset and uses the decision tree ..
    to test optimized parameters found in the function above.
    """

    X_train, X_test, y_train, y_test = retrieve_data(undersampling=True,
                                                     ratio=1,
                                                     random_state=2)

    clf = tree.DecisionTreeClassifier(criterion="gini",
                                      max_depth=20,
                                      max_features=30,
                                      min_samples_leaf=1,
                                      min_samples_split=2)

    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)

    scores(prediction, y_test, X_train, y_train)

    dot_data = tree.export_graphviz(clf,
                                    out_file=None,
                                    filled=True,
                                    rounded=True,
                                    special_characters=True)
    graph = graphviz.Source(dot_data)
    graph.format = "png"
    graph.render("plots/tree")
示例#4
0
def decisiontree_gridsearch():
    """
    This function retrieves the dataset and uses grid search to find optimum parameters
    to optimize the recall score of a Decision Tree classifier.
    """

    X_train, X_test, y_train, y_test = retrieve_data(undersampling=True,
                                                     ratio=1.0,
                                                     random_state=2)

    ## Our decision tree model
    clf = tree.DecisionTreeClassifier()

    # Grid search parameter grid to search through.
    param_grid = {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 3, 5, 8, 10],
        "max_depth": [3, 5, 10, 15, 20, 25],
        "max_features": [5, 20, 25, 30, "auto", "sqrt", "log2"],
        "min_samples_leaf": [1, 5, 10, 20, 50]
    }

    ## Different scorers for the grid search
    scorers = {
        "precision_score": make_scorer(precision_score),
        "recall_score": make_scorer(recall_score),
        "accuracy_score": make_scorer(accuracy_score)
    }

    ## Creating the grid search object. Using refit="recall_score" to optimize using this score
    grid_search = GridSearchCV(clf,
                               param_grid,
                               cv=5,
                               scoring=scorers,
                               refit="recall_score",
                               return_train_score=True,
                               n_jobs=-1)

    grid_search.fit(X_train, y_train)
    prediction = grid_search.predict(X_test)

    scores(prediction, y_test, X_train, y_train, grid_search)

    ## Using the graphviz package to produce a PNG image to display the decision tree
    dot_data = tree.export_graphviz(grid_search.best_estimator_,
                                    out_file=None,
                                    filled=True,
                                    rounded=True,
                                    special_characters=True)
    graph = graphviz.Source(dot_data)
    graph.format = "png"
    graph.render("plots/tree")
示例#5
0
def logreg_tuned():
    """
    This function reads the dataset and uses logistic regression ..
    to test optimized parameters found in the function above.
    """

    X_train, X_test, y_train, y_test = retrieve_data( undersampling=True, ratio=1.0, random_state=3 )

    clf = LogisticRegression(random_state=4, solver="liblinear", C=0.01, penalty="l1")

    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)

    scores(prediction, y_test, X_train, y_train)
示例#6
0
    def score(self, model, modeltype="original", n='all', **kwargs):

        import os
        if os.name == 'nt':
            """
            Single processing for Windows systems.
            """
            x  = [None] * len(self.dataset)
            for c in range(len(self.dataset)): # This should be optimized to run only for the requested time series
                x[c] = scores(self.dataset[c], model, modeltype, kwargs)
            if n == 'all':
                return x
            else:
                return x[n]
        else:
            """
            Multiprocessing for other systems beside Windows.
            """
            import concurrent.futures
            result = []
            with concurrent.futures.ProcessPoolExecutor() as executor:
                self.results = [executor.submit(scores, self.dataset[c], model, modeltype, kwargs) for c in range(len(self.dataset[:]))] # Same here, can be optimized
                for f in concurrent.futures.as_completed(self.results):
                    result.append(f.result())
                if n == 'all':
                    return result
                else:
                    return result[n]
def neuralnet_gridsearch():
    """
    This function retrieves the dataset, creates a neural network Multilayered Perceptron, and
    uses a grid search method to find the most optimum parameters for maximizing the recall score.
    """

    X_train, X_test, y_train, y_test = retrieve_data(undersampling=True,
                                                     ratio=1.0,
                                                     random_state=3)

    ## We decided on using the adaptive learning rate and a inital rate of 0.001.
    clf = sklearn.neural_network.MLPClassifier(learning_rate="adaptive",
                                               learning_rate_init=0.001,
                                               tol=1e-4,
                                               verbose=False)

    ## Grid search parameter grid to search through.
    param_grid = {
        "hidden_layer_sizes": [(30), (40, 40), (50, 50, 50), (30, 30, 30, 30)],
        "activation": ["logistic"],
        "solver": ["lbfgs", "adam"],
        "alpha": [0.1, 0.01, 0.001],
        "max_iter": [500, 1000]
    }

    ## Different scorers for the grid search
    scorers = {
        "precision_score": make_scorer(precision_score),
        "recall_score": make_scorer(recall_score),
        "accuracy_score": make_scorer(accuracy_score)
    }

    ## Creating the grid search object. Using refit="recall_score" to optimize using this score
    grid_search = GridSearchCV(clf,
                               param_grid,
                               cv=5,
                               scoring=scorers,
                               refit="recall_score",
                               return_train_score=True,
                               n_jobs=-1)

    grid_search.fit(X_train, y_train)
    prediction = grid_search.predict(X_test)

    scores(prediction, y_test, X_train, y_train, grid_search)
def randomforest_gridsearch():
    """
    This function retrieves the dataset and uses a random forest classifier for predicting
    credit card frauds. To maximize the recall score, we used a grid search method to optimize
    the parameters going into the random forest classifier.
    """

    X_train, X_test, y_train, y_test = retrieve_data(undersampling=True,
                                                     ratio=1.0,
                                                     random_state=None)

    ### Random Forest Classifier
    clf = RandomForestClassifier(random_state=4)

    ## Grid search parameter grid to search through
    param_grid = {
        "criterion": ["gini", "entropy"],
        "n_estimators": [10, 100, 200],
        "min_samples_split": [3, 5, 10],
        "max_depth": [5, 15, 25],
        "max_features": [5, 10, 30],
        "min_samples_leaf": [1, 10, 20]
    }

    ## Different scorers for the grid search
    scorers = {
        "precision_score": make_scorer(precision_score),
        "recall_score": make_scorer(recall_score),
        "accuracy_score": make_scorer(accuracy_score)
    }

    ## Creating the grid search object. Using refit="recall_score" to optimize using this score
    grid_search = GridSearchCV(clf,
                               param_grid,
                               cv=5,
                               scoring=scorers,
                               refit="recall_score",
                               return_train_score=True,
                               n_jobs=-1)

    grid_search.fit(X_train, y_train)
    prediction = grid_search.predict(X_test)

    scores(prediction, y_test, X_train, y_train, grid_search)
def randomforest_tuned():
    """
    This function reads the dataset and uses the random forest ..
    to test optimized parameters found in the function above.
    """

    X_train, X_test, y_train, y_test = retrieve_data(undersampling=True,
                                                     ratio=1,
                                                     random_state=None)
    print("shape of X_train " + str(np.shape(X_train)))
    print("shape of Y_train " + str(np.shape(y_train)))
    print("shape of X_test " + str(np.shape(X_test)))
    print("shape of Y_test " + str(np.shape(y_test)))
    clf = RandomForestClassifier()

    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)

    scores(prediction, y_test, X_train, y_train)
示例#10
0
    def supervised_learning(self):
        """Helper function for supervised learning"""

        y = pd.get_dummies(pd.Series(self.y_train).astype('category'))
        y_val = pd.get_dummies(pd.Series(self.y_val).astype('category'))

        callback = keras.callbacks.EarlyStopping(patience=5)
        initializer = initializers.RandomNormal(seed=constants.RANDOM_STATE)

        model = keras.layers.Sequential()
        model.add(
            keras.layers.Dense(units=100,
                               input_shape=(self.x_train.shape[1], ),
                               activation='relu',
                               kernel_initializer=initializer))
        model.add(keras.layers.Dropout(0.4))
        model.add(
            keras.layers.Dense(100,
                               activation='relu',
                               kernel_initializer=initializer))
        model.add(keras.layers.Dropout(0.2))
        model.add(
            keras.layers.Dense(2,
                               activation='softmax',
                               kernel_initializer=initializer))

        model.compile(optimizer='SGD',
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        model.fit(self.x_train,
                  y,
                  epochs=20,
                  callbacks=[callback],
                  verbose=0,
                  validation_data=(self.x_val, y_val))
        self.preds_prob = model.predict(self.x_val)[:, 1]
        auc = np.round(metrics.roc_auc_score(self.y_val, self.preds_prob), 2)
        self.preds = np.where(self.preds_prob < 0.5, 0, 1)
        acc, recall, auc = scoring.scores(self.y_val, self.preds,
                                          self.preds_prob)
        return self.preds_prob, acc, recall, auc
# %%
# Defining an instance of the classification class
classifiers = classification.Classification(list_models, x_train, x_val,
                                            y_train, y_val)

# %% [markdown]
# ## Supervised learning

# %%
df_scores_supLearning, prediction_supLearning = classifiers.run_supervised_learning(
)

# %%
df_scores_supLearning

# %% [markdown]
# ## Semi supervised learning

# %%
df_scores_supLearning, prediction_supLearning = classifiers.run_semi_supervised_learning(
)

# %%
df_scores_supLearning

# %%
model, oof, = utils.CV(clf_lgb, df_targ_encoded, target_2_classes)

# %%
scoring.scores(target_2_classes, np.where(oof < 0.5, 0, 1), oof)
示例#12
0
 def supervised_learning(self):
     self.fit_predict()
     acc, recall, auc = scoring.scores(self.y_val, self.preds,
                                       self.preds_prob)
     return self.preds_prob, acc, recall, auc