def logreg_gridsearch(): """ This function reads in the dataset and performs a logistic regression method and .. using Grid Search to find the optimum parameters that will maximize the recall score, """ X_train, X_test, y_train, y_test = retrieve_data( undersampling=True, ratio=1.0, random_state=3 ) clf = LogisticRegression(random_state=4, solver="liblinear") ## Grid search parameter grid param_grid= { "C" : np.logspace(-3,3,7), "penalty" : ["l1", "l2"] } ## Different scorers for the grid search scorers = { "precision_score": make_scorer(precision_score), "recall_score": make_scorer(recall_score), "accuracy_score": make_scorer(accuracy_score) } ## Creating the grid search object. Using refit="recall_score" to optimize using this score grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=scorers, refit="recall_score", return_train_score=True, n_jobs=-1) grid_search.fit(X_train, y_train) prediction = grid_search.predict(X_test) scores(prediction, y_test, X_train, y_train, grid_search)
def neuralnet_tuned(): """ This function reads the dataset and uses the neural network .. to test optimized parameters found in the function above. """ X_train, X_test, y_train, y_test = retrieve_data(undersampling=True, ratio=1.0, random_state=3) clf = sklearn.neural_network.MLPClassifier(learning_rate="adaptive", learning_rate_init=0.001, activation="logistic", alpha=0.1, hidden_layer_sizes=(30, 30, 30, 30), max_iter=500, solver="lbfgs", tol=1e-4, verbose=False) clf.fit(X_train, y_train) prediction = clf.predict(X_test) scores(prediction, y_test, X_train, y_train)
def decisiontree_tuned(): """ This function reads the dataset and uses the decision tree .. to test optimized parameters found in the function above. """ X_train, X_test, y_train, y_test = retrieve_data(undersampling=True, ratio=1, random_state=2) clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=20, max_features=30, min_samples_leaf=1, min_samples_split=2) clf.fit(X_train, y_train) prediction = clf.predict(X_test) scores(prediction, y_test, X_train, y_train) dot_data = tree.export_graphviz(clf, out_file=None, filled=True, rounded=True, special_characters=True) graph = graphviz.Source(dot_data) graph.format = "png" graph.render("plots/tree")
def decisiontree_gridsearch(): """ This function retrieves the dataset and uses grid search to find optimum parameters to optimize the recall score of a Decision Tree classifier. """ X_train, X_test, y_train, y_test = retrieve_data(undersampling=True, ratio=1.0, random_state=2) ## Our decision tree model clf = tree.DecisionTreeClassifier() # Grid search parameter grid to search through. param_grid = { "criterion": ["gini", "entropy"], "min_samples_split": [2, 3, 5, 8, 10], "max_depth": [3, 5, 10, 15, 20, 25], "max_features": [5, 20, 25, 30, "auto", "sqrt", "log2"], "min_samples_leaf": [1, 5, 10, 20, 50] } ## Different scorers for the grid search scorers = { "precision_score": make_scorer(precision_score), "recall_score": make_scorer(recall_score), "accuracy_score": make_scorer(accuracy_score) } ## Creating the grid search object. Using refit="recall_score" to optimize using this score grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=scorers, refit="recall_score", return_train_score=True, n_jobs=-1) grid_search.fit(X_train, y_train) prediction = grid_search.predict(X_test) scores(prediction, y_test, X_train, y_train, grid_search) ## Using the graphviz package to produce a PNG image to display the decision tree dot_data = tree.export_graphviz(grid_search.best_estimator_, out_file=None, filled=True, rounded=True, special_characters=True) graph = graphviz.Source(dot_data) graph.format = "png" graph.render("plots/tree")
def logreg_tuned(): """ This function reads the dataset and uses logistic regression .. to test optimized parameters found in the function above. """ X_train, X_test, y_train, y_test = retrieve_data( undersampling=True, ratio=1.0, random_state=3 ) clf = LogisticRegression(random_state=4, solver="liblinear", C=0.01, penalty="l1") clf.fit(X_train, y_train) prediction = clf.predict(X_test) scores(prediction, y_test, X_train, y_train)
def score(self, model, modeltype="original", n='all', **kwargs): import os if os.name == 'nt': """ Single processing for Windows systems. """ x = [None] * len(self.dataset) for c in range(len(self.dataset)): # This should be optimized to run only for the requested time series x[c] = scores(self.dataset[c], model, modeltype, kwargs) if n == 'all': return x else: return x[n] else: """ Multiprocessing for other systems beside Windows. """ import concurrent.futures result = [] with concurrent.futures.ProcessPoolExecutor() as executor: self.results = [executor.submit(scores, self.dataset[c], model, modeltype, kwargs) for c in range(len(self.dataset[:]))] # Same here, can be optimized for f in concurrent.futures.as_completed(self.results): result.append(f.result()) if n == 'all': return result else: return result[n]
def neuralnet_gridsearch(): """ This function retrieves the dataset, creates a neural network Multilayered Perceptron, and uses a grid search method to find the most optimum parameters for maximizing the recall score. """ X_train, X_test, y_train, y_test = retrieve_data(undersampling=True, ratio=1.0, random_state=3) ## We decided on using the adaptive learning rate and a inital rate of 0.001. clf = sklearn.neural_network.MLPClassifier(learning_rate="adaptive", learning_rate_init=0.001, tol=1e-4, verbose=False) ## Grid search parameter grid to search through. param_grid = { "hidden_layer_sizes": [(30), (40, 40), (50, 50, 50), (30, 30, 30, 30)], "activation": ["logistic"], "solver": ["lbfgs", "adam"], "alpha": [0.1, 0.01, 0.001], "max_iter": [500, 1000] } ## Different scorers for the grid search scorers = { "precision_score": make_scorer(precision_score), "recall_score": make_scorer(recall_score), "accuracy_score": make_scorer(accuracy_score) } ## Creating the grid search object. Using refit="recall_score" to optimize using this score grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=scorers, refit="recall_score", return_train_score=True, n_jobs=-1) grid_search.fit(X_train, y_train) prediction = grid_search.predict(X_test) scores(prediction, y_test, X_train, y_train, grid_search)
def randomforest_gridsearch(): """ This function retrieves the dataset and uses a random forest classifier for predicting credit card frauds. To maximize the recall score, we used a grid search method to optimize the parameters going into the random forest classifier. """ X_train, X_test, y_train, y_test = retrieve_data(undersampling=True, ratio=1.0, random_state=None) ### Random Forest Classifier clf = RandomForestClassifier(random_state=4) ## Grid search parameter grid to search through param_grid = { "criterion": ["gini", "entropy"], "n_estimators": [10, 100, 200], "min_samples_split": [3, 5, 10], "max_depth": [5, 15, 25], "max_features": [5, 10, 30], "min_samples_leaf": [1, 10, 20] } ## Different scorers for the grid search scorers = { "precision_score": make_scorer(precision_score), "recall_score": make_scorer(recall_score), "accuracy_score": make_scorer(accuracy_score) } ## Creating the grid search object. Using refit="recall_score" to optimize using this score grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=scorers, refit="recall_score", return_train_score=True, n_jobs=-1) grid_search.fit(X_train, y_train) prediction = grid_search.predict(X_test) scores(prediction, y_test, X_train, y_train, grid_search)
def randomforest_tuned(): """ This function reads the dataset and uses the random forest .. to test optimized parameters found in the function above. """ X_train, X_test, y_train, y_test = retrieve_data(undersampling=True, ratio=1, random_state=None) print("shape of X_train " + str(np.shape(X_train))) print("shape of Y_train " + str(np.shape(y_train))) print("shape of X_test " + str(np.shape(X_test))) print("shape of Y_test " + str(np.shape(y_test))) clf = RandomForestClassifier() clf.fit(X_train, y_train) prediction = clf.predict(X_test) scores(prediction, y_test, X_train, y_train)
def supervised_learning(self): """Helper function for supervised learning""" y = pd.get_dummies(pd.Series(self.y_train).astype('category')) y_val = pd.get_dummies(pd.Series(self.y_val).astype('category')) callback = keras.callbacks.EarlyStopping(patience=5) initializer = initializers.RandomNormal(seed=constants.RANDOM_STATE) model = keras.layers.Sequential() model.add( keras.layers.Dense(units=100, input_shape=(self.x_train.shape[1], ), activation='relu', kernel_initializer=initializer)) model.add(keras.layers.Dropout(0.4)) model.add( keras.layers.Dense(100, activation='relu', kernel_initializer=initializer)) model.add(keras.layers.Dropout(0.2)) model.add( keras.layers.Dense(2, activation='softmax', kernel_initializer=initializer)) model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy']) model.fit(self.x_train, y, epochs=20, callbacks=[callback], verbose=0, validation_data=(self.x_val, y_val)) self.preds_prob = model.predict(self.x_val)[:, 1] auc = np.round(metrics.roc_auc_score(self.y_val, self.preds_prob), 2) self.preds = np.where(self.preds_prob < 0.5, 0, 1) acc, recall, auc = scoring.scores(self.y_val, self.preds, self.preds_prob) return self.preds_prob, acc, recall, auc
# %% # Defining an instance of the classification class classifiers = classification.Classification(list_models, x_train, x_val, y_train, y_val) # %% [markdown] # ## Supervised learning # %% df_scores_supLearning, prediction_supLearning = classifiers.run_supervised_learning( ) # %% df_scores_supLearning # %% [markdown] # ## Semi supervised learning # %% df_scores_supLearning, prediction_supLearning = classifiers.run_semi_supervised_learning( ) # %% df_scores_supLearning # %% model, oof, = utils.CV(clf_lgb, df_targ_encoded, target_2_classes) # %% scoring.scores(target_2_classes, np.where(oof < 0.5, 0, 1), oof)
def supervised_learning(self): self.fit_predict() acc, recall, auc = scoring.scores(self.y_val, self.preds, self.preds_prob) return self.preds_prob, acc, recall, auc