def test_undersampling(self): """ Checks that the ratio property of the function is working properly """ with self.assertRaises(ValueError): retrieve_data(undersampling=True, ratio=1.1) X_train, X_test, y_train, y_test = retrieve_data( undersampling=True,\ ratio=0.5) self.assertEqual(1476, len(X_train) + len(X_test)) self.assertEqual(1476, len(y_train) + len(y_test)) self.assertEqual(988, len(y_train)) self.assertEqual(988, len(X_train)) self.assertEqual(488, len(y_test)) self.assertEqual(488, len(y_test))
def neuralnet_tuned(): """ This function reads the dataset and uses the neural network .. to test optimized parameters found in the function above. """ X_train, X_test, y_train, y_test = retrieve_data(undersampling=True, ratio=1.0, random_state=3) clf = sklearn.neural_network.MLPClassifier(learning_rate="adaptive", learning_rate_init=0.001, activation="logistic", alpha=0.1, hidden_layer_sizes=(30, 30, 30, 30), max_iter=500, solver="lbfgs", tol=1e-4, verbose=False) clf.fit(X_train, y_train) prediction = clf.predict(X_test) scores(prediction, y_test, X_train, y_train)
def logreg_gridsearch(): """ This function reads in the dataset and performs a logistic regression method and .. using Grid Search to find the optimum parameters that will maximize the recall score, """ X_train, X_test, y_train, y_test = retrieve_data( undersampling=True, ratio=1.0, random_state=3 ) clf = LogisticRegression(random_state=4, solver="liblinear") ## Grid search parameter grid param_grid= { "C" : np.logspace(-3,3,7), "penalty" : ["l1", "l2"] } ## Different scorers for the grid search scorers = { "precision_score": make_scorer(precision_score), "recall_score": make_scorer(recall_score), "accuracy_score": make_scorer(accuracy_score) } ## Creating the grid search object. Using refit="recall_score" to optimize using this score grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=scorers, refit="recall_score", return_train_score=True, n_jobs=-1) grid_search.fit(X_train, y_train) prediction = grid_search.predict(X_test) scores(prediction, y_test, X_train, y_train, grid_search)
def decisiontree_tuned(): """ This function reads the dataset and uses the decision tree .. to test optimized parameters found in the function above. """ X_train, X_test, y_train, y_test = retrieve_data(undersampling=True, ratio=1, random_state=2) clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=20, max_features=30, min_samples_leaf=1, min_samples_split=2) clf.fit(X_train, y_train) prediction = clf.predict(X_test) scores(prediction, y_test, X_train, y_train) dot_data = tree.export_graphviz(clf, out_file=None, filled=True, rounded=True, special_characters=True) graph = graphviz.Source(dot_data) graph.format = "png" graph.render("plots/tree")
def test_arrays(self): """ testing that the ouput has correct dimension """ total_size = 284807 X_train, X_test, y_train, y_test = retrieve_data() self.assertEqual(total_size, len(X_train) + len(X_test)) self.assertEqual(total_size, len(y_train) + len(y_test)) self.assertEqual(int(total_size * 0.67), len(y_train)) self.assertEqual(int(total_size * 0.67), len(X_train)) self.assertEqual(int(total_size * 0.33 + 1), len(y_test)) self.assertEqual(int(total_size * 0.33 + 1), len(X_test))
def decisiontree_gridsearch(): """ This function retrieves the dataset and uses grid search to find optimum parameters to optimize the recall score of a Decision Tree classifier. """ X_train, X_test, y_train, y_test = retrieve_data(undersampling=True, ratio=1.0, random_state=2) ## Our decision tree model clf = tree.DecisionTreeClassifier() # Grid search parameter grid to search through. param_grid = { "criterion": ["gini", "entropy"], "min_samples_split": [2, 3, 5, 8, 10], "max_depth": [3, 5, 10, 15, 20, 25], "max_features": [5, 20, 25, 30, "auto", "sqrt", "log2"], "min_samples_leaf": [1, 5, 10, 20, 50] } ## Different scorers for the grid search scorers = { "precision_score": make_scorer(precision_score), "recall_score": make_scorer(recall_score), "accuracy_score": make_scorer(accuracy_score) } ## Creating the grid search object. Using refit="recall_score" to optimize using this score grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=scorers, refit="recall_score", return_train_score=True, n_jobs=-1) grid_search.fit(X_train, y_train) prediction = grid_search.predict(X_test) scores(prediction, y_test, X_train, y_train, grid_search) ## Using the graphviz package to produce a PNG image to display the decision tree dot_data = tree.export_graphviz(grid_search.best_estimator_, out_file=None, filled=True, rounded=True, special_characters=True) graph = graphviz.Source(dot_data) graph.format = "png" graph.render("plots/tree")
def logreg_tuned(): """ This function reads the dataset and uses logistic regression .. to test optimized parameters found in the function above. """ X_train, X_test, y_train, y_test = retrieve_data( undersampling=True, ratio=1.0, random_state=3 ) clf = LogisticRegression(random_state=4, solver="liblinear", C=0.01, penalty="l1") clf.fit(X_train, y_train) prediction = clf.predict(X_test) scores(prediction, y_test, X_train, y_train)
def neuralnet_learningrate(): """ This function tests using a neural network with different, initial learning rate and then plots and prints the results. """ ratio_ = 0.1 X_train, X_test, y_train, y_test = retrieve_data(undersampling=True, ratio=ratio_) learning_rate = 10**(-np.linspace(3, 1, 70)) n = len(learning_rate) acc_score = np.zeros(n) rec_score = np.zeros(n) prec_score = np.zeros(n) for i in range(len(learning_rate)): print(int(100 * i / len(learning_rate)), "%", end="\r") clf = sklearn.neural_network.MLPClassifier( hidden_layer_sizes=(30, 30, 30, 30), learning_rate="adaptive", learning_rate_init=learning_rate[i], max_iter=1000000, tol=1e-10, verbose=False, ) clf = clf.fit(X_train, y_train.ravel()) predict = clf.predict(X_test) acc_score[i] = accuracy_score(y_test.ravel(), predict) prec_score[i] = precision_score(y_test.ravel(), predict) rec_score[i] = recall_score(y_test.ravel(), predict) plt.semilogx(learning_rate, acc_score) plt.semilogx(learning_rate, prec_score) plt.semilogx(learning_rate, rec_score) plt.legend(["Accuracy", "Precision", "Recall"], prop={'size': 12}) plt.xlabel(r"Learning rate $\eta$", size=14) plt.ylabel("Scores", size=14) plt.title("Scikit-Learn NeuralNet score for different learning rates", size=16) plt.show() print("Ratio: ", ratio_) print("Accuracy score", acc_score) print("Precision score", prec_score) print("Recall score", rec_score)
def neuralnet_gridsearch(): """ This function retrieves the dataset, creates a neural network Multilayered Perceptron, and uses a grid search method to find the most optimum parameters for maximizing the recall score. """ X_train, X_test, y_train, y_test = retrieve_data(undersampling=True, ratio=1.0, random_state=3) ## We decided on using the adaptive learning rate and a inital rate of 0.001. clf = sklearn.neural_network.MLPClassifier(learning_rate="adaptive", learning_rate_init=0.001, tol=1e-4, verbose=False) ## Grid search parameter grid to search through. param_grid = { "hidden_layer_sizes": [(30), (40, 40), (50, 50, 50), (30, 30, 30, 30)], "activation": ["logistic"], "solver": ["lbfgs", "adam"], "alpha": [0.1, 0.01, 0.001], "max_iter": [500, 1000] } ## Different scorers for the grid search scorers = { "precision_score": make_scorer(precision_score), "recall_score": make_scorer(recall_score), "accuracy_score": make_scorer(accuracy_score) } ## Creating the grid search object. Using refit="recall_score" to optimize using this score grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=scorers, refit="recall_score", return_train_score=True, n_jobs=-1) grid_search.fit(X_train, y_train) prediction = grid_search.predict(X_test) scores(prediction, y_test, X_train, y_train, grid_search)
def randomforest_gridsearch(): """ This function retrieves the dataset and uses a random forest classifier for predicting credit card frauds. To maximize the recall score, we used a grid search method to optimize the parameters going into the random forest classifier. """ X_train, X_test, y_train, y_test = retrieve_data(undersampling=True, ratio=1.0, random_state=None) ### Random Forest Classifier clf = RandomForestClassifier(random_state=4) ## Grid search parameter grid to search through param_grid = { "criterion": ["gini", "entropy"], "n_estimators": [10, 100, 200], "min_samples_split": [3, 5, 10], "max_depth": [5, 15, 25], "max_features": [5, 10, 30], "min_samples_leaf": [1, 10, 20] } ## Different scorers for the grid search scorers = { "precision_score": make_scorer(precision_score), "recall_score": make_scorer(recall_score), "accuracy_score": make_scorer(accuracy_score) } ## Creating the grid search object. Using refit="recall_score" to optimize using this score grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=scorers, refit="recall_score", return_train_score=True, n_jobs=-1) grid_search.fit(X_train, y_train) prediction = grid_search.predict(X_test) scores(prediction, y_test, X_train, y_train, grid_search)
def randomforest_tuned(): """ This function reads the dataset and uses the random forest .. to test optimized parameters found in the function above. """ X_train, X_test, y_train, y_test = retrieve_data(undersampling=True, ratio=1, random_state=None) print("shape of X_train " + str(np.shape(X_train))) print("shape of Y_train " + str(np.shape(y_train))) print("shape of X_test " + str(np.shape(X_test))) print("shape of Y_test " + str(np.shape(y_test))) clf = RandomForestClassifier() clf.fit(X_train, y_train) prediction = clf.predict(X_test) scores(prediction, y_test, X_train, y_train)
def decisiontree_undersamplingratio(): """ This funcions purpose is to test how the scores vary when using different undersampling ratios and plots the results. """ n = 61 ratio_ = 10**(-np.linspace(6.0, 0.0, n)) n = len(ratio_) acc_score = np.zeros(n) rec_score = np.zeros(n) prec_score = np.zeros(n) for i in range(n): print(int(100 * i / len(ratio_)), "%", end="\r") X_train, X_test, y_train, y_test = retrieve_data(undersampling=True, ratio=ratio_[i]) clf = tree.DecisionTreeClassifier() clf = clf.fit(X_train, y_train.ravel()) predict = clf.predict(X_test) acc_score[i] = accuracy_score(y_test.ravel(), predict) prec_score[i] = precision_score(y_test.ravel(), predict) rec_score[i] = recall_score(y_test.ravel(), predict) plt.semilogx(ratio_, acc_score) plt.semilogx(ratio_, prec_score) plt.semilogx(ratio_, rec_score) plt.xlabel("Ratio", size=14) plt.ylabel("Score", size=14) plt.title("Scikit-Learn Decision Tree score for different ratios", size=16) plt.legend(['Accuracy', "Precision", "Recall"], prop={'size': 12}) plt.savefig("plots/dectree_ratiotest.png") plt.show()