def ft_random_forest_testing(x_train, y_train, x_test, y_test): print('Random Forest Feature Loop\n\n') train_list = [] test_list = [] F1_list = [] for i in [1, 2, 5, 8, 10, 20, 25, 35, 50]: rclf = RandomForestClassifier(max_depth=7, max_features=i, n_trees=50) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = rclf.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds))) # Grab the useful number per cycle train_list.append(train_accuracy) test_list.append(test_accuracy) F1_list.append(f1(y_test, preds)) plt.rcParams['font.family'] = ['serif'] x = [1, 2, 5, 8, 10, 20, 25, 35, 50] ax = plt.subplot(111) ax.plot(x, train_list, label='training') ax.plot(x, test_list, label='testing') ax.plot(x, F1_list, label='F1') plt.xlabel("max_features") plt.xticks(x) plt.ylabel("Accuracies") ax.legend() plt.savefig("RandomForestFeatures.png") plt.clf()
def random_forest_various_features(x_train, y_train, x_test, y_test): # keep our values to use for max_features useFeatures = [1, 2, 5, 8, 10, 20, 25, 35, 50] # for whatever reason, same variable names cause issues despite being within local scope # so we have to make sure there are no matching variable names even between functions graphTrain2 = [] graphTest2 = [] graphF12 = [] # let the user know which test this is print("== Beginning test for various max_features.\n") for features in useFeatures: print("max_features: ", features) rclf = RandomForestClassifier(max_depth=7, max_features=features, n_trees=50) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) graphTrain2.append(accuracy_score(preds_train, y_train)) graphTest2.append(accuracy_score(preds_test, y_test)) print('Train {}'.format(accuracy_score(preds_train, y_train))) print('Test {}'.format(accuracy_score(preds_test, y_test))) preds = rclf.predict(x_test) graphF12.append(f1(y_test, preds)) print('F1 Test {}\n'.format(f1(y_test, preds))) # print lengths for debugging print("== Length of Train", len(graphTrain2)) print("== Length of Test", len(graphTest2)) print("== Length of F1", len(graphF12)) # table for easily reading data table2 = pd.DataFrame({ "max_features": [i for i in useFeatures], "Train Accuracy": graphTrain2, "Test Accuracy": graphTest2, "F1 Accuracy": graphF12 }) print(table2) # plot our graph and output to a file plt.figure(3) plt.xlabel('Max Features') plt.ylabel('Performance') plt.title('Accuracy & F1 Score vs Max Features') plt.plot('max_features', 'Train Accuracy', data=table2, color='blue') plt.plot('max_features', 'Test Accuracy', data=table2, color='green') plt.plot('max_features', 'F1 Accuracy', data=table2, color='red') plt.legend() plt.savefig('q2pd.png') # return best value for max_features to use in main return [feature for feature in useFeatures][graphF12.index(max(graphF12))]
def random_forest_testing(x_train, y_train, x_test, y_test, feat, tree): print('Random Forest\n\n') rclf = RandomForestClassifier(max_depth=7, max_features=feat, n_trees=tree) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = rclf.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds))) preds_train = rclf.predict(x_train) return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
def random_forest_various_seeds(x_train, y_train, x_test, y_test, best_max_features, best_n_trees): # let the user know which test this is print("== Beginning test for best result with random seeds.\n") # to hold data points randseedTrain = [] randseedTest = [] randseedF1 = [] averageSeeds = [] averageTrain = [] averageTest = [] averageF1 = [] usedSeeds = [] rclf = RandomForestClassifier(max_depth=7, max_features=best_max_features, n_trees=best_n_trees) for item in [i for i in range(10)]: rclf.seed = np.random.randint(1, 1000) usedSeeds.append(rclf.seed) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) randseedTrain.append(accuracy_score(preds_train, y_train)) randseedTest.append(accuracy_score(preds_test, y_test)) print('Train {}'.format(accuracy_score(preds_train, y_train))) print('Test {}'.format(accuracy_score(preds_test, y_test))) preds = rclf.predict(x_test) randseedF1.append(f1(y_test, preds)) print('F1 Test {}\n'.format(f1(y_test, preds))) # get averages averageSeeds.append("Average") averageTrain.append(sum(randseedTrain) / len(randseedTrain)) averageTest.append(sum(randseedTest) / len(randseedTest)) averageF1.append(sum(randseedF1) / len(randseedF1)) # get table for data + add averages at the end table3 = pd.DataFrame({ "Seed": [i for i in usedSeeds] + averageSeeds, "Train Accuracy": randseedTrain + averageTrain, "Test Accuracy": randseedTest + averageTest, "F1 Score": randseedF1 + averageF1 }) print(table3)
def random_forest_testing(x_train, y_train, x_test, y_test, n_trees, max_features): print('Random Forest') print("max_depth: %d, max_features: %d, n_trees: %d" % (7,max_features, n_trees)) rclf = RandomForestClassifier(n_trees, max_features, max_depth=7) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = rclf.predict(x_test) preds_train = rclf.predict(x_train) print('F1 Train {}'.format(f1(y_train, preds_train))) print('F1 Test {}\n'.format(f1(y_test, preds))) return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
def random_forest_tune_MaxFeatures(x_train, y_train, x_test, y_test): print('Random Forest tune\n\n') plotX = [1, 2, 5, 8, 10, 20, 25, 35, 50] plotTrain = [] plotTest = [] plotF1 = [] for max_features in plotX: print("MAX_Features: ", max_features) rclf = RandomForestClassifier(max_depth=7, max_features=max_features, n_trees=50) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) train_accuracy = round(accuracy_score(preds_train, y_train), 3) test_accuracy = round(accuracy_score(preds_test, y_test), 3) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = rclf.predict(x_test) F1 = round(f1(y_test, preds), 3) print('F1 Test {}'.format(F1)) print('\n') plotTrain.append(train_accuracy) plotTest.append(test_accuracy) plotF1.append(F1) df = pd.DataFrame({ "MAX_Features": plotX, "Train_Accuracy": plotTrain, "Test_Accuracy": plotTest, "F1_Accuracy": plotF1 }) print(df) maxAccuracy = max(plotF1) best_MAX_Features = plotX[plotF1.index(maxAccuracy)] print("The best MAX_Features is ", best_MAX_Features, "with F1 accuracy ", maxAccuracy) print("Drawing plot") plt.plot('MAX_Features', 'Train_Accuracy', data=df, color='red') plt.plot('MAX_Features', 'Test_Accuracy', data=df, color='blue') plt.plot('MAX_Features', 'F1_Accuracy', data=df, color='black') plt.legend() plt.savefig('random_forest_output_max_features.png') plt.close() return best_MAX_Features
def random_forest_various_trees(x_train, y_train, x_test, y_test): graphTrain = [] graphTest = [] graphF1 = [] # let the user know which test this is print("== Beginning test for various n_trees.\n") # plot accuracies for the number of trees specified in part b for i in range(10, 210, 10): print("n_trees: ", i) rclf = RandomForestClassifier(max_depth=7, max_features=11, n_trees=i) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) graphTrain.append(accuracy_score(preds_train, y_train)) graphTest.append(accuracy_score(preds_test, y_test)) print('Train {}'.format(accuracy_score(preds_train, y_train))) print('Test {}'.format(accuracy_score(preds_test, y_test))) preds = rclf.predict(x_test) print('F1 Test {}\n'.format(f1(y_test, preds))) graphF1.append(f1(y_test, preds)) # table for easily reading data table = pd.DataFrame({ "n_trees": [i for i in range(10, 210, 10)], "Train Accuracy": graphTrain, "Test Accuracy": graphTest, "F1 Accuracy": graphF1 }) print(table) # plot our graph and output to a file plt.figure(2) plt.xlabel('Number of trees') plt.ylabel('Performance') plt.title('Accuracy & F1 Score vs Number of Trees in the Forest') plt.plot('n_trees', 'Train Accuracy', data=table, color='blue') plt.plot('n_trees', 'Test Accuracy', data=table, color='green') plt.plot('n_trees', 'F1 Accuracy', data=table, color='red') plt.legend() plt.savefig('q2pb.png') # return our best n__trees value for use in main return [i for i in range(10, 210, 10)][graphF1.index(max(graphF1))]
def rf_tune_all(x_train, y_train, x_test, y_test, depth, features, trees): print('Random Forest tune 3 parameters\n\n') print("[depth,features,trees]") print([depth, features, trees]) rclf = RandomForestClassifier(max_depth=depth, max_features=features, n_trees=trees) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) train_accuracy = round(accuracy_score(preds_train, y_train), 3) test_accuracy = round(accuracy_score(preds_test, y_test), 3) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = rclf.predict(x_test) F1 = round(f1(y_test, preds), 3) print('F1 Test {}'.format(F1)) print('\n') return F1
def random_forest_seed_testing(x_train, y_train, x_test, y_test): print('Random Forest seed testing\n\n') rclf = RandomForestClassifier(max_depth=15, max_features=8, n_trees=210) F1_result = [] Train_result = [] Test_result = [] seeds = [x for x in range(1, 100, 10)] for seed in seeds: rclf.seed = seed rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) train_accuracy = round(accuracy_score(preds_train, y_train), 3) test_accuracy = round(accuracy_score(preds_test, y_test), 3) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = rclf.predict(x_test) F1 = round(f1(y_test, preds), 3) print('F1 Test {}'.format(F1)) print('\n') F1_result.append(F1) Train_result.append(train_accuracy) Test_result.append(test_accuracy) seeds.append("Average") F1_result.append(sum(F1_result) / len(F1_result)) Train_result.append(sum(Train_result) / len(Train_result)) Test_result.append(sum(Test_result) / len(Test_result)) df = pd.DataFrame({ "Seed": seeds, "F1": F1_result, "Train": Train_result, "Test": Test_result }) print(df)
def random_forsest_random_seed(x_train, y_train, x_test, y_test, count): print('#Random Forest Number of Trees\n\n') accuracy_training = [] accuracy_testing = [] f1_testing = [] f1_training = [] features = [] for i in range(0, count): rclf = RandomForestClassifier(max_depth=7, max_features=25, n_trees=151) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) features.append(i) accuracy_training.append(accuracy_score(preds_train, y_train)) accuracy_testing.append(accuracy_score(preds_test, y_test)) f1_training.append(calc_f1(preds_train, y_train)) f1_testing.append(calc_f1(preds_test, y_test)) f1 = plt.figure(1) plt.plot(features, accuracy_training) plt.plot(features, accuracy_testing) plt.title("Accuracy vs Seed") plt.ylabel("Accuracy") plt.xlabel("Seed Index") plt.legend(['Training Accuracy', 'Testing Accuracy']) f1.show() f2 = plt.figure(2) plt.plot(features, f1_training) plt.plot(features, f1_testing) plt.title("F1 vs Seed") plt.ylabel("F1") plt.xlabel("Seed Index") plt.legend(['Training F1', 'Testing F1']) plt.show()
def random_forest_testing_max_features(x_train, y_train, x_test, y_test): print('#Random Forest Number of Trees\n\n') accuracy_training = [] accuracy_testing = [] f1_testing = [] f1_training = [] features = [] for max_features in [1, 2, 5, 8, 10, 20, 25, 35, 50]: rclf = RandomForestClassifier(max_depth=7, max_features=max_features, n_trees=50) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) features.append(max_features) accuracy_training.append(accuracy_score(preds_train, y_train)) accuracy_testing.append(accuracy_score(preds_test, y_test)) f1_training.append(calc_f1(preds_train, y_train)) f1_testing.append(calc_f1(preds_test, y_test)) f1 = plt.figure(1) plt.plot(features, accuracy_training) plt.plot(features, accuracy_testing) plt.title("Accuracy vs Max Features") plt.ylabel("Accuracy") plt.xlabel("Max Features") plt.legend(['Training Accuracy', 'Testing Accuracy']) f1.show() f2 = plt.figure(2) plt.plot(features, f1_training) plt.plot(features, f1_testing) plt.title("F1 vs Max Features") plt.ylabel("F1") plt.xlabel("Max Features") plt.legend(['Training F1', 'Testing F1']) plt.show()