Пример #1
0
def ft_random_forest_testing(x_train, y_train, x_test, y_test):
    print('Random Forest Feature Loop\n\n')
    train_list = []
    test_list = []
    F1_list = []

    for i in [1, 2, 5, 8, 10, 20, 25, 35, 50]:
        rclf = RandomForestClassifier(max_depth=7, max_features=i, n_trees=50)
        rclf.fit(x_train, y_train)
        preds_train = rclf.predict(x_train)
        preds_test = rclf.predict(x_test)
        train_accuracy = accuracy_score(preds_train, y_train)
        test_accuracy = accuracy_score(preds_test, y_test)
        print('Train {}'.format(train_accuracy))
        print('Test {}'.format(test_accuracy))
        preds = rclf.predict(x_test)
        print('F1 Test {}'.format(f1(y_test, preds)))

        # Grab the useful number per cycle
        train_list.append(train_accuracy)
        test_list.append(test_accuracy)
        F1_list.append(f1(y_test, preds))

    plt.rcParams['font.family'] = ['serif']
    x = [1, 2, 5, 8, 10, 20, 25, 35, 50]
    ax = plt.subplot(111)
    ax.plot(x, train_list, label='training')
    ax.plot(x, test_list, label='testing')
    ax.plot(x, F1_list, label='F1')
    plt.xlabel("max_features")
    plt.xticks(x)
    plt.ylabel("Accuracies")
    ax.legend()
    plt.savefig("RandomForestFeatures.png")
    plt.clf()
Пример #2
0
def random_forest_various_features(x_train, y_train, x_test, y_test):
    # keep our values to use for max_features
    useFeatures = [1, 2, 5, 8, 10, 20, 25, 35, 50]

    # for whatever reason, same variable names cause issues despite being within local scope
    # so we have to make sure there are no matching variable names even between functions

    graphTrain2 = []
    graphTest2 = []
    graphF12 = []

    # let the user know which test this is
    print("== Beginning test for various max_features.\n")

    for features in useFeatures:
        print("max_features: ", features)
        rclf = RandomForestClassifier(max_depth=7,
                                      max_features=features,
                                      n_trees=50)
        rclf.fit(x_train, y_train)
        preds_train = rclf.predict(x_train)
        preds_test = rclf.predict(x_test)
        graphTrain2.append(accuracy_score(preds_train, y_train))
        graphTest2.append(accuracy_score(preds_test, y_test))
        print('Train {}'.format(accuracy_score(preds_train, y_train)))
        print('Test {}'.format(accuracy_score(preds_test, y_test)))
        preds = rclf.predict(x_test)
        graphF12.append(f1(y_test, preds))
        print('F1 Test {}\n'.format(f1(y_test, preds)))

    # print lengths for debugging
    print("== Length of Train", len(graphTrain2))
    print("== Length of Test", len(graphTest2))
    print("== Length of F1", len(graphF12))

    # table for easily reading data
    table2 = pd.DataFrame({
        "max_features": [i for i in useFeatures],
        "Train Accuracy": graphTrain2,
        "Test Accuracy": graphTest2,
        "F1 Accuracy": graphF12
    })
    print(table2)

    # plot our graph and output to a file
    plt.figure(3)
    plt.xlabel('Max Features')
    plt.ylabel('Performance')
    plt.title('Accuracy & F1 Score vs Max Features')
    plt.plot('max_features', 'Train Accuracy', data=table2, color='blue')
    plt.plot('max_features', 'Test Accuracy', data=table2, color='green')
    plt.plot('max_features', 'F1 Accuracy', data=table2, color='red')
    plt.legend()
    plt.savefig('q2pd.png')

    # return best value for max_features to use in main
    return [feature for feature in useFeatures][graphF12.index(max(graphF12))]
Пример #3
0
def random_forest_testing(x_train, y_train, x_test, y_test, feat, tree):
	print('Random Forest\n\n')
	rclf = RandomForestClassifier(max_depth=7, max_features=feat, n_trees=tree)
	rclf.fit(x_train, y_train)
	preds_train = rclf.predict(x_train)
	preds_test = rclf.predict(x_test)
	train_accuracy = accuracy_score(preds_train, y_train)
	test_accuracy = accuracy_score(preds_test, y_test)
	print('Train {}'.format(train_accuracy))
	print('Test {}'.format(test_accuracy))
	preds = rclf.predict(x_test)
	print('F1 Test {}'.format(f1(y_test, preds)))
	preds_train = rclf.predict(x_train)
	return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
Пример #4
0
def random_forest_various_seeds(x_train, y_train, x_test, y_test,
                                best_max_features, best_n_trees):
    # let the user know which test this is
    print("== Beginning test for best result with random seeds.\n")

    # to hold data points
    randseedTrain = []
    randseedTest = []
    randseedF1 = []
    averageSeeds = []
    averageTrain = []
    averageTest = []
    averageF1 = []
    usedSeeds = []

    rclf = RandomForestClassifier(max_depth=7,
                                  max_features=best_max_features,
                                  n_trees=best_n_trees)

    for item in [i for i in range(10)]:
        rclf.seed = np.random.randint(1, 1000)
        usedSeeds.append(rclf.seed)
        rclf.fit(x_train, y_train)
        preds_train = rclf.predict(x_train)
        preds_test = rclf.predict(x_test)
        randseedTrain.append(accuracy_score(preds_train, y_train))
        randseedTest.append(accuracy_score(preds_test, y_test))
        print('Train {}'.format(accuracy_score(preds_train, y_train)))
        print('Test {}'.format(accuracy_score(preds_test, y_test)))
        preds = rclf.predict(x_test)
        randseedF1.append(f1(y_test, preds))
        print('F1 Test {}\n'.format(f1(y_test, preds)))

    # get averages
    averageSeeds.append("Average")
    averageTrain.append(sum(randseedTrain) / len(randseedTrain))
    averageTest.append(sum(randseedTest) / len(randseedTest))
    averageF1.append(sum(randseedF1) / len(randseedF1))

    # get table for data + add averages at the end
    table3 = pd.DataFrame({
        "Seed": [i for i in usedSeeds] + averageSeeds,
        "Train Accuracy": randseedTrain + averageTrain,
        "Test Accuracy": randseedTest + averageTest,
        "F1 Score": randseedF1 + averageF1
    })
    print(table3)
Пример #5
0
def random_forest_testing(x_train, y_train, x_test, y_test, n_trees, max_features):
	print('Random Forest')
	print("max_depth: %d, max_features: %d, n_trees: %d" % (7,max_features, n_trees))
	rclf = RandomForestClassifier(n_trees, max_features, max_depth=7)
	rclf.fit(x_train, y_train)
	preds_train = rclf.predict(x_train)
	preds_test = rclf.predict(x_test)
	train_accuracy = accuracy_score(preds_train, y_train)
	test_accuracy = accuracy_score(preds_test, y_test)
	print('Train {}'.format(train_accuracy))
	print('Test {}'.format(test_accuracy))
	preds = rclf.predict(x_test)
	preds_train = rclf.predict(x_train)

	print('F1 Train {}'.format(f1(y_train, preds_train)))
	print('F1 Test {}\n'.format(f1(y_test, preds)))

	return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
Пример #6
0
def random_forest_tune_MaxFeatures(x_train, y_train, x_test, y_test):
    print('Random Forest tune\n\n')
    plotX = [1, 2, 5, 8, 10, 20, 25, 35, 50]
    plotTrain = []
    plotTest = []
    plotF1 = []

    for max_features in plotX:
        print("MAX_Features: ", max_features)
        rclf = RandomForestClassifier(max_depth=7,
                                      max_features=max_features,
                                      n_trees=50)
        rclf.fit(x_train, y_train)
        preds_train = rclf.predict(x_train)
        preds_test = rclf.predict(x_test)
        train_accuracy = round(accuracy_score(preds_train, y_train), 3)
        test_accuracy = round(accuracy_score(preds_test, y_test), 3)
        print('Train {}'.format(train_accuracy))
        print('Test {}'.format(test_accuracy))
        preds = rclf.predict(x_test)
        F1 = round(f1(y_test, preds), 3)
        print('F1 Test {}'.format(F1))
        print('\n')
        plotTrain.append(train_accuracy)
        plotTest.append(test_accuracy)
        plotF1.append(F1)

    df = pd.DataFrame({
        "MAX_Features": plotX,
        "Train_Accuracy": plotTrain,
        "Test_Accuracy": plotTest,
        "F1_Accuracy": plotF1
    })
    print(df)
    maxAccuracy = max(plotF1)
    best_MAX_Features = plotX[plotF1.index(maxAccuracy)]
    print("The best MAX_Features is ", best_MAX_Features, "with F1 accuracy ",
          maxAccuracy)

    print("Drawing plot")
    plt.plot('MAX_Features', 'Train_Accuracy', data=df, color='red')
    plt.plot('MAX_Features', 'Test_Accuracy', data=df, color='blue')
    plt.plot('MAX_Features', 'F1_Accuracy', data=df, color='black')
    plt.legend()
    plt.savefig('random_forest_output_max_features.png')
    plt.close()
    return best_MAX_Features
Пример #7
0
def random_forest_various_trees(x_train, y_train, x_test, y_test):
    graphTrain = []
    graphTest = []
    graphF1 = []

    # let the user know which test this is
    print("== Beginning test for various n_trees.\n")

    # plot accuracies for the number of trees specified in part b
    for i in range(10, 210, 10):
        print("n_trees: ", i)
        rclf = RandomForestClassifier(max_depth=7, max_features=11, n_trees=i)
        rclf.fit(x_train, y_train)
        preds_train = rclf.predict(x_train)
        preds_test = rclf.predict(x_test)
        graphTrain.append(accuracy_score(preds_train, y_train))
        graphTest.append(accuracy_score(preds_test, y_test))
        print('Train {}'.format(accuracy_score(preds_train, y_train)))
        print('Test {}'.format(accuracy_score(preds_test, y_test)))
        preds = rclf.predict(x_test)
        print('F1 Test {}\n'.format(f1(y_test, preds)))
        graphF1.append(f1(y_test, preds))

    # table for easily reading data
    table = pd.DataFrame({
        "n_trees": [i for i in range(10, 210, 10)],
        "Train Accuracy": graphTrain,
        "Test Accuracy": graphTest,
        "F1 Accuracy": graphF1
    })
    print(table)

    # plot our graph and output to a file
    plt.figure(2)
    plt.xlabel('Number of trees')
    plt.ylabel('Performance')
    plt.title('Accuracy & F1 Score vs Number of Trees in the Forest')
    plt.plot('n_trees', 'Train Accuracy', data=table, color='blue')
    plt.plot('n_trees', 'Test Accuracy', data=table, color='green')
    plt.plot('n_trees', 'F1 Accuracy', data=table, color='red')
    plt.legend()
    plt.savefig('q2pb.png')

    # return our best n__trees value for use in main
    return [i for i in range(10, 210, 10)][graphF1.index(max(graphF1))]
Пример #8
0
def rf_tune_all(x_train, y_train, x_test, y_test, depth, features, trees):
    print('Random Forest tune 3 parameters\n\n')
    print("[depth,features,trees]")
    print([depth, features, trees])
    rclf = RandomForestClassifier(max_depth=depth,
                                  max_features=features,
                                  n_trees=trees)
    rclf.fit(x_train, y_train)
    preds_train = rclf.predict(x_train)
    preds_test = rclf.predict(x_test)
    train_accuracy = round(accuracy_score(preds_train, y_train), 3)
    test_accuracy = round(accuracy_score(preds_test, y_test), 3)
    print('Train {}'.format(train_accuracy))
    print('Test {}'.format(test_accuracy))
    preds = rclf.predict(x_test)
    F1 = round(f1(y_test, preds), 3)
    print('F1 Test {}'.format(F1))
    print('\n')
    return F1
Пример #9
0
def random_forest_seed_testing(x_train, y_train, x_test, y_test):
    print('Random Forest seed testing\n\n')

    rclf = RandomForestClassifier(max_depth=15, max_features=8, n_trees=210)

    F1_result = []
    Train_result = []
    Test_result = []
    seeds = [x for x in range(1, 100, 10)]
    for seed in seeds:
        rclf.seed = seed
        rclf.fit(x_train, y_train)
        preds_train = rclf.predict(x_train)
        preds_test = rclf.predict(x_test)
        train_accuracy = round(accuracy_score(preds_train, y_train), 3)
        test_accuracy = round(accuracy_score(preds_test, y_test), 3)
        print('Train {}'.format(train_accuracy))
        print('Test {}'.format(test_accuracy))
        preds = rclf.predict(x_test)
        F1 = round(f1(y_test, preds), 3)
        print('F1 Test {}'.format(F1))
        print('\n')
        F1_result.append(F1)
        Train_result.append(train_accuracy)
        Test_result.append(test_accuracy)

    seeds.append("Average")
    F1_result.append(sum(F1_result) / len(F1_result))
    Train_result.append(sum(Train_result) / len(Train_result))
    Test_result.append(sum(Test_result) / len(Test_result))
    df = pd.DataFrame({
        "Seed": seeds,
        "F1": F1_result,
        "Train": Train_result,
        "Test": Test_result
    })
    print(df)
Пример #10
0
def random_forsest_random_seed(x_train, y_train, x_test, y_test, count):
    print('#Random Forest Number of Trees\n\n')
    accuracy_training = []
    accuracy_testing = []
    f1_testing = []
    f1_training = []
    features = []
    for i in range(0, count):
        rclf = RandomForestClassifier(max_depth=7,
                                      max_features=25,
                                      n_trees=151)
        rclf.fit(x_train, y_train)
        preds_train = rclf.predict(x_train)
        preds_test = rclf.predict(x_test)
        features.append(i)
        accuracy_training.append(accuracy_score(preds_train, y_train))
        accuracy_testing.append(accuracy_score(preds_test, y_test))
        f1_training.append(calc_f1(preds_train, y_train))
        f1_testing.append(calc_f1(preds_test, y_test))

    f1 = plt.figure(1)
    plt.plot(features, accuracy_training)
    plt.plot(features, accuracy_testing)
    plt.title("Accuracy vs Seed")
    plt.ylabel("Accuracy")
    plt.xlabel("Seed Index")
    plt.legend(['Training Accuracy', 'Testing Accuracy'])
    f1.show()

    f2 = plt.figure(2)
    plt.plot(features, f1_training)
    plt.plot(features, f1_testing)
    plt.title("F1 vs Seed")
    plt.ylabel("F1")
    plt.xlabel("Seed Index")
    plt.legend(['Training F1', 'Testing F1'])
    plt.show()
Пример #11
0
def random_forest_testing_max_features(x_train, y_train, x_test, y_test):
    print('#Random Forest Number of Trees\n\n')
    accuracy_training = []
    accuracy_testing = []
    f1_testing = []
    f1_training = []
    features = []
    for max_features in [1, 2, 5, 8, 10, 20, 25, 35, 50]:
        rclf = RandomForestClassifier(max_depth=7,
                                      max_features=max_features,
                                      n_trees=50)
        rclf.fit(x_train, y_train)
        preds_train = rclf.predict(x_train)
        preds_test = rclf.predict(x_test)
        features.append(max_features)
        accuracy_training.append(accuracy_score(preds_train, y_train))
        accuracy_testing.append(accuracy_score(preds_test, y_test))
        f1_training.append(calc_f1(preds_train, y_train))
        f1_testing.append(calc_f1(preds_test, y_test))

    f1 = plt.figure(1)
    plt.plot(features, accuracy_training)
    plt.plot(features, accuracy_testing)
    plt.title("Accuracy vs Max Features")
    plt.ylabel("Accuracy")
    plt.xlabel("Max Features")
    plt.legend(['Training Accuracy', 'Testing Accuracy'])
    f1.show()

    f2 = plt.figure(2)
    plt.plot(features, f1_training)
    plt.plot(features, f1_testing)
    plt.title("F1 vs Max Features")
    plt.ylabel("F1")
    plt.xlabel("Max Features")
    plt.legend(['Training F1', 'Testing F1'])
    plt.show()