def run_random_forest(training_data, training_labels, validation_data, validation_labels,
                      best_max_depth=[], best_min_samples_leaf=[]):
    n_estimators_list = range(1, 51)
    training_accuracy_list = []
    validation_accuracy_list = []
    for this_n_estimator in n_estimators_list:
        print('Processing n estimator: ' + str(this_n_estimator) + '/' + str(len(n_estimators_list)))
        if best_max_depth == []:
            clf = rfc(n_estimators=this_n_estimator)
        else:
            clf = rfc(n_estimators=this_n_estimator, max_depth=best_max_depth,
                      min_samples_leaf=best_min_samples_leaf)
        (training_accuracy, validation_accuracy) = get_training_accuracy.run(clf, training_data,
                                                                             training_labels,
                                                                             validation_data,
                                                                             validation_labels)
        training_accuracy_list.append(training_accuracy)
        validation_accuracy_list.append(validation_accuracy)
        print(CURSOR_UP_ONE + ERASE_LINE + CURSOR_UP_ONE)
    
    # Plot data ------------------------------------------------------------------------------------
    training_accuracy_list = [training_accuracy*100 for training_accuracy
                              in training_accuracy_list]
    validation_accuracy_list = [validation_accuracy*100 for validation_accuracy 
                                in validation_accuracy_list]

    pylab.plot(n_estimators_list, training_accuracy_list)
    pylab.plot(n_estimators_list, validation_accuracy_list)
    
    pylab.xlabel('N Estimators')
    pylab.ylabel('Accuracy (% out of 100)')
    pylab.legend(['Training Accuracy', 'Validation Accuracy'], loc=2)
    pylab.grid(True)
    if best_max_depth == []:
        pylab.title('Training and Validation Accuracy as function of N Estimators')
        pylab.savefig("Accuracy_vs_N_Estimators.png")
    else:
        pylab.title('Training and Validation Accuracy as function of N Estimators With' +
                    ' Best Max Depth and Best Min Sample Leaf')
        pylab.savefig("Accuracy_vs_N_Estimators_modified.png")
    #pylab.show()
    pylab.close()
    pylab.clf()
    # End plot data --------------------------------------------------------------------------------

    (best_index, best_accuracy) = max(enumerate(validation_accuracy_list), key = itemgetter(1))
    best_n_estimator = n_estimators_list[best_index]
    return (best_n_estimator, best_accuracy)
def run_min_samples_leaf(training_data, training_labels, validation_data, validation_labels):
    min_samples_leaf_list = range(1, 51)
    
    training_accuracy_list = []
    validation_accuracy_list = []
    for this_min_samples_leaf in min_samples_leaf_list:
        print('Processing min samples leaf: ' + str(this_min_samples_leaf) + '/' +
                str(len(min_samples_leaf_list)))
        clf = dtc(criterion='entropy', min_samples_leaf=this_min_samples_leaf)
        (training_accuracy, validation_accuracy) = get_training_accuracy.run(clf, training_data,
                                                                             training_labels,
                                                                             validation_data,
                                                                             validation_labels)
        training_accuracy_list.append(training_accuracy)
        validation_accuracy_list.append(validation_accuracy)
        print(CURSOR_UP_ONE + ERASE_LINE + CURSOR_UP_ONE)
    
    # Plot data ------------------------------------------------------------------------------------
    training_accuracy_list = [training_accuracy*100 for training_accuracy
                              in training_accuracy_list]
    validation_accuracy_list = [validation_accuracy*100 for validation_accuracy 
                                in validation_accuracy_list]

    pylab.plot(min_samples_leaf_list, training_accuracy_list)
    pylab.plot(min_samples_leaf_list, validation_accuracy_list)
    
    pylab.xlabel('Min Samples Leaf')
    pylab.ylabel('Accuracy (% out of 100)')
    pylab.title('Training and Validation Accuracy as function of Min Samples Leaf')
    pylab.legend(['Training Accuracy', 'Validation Accuracy'], loc=2)
    pylab.grid(True)
    pylab.savefig("Accuracy_vs_Min_Samples_Leaf.png")
    #pylab.show()
    pylab.close()
    pylab.clf()
    # End plot data --------------------------------------------------------------------------------
    
    (best_index, best_accuracy) = max(enumerate(validation_accuracy_list), key = itemgetter(1))
    best_min_samples_leaf = min_samples_leaf_list[best_index]
    return best_min_samples_leaf
예제 #3
0
print('Optimal N Estimator with default settings was: ' + str(best_n_estimator) +
      ' with accuracy: ' + str(best_n_estimator_accuracy))
print('Optimal N Estimator with modified settings was: ' + str(best_n_estimator_modified) + 
      ' with accuracy: ' + str(best_n_estimator_modified_accuracy))

# Get Test error with best configuration of Decision Tree and Random Forest
(training_data, training_labels, _, _) = preprocess_data.run_for_training_data(1)
(test_data, test_labels) = preprocess_data.run_for_test_data()

# Align data
missing_headers = training_data.columns.diff(test_data.columns)
test_data[missing_headers] = training_data[missing_headers].applymap(lambda x: False)

# Decision Tree
clf = dtc(criterion='entropy', max_depth=best_max_depth, min_samples_leaf=best_min_samples_leaf)
(_, test_accuracy_dt) = get_training_accuracy.run(clf, training_data, training_labels,
                                                  test_data, test_labels)

# Random Forest
clf = rfc(n_estimators=best_n_estimator_modified, max_depth=best_max_depth,
          min_samples_leaf=best_min_samples_leaf)
(_, test_accuracy_rf) = get_training_accuracy.run(clf, training_data, training_labels,
                                                  test_data, test_labels)

print('Test accuracy for Decision Tree: ' + str(test_accuracy_dt))
print('Test accuracy for Random Forest: ' + str(test_accuracy_rf))

print('\n=========================================================================================')
print('Script complete')