def run_random_forest(training_data, training_labels, validation_data, validation_labels, best_max_depth=[], best_min_samples_leaf=[]): n_estimators_list = range(1, 51) training_accuracy_list = [] validation_accuracy_list = [] for this_n_estimator in n_estimators_list: print('Processing n estimator: ' + str(this_n_estimator) + '/' + str(len(n_estimators_list))) if best_max_depth == []: clf = rfc(n_estimators=this_n_estimator) else: clf = rfc(n_estimators=this_n_estimator, max_depth=best_max_depth, min_samples_leaf=best_min_samples_leaf) (training_accuracy, validation_accuracy) = get_training_accuracy.run(clf, training_data, training_labels, validation_data, validation_labels) training_accuracy_list.append(training_accuracy) validation_accuracy_list.append(validation_accuracy) print(CURSOR_UP_ONE + ERASE_LINE + CURSOR_UP_ONE) # Plot data ------------------------------------------------------------------------------------ training_accuracy_list = [training_accuracy*100 for training_accuracy in training_accuracy_list] validation_accuracy_list = [validation_accuracy*100 for validation_accuracy in validation_accuracy_list] pylab.plot(n_estimators_list, training_accuracy_list) pylab.plot(n_estimators_list, validation_accuracy_list) pylab.xlabel('N Estimators') pylab.ylabel('Accuracy (% out of 100)') pylab.legend(['Training Accuracy', 'Validation Accuracy'], loc=2) pylab.grid(True) if best_max_depth == []: pylab.title('Training and Validation Accuracy as function of N Estimators') pylab.savefig("Accuracy_vs_N_Estimators.png") else: pylab.title('Training and Validation Accuracy as function of N Estimators With' + ' Best Max Depth and Best Min Sample Leaf') pylab.savefig("Accuracy_vs_N_Estimators_modified.png") #pylab.show() pylab.close() pylab.clf() # End plot data -------------------------------------------------------------------------------- (best_index, best_accuracy) = max(enumerate(validation_accuracy_list), key = itemgetter(1)) best_n_estimator = n_estimators_list[best_index] return (best_n_estimator, best_accuracy)
def run_min_samples_leaf(training_data, training_labels, validation_data, validation_labels): min_samples_leaf_list = range(1, 51) training_accuracy_list = [] validation_accuracy_list = [] for this_min_samples_leaf in min_samples_leaf_list: print('Processing min samples leaf: ' + str(this_min_samples_leaf) + '/' + str(len(min_samples_leaf_list))) clf = dtc(criterion='entropy', min_samples_leaf=this_min_samples_leaf) (training_accuracy, validation_accuracy) = get_training_accuracy.run(clf, training_data, training_labels, validation_data, validation_labels) training_accuracy_list.append(training_accuracy) validation_accuracy_list.append(validation_accuracy) print(CURSOR_UP_ONE + ERASE_LINE + CURSOR_UP_ONE) # Plot data ------------------------------------------------------------------------------------ training_accuracy_list = [training_accuracy*100 for training_accuracy in training_accuracy_list] validation_accuracy_list = [validation_accuracy*100 for validation_accuracy in validation_accuracy_list] pylab.plot(min_samples_leaf_list, training_accuracy_list) pylab.plot(min_samples_leaf_list, validation_accuracy_list) pylab.xlabel('Min Samples Leaf') pylab.ylabel('Accuracy (% out of 100)') pylab.title('Training and Validation Accuracy as function of Min Samples Leaf') pylab.legend(['Training Accuracy', 'Validation Accuracy'], loc=2) pylab.grid(True) pylab.savefig("Accuracy_vs_Min_Samples_Leaf.png") #pylab.show() pylab.close() pylab.clf() # End plot data -------------------------------------------------------------------------------- (best_index, best_accuracy) = max(enumerate(validation_accuracy_list), key = itemgetter(1)) best_min_samples_leaf = min_samples_leaf_list[best_index] return best_min_samples_leaf
print('Optimal N Estimator with default settings was: ' + str(best_n_estimator) + ' with accuracy: ' + str(best_n_estimator_accuracy)) print('Optimal N Estimator with modified settings was: ' + str(best_n_estimator_modified) + ' with accuracy: ' + str(best_n_estimator_modified_accuracy)) # Get Test error with best configuration of Decision Tree and Random Forest (training_data, training_labels, _, _) = preprocess_data.run_for_training_data(1) (test_data, test_labels) = preprocess_data.run_for_test_data() # Align data missing_headers = training_data.columns.diff(test_data.columns) test_data[missing_headers] = training_data[missing_headers].applymap(lambda x: False) # Decision Tree clf = dtc(criterion='entropy', max_depth=best_max_depth, min_samples_leaf=best_min_samples_leaf) (_, test_accuracy_dt) = get_training_accuracy.run(clf, training_data, training_labels, test_data, test_labels) # Random Forest clf = rfc(n_estimators=best_n_estimator_modified, max_depth=best_max_depth, min_samples_leaf=best_min_samples_leaf) (_, test_accuracy_rf) = get_training_accuracy.run(clf, training_data, training_labels, test_data, test_labels) print('Test accuracy for Decision Tree: ' + str(test_accuracy_dt)) print('Test accuracy for Random Forest: ' + str(test_accuracy_rf)) print('\n=========================================================================================') print('Script complete')