def run_glass(filename, target_class, class_wanted, glass_names): # Setup data glass_obj = glass.Glass() glass_data = glass_obj.setup_data_glass(filename=filename, target_class=target_class, class_wanted=class_wanted, glass_names=glass_names) # Setup five fold cross validation five_fold = ff.FiveFold() glass1, glass2, glass3, glass4, glass5 = five_fold.five_fold_sort_class( data=glass_data, sortby=target_class) glass_nb1 = nb_glass(glass_data=glass1, target_class=target_class) glass_nb2 = nb_glass(glass_data=glass1, target_class=target_class) glass_nb3 = nb_glass(glass_data=glass1, target_class=target_class) glass_nb4 = nb_glass(glass_data=glass1, target_class=target_class) glass_nb5 = nb_glass(glass_data=glass1, target_class=target_class) nb_perf = [glass_nb1, glass_nb2, glass_nb3, glass_nb4, glass_nb5] glass_lr1 = perform_lr(glass1) glass_lr2 = perform_lr(glass2) glass_lr3 = perform_lr(glass3) glass_lr4 = perform_lr(glass4) glass_lr5 = perform_lr(glass5) lr_perf = [glass_lr1, glass_lr2, glass_lr3, glass_lr4, glass_lr5] return nb_perf, lr_perf
def run_votes(filename, target_class, class_wanted, vote_names): # Setup data votes_obj = votes.HouseVotes() votes_data = votes_obj.setup_data_votes(filename=filename, target_class=target_class, class_wanted=class_wanted, vote_names=vote_names) # Setup five fold cross validation five_fold = ff.FiveFold() votes1, votes2, votes3, votes4, votes5 = five_fold.five_fold_sort_class( data=votes_data, sortby=target_class) votes_nb1 = nb_votes(votes_data=votes1, target_class=target_class) votes_nb2 = nb_votes(votes_data=votes1, target_class=target_class) votes_nb3 = nb_votes(votes_data=votes1, target_class=target_class) votes_nb4 = nb_votes(votes_data=votes1, target_class=target_class) votes_nb5 = nb_votes(votes_data=votes1, target_class=target_class) nb_perf = [votes_nb1, votes_nb2, votes_nb3, votes_nb4, votes_nb5] votes_lr1 = perform_lr(votes1) votes_lr2 = perform_lr(votes2) votes_lr3 = perform_lr(votes3) votes_lr4 = perform_lr(votes4) votes_lr5 = perform_lr(votes5) lr_perf = [votes_lr1, votes_lr2, votes_lr3, votes_lr4, votes_lr5] return nb_perf, lr_perf
def run_bc(filename, target_class, class_wanted, bc_names): # Setup data bc_obj = bc.BreastCancer() bc_data = bc_obj.setup_data_bc(filename=filename, target_class=target_class, class_wanted=class_wanted, bc_names=bc_names) # Setup five fold cross validation five_fold = ff.FiveFold() bc1, bc2, bc3, bc4, bc5 = five_fold.five_fold_sort_class( data=bc_data, sortby=target_class) bc_nb1 = nb_bc(bc_data=bc1, target_class=target_class) bc_nb2 = nb_bc(bc_data=bc1, target_class=target_class) bc_nb3 = nb_bc(bc_data=bc1, target_class=target_class) bc_nb4 = nb_bc(bc_data=bc1, target_class=target_class) bc_nb5 = nb_bc(bc_data=bc1, target_class=target_class) nb_perf = [bc_nb1, bc_nb2, bc_nb3, bc_nb4, bc_nb5] bc_lr1 = perform_lr(bc1) bc_lr2 = perform_lr(bc2) bc_lr3 = perform_lr(bc3) bc_lr4 = perform_lr(bc4) bc_lr5 = perform_lr(bc5) lr_perf = [bc_lr1, bc_lr2, bc_lr3, bc_lr4, bc_lr5] return nb_perf, lr_perf
def run_iris(filename, target_class, class_wanted, iris_names): # Setup data iris_obj = iris.Iris() iris_data = iris_obj.setup_data_iris(filename=filename, target_class=target_class, class_wanted=class_wanted, iris_names=iris_names) # Setup five fold cross validation five_fold = ff.FiveFold() iris1, iris2, iris3, iris4, iris5 = five_fold.five_fold_sort_class( data=iris_data, sortby=target_class) iris_nb1 = nb_iris(iris_data=iris1, target_class=target_class) iris_nb2 = nb_iris(iris_data=iris1, target_class=target_class) iris_nb3 = nb_iris(iris_data=iris1, target_class=target_class) iris_nb4 = nb_iris(iris_data=iris1, target_class=target_class) iris_nb5 = nb_iris(iris_data=iris1, target_class=target_class) nb_perf = [iris_nb1, iris_nb2, iris_nb3, iris_nb4, iris_nb5] iris_lr1 = perform_lr(iris1) iris_lr2 = perform_lr(iris2) iris_lr3 = perform_lr(iris3) iris_lr4 = perform_lr(iris4) iris_lr5 = perform_lr(iris5) lr_perf = [iris_lr1, iris_lr2, iris_lr3, iris_lr4, iris_lr5] return nb_perf, lr_perf
def run_seg(self, filename, column_names, sortby): # Setup data seg = s.Segmentation() seg_data = seg.setup_data(filename=filename, column_names=column_names) five_fold = ff.FiveFold() seg1, seg2, seg3, seg4, seg5 = five_fold.five_fold_sort_class( data=seg_data, sortby=sortby) return seg_data
def run_bc(filename, target_class, class_wanted, bc_names, learning_rate, epoch): # Setup data bc_obj = bc.BreastCancer() bc_data = bc_obj.setup_data_bc(filename=filename, target_class=target_class, class_wanted=class_wanted, bc_names=bc_names) # Setup five fold cross validation five_fold = ff.FiveFold() bc1, bc2, bc3, bc4, bc5 = five_fold.five_fold_sort_class( data=bc_data, sortby=target_class) print("Breast Cancer 0 layers") # Run 0 hidden layers a01, list01 = run_feedforward_backpropagation(bc1, 0, learning_rate, epoch) print("Classification on Breast Cancer 0 hidden layers fold 1:") print(list01) a02, list02 = run_feedforward_backpropagation(bc2, 0, learning_rate, epoch) a03, list03 = run_feedforward_backpropagation(bc3, 0, learning_rate, epoch) a04, list04 = run_feedforward_backpropagation(bc4, 0, learning_rate, epoch) a05, list05 = run_feedforward_backpropagation(bc5, 0, learning_rate, epoch) mean0 = np.average([a01, a02, a03, a04, a05]) print("Mean Accuracy of Breast Cancer 0 hidden layers: " + str(mean0) + "%") print() print("Breast Cancer 1 hidden layer") # Run 1 hidden layers a11, list11 = run_feedforward_backpropagation(bc1, 1, learning_rate, epoch) print("Classification on Breast Cancer 1 hidden layers fold 1:") print(list11) a12, list12 = run_feedforward_backpropagation(bc2, 1, learning_rate, epoch) a13, list13 = run_feedforward_backpropagation(bc3, 1, learning_rate, epoch) a14, list14 = run_feedforward_backpropagation(bc4, 1, learning_rate, epoch) a15, list15 = run_feedforward_backpropagation(bc5, 1, learning_rate, epoch) mean1 = np.average([a11, a12, a13, a14, a15]) print("Mean Accuracy of Breast Cancer 1 hidden layers: " + str(mean1) + "%") print() print("Breast Cancer 2 hidden layers") # Run 2 hidden layers a21, list21 = run_feedforward_backpropagation(bc1, 2, learning_rate, epoch) print("Classification on Breast Cancer 2 hidden layers fold 1:") print(list21) a22, list22 = run_feedforward_backpropagation(bc2, 2, learning_rate, epoch) a23, list23 = run_feedforward_backpropagation(bc3, 2, learning_rate, epoch) a24, list24 = run_feedforward_backpropagation(bc4, 2, learning_rate, epoch) a25, list25 = run_feedforward_backpropagation(bc5, 2, learning_rate, epoch) mean2 = np.average([a21, a22, a23, a24, a25]) print("Mean Accuracy of Breast Cancer 2 hidden layers: " + str(mean2) + "%") print()
def run_machine(self, filename, column_names, columns_to_drop, sortby): # Setup data machine = m.Machine() machine_data = machine.setup_data(filename=filename, column_names=column_names, columns_to_drop=columns_to_drop) five_fold = ff.FiveFold() mac1, mac2, mac3, mac4, mac5 = five_fold.five_fold_sort_class( data=machine_data, sortby=sortby) return machine_data
def run_forest(self, filename, column_names, sortby): # Setup data forest = f.Forest() forest_data = forest.setup_data(filename=filename, column_names=column_names) five_fold = ff.FiveFold() forest1, forest2, forest3, forest4, forest5 = five_fold.five_fold_sort_class( data=forest_data, sortby=sortby) return forest_data
def run_ecoli(self, filename, column_names, columns_to_drop, sortby): # Setup data ecoli = e.Ecoli() ecoli_data = ecoli.setup_data(filename=filename, column_names=column_names, columns_to_drop=columns_to_drop) five_fold = ff.FiveFold() ecoli1, ecoli2, ecoli3, ecoli4, ecoli5 = five_fold.five_fold_sort_class( data=ecoli_data, sortby=sortby) return ecoli_data
def run_spambase(filename, target_class): # Setup data spambase_obj = spambase.Spambase() spambase_data = spambase_obj.setup_data_spambase(filename=filename, target_class=target_class) # Setup five fold cross validation five_fold = ff.FiveFold() spambase1, spambase2, spambase3, spambase4, spambase5 = five_fold.five_fold_sort_class( data=spambase_data, sortby=target_class) spambase_nb1 = nb_spambase(spambase_data=spambase1, target_class=target_class) spambase_nb2 = nb_spambase(spambase_data=spambase1, target_class=target_class) spambase_nb3 = nb_spambase(spambase_data=spambase1, target_class=target_class) spambase_nb4 = nb_spambase(spambase_data=spambase1, target_class=target_class) spambase_nb5 = nb_spambase(spambase_data=spambase1, target_class=target_class) nb_perf = [ spambase_nb1, spambase_nb2, spambase_nb3, spambase_nb4, spambase_nb5 ] spambase_lr1 = perform_lr(spambase1) spambase_lr2 = perform_lr(spambase2) spambase_lr3 = perform_lr(spambase3) spambase_lr4 = perform_lr(spambase4) spambase_lr5 = perform_lr(spambase5) lr_perf = [ spambase_lr1, spambase_lr2, spambase_lr3, spambase_lr4, spambase_lr5 ] return nb_perf, lr_perf
def run_iris(filename, target_class, class_wanted, iris_names, learning_rate, epoch): # Setup data iris_obj = iris.Iris() iris_data = iris_obj.setup_data_iris(filename=filename, target_class=target_class, class_wanted=class_wanted, iris_names=iris_names) # Setup five fold cross validation five_fold = ff.FiveFold() iris1, iris2, iris3, iris4, iris5 = five_fold.five_fold_sort_class( data=iris_data, sortby=target_class) print("Iris 0 layers") # Run 0 hidden layers a01, list01 = run_feedforward_backpropagation(iris1, 0, learning_rate, epoch) print("Classification on Iris 0 hidden layers fold 1:") print(list01) a02, list02 = run_feedforward_backpropagation(iris2, 0, learning_rate, epoch) a03, list03 = run_feedforward_backpropagation(iris3, 0, learning_rate, epoch) a04, list04 = run_feedforward_backpropagation(iris4, 0, learning_rate, epoch) a05, list05 = run_feedforward_backpropagation(iris5, 0, learning_rate, epoch) mean0 = np.average([a01, a02, a03, a04, a05]) print("Mean Accuracy of Iris 0 hidden layers: " + str(mean0) + "%") print() print("Iris 1 hidden layer") # Run 1 hidden layers a11, list11 = run_feedforward_backpropagation(iris1, 1, learning_rate, epoch) print("Classification on Iris 1 hidden layers fold 1:") print(list11) a12, list12 = run_feedforward_backpropagation(iris2, 1, learning_rate, epoch) a13, list13 = run_feedforward_backpropagation(iris3, 1, learning_rate, epoch) a14, list14 = run_feedforward_backpropagation(iris4, 1, learning_rate, epoch) a15, list15 = run_feedforward_backpropagation(iris5, 1, learning_rate, epoch) mean1 = np.average([a11, a12, a13, a14, a15]) print("Mean Accuracy of Iris 1 hidden layers: " + str(mean1) + "%") print() print("Iris 2 hidden layers") # Run 2 hidden layers a21, list21 = run_feedforward_backpropagation(iris1, 2, learning_rate, epoch) print("Classification on Iris 2 hidden layers fold 1:") print(list21) a22, list22 = run_feedforward_backpropagation(iris2, 2, learning_rate, epoch) a23, list23 = run_feedforward_backpropagation(iris3, 2, learning_rate, epoch) a24, list24 = run_feedforward_backpropagation(iris4, 2, learning_rate, epoch) a25, list25 = run_feedforward_backpropagation(iris5, 2, learning_rate, epoch) mean2 = np.average([a21, a22, a23, a24, a25]) print("Mean Accuracy of Iris 2 hidden layers: " + str(mean2) + "%") print()
def run_soybean(filename, target_class, learning_rate, epoch): # Setup data soybean_obj = soybean.Soybean() soybean_data = soybean_obj.setup_data_soybean(filename=filename, target_class=target_class) # Setup five fold cross validation five_fold = ff.FiveFold() soybean1, soybean2, soybean3, soybean4, soybean5 = five_fold.five_fold_sort_class( data=soybean_data, sortby=target_class) print("Soybean 0 layers") # Run 0 hidden layers a01, list01 = run_feedforward_backpropagation(soybean1, 0, learning_rate, epoch) print("Classification on Soybean 0 hidden layers fold 1:") print(list01) a02, list02 = run_feedforward_backpropagation(soybean2, 0, learning_rate, epoch) a03, list03 = run_feedforward_backpropagation(soybean3, 0, learning_rate, epoch) a04, list04 = run_feedforward_backpropagation(soybean4, 0, learning_rate, epoch) a05, list05 = run_feedforward_backpropagation(soybean5, 0, learning_rate, epoch) mean0 = np.average([a01, a02, a03, a04, a05]) print("Mean Accuracy of Soybean 0 hidden layers: " + str(mean0) + "%") print() print("Soybean 1 hidden layer") # Run 1 hidden layers a11, list11 = run_feedforward_backpropagation(soybean1, 1, learning_rate, epoch) print("Classification on Soybean 1 hidden layers fold 1:") print(list11) a12, list12 = run_feedforward_backpropagation(soybean2, 1, learning_rate, epoch) a13, list13 = run_feedforward_backpropagation(soybean3, 1, learning_rate, epoch) a14, list14 = run_feedforward_backpropagation(soybean4, 1, learning_rate, epoch) a15, list15 = run_feedforward_backpropagation(soybean5, 1, learning_rate, epoch) mean1 = np.average([a11, a12, a13, a14, a15]) print("Mean Accuracy of Soybean 1 hidden layers: " + str(mean1) + "%") print() print("Soybean 2 hidden layers") # Run 2 hidden layers a21, list21 = run_feedforward_backpropagation(soybean1, 2, learning_rate, epoch) print("Classification on Soybean 2 hidden layers fold 1:") print(list21) a22, list22 = run_feedforward_backpropagation(soybean2, 2, learning_rate, epoch) a23, list23 = run_feedforward_backpropagation(soybean3, 2, learning_rate, epoch) a24, list24 = run_feedforward_backpropagation(soybean4, 2, learning_rate, epoch) a25, list25 = run_feedforward_backpropagation(soybean5, 2, learning_rate, epoch) mean2 = np.average([a21, a22, a23, a24, a25]) print("Mean Accuracy of Soybean 2 hidden layers: " + str(mean2) + "%") print()
def run_car(self, filename, sortby): print() car_names = [ "buying", "maint", "doors", "persons", "lug boot", "safety", "class" ] # Setup data car = c.Car() car_data = car.setup_data(filename=filename, column_names=car_names) # Split the data set into 10% and 90% car_validation_data = car_data.sample(frac=.10) car_data_rest = car_data.drop(car_validation_data.index) # Reset indexes on data frames car_validation_data.reset_index(inplace=True) car_data_rest.reset_index(inplace=True) # print(car_validation_data) # print() # print(car_data_rest) # Setup five fold cross validation five_fold = ff.FiveFold() car1, car2, car3, car4, car5 = five_fold.five_fold_sort_class( data=car_data_rest, sortby=sortby) car1.drop(columns='index', axis=1, inplace=True) car2.drop(columns='index', axis=1, inplace=True) car3.drop(columns='index', axis=1, inplace=True) car4.drop(columns='index', axis=1, inplace=True) car5.drop(columns='index', axis=1, inplace=True) """ This next section will run 5 different variations with the 5 different data sets that were made above using five fold cross validation against the validation set specified in the project. It does a 90/10 split on the data, where 90% is used for cross validation, and 10% is used for the validation set. """ tree1 = dt.DecisionTree() tree_node1 = tree1.create_decision_tree(data=car1, features_list=car_names) accuracy1 = tree1.run_test(car_validation_data, tree_node1) print("Unpruned accuracy for car1: " + str(accuracy1) + "%") tree_pruned1 = tree1.prune_tree(car_validation_data, tree_node1) accuracy_pruned1 = tree1.run_test(car_validation_data, tree_pruned1) print("Pruned accuracy for car1: " + str(accuracy_pruned1) + "%") print() car_names = [ "buying", "maint", "doors", "persons", "lug boot", "safety", "class" ] tree2 = dt.DecisionTree() tree_node2 = tree2.create_decision_tree(data=car2, features_list=car_names) accuracy2 = tree2.run_test(car_validation_data, tree_node2) print("Unpruned accuracy for car2: " + str(accuracy2) + "%") tree_pruned2 = tree2.prune_tree(car_validation_data, tree_node2) accuracy_pruned2 = tree2.run_test(car_validation_data, tree_pruned2) print("Pruned accuracy for car2: " + str(accuracy_pruned2) + "%") print() car_names = [ "buying", "maint", "doors", "persons", "lug boot", "safety", "class" ] tree3 = dt.DecisionTree() tree_node3 = tree3.create_decision_tree(data=car3, features_list=car_names) accuracy3 = tree3.run_test(car_validation_data, tree_node3) print("Unpruned accuracy for car3: " + str(accuracy3) + "%") tree_pruned3 = tree3.prune_tree(car_validation_data, tree_node3) accuracy_pruned3 = tree3.run_test(car_validation_data, tree_pruned3) print("Pruned accuracy for car3: " + str(accuracy_pruned3) + "%") print() car_names = [ "buying", "maint", "doors", "persons", "lug boot", "safety", "class" ] tree4 = dt.DecisionTree() tree_node4 = tree4.create_decision_tree(data=car4, features_list=car_names) accuracy4 = tree4.run_test(car_validation_data, tree_node4) print("Unpruned accuracy for car4: " + str(accuracy4) + "%") tree_pruned4 = tree4.prune_tree(car_validation_data, tree_node4) accuracy_pruned4 = tree4.run_test(car_validation_data, tree_pruned4) print("Pruned accuracy for car4: " + str(accuracy_pruned4) + "%") print() car_names = [ "buying", "maint", "doors", "persons", "lug boot", "safety", "class" ] tree5 = dt.DecisionTree() tree_node5 = tree5.create_decision_tree(data=car5, features_list=car_names) accuracy5 = tree5.run_test(car_validation_data, tree_node5) print("Unpruned accuracy for car5: " + str(accuracy5) + "%") tree_pruned5 = tree5.prune_tree(car_validation_data, tree_node5) accuracy_pruned5 = tree5.run_test(car_validation_data, tree_pruned5) print("Pruned accuracy for car5: " + str(accuracy_pruned5) + "%") print() unpruned_accuracy_average = np.average( [accuracy1, accuracy2, accuracy3, accuracy4, accuracy5]) pruned_accuracy_average = np.average([ accuracy_pruned1, accuracy_pruned2, accuracy_pruned3, accuracy_pruned4, accuracy_pruned5 ]) print("Unpruned accuracy average for car data: " + str(unpruned_accuracy_average) + "%") print("Pruned accuracy average for car data: " + str(pruned_accuracy_average) + "%") print()
def run_seg(self, filename, sortby): print() seg_names = [ "class", "cen col", "cen row", "pix count", "sld -5", "sld -2", "vedge mean", "vedge sd", "hedge mean", "hedge sd", "intensity mean", "rawred mean", "rawblue mean", "rawgreen mean", "exred mean", "exblue mean", "exgreen mean", "value mean", "sat mean", "hue mean" ] # Setup data seg = s.Segmentation() seg_data = seg.setup_data(filename=filename, column_names=seg_names) # Split the data set into 10% and 90% seg_validation_data = seg_data.sample(frac=.10) seg_data_rest = seg_data.drop(seg_validation_data.index) # Reset indexes on data frames seg_validation_data.reset_index(inplace=True) seg_data_rest.reset_index(inplace=True) # Setup five fold cross validation five_fold = ff.FiveFold() seg1, seg2, seg3, seg4, seg5 = five_fold.five_fold_sort_class( data=seg_data_rest, sortby=sortby) seg1.drop(columns='index', axis=1, inplace=True) seg2.drop(columns='index', axis=1, inplace=True) seg3.drop(columns='index', axis=1, inplace=True) seg4.drop(columns='index', axis=1, inplace=True) seg5.drop(columns='index', axis=1, inplace=True) """ This next section will run 5 different variations with the 5 different data sets that were made above using five fold cross validation against the validation set specified in the project. It does a 90/10 split on the data, where 90% is used for cross validation, and 10% is used for the validation set. """ tree1 = dt.DecisionTree() tree_node1 = tree1.create_decision_tree(data=seg1, features_list=seg_names) accuracy1 = tree1.run_test(seg_validation_data, tree_node1) print("Unpruned accuracy for seg1: " + str(accuracy1) + "%") tree_pruned1 = tree1.prune_tree(seg_validation_data, tree_node1) accuracy_pruned1 = tree1.run_test(seg_validation_data, tree_pruned1) print("Pruned accuracy for seg1: " + str(accuracy_pruned1) + "%") print() seg_names = [ "class", "cen col", "cen row", "pix count", "sld -5", "sld -2", "vedge mean", "vedge sd", "hedge mean", "hedge sd", "intensity mean", "rawred mean", "rawblue mean", "rawgreen mean", "exred mean", "exblue mean", "exgreen mean", "value mean", "sat mean", "hue mean" ] tree2 = dt.DecisionTree() tree_node2 = tree2.create_decision_tree(data=seg2, features_list=seg_names) accuracy2 = tree2.run_test(seg_validation_data, tree_node2) print("Unpruned accuracy for seg2: " + str(accuracy2) + "%") tree_pruned2 = tree2.prune_tree(seg_validation_data, tree_node2) accuracy_pruned2 = tree2.run_test(seg_validation_data, tree_pruned2) print("Pruned accuracy for seg2: " + str(accuracy_pruned2) + "%") print() seg_names = [ "class", "cen col", "cen row", "pix count", "sld -5", "sld -2", "vedge mean", "vedge sd", "hedge mean", "hedge sd", "intensity mean", "rawred mean", "rawblue mean", "rawgreen mean", "exred mean", "exblue mean", "exgreen mean", "value mean", "sat mean", "hue mean" ] tree3 = dt.DecisionTree() tree_node3 = tree3.create_decision_tree(data=seg3, features_list=seg_names) accuracy3 = tree3.run_test(seg_validation_data, tree_node3) print("Unpruned accuracy for seg3: " + str(accuracy3) + "%") tree_pruned3 = tree3.prune_tree(seg_validation_data, tree_node3) accuracy_pruned3 = tree3.run_test(seg_validation_data, tree_pruned3) print("Pruned accuracy for seg3: " + str(accuracy_pruned3) + "%") print() seg_names = [ "class", "cen col", "cen row", "pix count", "sld -5", "sld -2", "vedge mean", "vedge sd", "hedge mean", "hedge sd", "intensity mean", "rawred mean", "rawblue mean", "rawgreen mean", "exred mean", "exblue mean", "exgreen mean", "value mean", "sat mean", "hue mean" ] tree4 = dt.DecisionTree() tree_node4 = tree4.create_decision_tree(data=seg4, features_list=seg_names) accuracy4 = tree4.run_test(seg_validation_data, tree_node4) print("Unpruned accuracy for seg4: " + str(accuracy4) + "%") tree_pruned4 = tree4.prune_tree(seg_validation_data, tree_node4) accuracy_pruned4 = tree4.run_test(seg_validation_data, tree_pruned4) print("Pruned accuracy for seg4: " + str(accuracy_pruned4) + "%") print() seg_names = [ "class", "cen col", "cen row", "pix count", "sld -5", "sld -2", "vedge mean", "vedge sd", "hedge mean", "hedge sd", "intensity mean", "rawred mean", "rawblue mean", "rawgreen mean", "exred mean", "exblue mean", "exgreen mean", "value mean", "sat mean", "hue mean" ] tree5 = dt.DecisionTree() tree_node5 = tree5.create_decision_tree(data=seg5, features_list=seg_names) accuracy5 = tree5.run_test(seg_validation_data, tree_node5) print("Unpruned accuracy for seg5: " + str(accuracy5) + "%") tree_pruned5 = tree5.prune_tree(seg_validation_data, tree_node5) accuracy_pruned5 = tree5.run_test(seg_validation_data, tree_pruned5) print("Pruned accuracy for seg5: " + str(accuracy_pruned5) + "%") print() unpruned_accuracy_average = np.average( [accuracy1, accuracy2, accuracy3, accuracy4, accuracy5]) pruned_accuracy_average = np.average([ accuracy_pruned1, accuracy_pruned2, accuracy_pruned3, accuracy_pruned4, accuracy_pruned5 ]) print("Unpruned accuracy average for seg data: " + str(unpruned_accuracy_average) + "%") print("Pruned accuracy average for seg data: " + str(pruned_accuracy_average) + "%") print()
def run_abalone(self, filename, sortby): print() abalone_names = [ "sex", "length", "diameter", "height", "whole weight", "shucked weight", "viscera weight", "shell weight", "class" ] # Setup data abalone = a.Abalone() abalone_data = abalone.setup_data(filename=filename, column_names=abalone_names) # Split the data set into 10% and 90% abalone_validation_data = abalone_data.sample(frac=.10) abalone_data_rest = abalone_data.drop(abalone_validation_data.index) # Reset indexes on data frames abalone_validation_data.reset_index(inplace=True) abalone_data_rest.reset_index(inplace=True) # Setup five fold cross validation five_fold = ff.FiveFold() abalone1, abalone2, abalone3, abalone4, abalone5 = five_fold.five_fold_sort_class( data=abalone_data, sortby=sortby) """ This next section will run 5 different variations with the 5 different data sets that were made above using five fold cross validation against the validation set specified in the project. It does a 90/10 split on the data, where 90% is used for cross validation, and 10% is used for the validation set. """ tree1 = dt.DecisionTree() tree_node1 = tree1.create_decision_tree(data=abalone1, features_list=abalone_names) accuracy1 = tree1.run_test(abalone_validation_data, tree_node1) print("Unpruned accuracy for abalone1: " + str(accuracy1) + "%") tree_pruned1 = tree1.prune_tree(abalone_validation_data, tree_node1) accuracy_pruned1 = tree1.run_test(abalone_validation_data, tree_pruned1) print("Pruned accuracy for abalone1: " + str(accuracy_pruned1) + "%") print() abalone_names = [ "sex", "length", "diameter", "height", "whole weight", "shucked weight", "viscera weight", "shell weight", "class" ] tree2 = dt.DecisionTree() tree_node2 = tree2.create_decision_tree(data=abalone2, features_list=abalone_names) accuracy2 = tree2.run_test(abalone_validation_data, tree_node2) print("Unpruned accuracy for abalone2: " + str(accuracy2) + "%") tree_pruned2 = tree2.prune_tree(abalone_validation_data, tree_node2) accuracy_pruned2 = tree2.run_test(abalone_validation_data, tree_pruned2) print("Pruned accuracy for abalone2: " + str(accuracy_pruned2) + "%") print() abalone_names = [ "sex", "length", "diameter", "height", "whole weight", "shucked weight", "viscera weight", "shell weight", "class" ] tree3 = dt.DecisionTree() tree_node3 = tree3.create_decision_tree(data=abalone3, features_list=abalone_names) accuracy3 = tree3.run_test(abalone_validation_data, tree_node3) print("Unpruned accuracy for abalone3: " + str(accuracy3) + "%") tree_pruned3 = tree3.prune_tree(abalone_validation_data, tree_node3) accuracy_pruned3 = tree3.run_test(abalone_validation_data, tree_pruned3) print("Pruned accuracy for abalone3: " + str(accuracy_pruned3) + "%") print() abalone_names = [ "sex", "length", "diameter", "height", "whole weight", "shucked weight", "viscera weight", "shell weight", "class" ] tree4 = dt.DecisionTree() tree_node4 = tree4.create_decision_tree(data=abalone4, features_list=abalone_names) accuracy4 = tree4.run_test(abalone_validation_data, tree_node4) print("Unpruned accuracy for abalone4: " + str(accuracy4) + "%") tree_pruned4 = tree4.prune_tree(abalone_validation_data, tree_node4) accuracy_pruned4 = tree4.run_test(abalone_validation_data, tree_pruned4) print("Pruned accuracy for abalone4: " + str(accuracy_pruned4) + "%") print() abalone_names = [ "sex", "length", "diameter", "height", "whole weight", "shucked weight", "viscera weight", "shell weight", "class" ] tree5 = dt.DecisionTree() tree_node5 = tree5.create_decision_tree(data=abalone5, features_list=abalone_names) accuracy5 = tree5.run_test(abalone_validation_data, tree_node5) print("Unpruned accuracy for abalone5: " + str(accuracy5) + "%") tree_pruned5 = tree5.prune_tree(abalone_validation_data, tree_node5) accuracy_pruned5 = tree5.run_test(abalone_validation_data, tree_pruned5) print("Pruned accuracy for abalone5: " + str(accuracy_pruned5) + "%") print() unpruned_accuracy_average = np.average( [accuracy1, accuracy2, accuracy3, accuracy4, accuracy5]) pruned_accuracy_average = np.average([ accuracy_pruned1, accuracy_pruned2, accuracy_pruned3, accuracy_pruned4, accuracy_pruned5 ]) print("Unpruned accuracy average for abalone data: " + str(unpruned_accuracy_average) + "%") print("Pruned accuracy average for abalone data: " + str(pruned_accuracy_average) + "%") print()