def run_id3_decision_tree(df, prune=False): # Split dataset 5-fold stratified print(f"Size of total dataset = {len(df)}") train1, train2, train3, train4, train5 = split_into_random_stratified_groups( df) datasets = [train1, train2, train3, train4, train5] scores = [] pruned_scores = [] for i, d in enumerate(datasets): print("-------------") print(f"Experiment #{i + 1}") print("-------------") # Use one subset as a test set df_test = datasets[i] print(f"Test set size = {len(df_test)}") training_sets = datasets.copy() # Create a training set from remaining subsets del training_sets[i] df_train = pd.concat(training_sets) print(f"Training set size = {len(df_train)}") # Build the decision tree from the training set id3 = DecisionTree(df_train) id3.build_id3_tree() #id3.print_tree() # Test the decision tree accuracy = id3.validate(id3.root, df_test) print('Percent accurate: ' + repr(accuracy) + '%') scores.append(accuracy) # If pruning is turned on, test pruned tree accuracy if prune: p_accuracy = id3.validate_pruned_tree(df_test) print('Pruned Tree Percent Accurate: ' + repr(p_accuracy) + '%') pruned_scores.append(p_accuracy) print("----------------------------") print(f"Averages over 5 experiments") print("----------------------------") print(f"ID3 Decision Tree Averages = {statistics.mean(scores)}%") if prune: print( f"Pruned ID3 Decision Tree Averages = {statistics.mean(pruned_scores)}%" )
def run_k_nearest_neighbor_experiments(df, k, run_condensed, classification=True): # Split dataset 5-fold stratified print(f"Size of total dataset = {len(df)}") train1, train2, train3, train4, train5 = split_into_random_stratified_groups( df) # Run five experiments, using one of the sets as a test set each time k_scores = [] k_condensed_scores = [] datasets = [train1, train2, train3, train4, train5] for i, d in enumerate(datasets): print("-------------") print(f"Experiment #{i + 1}") print("-------------") df_test = datasets[i] print(len(df_test)) training_sets = datasets.copy() del training_sets[i] df_train = pd.concat(training_sets) print(len(df_train)) # Run K-Nearest Neighbors print(f"k = {k}") print("Running k nearest neighbors...") knn = KNearestNeighbors(df_test, k, df.columns, classification) accuracy = knn.run(df_train) print('Percent accurate: ' + repr(accuracy) + '%') k_scores.append(accuracy) if run_condensed: # Run Condensed K-Nearest Neighbors knn = KNearestNeighbors(df_test, k, df.columns, classification) accuracy = knn.run_condensed(df_train) print('Percent accurate: ' + repr(accuracy) + '%') k_condensed_scores.append(accuracy) print("----------------------------------------") print(f"Averages over 5 experiments where k={k}") print("----------------------------------------") print(f"k-Nearest Neighbors = {statistics.mean(k_scores)}") if run_condensed: print( f"Condensed k-Nearest Neighbors = {statistics.mean(k_condensed_scores)}" )
def run_backpropagation(df, num_features, num_hidden): """ This function runs a backpropagation neural network on the data frame and outputs statistics from five experiments :param df: The data set to run the algorithm on= :param num_features: The number of features in this dataset """ # Split dataset 5-fold stratified print(f"Size of total dataset = {len(df)}") train1, train2, train3, train4, train5 = split_into_random_stratified_groups( df) datasets = [train1, train2, train3, train4, train5] lg_scores = [] for i, d in enumerate(datasets): print("-------------") print(f"Experiment #{i + 1}") print("-------------") # Use one subset as a test set df_test = datasets[i] print(f"Test set size = {len(df_test)}") training_sets = datasets.copy() # Create a training set from remaining subsets del training_sets[i] df_train = pd.concat(training_sets) print(f"Training set size = {len(df_train)}") # Create Logistic Regression print(df_train.iloc[:, 0:num_features + 1].head()) lg = BackpropagationNeuralNetwork(df_train.columns[0:num_features], df_train.iloc[:, 0:num_features + 1], df_train.iloc[:, num_features], df_test.iloc[:, 0:num_features + 1], df_test.iloc[:, num_features], int(num_hidden)) # Train with logistic regression lg.learn() # Test the logistic regression accuracy lg_accuracy = lg.make_predictions() print('Percent accurate: ' + repr(lg_accuracy) + '%') lg_scores.append(lg_accuracy) return statistics.mean(lg_scores)
def run_naive_bayes(df, num_features): """ This function runs naive on the data frame and outputs statistics from five experiments :param df: The data set to run the algorithm on= :param num_features: The number of features in this dataset """ # Split dataset 5-fold stratified print(f"Size of total dataset = {len(df)}") train1, train2, train3, train4, train5 = split_into_random_stratified_groups( df) datasets = [train1, train2, train3, train4, train5] nb_scores = [] for i, d in enumerate(datasets): print("-------------") print(f"Experiment #{i + 1}") print("-------------") # Use one subset as a test set df_test = datasets[i] print(f"Test set size = {len(df_test)}") training_sets = datasets.copy() # Create a training set from remaining subsets del training_sets[i] df_train = pd.concat(training_sets) print(f"Training set size = {len(df_train)}") # Create Naive Bayes nb = NaiveBayes(df_train.iloc[:, 0:num_features], df_train.iloc[:, num_features], df_test.iloc[:, 0:num_features], df_test.iloc[:, num_features]) # Train with naive bayes nb.learn() # Test the accuracy of naive bayes nb_accuracy = nb.validate() print('Naive Bayes Percent accurate: ' + repr(nb_accuracy) + '%') nb_scores.append(nb_accuracy) return statistics.mean(nb_scores)