def run_kmeans_experiment(data_set_path, number_of_clusters, learner, fraction_of_data_used=1, data_type=float): """ The main work horse for running the experiments and output the approriate information into a file Works by reading in the data, trainnig and test data. Creates the KMeans and pass the needed data to it. It returns a list of selected features. The mean is then retrieved for those features, and we cluster all of the data based on the means Finally, I print all information needed in a human readable way. """ print("Running {0} Experiment with k clusters = {1}".format(data_set_path, number_of_clusters)) all_data = CustomCSVReader.read_file(data_set_path, data_type) feature_selection_data = all_data[:int(len(all_data)/fraction_of_data_used)] feature_length = len(all_data[0]) - 1 Features = list(range(feature_length)) best_features = SFS.select_features(Features, feature_selection_data, all_data, learner) means = learner.learn(best_features[0], all_data) data_clusters = learner.get_clusters_for_means(means, best_features[0], all_data) print("The Final Selected Features are: (features are zero indexed) ") print("{}\n".format(best_features[0])) print("The Fisher Score for the clustering is: ") print("{}\n".format(best_features[1])) pp = pprint.PrettyPrinter(indent=2, width=400) print("For Clustered points, the key in the dictionary represents the cluster each data point belongs to. ") print("Clustered points: ") pp.pprint(data_clusters)
def run_classification_experiment(data_set_path, k_nearest, learner, data_type=float): """ The main work horse for running the experiments and output the approriate information into a file Works by reading in the data, trainnig and test data. Creates the Cross validation objects with the correct k-NN algorithm (classification k-NN) Then runs the cross validation (classificaiton) and gets the outputs from the cross validation Finally, I print all information needed in a human readable way. """ print("Running {0} Experiment with k nearest = {1}".format( data_set_path, k_nearest)) all_data = CustomCSVReader.read_file(data_set_path, data_type) cv = CrossValidation(5, learner) average_error_rate = cv.cross_validation_classification(all_data) print("Average Error Rate: {}".format(average_error_rate[0])) print("Standard Deviation: {}".format(average_error_rate[1])) print( "Last Cross Validation Set Predicted Values: \n(Predicted Value, Actual Value)" ) cv_predicted_values = average_error_rate[2] cv_actual_values = average_error_rate[3] for predicted, actual in zip(cv_predicted_values[4], cv_actual_values[4]): print("{0}, {1}".format(predicted, actual)) return average_error_rate[0]
def run_classification_experiment(data_set_path, learner, positive_class_name, data_type=float): """ The main work horse for running the experiments and output the approriate information into a file Works by reading in the data, training and test data. Creates the Cross validation objects with the correct linear model algorithm (NB or LR) Then runs the cross validation (classificaiton) and gets the outputs from the cross validation. Finally, I print all information needed in a human readable way. """ print("Running {0} Experiment with positive class = {1}".format(data_set_path, positive_class_name)) # Network structure. print("Number of Hidden Layers: {}".format(len(learner.weights)-1)) print("Number of Nodes in First Hidden Layer: {}".format(learner.num_in_hidden_layer_1)) print("Number of Nodes in First Hidden Layer: {}".format(learner.num_in_hidden_layer_2)) all_data = CustomCSVReader.read_file(data_set_path, data_type) # Pre-process the data to split into 2 classes, positive and not positive. all_data = learner.pre_process(all_data, positive_class_name) cv = CrossValidation(5, learner) average_error_rate = cv.cross_validation_classification(all_data) print("Average Error Rate: {}".format(average_error_rate[0])) print("Standard Deviation: {}".format(average_error_rate[1])) # if not linearRegression: # print("Learned Naive Bayes Distribution: ") # print("Keys are structured as follows: (feature#, possible domain values 0 or 1, 'label', label value)") # print("Special Key's that are ('label', possible_class_value) are the percentage of the distribution with " # "that class label") # else: print("Learned NN Model (Weights) ") pp = pprint.PrettyPrinter(indent=2) pp.pprint(average_error_rate[2][4]) print() print("Last Cross Validation Set Predicted Values: \n(Predicted Value, Actual Value)") cv_predicted_values = average_error_rate[3] cv_actual_values = average_error_rate[4] for predicted, actual in zip(cv_predicted_values[4], cv_actual_values[4]): # if linearRegression: # print("{0}, {1}".format(predicted[0], actual)) # else: print("{0}, {1}".format(predicted, actual)) return average_error_rate[0]
def run_hac_experiment(data_set_path, number_of_clusters, hac, fraction_of_data_used=1, data_type=float): """ The main work horse for running the experiments and output the approriate information into a file Works by reading in the data, trainnig and test data. Creates the HAC and pass the needed data to it. It returns a list of selected features. The cluster of datapoints by HAC is then retrieved for those features on the test data. (Cluster if datapoint Ids = model) We then retrieve the full clustering of the data from HAC by passing in the "model" it returned. Results in all of the datapoint being clustered by HAC Finally, I print all information needed in a human readable way. """ print("Running {0} Experiment with k clusters = {1}".format( data_set_path, number_of_clusters)) all_data = CustomCSVReader.read_file(data_set_path, data_type) feature_selection_data = all_data[:int( len(all_data) / fraction_of_data_used)] feature_length = len(all_data[0]) - 1 Features = list(range(feature_length)) GA = GeneticAlgorithmFeatureSelection() best_features = GA.select_features_ga(hac, feature_selection_data, all_data, feature_length) selected_features = GA.get_selected_features(best_features) clusters_of_datapoint_ids = hac.learn(selected_features, feature_selection_data) full_clusters = hac.get_full_clusters_of_data(clusters_of_datapoint_ids, selected_features, all_data) print("The Final Selected Features are: (features are zero indexed) ") print("{}\n".format(selected_features)) print("The Fisher Score for the clustering is: ") print("{}\n".format(best_features["evaluation"])) pp = pprint.PrettyPrinter(indent=2, width=400) print( "For Clustered points, the key in the dictionary represents the cluster each data point belongs to. " ) print("Clustered points: ") pp.pprint(full_clusters)
""" Created by Max 10/8/2017 """ import math import sys from customCsvReader import CustomCSVReader """ This file was used to calculate the Max MSE that exists in the data sets for comparison of how well the k-NN regression algorithm did when comparing MSE """ all_data = CustomCSVReader.read_file("D:\\Documents\\JHU-Masters\\605.449 Intro to Machine Learning\\projects\\project3\\data\\machine.data.new.txt", float) # all_data = CustomCSVReader.read_file("D:\\Documents\\JHU-Masters\\605.449 Intro to Machine Learning\\projects\\project3\\data\\forestfires.data.new.txt", float) max_distance = 0 max_dist_points = () for datapoint_i in all_data: for datapoint_j in all_data: dist = (datapoint_i[-1] - datapoint_j[-1])**2 if dist > max_distance: max_distance = dist max_dist_points = (datapoint_i, datapoint_j) print(max_distance) print(max_distance**2) print((max_dist_points[0][-1] - max_dist_points[1][-1])**2) print(max_dist_points[0]) print(max_dist_points[1])
actual_prediction = prediction actuals.append(test_item[-1]) if actual_prediction != test_item[-1]: num_errors += 1 error_rate = num_errors / len(predictions) return error_rate # <editor-fold desc="Cancer Data"> print("Cancer") node_search = hidden_layer1_node_search(90, 1) all_data = CustomCSVReader.read_file("data/breast-cancer-wisconsin.data.new.txt", float) optimal_node_number = node_search.find_optimal_hidden_node_1(start_number=5, max_number=20, step=5, dataset=all_data, positive_class_name=1) print(optimal_node_number) # </editor-fold> # <editor-fold desc="Soybean"> print("Soybean") node_search = hidden_layer1_node_search(204, 1) all_data = CustomCSVReader.read_file("data/soybean-small.data.new.txt", float) optimal_node_number = node_search.find_optimal_hidden_node_1(start_number=5, max_number=20, step=5, dataset=all_data, positive_class_name="D1") print(optimal_node_number) # </editor-fold>
""" Created by Max 9/24/2017 """ import numpy as np import matplotlib.pyplot as plt from customCsvReader import CustomCSVReader def data_munge(selected_features, data): new_data = [] for data_point in data: new_data_point = [] for selected_feature in selected_features: new_data_point.append(data_point[selected_feature]) new_data.append(new_data_point) return new_data all_data = CustomCSVReader.read_file("data/iris.data.txt", float) data = data_munge([2], all_data) np.zeros_like(data) plt.plot(data, np.zeros_like(data), 'x', color='red') plt.show()