def _re_calculate_proximity_matrix(self): start_time = time.time() num_processes = config.MAX_AVAILABLE_CPU_CORES * 3 # should be larger then available processors because the work is not evenly spread # Spread the work out around all the processes # Each process will have around the same number of data instances to handle, but higher index value has less calculations to do then lower index value data range_indices = np.array(range(0, self.num_data_instances)) divided_indices = np.array_split(range_indices, num_processes) # Execute multi-processor functions with concurrent.futures.ProcessPoolExecutor(max_workers=config.MAX_AVAILABLE_CPU_CORES) as executor: results = [executor.submit(initial_proxy_matrix_distance_calculator_thread, index, divided_indices[index][0], divided_indices[index][-1], self.num_data_instances, self.training_data) for index in range(num_processes)] # collect the results as the processes complete and fill out the proximity matrix for f in concurrent.futures.as_completed(results): index, results = f.result() for x in range(results.shape[0]): x_pos = divided_indices[index][0] + x for y in range(x_pos+1, self.num_data_instances): self.proximity_matrix[x_pos][y] = results[x][y] self.proximity_matrix[y][x_pos] = results[x][y] config.PrintDebug("Completed in {:.2f} seconds. Sum {}".format((time.time() - start_time), np.nansum(self.proximity_matrix))) with open(self.INITIAL_PROXY_MATRIX_FILE_NAME, 'wb') as f: config.PrintDebug("Saving Initial Proximity Matrix Into {}".format(self.INITIAL_PROXY_MATRIX_FILE_NAME)) np.save(f, self.proximity_matrix) pass
def _create_and_save_training_data_histograms(self, num_bins, image_paths_list): all_imagenames = [] all_historgrams = [] for image_path in image_paths_list: im_name = image_path.split('\\')[-1] config.PrintDebug("Processing: {}".format(im_name)) # load each image file and change the pixel range to be from 0 - 1. image = skimage.io.imread(fname=image_path, as_gray=True) / config.U16_MAX_VAL # convert image to histogram histogram, bin_edges = np.histogram(image, bins=num_bins, range=(0, 1)) # cache histogram into array all_historgrams.append(histogram) all_imagenames.append(im_name) # convert to numpy array and save it to file all_historgrams = np.array(all_historgrams) with open(self.HISTOGRAM_FILENAME, 'wb') as f: config.PrintDebug("Saving Histogram Data Into {}".format(self.HISTOGRAM_FILENAME)) np.save(f, all_historgrams) config.PrintDebug("Process complete. Saved: {} Rows Of Data".format(all_historgrams.shape[0])) all_imagenames = np.array(all_imagenames) with open(self.TRAINING_IMG_NAMES_FILENAME, "wb") as f: config.PrintDebug("Saving Training Image Filenames Into {}".format(self.TRAINING_IMG_NAMES_FILENAME)) np.save(f, all_imagenames) return (all_historgrams, all_imagenames)
def _load_merge_history(self): with open(self.MERGE_HISTORY_FILE_NAME, 'rb') as f: config.PrintDebug("Loading Merge History From {}".format(self.MERGE_HISTORY_FILE_NAME)) self.merge_history = np.load(f) with open(self.MERGE_CLUSTER_HISTORY_FILE_NAME, 'rb') as f: config.PrintDebug("Loading Merge Cluster History From {}".format(self.MERGE_CLUSTER_HISTORY_FILE_NAME)) self.cluster_history = pickle.load(f) pass
def _predict_cluster_and_accuracy(self, image_path, num_bins, num_clusters): # 1. Load image: load each image file and change the pixel range to be from 0 - 1. config.PrintDebug("Predicting: {}".format(image_path.split('\\')[-1])) image = skimage.io.imread(fname=image_path, as_gray=True) / config.U16_MAX_VAL # 2. Convert to histogram histogram, bin_edges = np.histogram(image, bins=num_bins, range=(0, 1)) # 3. Find the right cluster and the distance to it cluster_distances = [] for c in range(num_clusters): #dist_to_cluster = self._calculate_distance_to_cluster_V2(c, histogram) dist_to_cluster = self._calculate_distance_to_cluster(c, histogram, LINKAGE.AVERAGE) cluster_distances.append((c, dist_to_cluster)) cluster_distances = sorted(cluster_distances, key = lambda x: x[1]) # 4. Subtrack the within cluster distance closest_cluster = cluster_distances[0][0] dist_to_closest = cluster_distances[0][1] # if distance is less then 0 than it's inside the cluster # if dist_to_closest < 0: # dist_to_closest = 0 # 5. Return results return (closest_cluster, dist_to_closest)
def _create_clusters_from_training_data(self, num_clusters): # if recreating the clusters, delete the existing info if os.path.isdir(self.CLUSTER_INFO_FOLDER_NAME): shutil.rmtree(self.CLUSTER_INFO_FOLDER_NAME) # 1. Create array, one set for each cluster self.clusters = [] for i in range(num_clusters): self.clusters.append(set()) # 2. populate the first element with all elements for val in self.cluster_history[-1]: self.clusters[0].add(val) print("First has: {} elements".format(len(self.clusters[0]))) # 3. loop and split of merged parts hist_ind = -2 cluster_to_set = self.get_empty_clusterset_index(self.clusters) while cluster_to_set != -1: cluster_to_split = -1 for val in self.cluster_history[hist_ind]: # find the right cluster to split if cluster_to_split == -1: for k in range(num_clusters): if val in self.clusters[k]: cluster_to_split = k break self.clusters[cluster_to_split].remove(val) self.clusters[cluster_to_set].add(val) hist_ind -= 1 cluster_to_set = self.get_empty_clusterset_index(self.clusters) self.clusters = np.array(self.clusters) with open(self.CLUSTER_GROUPS_FILE_NAME, 'wb') as f: config.PrintDebug("Saving Cluster Groups Into {}".format(self.CLUSTER_GROUPS_FILE_NAME)) np.save(f, self.clusters) if not os.path.exists(self.CLUSTER_INFO_FOLDER_NAME): os.makedirs(self.CLUSTER_INFO_FOLDER_NAME) # Save the image file names of each cluster into a file for evaluation for c in range(self.clusters.shape[0]): cluster_indices = list(self.clusters[c]) with open("{}/Cluster_{}.txt".format(self.CLUSTER_INFO_FOLDER_NAME, c), 'w') as f: for cluster_index in cluster_indices: f.write("{}\n".format(self.training_filenames[cluster_index])) pass
def calculate_within_cluster_variance(self): self.within_cluster_variance = [0.0] * len(self.clusters) for cluster_index in range(len(self.clusters)): print("Calculating Within-Cluster Variance for cluster: {}".format(cluster_index)) cluster = list(self.clusters[cluster_index]) within_dist_with = 0.0 total_dist = [] for i in range(0, len(cluster)-1): for k in range(i+1, len(cluster)): total_dist.append(self.proximity_matrix[cluster[i]][cluster[k]]) within_dist_with = np.nanmean(total_dist) num_processes = config.MAX_AVAILABLE_CPU_CORES # Spread the work out around all the processes # Each process will have around the same number of data instances to handle, but higher index value has less calculations to do then lower index value data range_indices = np.array(range(0, len(cluster))) divided_indices = np.array_split(range_indices, num_processes) largest_diff = 0.0 # Execute multi-processor functions with concurrent.futures.ProcessPoolExecutor(max_workers=config.MAX_AVAILABLE_CPU_CORES) as executor: results = [executor.submit(within_cluster_variance_calculator_thread, index, divided_indices[index][0], divided_indices[index][-1], cluster, self.proximity_matrix, within_dist_with) for index in range(num_processes)] # collect the results as the processes complete and fill out the proximity matrix for f in concurrent.futures.as_completed(results): result = f.result() if result > largest_diff: largest_diff = result self.within_cluster_variance[cluster_index] = largest_diff print("Cluster {}'s within-cluster distance variance is {}".format(cluster_index, largest_diff)) with open(self.CLUSTER_VARIANCE_FILE_NAME, 'wb') as f: config.PrintDebug("Saving Cluster Variance Into {}".format(self.CLUSTER_VARIANCE_FILE_NAME)) np.save(f, self.within_cluster_variance) pass
def _load_within_cluster_variance(self): with open(self.CLUSTER_VARIANCE_FILE_NAME, 'rb') as f: config.PrintDebug("Loading Cluster Variance From {}".format(self.CLUSTER_VARIANCE_FILE_NAME)) self.within_cluster_variance = np.load(f, allow_pickle=True) pass
def _load_cluster_groups(self): with open(self.CLUSTER_GROUPS_FILE_NAME, 'rb') as f: config.PrintDebug("Loading Cluster Groups From {}".format(self.CLUSTER_GROUPS_FILE_NAME)) self.clusters = np.load(f, allow_pickle=True) pass
def _merge_clusters(self, linkage_type): self.merge_history = [] proxy_matrix_dupe = np.array(self.proximity_matrix, copy=True) # create duplicate of the initial proximity matrix current_largest_cluster_index = proxy_matrix_dupe.shape[0] # rolling cluster index, each new cluster gets a new index row_indices = np.arange(current_largest_cluster_index) # helper array to help refer back to the original histogram # Create an array of arrays tracking which elements are in each cluster. # At the start, each cluster index only contains itself self.cluster_history = [] for i in range(current_largest_cluster_index): self.cluster_history.append([i]) while proxy_matrix_dupe.shape[0] > 1: start_time = time.time() print("Meging Next Closest Clusters. {} Clusters are remaining".format(proxy_matrix_dupe.shape[0])) # 1. Find the next closes cluster smallest_index = self._find_smallest_index(proxy_matrix_dupe) # returns the row and column indices of the closest cluster smallest_distance = proxy_matrix_dupe[smallest_index[0]][smallest_index[1]] # gets the actual distance to this closest cluster smallest_original_index = [row_indices[smallest_index[0]], row_indices[smallest_index[1]]] # gets the original indices of this cluster referencing the original training data position # 2. Get which elements are in both clusters being merged cluster_one = self.cluster_history[smallest_original_index[0]] cluster_two = self.cluster_history[smallest_original_index[1]] both_clusters = cluster_one + cluster_two self.cluster_history.append(both_clusters) # keep track of all the elements in this new cluster # 3. delete the data belonging to the clusters being merged. proxy_matrix_dupe = np.delete(proxy_matrix_dupe, [smallest_index[0], smallest_index[1]], axis=0) # delete rows proxy_matrix_dupe = np.delete(proxy_matrix_dupe, [smallest_index[0], smallest_index[1]], axis=1) # delete cols # delete the row indices as well to make sure that the found index will continue referencing the original index row_indices = np.delete(row_indices, [smallest_index[0], smallest_index[1]]) # 4. calculate the distances of this new cluster against all other clusters. # This has to happen after the deletion so we don't waste time distances = self._calculate_subset_of_proximity_matrix(linkage_type, row_indices, both_clusters, self.cluster_history) # 5. Add the distances to the proximity matrix, both as column and as row proxy_matrix_dupe = np.vstack((proxy_matrix_dupe, distances)) # add new row to 2d matrix distances = np.append(distances, np.NaN) # add an extra NaN to the end of the list, this will be for the distance to itself proxy_matrix_dupe = np.column_stack((proxy_matrix_dupe, distances)) # add new column to 2d matrix # 6. Keep tracking the index of this new cluster row_indices = np.append(row_indices, current_largest_cluster_index) current_largest_cluster_index += 1 # 7. add result to merge_history # The merge history for each merge is an array with size 4: # Index 0 and 1 in this array references the indices of the clusters that got merged # Index 2 is the distance between the merged clusters # Index 3 is the size of the new cluster, how many training data instances it contains cluster_size = len(both_clusters) res = [smallest_original_index[0], smallest_original_index[1], smallest_distance, cluster_size] self.merge_history.append(res) config.PrintDebug("Merging {} with {} completed in {} seconds. Distance: {}, Cluster Size: {}".format(smallest_original_index[0], smallest_original_index[1], (time.time() - start_time), smallest_distance, cluster_size)) self.merge_history = np.array(self.merge_history) with open(self.MERGE_HISTORY_FILE_NAME, 'wb') as f: config.PrintDebug("Saving Merge History Into {}".format(self.MERGE_HISTORY_FILE_NAME)) np.save(f, self.merge_history) with open(self.MERGE_CLUSTER_HISTORY_FILE_NAME, 'wb') as f: config.PrintDebug("Saving Merge Cluster History Into {}".format(self.MERGE_CLUSTER_HISTORY_FILE_NAME)) pickle.dump(self.cluster_history, f) pass
def _load_initial_proximity_matrix(self): with open(self.INITIAL_PROXY_MATRIX_FILE_NAME, 'rb') as f: config.PrintDebug("Loading Initial Proximity Matrix From {}".format(self.INITIAL_PROXY_MATRIX_FILE_NAME)) self.proximity_matrix = np.load(f) pass