def relative_validity_hard_large_data(X): # Initialization no_of_clusters_list = [i for i in range(2, 11)] DB = np.zeros(len(no_of_clusters_list)) # Centroids must remain the same. The only parameter that should change is the number of clusters clustered_data, centroids_BSAS, total_clusters_ = BSAS.basic_sequential_scheme( X) for i, total_clusters in tqdm( enumerate(no_of_clusters_list)): # no_of_clusters if len(centroids_BSAS) < total_clusters: centroids = np.zeros((total_clusters, len(X[0]))) # First centroids values centroids[:len(centroids_BSAS), :] = centroids_BSAS # Last centroids values random_indices = np.random.randint(len(X), size=total_clusters - len(centroids_BSAS)) centroids[len(centroids_BSAS):, :] = X[random_indices, :] elif len(centroids_BSAS) > total_clusters: centroids = centroids_BSAS[:total_clusters, :] elif len(centroids_BSAS) == total_clusters: centroids = centroids_BSAS X_, centroids, centroids_history = kmeans_clustering.kmeans( X, total_clusters, centroids_initial=centroids) DB[i] = Davies_Bouldin(X_, centroids) return no_of_clusters_list, DB
def testBlobs(self): no_of_clusters = 4 # Create the dataset X, y = make_blobs(n_samples=500, centers=no_of_clusters, n_features=2, random_state=185) # Run the clustering algorithm but first run a sequential algorithm to obtain initial centroids clustered_data, centroids, total_clusters = BSAS.basic_sequential_scheme( X) X, centroids, centroids_history = kmeans_clustering.kmeans( X, no_of_clusters, centroids_initial=centroids) # Plotting plot_data(X, no_of_clusters, centroids, centroids_history) # Examine Cluster Validity with statistical tests initial_gamma, list_of_gammas, result = internal_criteria.internal_validity( X, no_of_clusters, kmeans_clustering.kmeans) initial_indices, list_of_indices, result_list = external_criteria.external_validity( X, no_of_clusters, y, kmeans_clustering.kmeans) # Histogram of gammas from internal criteria hist_internal_criteria(initial_gamma, list_of_gammas, result) hist_external_criteria(initial_indices, list_of_indices, result_list) plt.show()
def testImageSegmentation(self): image = ndimage.imread('..//..//images//181091.jpg') image = image.astype(np.int32, copy=False) # Algorithm execution. We run BSAS first to get estimates for the centroids number_of_clusters = 3 clustered_data, centroids, total_clusters = BSAS.basic_sequential_scheme( image) X_, centroids, centroids_history = kmeans_clustering.kmeans( image, no_of_clusters=number_of_clusters, centroids_initial=centroids) ################################################################### # Merging procedure X_ = image_segm_utility.merging_procedure(X_, 500) # Calculate the Rand Index to test similarity to external data original_image = '181091.jpg' seg_file = '181091.seg' external_info = image_segm_utility.insert_clusters( original_image, seg_file) rand_index = image_segm_utility.rand_index_calculation( X_, external_info) print(rand_index) # Draw the clustered image draw_clustered_image(X_, image.shape, rand_index) plt.show()
def testMoons(self): # Create the dataset X, y = make_moons(n_samples=500, shuffle=True, noise=0.1, random_state=121) # Run the clustering algorithm X, centroids, no_of_clusters = BSAS.basic_sequential_scheme( X, threshold=1) # Plotting plot_data(X, no_of_clusters, centroids) # Examine Cluster Validity with statistical tests initial_gamma, list_of_gammas, result = internal_criteria.internal_validity( X, no_of_clusters, BSAS.basic_sequential_scheme) initial_indices, list_of_indices, result_list = external_criteria.external_validity( X, no_of_clusters, y, BSAS.basic_sequential_scheme) # Histogram of gammas from internal and external criteria hist_internal_criteria(initial_gamma, list_of_gammas, result) hist_external_criteria(initial_indices, list_of_indices, result_list) plt.show()
def testImageSegmentation(self): image = ndimage.imread('..//..//images//181091.jpg') image = image.astype(np.int32, copy=False) # Algorithm execution. clusters_number_to_execute = 28 clustered_data, centroids, total_clusters = BSAS.basic_sequential_scheme( image) X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy( image, no_of_clusters=clusters_number_to_execute) ################################################################### # Merging procedure X_ = image_segm_utility.merging_procedure(X_, 500) # Calculate the Rand Index to test similarity to external data original_image = '181091.jpg' seg_file = '181091.seg' external_info = image_segm_utility.insert_clusters( original_image, seg_file) rand_index = image_segm_utility.rand_index_calculation( X_, external_info) print(rand_index) # Draw the clustered image draw_clustered_image(X_, image.shape, rand_index) plt.show()
def relative_validity_hard_sequential(X): ''' Defines the several values of the BSAS parameter. Then conducts successive executions of the algorithm by passing to it those values and calculates all the proper relative indices. Parameters: X((N x m) numpy array): a data set of N instances and m features Returns: no_of_threshold_values: the different values of the threshold parameter DI, DB, SI: the arrays holding the values of the relative indices ''' # Initialization threshold, bins = BSAS.thresholding_BSAS(X) threshold_index = np.where(bins == threshold)[0][0] # Finds the threshold values against which to run the BSAS algorithm number_of_threshold_values = 10 if threshold_index >= number_of_threshold_values: no_of_threshold_values = [ bins[i] for i in range( threshold_index - number_of_threshold_values, min(threshold_index + number_of_threshold_values, len(bins) - 1)) ] else: no_of_threshold_values = [ bins[i] for i in range(0, threshold_index + threshold_index) ] DI = np.zeros(len(no_of_threshold_values)) DB = np.zeros(len(no_of_threshold_values)) SI = np.zeros(len(no_of_threshold_values)) for i, threshold_values in tqdm( enumerate(no_of_threshold_values)): # no_of_clusters X_, centroids_BSAS, total_clusters_ = BSAS.basic_sequential_scheme( X, threshold=threshold_values) DI[i] = Dunn_index(X_) DB[i] = Davies_Bouldin(X_, centroids_BSAS) SI[i] = silhouette_index(X_) return no_of_threshold_values, DI, DB, SI
def relative_validity_hard(X): ''' Defines the several values of the kmeans parameter. Then conducts successive executions of the algorithm by passing to it those values and calculates all the proper relative indices. Parameters: X((N x m) numpy array): a data set of N instances and m features Returns: no_of_clusters_list: the different values of the clusters number DI, DB, SI, GI: the arrays holding the values of the relative indices ''' # Initialization no_of_clusters_list = [i for i in range(2, 11)] DI = np.zeros(len(no_of_clusters_list)) DB = np.zeros(len(no_of_clusters_list)) SI = np.zeros(len(no_of_clusters_list)) GI = np.zeros(len(no_of_clusters_list)) # Centroids must remain the same. The only parameter that should change is the number of clusters clustered_data, centroids_BSAS, total_clusters_ = BSAS.basic_sequential_scheme( X) for i, total_clusters in tqdm( enumerate(no_of_clusters_list)): # no_of_clusters if len(centroids_BSAS) < total_clusters: centroids = np.zeros((total_clusters, len(X[0]))) # First centroids values centroids[:len(centroids_BSAS), :] = centroids_BSAS # Last centroids values random_indices = np.random.randint(len(X), size=total_clusters - len(centroids_BSAS)) centroids[len(centroids_BSAS):, :] = X[random_indices, :] elif len(centroids_BSAS) > total_clusters: centroids = centroids_BSAS[:total_clusters, :] elif len(centroids_BSAS) == total_clusters: centroids = centroids_BSAS X_, centroids, centroids_history = kmeans_clustering.kmeans( X, total_clusters, centroids_initial=centroids) DI[i] = Dunn_index(X_) DB[i] = Davies_Bouldin(X_, centroids) SI[i] = silhouette_index(X_) GI[i] = gap_index(X_, total_clusters, kmeans_clustering.kmeans) return no_of_clusters_list, DI, DB, SI, GI
def relative_validity_hard(X, no_of_clusters): # Initialization no_of_clusters_list = [i for i in range(2, 11)] DI = np.zeros(len(no_of_clusters_list)) DB = np.zeros(len(no_of_clusters_list)) SI = np.zeros(len(no_of_clusters_list)) GI = np.zeros(len(no_of_clusters_list)) # Centroids must remain the same. The only parameter that should change is the number of clusters clustered_data, centroids_BSAS, total_clusters_ = BSAS.basic_sequential_scheme( X) for i, total_clusters in tqdm( enumerate(no_of_clusters_list)): # no_of_clusters if len(centroids_BSAS) < total_clusters: centroids = np.zeros((total_clusters, len(X[0]))) # First centroids values centroids[:len(centroids_BSAS), :] = centroids_BSAS # Last centroids values random_indices = np.random.randint(len(X), size=total_clusters - len(centroids_BSAS)) centroids[len(centroids_BSAS):, :] = X[random_indices, :] elif len(centroids_BSAS) > total_clusters: centroids = centroids_BSAS[:no_of_clusters, :] elif len(centroids_BSAS) == total_clusters: centroids = centroids_BSAS X_, centroids, centroids_history = kmeans_clustering.kmeans( X, total_clusters, centroids_initial=centroids) DI[i] = Dunn_index(X_) DB[i] = Davies_Bouldin(X_, centroids) SI[i] = silhouette_index(X_) GI[i] = gap_index(X_, total_clusters) # Print just one clustering effort, the correct one in order to compare it with the indices' signals if total_clusters == no_of_clusters: plot_data(X_, centroids, total_clusters, centroids_history) return no_of_clusters_list, DI, DB, SI, GI
def testImageSegmentation(self): image = ndimage.imread('..//..//images//113044.jpg') image = image.astype(np.int32, copy=False) # Algorithm execution. We run BSAS first to get estimates for the centroids X_, centroids, total_clusters = BSAS.basic_sequential_scheme( image, max_number_of_clusters=1000, threshold=185) # Calculate the Rand Index to test similarity to external data original_image = '113044.jpg' seg_file = '113044.seg' external_info = image_segm_utility.insert_clusters( original_image, seg_file) rand_index = image_segm_utility.rand_index_calculation( X_, external_info) print(rand_index) # Draw the clustered image draw_clustered_image(X_, image.shape, total_clusters, rand_index) plt.show()
def testBlobs(self): no_of_clusters = 6 # Create the dataset X, y = make_blobs(n_samples = 500, centers= no_of_clusters, n_features=2,random_state=50) # Run the clustering algorithm X, centroids, no_of_clusters = BSAS.basic_sequential_scheme(X) # Plotting plot_data(X, centroids, no_of_clusters) # Examine Cluster Validity with statistical tests initial_gamma, list_of_gammas, result = internal_criteria.internal_validity(X, no_of_clusters, BSAS.basic_sequential_scheme) initial_indices, list_of_indices, result_list = external_criteria.external_validity(X, no_of_clusters, y, BSAS.basic_sequential_scheme) # Histogram of gammas from internal criteria hist_internal_criteria(initial_gamma, list_of_gammas, result) hist_external_criteria(initial_indices, list_of_indices, result_list) plt.show()