예제 #1
0
def relative_validity_hard_large_data(X):
    # Initialization
    no_of_clusters_list = [i for i in range(2, 11)]

    DB = np.zeros(len(no_of_clusters_list))

    # Centroids must remain the same. The only parameter that should change is the number of clusters
    clustered_data, centroids_BSAS, total_clusters_ = BSAS.basic_sequential_scheme(
        X)

    for i, total_clusters in tqdm(
            enumerate(no_of_clusters_list)):  # no_of_clusters

        if len(centroids_BSAS) < total_clusters:
            centroids = np.zeros((total_clusters, len(X[0])))
            # First centroids values
            centroids[:len(centroids_BSAS), :] = centroids_BSAS
            # Last centroids values
            random_indices = np.random.randint(len(X),
                                               size=total_clusters -
                                               len(centroids_BSAS))
            centroids[len(centroids_BSAS):, :] = X[random_indices, :]
        elif len(centroids_BSAS) > total_clusters:
            centroids = centroids_BSAS[:total_clusters, :]
        elif len(centroids_BSAS) == total_clusters:
            centroids = centroids_BSAS

        X_, centroids, centroids_history = kmeans_clustering.kmeans(
            X, total_clusters, centroids_initial=centroids)

        DB[i] = Davies_Bouldin(X_, centroids)

    return no_of_clusters_list, DB
    def testBlobs(self):
        no_of_clusters = 4

        # Create the dataset
        X, y = make_blobs(n_samples=500,
                          centers=no_of_clusters,
                          n_features=2,
                          random_state=185)

        # Run the clustering algorithm but first run a sequential algorithm to obtain initial centroids
        clustered_data, centroids, total_clusters = BSAS.basic_sequential_scheme(
            X)
        X, centroids, centroids_history = kmeans_clustering.kmeans(
            X, no_of_clusters, centroids_initial=centroids)

        # Plotting
        plot_data(X, no_of_clusters, centroids, centroids_history)

        # Examine Cluster Validity with statistical tests
        initial_gamma, list_of_gammas, result = internal_criteria.internal_validity(
            X, no_of_clusters, kmeans_clustering.kmeans)
        initial_indices, list_of_indices, result_list = external_criteria.external_validity(
            X, no_of_clusters, y, kmeans_clustering.kmeans)

        # Histogram of gammas from internal criteria
        hist_internal_criteria(initial_gamma, list_of_gammas, result)
        hist_external_criteria(initial_indices, list_of_indices, result_list)

        plt.show()
    def testImageSegmentation(self):
        image = ndimage.imread('..//..//images//181091.jpg')
        image = image.astype(np.int32, copy=False)

        # Algorithm execution. We run BSAS first to get estimates for the centroids
        number_of_clusters = 3
        clustered_data, centroids, total_clusters = BSAS.basic_sequential_scheme(
            image)
        X_, centroids, centroids_history = kmeans_clustering.kmeans(
            image,
            no_of_clusters=number_of_clusters,
            centroids_initial=centroids)

        ###################################################################
        # Merging procedure

        X_ = image_segm_utility.merging_procedure(X_, 500)

        # Calculate the Rand Index to test similarity to external data
        original_image = '181091.jpg'
        seg_file = '181091.seg'
        external_info = image_segm_utility.insert_clusters(
            original_image, seg_file)
        rand_index = image_segm_utility.rand_index_calculation(
            X_, external_info)
        print(rand_index)

        # Draw the clustered image
        draw_clustered_image(X_, image.shape, rand_index)
        plt.show()
예제 #4
0
    def testMoons(self):
        # Create the dataset
        X, y = make_moons(n_samples=500,
                          shuffle=True,
                          noise=0.1,
                          random_state=121)

        # Run the clustering algorithm
        X, centroids, no_of_clusters = BSAS.basic_sequential_scheme(
            X, threshold=1)

        # Plotting
        plot_data(X, no_of_clusters, centroids)

        # Examine Cluster Validity with statistical tests
        initial_gamma, list_of_gammas, result = internal_criteria.internal_validity(
            X, no_of_clusters, BSAS.basic_sequential_scheme)
        initial_indices, list_of_indices, result_list = external_criteria.external_validity(
            X, no_of_clusters, y, BSAS.basic_sequential_scheme)

        # Histogram of gammas from internal and external criteria
        hist_internal_criteria(initial_gamma, list_of_gammas, result)
        hist_external_criteria(initial_indices, list_of_indices, result_list)

        plt.show()
예제 #5
0
    def testImageSegmentation(self):
        image = ndimage.imread('..//..//images//181091.jpg')
        image = image.astype(np.int32, copy=False)

        # Algorithm execution.
        clusters_number_to_execute = 28
        clustered_data, centroids, total_clusters = BSAS.basic_sequential_scheme(
            image)
        X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy(
            image, no_of_clusters=clusters_number_to_execute)

        ###################################################################
        # Merging procedure

        X_ = image_segm_utility.merging_procedure(X_, 500)

        # Calculate the Rand Index to test similarity to external data
        original_image = '181091.jpg'
        seg_file = '181091.seg'
        external_info = image_segm_utility.insert_clusters(
            original_image, seg_file)
        rand_index = image_segm_utility.rand_index_calculation(
            X_, external_info)
        print(rand_index)

        # Draw the clustered image
        draw_clustered_image(X_, image.shape, rand_index)
        plt.show()
예제 #6
0
def relative_validity_hard_sequential(X):
    ''' Defines the several values of the BSAS parameter. Then conducts successive executions of the algorithm by passing to it 
        those values and calculates all the proper relative indices.
        
        Parameters:
            X((N x m) numpy array): a data set of N instances and m features
        
        Returns:
            no_of_threshold_values: the different values of the threshold parameter 
            DI, DB, SI: the arrays holding the values of the relative indices
    '''
    # Initialization
    threshold, bins = BSAS.thresholding_BSAS(X)
    threshold_index = np.where(bins == threshold)[0][0]

    # Finds the threshold values against which to run the BSAS algorithm
    number_of_threshold_values = 10
    if threshold_index >= number_of_threshold_values:
        no_of_threshold_values = [
            bins[i] for i in range(
                threshold_index - number_of_threshold_values,
                min(threshold_index + number_of_threshold_values,
                    len(bins) - 1))
        ]
    else:
        no_of_threshold_values = [
            bins[i] for i in range(0, threshold_index + threshold_index)
        ]

    DI = np.zeros(len(no_of_threshold_values))
    DB = np.zeros(len(no_of_threshold_values))
    SI = np.zeros(len(no_of_threshold_values))

    for i, threshold_values in tqdm(
            enumerate(no_of_threshold_values)):  # no_of_clusters

        X_, centroids_BSAS, total_clusters_ = BSAS.basic_sequential_scheme(
            X, threshold=threshold_values)

        DI[i] = Dunn_index(X_)
        DB[i] = Davies_Bouldin(X_, centroids_BSAS)
        SI[i] = silhouette_index(X_)

    return no_of_threshold_values, DI, DB, SI
예제 #7
0
def relative_validity_hard(X):
    ''' Defines the several values of the kmeans parameter. Then conducts successive executions of the algorithm by passing to it 
        those values and calculates all the proper relative indices.
        
        Parameters:
            X((N x m) numpy array): a data set of N instances and m features
        
        Returns:
            no_of_clusters_list: the different values of the clusters number
            DI, DB, SI, GI: the arrays holding the values of the relative indices
    '''
    # Initialization
    no_of_clusters_list = [i for i in range(2, 11)]

    DI = np.zeros(len(no_of_clusters_list))
    DB = np.zeros(len(no_of_clusters_list))
    SI = np.zeros(len(no_of_clusters_list))
    GI = np.zeros(len(no_of_clusters_list))

    # Centroids must remain the same. The only parameter that should change is the number of clusters
    clustered_data, centroids_BSAS, total_clusters_ = BSAS.basic_sequential_scheme(
        X)

    for i, total_clusters in tqdm(
            enumerate(no_of_clusters_list)):  # no_of_clusters

        if len(centroids_BSAS) < total_clusters:
            centroids = np.zeros((total_clusters, len(X[0])))
            # First centroids values
            centroids[:len(centroids_BSAS), :] = centroids_BSAS
            # Last centroids values
            random_indices = np.random.randint(len(X),
                                               size=total_clusters -
                                               len(centroids_BSAS))
            centroids[len(centroids_BSAS):, :] = X[random_indices, :]
        elif len(centroids_BSAS) > total_clusters:
            centroids = centroids_BSAS[:total_clusters, :]
        elif len(centroids_BSAS) == total_clusters:
            centroids = centroids_BSAS

        X_, centroids, centroids_history = kmeans_clustering.kmeans(
            X, total_clusters, centroids_initial=centroids)

        DI[i] = Dunn_index(X_)
        DB[i] = Davies_Bouldin(X_, centroids)
        SI[i] = silhouette_index(X_)
        GI[i] = gap_index(X_, total_clusters, kmeans_clustering.kmeans)

    return no_of_clusters_list, DI, DB, SI, GI
예제 #8
0
def relative_validity_hard(X, no_of_clusters):
    # Initialization
    no_of_clusters_list = [i for i in range(2, 11)]

    DI = np.zeros(len(no_of_clusters_list))
    DB = np.zeros(len(no_of_clusters_list))
    SI = np.zeros(len(no_of_clusters_list))
    GI = np.zeros(len(no_of_clusters_list))

    # Centroids must remain the same. The only parameter that should change is the number of clusters
    clustered_data, centroids_BSAS, total_clusters_ = BSAS.basic_sequential_scheme(
        X)

    for i, total_clusters in tqdm(
            enumerate(no_of_clusters_list)):  # no_of_clusters

        if len(centroids_BSAS) < total_clusters:
            centroids = np.zeros((total_clusters, len(X[0])))
            # First centroids values
            centroids[:len(centroids_BSAS), :] = centroids_BSAS
            # Last centroids values
            random_indices = np.random.randint(len(X),
                                               size=total_clusters -
                                               len(centroids_BSAS))
            centroids[len(centroids_BSAS):, :] = X[random_indices, :]
        elif len(centroids_BSAS) > total_clusters:
            centroids = centroids_BSAS[:no_of_clusters, :]
        elif len(centroids_BSAS) == total_clusters:
            centroids = centroids_BSAS

        X_, centroids, centroids_history = kmeans_clustering.kmeans(
            X, total_clusters, centroids_initial=centroids)

        DI[i] = Dunn_index(X_)
        DB[i] = Davies_Bouldin(X_, centroids)
        SI[i] = silhouette_index(X_)
        GI[i] = gap_index(X_, total_clusters)

        # Print just one clustering effort, the correct one in order to compare it with the indices' signals
        if total_clusters == no_of_clusters:
            plot_data(X_, centroids, total_clusters, centroids_history)

    return no_of_clusters_list, DI, DB, SI, GI
예제 #9
0
    def testImageSegmentation(self):
        image = ndimage.imread('..//..//images//113044.jpg')
        image = image.astype(np.int32, copy=False)

        # Algorithm execution. We run BSAS first to get estimates for the centroids
        X_, centroids, total_clusters = BSAS.basic_sequential_scheme(
            image, max_number_of_clusters=1000, threshold=185)

        # Calculate the Rand Index to test similarity to external data
        original_image = '113044.jpg'
        seg_file = '113044.seg'
        external_info = image_segm_utility.insert_clusters(
            original_image, seg_file)
        rand_index = image_segm_utility.rand_index_calculation(
            X_, external_info)
        print(rand_index)

        # Draw the clustered image
        draw_clustered_image(X_, image.shape, total_clusters, rand_index)
        plt.show()
예제 #10
0
    def testBlobs(self):
        no_of_clusters = 6
        
        # Create the dataset
        X, y = make_blobs(n_samples = 500, centers= no_of_clusters, n_features=2,random_state=50)
        
        # Run the clustering algorithm
        X, centroids, no_of_clusters = BSAS.basic_sequential_scheme(X)

        # Plotting
        plot_data(X, centroids, no_of_clusters)
        
        # Examine Cluster Validity with statistical tests
        initial_gamma, list_of_gammas, result = internal_criteria.internal_validity(X, no_of_clusters, BSAS.basic_sequential_scheme)
        initial_indices, list_of_indices, result_list = external_criteria.external_validity(X, no_of_clusters, y, BSAS.basic_sequential_scheme)
        
        # Histogram of gammas from internal criteria 
        hist_internal_criteria(initial_gamma, list_of_gammas, result)
        hist_external_criteria(initial_indices, list_of_indices, result_list)
        
        plt.show()