示例#1
0
def find_best_distance_epsilon(dbscan_xs_points_k_distance_method, threshold_majority):
    
    num_points_dbscan_xs_data = len(dbscan_xs_points_k_distance_method)

    dbscan_xs_points_k_distance_method_unique = array_sort(array_unique(dbscan_xs_points_k_distance_method))


    accumulative_probabilities = array_matrix(list(accumulate_sum(array_matrix(list(collection_counter(dbscan_xs_points_k_distance_method).values())) / num_points_dbscan_xs_data)))

    final_distance_epsilon = dbscan_xs_points_k_distance_method_unique[compute_arg_majority(accumulative_probabilities, threshold_majority)]

    
    return final_distance_epsilon
示例#2
0
def dbscan_final_clustering(xs_features_data, ys_labels_true, best_epsilon_value, num_closest_k_neighbors = 5):

    ys_labels_predicted, clusters_centroids_indices, clusters_centroids_points, clusters_border_points, xs_features_data_inliers, xs_features_data_outliers = dbscan_clustering_method(xs_features_data, best_epsilon_value, num_closest_k_neighbors = num_closest_k_neighbors)

    num_clusters_centroids = ( nan_max(ys_labels_predicted) + 1 )

    plot_clusters_centroids_and_radii("DBSCAN", xs_features_data, ys_labels_predicted, clusters_centroids_points, num_clusters = num_clusters_centroids, epsilon = best_epsilon_value, damping = None, final_clustering = True)


    if( num_clusters_centroids >= 2 ):
            
        if( num_clusters_centroids <= 26 ):

            plot_silhouette_analysis("DBSCAN", xs_features_data, ys_labels_predicted, clusters_centroids_points, num_clusters_centroids, epsilon = best_epsilon_value, damping = None, final_clustering = True)

    
        dbscan_final_clustering_silhouette_score, dbscan_final_clustering_precision_score, dbscan_final_clustering_recall_score, dbscan_final_clustering_rand_index_score, dbscan_final_clustering_f1_score, dbscan_final_clustering_adjusted_rand_score, dbscan_final_clustering_confusion_matrix_rand_index = compute_clustering_performance_metrics("K-Means", xs_features_data, ys_labels_true, ys_labels_predicted, num_clusters_centroids, final_clustering = True)
    
        plot_confusion_matrix_rand_index_clustering_heatmap("DBSCAN", dbscan_final_clustering_confusion_matrix_rand_index, num_clusters_centroids, epsilon = best_epsilon_value, damping = None, final_clustering = True)


    xs_ids_examples = list(range(0, len(ys_labels_predicted)))
    
    html_report_cluster_labels(array_matrix(xs_ids_examples), ys_labels_predicted, "dbscan.html")
 
    
    return dbscan_final_clustering_silhouette_score, dbscan_final_clustering_precision_score, dbscan_final_clustering_recall_score, dbscan_final_clustering_rand_index_score, dbscan_final_clustering_f1_score, dbscan_final_clustering_adjusted_rand_score, dbscan_final_clustering_confusion_matrix_rand_index
示例#3
0
def k_means_final_clustering(xs_features_data, ys_labels_true, num_clusters=4):

    ys_labels_predicted, k_means_estimator_centroids, k_means_final_clustering_error = k_means_clustering_method(
        xs_features_data, num_clusters=num_clusters, max_iterations=300)

    plot_clusters_centroids_and_radii("K-Means",
                                      xs_features_data,
                                      ys_labels_predicted,
                                      k_means_estimator_centroids,
                                      num_clusters=num_clusters,
                                      epsilon=None,
                                      damping=None,
                                      final_clustering=True)

    if (num_clusters >= 2):

        if (num_clusters <= 26):

            plot_silhouette_analysis("K-Means",
                                     xs_features_data,
                                     ys_labels_predicted,
                                     k_means_estimator_centroids,
                                     num_clusters,
                                     epsilon=None,
                                     damping=None,
                                     final_clustering=True)

        k_means_final_clustering_silhouette_score, k_means_final_clustering_precision_score, k_means_final_clustering_recall_score, k_means_final_clustering_rand_index_score, k_means_final_clustering_f1_score, k_means_final_clustering_adjusted_rand_score, k_means_final_clustering_confusion_matrix_rand_index = compute_clustering_performance_metrics(
            "K-Means",
            xs_features_data,
            ys_labels_true,
            ys_labels_predicted,
            num_clusters,
            final_clustering=True)

        plot_confusion_matrix_rand_index_clustering_heatmap(
            "K-Means",
            k_means_final_clustering_confusion_matrix_rand_index,
            num_clusters,
            epsilon=None,
            damping=None,
            final_clustering=True)

    xs_ids_examples = list(range(0, len(xs_features_data)))

    html_report_cluster_labels(array_matrix(xs_ids_examples),
                               ys_labels_predicted, "k-means.html")

    return k_means_final_clustering_error, k_means_final_clustering_silhouette_score, k_means_final_clustering_precision_score, k_means_final_clustering_recall_score, k_means_final_clustering_rand_index_score, k_means_final_clustering_f1_score, k_means_final_clustering_adjusted_rand_score, k_means_final_clustering_confusion_matrix_rand_index
示例#4
0
def affinity_propagation_final_clustering(xs_features_data, ys_labels_true, best_damping_value = 0.5):

    ys_labels_predicted, clusters_centroids_indices, clusters_centroids_points = affinity_propagation_clustering_method(xs_features_data, damping_value = best_damping_value, max_iterations = 300)
            
    num_clusters_centroids = ( nan_max(ys_labels_predicted) + 1 )

    
    if( num_clusters_centroids >= 2 ):
        
        affinity_propagation_final_clustering_silhouette_score, affinity_propagation_final_clustering_precision_score, affinity_propagation_final_clustering_recall_score, affinity_propagation_final_clustering_rand_index_score, affinity_propagation_final_clustering_f1_score, affinity_propagation_final_clustering_adjusted_rand_score, affinity_propagation_final_clustering_confusion_matrix_rand_index = compute_clustering_performance_metrics("Affinity-Propagation", xs_features_data, ys_labels_true, ys_labels_predicted, num_clusters_centroids, final_clustering = True)

        plot_confusion_matrix_rand_index_clustering_heatmap("Affinity-Propagation", affinity_propagation_final_clustering_confusion_matrix_rand_index, num_clusters_centroids, epsilon = None, damping = best_damping_value, final_clustering = True)
        

    xs_ids_examples = list(range(0, len(ys_labels_predicted)))
    
    html_report_cluster_labels(array_matrix(xs_ids_examples), ys_labels_predicted, "affinity-propagation.html")
 
    
    return affinity_propagation_final_clustering_silhouette_score, affinity_propagation_final_clustering_precision_score, affinity_propagation_final_clustering_recall_score, affinity_propagation_final_clustering_rand_index_score, affinity_propagation_final_clustering_f1_score, affinity_propagation_final_clustering_adjusted_rand_score, affinity_propagation_final_clustering_confusion_matrix_rand_index
示例#5
0
def dbscan_clustering_method(xs_features_data, current_epsilon, num_closest_k_neighbors = 5):
    
    dbscan_clustering = dbscan(eps = current_epsilon, min_samples = num_closest_k_neighbors)
    
    dbscan_clustering.fit(xs_features_data)
    
    ys_labels_predicted = dbscan_clustering.labels_
    
    
    clusters_centroids_indices = dbscan_clustering.core_sample_indices_
    

    xs_features_data_inliers = xs_features_data[ys_labels_predicted != -1]
    
    xs_features_data_outliers = xs_features_data[ys_labels_predicted == -1]

    
    clusters_centroids_points = xs_features_data[clusters_centroids_indices, :]

    clusters_border_points = array_matrix([list(point) for point in xs_features_data_inliers if point not in clusters_centroids_points])
    
    
    return ys_labels_predicted, clusters_centroids_indices, clusters_centroids_points, clusters_border_points, xs_features_data_inliers, xs_features_data_outliers
示例#6
0
k_means_xs_points_elbow_method, k_means_ys_points_elbow_method, best_num_clusters = plot_elbow_method("K-Means", k_means_squared_errors_sums_intertias, best_num_clusters, num_max_clusters = NUM_MAX_CLUSTERS)


error_k_means_final_clustering = k_means_final_clustering(normalized_data_xs_best_features_priori, ys_labels_true, num_clusters = best_num_clusters)

# ---- K-Means Clustering ----


# ---- DBScan Clustering ----

dbscan_num_centroids, dbscan_num_inliers, dbscan_num_outliers, dbscan_silhouette_scores, dbscan_precision_scores, dbscan_recall_scores, dbscan_rand_index_scores, dbscan_f1_scores, dbscan_adjusted_rand_scores = dbscan_pre_clustering(normalized_data_xs_best_features_priori, ys_labels_true, start_epsilon = START_EPSILON, end_epsilon = END_EPSILON, step_epsilon = STEP_EPSILON)

num_data_points_sorted_by_distance, k_neighbors_distances_epsilons = compute_distances_nearest_neighbors(normalized_data_xs_best_features_priori, num_closest_k_neighbors = NUM_K_NEAREST_NEIGHBORS)

dbscan_xs_points_k_distance_method = array_matrix( range( num_data_points_sorted_by_distance ) )
dbscan_ys_points_k_distance_method = k_neighbors_distances_epsilons


if(KNEED_LIB_IN_USE):
    
    kneed_locator_k_distance = knee_locator(dbscan_xs_points_k_distance_method, dbscan_ys_points_k_distance_method, S = 1.0, curve = "convex", direction = "increasing")

    best_distance_epsilon = dbscan_ys_points_k_distance_method[round(kneed_locator_k_distance.elbow, 0)]

else:
    
    best_distance_epsilon = find_best_distance_epsilon(dbscan_ys_points_k_distance_method, THRESHOLD_MAJORITY)
    
    
print( "The best Distance ( ε (Epsilon Value) ), for DBScan, found:" )
示例#7
0
def bisecting_k_means_clustering(xs_features_data,
                                 examples_ids,
                                 ys_labels_true,
                                 final_max_num_clusters_and_iterations=2):

    clusters_ids = [0]

    clusters_data = [xs_features_data]
    clusters_examples_ids = [examples_ids]

    clusters_centroids_points = [0]

    num_clusters = 1

    tree_predictions_lists_with_offset = array_empty(
        (len(xs_features_data), 0)).tolist()
    tree_predictions_lists_without_offset = array_empty(
        (len(xs_features_data), 0)).tolist()

    ys_labels_predicted_with_offset = None
    ys_labels_predicted_without_offset = None

    current_iteration = 0

    while (num_clusters < final_max_num_clusters_and_iterations):

        cluster_index_with_more_examples = -1
        num_max_examples_in_cluster = -1

        for index_cluster in range(num_clusters):

            num_examples_in_cluster = len(clusters_data[index_cluster])

            if (num_examples_in_cluster > num_max_examples_in_cluster):

                cluster_index_with_more_examples = index_cluster
                num_max_examples_in_cluster = num_examples_in_cluster

        cluster_id_to_be_divided = clusters_ids[
            cluster_index_with_more_examples]

        cluster_data_to_be_divided = clusters_data[
            cluster_index_with_more_examples]
        cluster_examples_ids_to_be_divided = clusters_examples_ids[
            cluster_index_with_more_examples]

        cluster_centroid_point_to_be_divided = clusters_centroids_points[
            cluster_index_with_more_examples]

        clusters_data.remove(cluster_data_to_be_divided)
        clusters_examples_ids.remove(cluster_examples_ids_to_be_divided)

        if (num_clusters == 1):

            clusters_ids.pop(cluster_id_to_be_divided)
            clusters_centroids_points.pop(0)

        else:

            cluster_centroid_point_index = 0

            cluster_centroid_point_index_triggered = 0

            for cluster_centroid_point in clusters_centroids_points:

                if ((cluster_centroid_point_to_be_divided ==
                     cluster_centroid_point).all()):

                    cluster_centroid_point_index_triggered = cluster_centroid_point_index

                cluster_centroid_point_index = (cluster_centroid_point_index +
                                                1)

            clusters_centroids_points.pop(
                cluster_centroid_point_index_triggered)

        if (num_clusters == 1):

            two_sub_clusters_ids, two_sub_clusters_examples_ids, two_sub_clusters_data, two_sub_clusters_centroids, ys_labels_predicted_without_offset, ys_labels_predicted_with_offset, cluster_squared_error_sum_intertia = bissect_k_means_into_two_sub_clusters(
                cluster_data_to_be_divided,
                cluster_examples_ids_to_be_divided,
                left_leaf_cluster_id_offset=0,
                right_leaf_cluster_id_offset=0)

        else:

            two_sub_clusters_ids, two_sub_clusters_examples_ids, two_sub_clusters_data, two_sub_clusters_centroids, ys_labels_predicted_without_offset, ys_labels_predicted_with_offset, cluster_squared_error_sum_intertia = bissect_k_means_into_two_sub_clusters(
                cluster_data_to_be_divided,
                cluster_examples_ids_to_be_divided,
                left_leaf_cluster_id_offset=cluster_id_to_be_divided,
                right_leaf_cluster_id_offset=(num_clusters - 1))

        ys_labels_predicted_with_offset_unique = array_sort(
            array_unique_values(ys_labels_predicted_with_offset))

        for sub_cluster_id in range(2):

            clusters_ids.append(two_sub_clusters_ids[sub_cluster_id])

            clusters_data.append(two_sub_clusters_data[sub_cluster_id])
            clusters_examples_ids.append(
                two_sub_clusters_examples_ids[sub_cluster_id])

            clusters_centroids_points.append(
                two_sub_clusters_centroids[sub_cluster_id])

            for example_index in range(len(xs_features_data)):

                if (example_index
                        in two_sub_clusters_examples_ids[sub_cluster_id]):

                    if (sub_cluster_id <
                            len(ys_labels_predicted_with_offset_unique)):

                        tree_predictions_lists_with_offset[
                            example_index].append(
                                ys_labels_predicted_with_offset_unique[
                                    sub_cluster_id])

                    tree_predictions_lists_without_offset[
                        example_index].append(sub_cluster_id)

        num_clusters = (num_clusters + 1)

        current_iteration = (current_iteration + 1)

    ys_final_labels_predicted = matrix_array_zeros(len(xs_features_data))

    for example_index in range(len(xs_features_data)):

        last_index = (len(tree_predictions_lists_with_offset[example_index]) -
                      1)

        ys_final_labels_predicted[
            example_index] = tree_predictions_lists_with_offset[example_index][
                last_index]

    clusters_centroids_points = array_matrix(clusters_centroids_points)

    effective_num_clusters = len(
        array_unique_values(ys_final_labels_predicted))

    if (effective_num_clusters >= 2):

        if (effective_num_clusters <= 26):

            plot_clusters_centroids_and_radii("Bisecting-K-Means",
                                              xs_features_data,
                                              ys_final_labels_predicted,
                                              clusters_centroids_points,
                                              effective_num_clusters,
                                              epsilon=None,
                                              damping=None,
                                              final_clustering=True)

            plot_silhouette_analysis("Bisecting-K-Means",
                                     xs_features_data,
                                     ys_final_labels_predicted,
                                     clusters_centroids_points,
                                     effective_num_clusters,
                                     epsilon=None,
                                     damping=None,
                                     final_clustering=True)

        bisecting_k_means_final_clustering_silhouette_score, bisecting_k_means_final_clustering_precision_score, bisecting_k_means_final_clustering_recall_score, bisecting_k_means_final_clustering_rand_index_score, bisecting_k_means_final_clustering_f1_score, bisecting_k_means_final_clustering_adjusted_rand_score, bisecting_k_means_final_clustering_confusion_matrix_rand_index = compute_clustering_performance_metrics(
            "Bisecting-K-Means",
            xs_features_data,
            ys_labels_true,
            ys_final_labels_predicted,
            effective_num_clusters,
            final_clustering=True)

        plot_confusion_matrix_rand_index_clustering_heatmap(
            "Bisecting-K-Means",
            bisecting_k_means_final_clustering_confusion_matrix_rand_index,
            effective_num_clusters,
            epsilon=None,
            damping=None,
            final_clustering=True)

    xs_ids_examples = list(range(0, len(examples_ids)))

    html_report_cluster_labels_hierarchical(
        xs_ids_examples, tree_predictions_lists_without_offset,
        "bisecting-k-means-hierarchical.html")

    return clusters_ids, clusters_data, ys_final_labels_predicted, tree_predictions_lists_without_offset, num_clusters, effective_num_clusters