Пример #1
0
    def fit_new_trainig(vectors,
                        algo,
                        metric,
                        path_to_save=None,
                        nb_clusters=300,
                        max_no_improvement=1000,
                        verbose=1,
                        shuffle=True):
        assert algo in ['MiniBatchKMeans', 'KMedoids']
        if shuffle: np.random.shuffle(vectors)

        if algo == 'MiniBatchKMeans':
            clusterer = MiniBatchKMeans(n_clusters=nb_clusters,
                                        verbose=verbose,
                                        max_no_improvement=max_no_improvement)
        if algo == 'KMedoids':
            if metric is not None:
                clusterer = KMedoids(n_clusters=nb_clusters,
                                     max_iter=max_no_improvement,
                                     metric=metric)
            else:
                clusterer = KMedoids(n_clusters=nb_clusters,
                                     max_iter=max_no_improvement)
        clusterer.fit(vectors)

        if path_to_save is not None:
            np.save(path_to_save, clusterer.cluster_centers_)

        return clusterer
Пример #2
0
def test_kmedoids(dtw_value, cluster_num, seed):
    # 声明precomputed自定义相似度计算方法
    km = KMedoids(n_clusters=cluster_num,
                  random_state=seed,
                  metric="precomputed",
                  init='k-medoids++',
                  max_iter=30000)
    dists = dtw_value
    y_pred = km.fit_predict(dists)
    with open(r".//res//grid_pred_d" + str(cluster_num) + ".csv",
              "w",
              encoding='UTF-8',
              newline='') as csvfile:
        writer = csv.writer(csvfile)
        index = 0
        for row in y_pred:
            writer.writerow([row])
            index += 1
    with open(r".//res//grid_centroids_d" + str(cluster_num) + ".csv",
              "w",
              encoding='UTF-8',
              newline='') as csvfile:
        writer = csv.writer(csvfile)
        for yi in range(cluster_num):
            writer.writerow([km.medoid_indices_[yi]])
    print('finish')
Пример #3
0
def build_k_medoids(factors: np.ndarray):
    """
    Builds a KMedoids model from the given factors
    """
    if CONFIG.use_k:
        k = CONFIG.k
        LOG.debug('Running K-Medoids with k=%s clusters...', k)
        model = KMedoids(n_clusters=k, max_iter=500).fit(factors)
    else:
        best_model, best_score = None, float('-inf')
        scores = []

        for i in range(1, CONFIG.max_clusters):
            k = i + 1
            LOG.debug('Starting K-medoids with %s clusters...', k)
            start = time()
            model = KMedoids(n_clusters=k, max_iter=500).fit(factors)
            score = silhouette_score(factors, model.labels_)
            scores.append(score)
            LOG.info(
                'Finished K-medoids with %s clusters in %s seconds, score: %s',
                k, round(time() - start), score)
            if score > best_score:
                best_model = model
                best_score = score
                LOG.debug('Better score! Saving model with k=%s.', score)

        model = best_model

        scores = pd.DataFrame(scores, index=np.arange(1, len(scores) + 1))
        scores.to_csv('outputs/k_medoids_silhouette.csv')

    return model
def test_kmedoids_empty_clusters():
    """When a cluster is empty, it should throw a warning."""
    rng = np.random.RandomState(seed)
    X = [[1], [1], [1]]
    kmedoids = KMedoids(n_clusters=2, random_state=rng)
    with pytest.warns(UserWarning, match="Cluster 1 is empty!"):
        kmedoids.fit(X)
Пример #5
0
def calculate_clusters(answers, real_classes):
    kmedoids1 = KMedoids(n_clusters=5,
                         metric=distance_function,
                         random_state=None).fit(answers)
    print(kmedoids1.labels_)

    kmedoids2 = KMedoids(n_clusters=5,
                         metric=distance_function,
                         random_state=None).fit(scale(answers))
    print(kmedoids2.labels_)

    print("Rand index:")
    print(rand_index(real_classes, kmedoids1.labels_))
    print(rand_index(real_classes, kmedoids2.labels_))

    print("Adjusted rand index:")
    print(adjusted_rand_index(real_classes, kmedoids1.labels_))
    print(adjusted_rand_index(real_classes, kmedoids2.labels_))

    print("Sum error:")
    print(sum_error(real_classes, kmedoids1.labels_))
    print(sum_error(real_classes, kmedoids2.labels_))

    print("NMI:")
    print(mutual_info_score(real_classes, kmedoids1.labels_))
    print(mutual_info_score(real_classes, kmedoids2.labels_))
Пример #6
0
    def k_medoids_clustering(self, n_clusters=3, normalized=True, n_repeats=1, criterion='avg_silhouette'):
        """
        Method that assigns phase labels to PhaseIdentification object obtained by performing K-medoids++ on the specified feeder.
        A number of repetitions can be specified, the best result according to the specified criterion will be returned
        By default the features will be normalized first. By scaling the features to have a mean of 0 and unit variance.
        (More info: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
        """
        if normalized == True:
            scaler = StandardScaler()
            data = scaler.fit_transform(self.voltage_features)
        else:
            data = self.voltage_features

        if criterion == 'avg_silhouette':
            best_cluster_labels = np.zeros(np.size(data, 0))
            score = -1
            for i in range(0, n_repeats):
                i_cluster_labels = KMedoids(n_clusters, init='k-medoids++').fit(data).labels_
                i_silhouette_avg = silhouette_score(data, i_cluster_labels)
                if i_silhouette_avg > score:
                    score = i_silhouette_avg
                    best_cluster_labels = i_cluster_labels
        if criterion == 'global_silhouette':
            best_cluster_labels = np.zeros(np.size(data, 0))
            score = -1
            for i in range(0, n_repeats):
                i_cluster_labels = KMedoids(n_clusters, init='k-medoids++').fit(data).labels_
                i_silhouette_global = global_silhouette_criterion(data, i_cluster_labels)
                if i_silhouette_global > score:
                    score = i_silhouette_global
                    best_cluster_labels = i_cluster_labels
        self._algorithm = 'k-medoids++'
        self._n_repeats = n_repeats
        self.partial_phase_labels = best_cluster_labels + 1
        self.match_labels()
def test_kmedoids_iris():
    """Test kmedoids on the Iris dataset"""
    rng = np.random.RandomState(seed)
    X_iris = load_iris()["data"]

    ref_model = KMeans(n_clusters=3).fit(X_iris)

    avg_dist_to_closest_centroid = (ref_model.transform(X_iris).min(
        axis=1).mean())

    for init in ["random", "heuristic", "k-medoids++"]:
        distance_metric = "euclidean"
        model = KMedoids(n_clusters=3,
                         metric=distance_metric,
                         init=init,
                         random_state=rng)
        model.fit(X_iris)

        # test convergence in reasonable number of steps
        assert model.n_iter_ < (len(X_iris) // 10)

        distances = PAIRWISE_DISTANCE_FUNCTIONS[distance_metric](X_iris)
        avg_dist_to_random_medoid = np.mean(distances.ravel())
        avg_dist_to_closest_medoid = model.inertia_ / X_iris.shape[0]
        # We want distance-to-closest-medoid to be reduced from average
        # distance by more than 50%
        assert avg_dist_to_random_medoid > 2 * avg_dist_to_closest_medoid
        # When K-Medoids is using Euclidean distance,
        # we can compare its performance to
        # K-Means. We want the average distance to cluster centers
        # to be similar between K-Means and K-Medoids
        assert_allclose(avg_dist_to_closest_medoid,
                        avg_dist_to_closest_centroid,
                        rtol=0.1)
    def __init__(self, data):
        self.states = data.keys()
        self.kmeans = KMedoids(n_clusters=3)
        self.kmeans.fit(np.array(tuple(data.values())).reshape(-1, 1))

        self.mapping = list(
            np.argsort(np.squeeze(self.kmeans.cluster_centers_)))
Пример #9
0
def test_kmedoid_results(method, init):
    expected = np.hstack([np.zeros(50), np.ones(50)])
    km = KMedoids(n_clusters=2, init=init, method=method, random_state=rng)
    km.fit(X_cc)
    # This test use data that are not perfectly separable so the
    # accuracy is not 1. Accuracy around 0.85
    assert (np.mean(km.labels_ == expected) >
            0.8) or (1 - np.mean(km.labels_ == expected) > 0.8)
def test_callable_distance_metric():
    rng = np.random.RandomState(seed)

    def my_metric(a, b):
        return np.sqrt(np.sum(np.power(a - b, 2)))

    model = KMedoids(random_state=rng, metric=my_metric)
    labels1 = model.fit_predict(X)
    assert len(labels1) == 100
    assert_array_equal(labels1, model.labels_)
def test_kmedoids_on_sparse_input():
    rng = np.random.RandomState(seed)
    model = KMedoids(n_clusters=2, random_state=rng)
    row = np.array([1, 0])
    col = np.array([0, 4])
    data = np.array([1, 1])
    X = csc_matrix((data, (row, col)), shape=(2, 5))
    labels = model.fit_predict(X)
    assert len(labels) == 2
    assert_array_equal(labels, model.labels_)
Пример #12
0
def sklearn_kmedoids(ds, numClusters, numSamples):

    km = KMedoids(n_clusters=numClusters, random_state=0)

    df = ds.df[["x1", "x2"]]
    df = df[:numSamples]

    km.fit(df[["x1", "x2"]].to_numpy())

    return pd.DataFrame(km.labels_, columns=["cluster"])
Пример #13
0
def run_KMedoids(n_clusters, pca_components, data, components):
    clustering = KMedoids(n_clusters=7, random_state=0)
    clustering.fit(pca_components)
    df_seg_pca_kmedoids = pd.concat(
        [data.reset_index(drop=True),
         pd.DataFrame(pca_components)], axis=1)
    df_seg_pca_kmedoids.columns.values[(-1 * components):] = [
        "Component " + str(i + 1) for i in range(components)
    ]
    df_seg_pca_kmedoids['Cluster'] = clustering.labels_
    return df_seg_pca_kmedoids
def test_heuristic_deterministic():
    """Result of heuristic init method should not depend on rnadom state."""
    rng1 = np.random.RandomState(1)
    rng2 = np.random.RandomState(2)
    X = load_iris()["data"]
    D = euclidean_distances(X)

    medoids_1 = KMedoids(init="heuristic")._initialize_medoids(D, 10, rng1)

    medoids_2 = KMedoids(init="heuristic")._initialize_medoids(D, 10, rng2)

    assert_array_equal(medoids_1, medoids_2)
Пример #15
0
def test_max_iter():
    """Test that warning message is thrown when max_iter is reached."""
    rng = np.random.RandomState(seed)
    X_iris = load_iris()["data"]

    model = KMedoids(
        n_clusters=10, init="random", random_state=rng, max_iter=1
    )
    msg = "Maximum number of iteration reached before"

    with pytest.warns(UserWarning, match=msg):
        model.fit(X_iris)
def test_outlier_robustness():
    rng = np.random.RandomState(seed)
    kmeans = KMeans(n_clusters=2, random_state=rng)
    kmedoids = KMedoids(n_clusters=2, random_state=rng)

    X = [[-11, 0], [-10, 0], [-9, 0], [0, 0], [1, 0], [2, 0], [1000, 0]]

    kmeans.fit(X)
    kmedoids.fit(X)

    assert_array_equal(kmeans.labels_, [0, 0, 0, 0, 0, 0, 1])
    assert_array_equal(kmedoids.labels_, [0, 0, 0, 1, 1, 1, 1])
def test_kmedoid_nclusters(method, init):
    n_clusters = 50

    km = KMedoids(
        n_clusters=n_clusters,
        init=init,
        method=method,
        max_iter=1,
        random_state=rng,
    )
    km.fit(X_cc)
    assert len(np.unique(km.medoid_indices_)) == n_clusters
def test_build():
    X, y = fetch_20newsgroups_vectorized(return_X_y=True)
    # Select only the first 500 samples
    X = X[:500]
    y = y[:500]
    # Precompute cosine distance matrix
    diss = cosine_distances(X)
    # run build
    ske = KMedoids(20, "precomputed", init="build", max_iter=0)
    ske.fit(diss)
    assert ske.inertia_ <= 230
    assert len(np.unique(ske.labels_)) == 20
Пример #19
0
def generate_clusteringInfo(filePath):
    results = []

    numberOfImages = 0
    # store features of each image into the results
    with open(filePath) as f:
        reader = csv.reader(f)
        for row in reader:
            features = [float(x) for x in row[:]]
            results.append(features)
            numberOfImages += 1
    f.close()

    # Define prefix of the lables and centres file
    prefix = "basic"
    if "cnn" in filePath:
        prefix = "cnn"

    # store results as an np array
    allImages = np.array(results)

    if prefix == "basic":
        Kmedoids = KMedoids(n_clusters=10,
                            metric=chi2_distance,
                            method='pam',
                            random_state=0).fit(allImages)
    else:
        Kmedoids = KMedoids(n_clusters=10,
                            metric='cosine',
                            method='pam',
                            random_state=0).fit(allImages)

    labels = Kmedoids.labels_

    centres = Kmedoids.cluster_centers_

    # write centroid and label information to prefix_lables and prefix_centres file
    output = open(prefix + "_labels.csv", "w")
    labelsInfo = [str(l) for l in labels]
    output.write(",".join(labelsInfo))
    output.close()

    output = open(prefix + "_centres.csv", "w")
    for i in range(0, len(centres)):
        centre = []
        for j in range(0, len(centres[i])):
            centre.append(str(centres[i][j]))
        output.write(",".join(centre) + "\n")
    output.close()

    SC = metrics.silhouette_score(allImages, labels)
    print(prefix, "Silhouette Coefficient: ", SC)
Пример #20
0
def find_optimal_clusters_and_display(pca_components):
    wcss = []
    max_clusters = 21
    for i in range(1, max_clusters):
        kmedoids_pca = KMedoids(n_clusters=i, random_state=0)
        kmedoids_pca.fit(pca_components)
        wcss.append(kmedoids_pca.inertia_)
    n_clusters = KneeLocator([i for i in range(1, max_clusters)],
                             wcss,
                             curve='convex',
                             direction='decreasing').knee
    st.write("Optimal number of clusters", n_clusters)
    return n_clusters
def test_kpp_called(_kpp_init_mocked):
    """KMedoids._kpp_init method should be called by _initialize_medoids"""
    D = np.array([[0, 1], [1, 0]])
    n_clusters = 2
    rng = np.random.RandomState(seed)
    kmedoids = KMedoids()
    kmedoids.init = "k-medoids++"
    # set _kpp_init_mocked.return_value to a singleton
    initial_medoids = kmedoids._initialize_medoids(D, n_clusters, rng)

    # assert that _kpp_init was called and its result was returned.
    _kpp_init_mocked.assert_called_once_with(D, n_clusters, rng)
    assert initial_medoids == _kpp_init_mocked.return_value
def test_update_medoid_idxs_empty_cluster():
    """Label is unchanged for an empty cluster."""
    D = np.zeros((3, 3))
    labels = np.array([0, 0, 0])
    medoid_idxs = np.array([0, 1])
    kmedoids = KMedoids(n_clusters=2)

    # Swallow empty cluster warning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        kmedoids._update_medoid_idxs_in_place(D, labels, medoid_idxs)

    assert_array_equal(medoid_idxs, [0, 1])
def test_seuclidean():
    with pytest.warns(None) as record:
        km = KMedoids(2, metric="seuclidean", method="pam")
        km.fit(np.array([0, 0, 0, 1]).reshape((4, 1)))
        km.predict(np.array([0, 0, 0, 1]).reshape((4, 1)))
        km.transform(np.array([0, 0, 0, 1]).reshape((4, 1)))
    assert len(record) == 0
Пример #24
0
class Clustering:
    def __init__(self, data):
        self.states = data.keys()
        self.kmeans = KMedoids(n_clusters=3)
        self.kmeans.fit(np.array(tuple(data.values())).reshape(-1, 1))

        self.mapping = list(
            np.argsort(np.squeeze(self.kmeans.cluster_centers_)))

    def cluster(self):
        result = [[], [], []]
        for state, cluster in zip(self.states, self.kmeans.labels_):
            result[self.mapping.index(cluster)].append(state)
        return result
Пример #25
0
 def __init__(self, cluster_name='KMedoids', columns=None,
              eval_inertia=False, eval_silhouette=False, eval_chi=False, eval_dbi=False, eval_sample_size=None,
              **kwargs):
     self.cluster_name     = cluster_name
     self.columns          = columns
     self.model            = KMedoids(**kwargs)
     self.eval_inertia     = eval_inertia
     self.eval_silhouette  = eval_silhouette
     self.eval_chi         = eval_chi
     self.eval_dbi         = eval_dbi
     self.eval_sample_size = eval_sample_size
     self.transform_cols   = None
     self.eval_df          = None
     self.centroid_df      = None
Пример #26
0
def find_cluster_centres(text, num_clusters):
    corpus = nltk.sent_tokenize(text)
    corpus_embeddings = embedder.encode(corpus)
    clustering_model = KMedoids(n_clusters=num_clusters,
                                random_state=0,
                                metric="cosine")
    clustering_model.fit(corpus_embeddings)
    cluster_center_embeddings = clustering_model.cluster_centers_
    cluster_centers = []
    for center_embedding in cluster_center_embeddings:
        for index, sentence_embedding in enumerate(corpus_embeddings):
            if np.array_equal(sentence_embedding, center_embedding):
                if corpus[index] not in cluster_centers:
                    cluster_centers.append(corpus[index])
    return cluster_centers
Пример #27
0
def EM_build_and_swap(args):
    total_images, total_labels, sigma = load_data(args)
    np.random.seed(args.seed)
    if args.metric != "L2":
        raise Exception("EM does not support metrics other than L2")

    imgs = total_images[np.random.choice(range(len(total_images)),
                                         size=args.sample_size,
                                         replace=False)]
    metric = 'euclidean'
    kmedoids = KMedoids(n_clusters=args.num_medoids,
                        metric=metric,
                        random_state=None).fit(imgs)
    medoids = kmedoids.medoid_indices_.tolist()
    best_distances, closest_medoids = get_best_distances(medoids,
                                                         imgs,
                                                         metric='L2')
    loss = np.mean(best_distances)

    if args.verbose >= 1:
        print("Final results:")
        print(medoids)
        print(loss)

    return medoids, loss
Пример #28
0
    def _cluster_matrix(self,
                        matrix: pd.DataFrame,
                        n_clusters=5) -> pd.DataFrame:
        '''
        clusters rule matrix (without support,confidence, group and level columns) with Jaccard distance
        :param matrix: rule matrix
        :param n_clusters: number of clusters
        :return: returns clustered matrix (without support,confidence, group and level columns) with rows in one cluster
            next to each other

        * clustering was not used in the final visual design
        '''
        tmp = matrix.copy()

        # remove redundant info for clustering
        for col in ['support', 'confidence', 'group', 'level']:
            if col in tmp.columns:
                tmp.drop(columns=col, inplace=True)

        # create a binary matrix
        df = np.where(matrix == ' ', False, True)

        # find clusters
        kmedoids_labels = KMedoids(n_clusters=n_clusters, random_state=0, metric='jaccard', init='k-medoids++') \
            .fit_predict(df)
        labels = pd.Series(kmedoids_labels, index=tmp.index)

        # sort rows by clusters
        tmp['labels'] = labels
        tmp.sort_values(by='labels', inplace=True)

        return tmp.drop(columns=['labels'])
Пример #29
0
    def fit(self, x, output_filename_suffix='output.pdf'):
        x = np.array(x)
        num_samples, num_features = x.shape[0], x.shape[1]
        self.__pca = PCA(n_components=min(num_samples, num_features),
                         random_state=0)
        x_transformed = self.__pca.fit_transform(x)

        visualizer = KElbowVisualizer(KMedoids(random_state=0),
                                      k=(1, num_samples),
                                      timings=False,
                                      locate_elbow=True)
        visualizer.fit(x_transformed)
        best_n_clusters = visualizer.elbow_value_ if visualizer.elbow_value_ is not None else 1

        self.__clusterer = KMedoids(n_clusters=best_n_clusters, random_state=0)
        self.__clusterer.fit(x_transformed)
Пример #30
0
def generate_clustering_pcoa(distance_file,
                             biom_file,
                             metadata_file,
                             num_clusters,
                             output_file=None,
                             plot=False,
                             L=2):
    if not isinstance(distance_file, list):
        distance_matrix = CSV.read(distance_file)
    else:
        distance_matrix = distance_file

    output_matrix = []

    AgglomerativeCluster = AgglomerativeClustering(
        n_clusters=num_clusters, affinity='precomputed',
        linkage='complete').fit_predict(distance_matrix)
    KMedoidsCluster = KMedoids(n_clusters=num_clusters,
                               metric='precomputed',
                               method='pam',
                               init='heuristic').fit_predict(distance_matrix)

    figure = pcoa.PCoA_total_from_matrix_clustering(distance_matrix,
                                                    biom_file,
                                                    AgglomerativeCluster,
                                                    plot=plot)
    if output_file is not None:
        plt.savefig('../src/images/out_L{0}_agglomerative_pcoa.png'.format(L))
    figure = pcoa.PCoA_total_from_matrix_clustering(distance_matrix,
                                                    biom_file,
                                                    KMedoidsCluster,
                                                    plot=plot)
    if output_file is not None:
        plt.savefig('../src/images/out_L{0}_kmedoids_pcoa.png'.format(L))