def grade3():
    marks = 0
    try:
        data = np.array([[i, i] for i in range(5)])
        centers = np.array([[1., 1.], [2., 2.], [3., 3.]])
        op = np.array([[0.5, 0.5], [2.0, 2.0], [3.5, 3.5]])

        kmeans = KMeans(D=2, n_clusters=3)
        kmeans.cluster_centers = centers
        it = kmeans.train(data, 1)
        if np.allclose(kmeans.cluster_centers, op) and it == 0:
            marks += 0.5

        data = np.array([[i + 1, i * 2.3] for i in range(5)])
        centers = np.array([[5., 1.], [-1., 2.], [3., 6.]])
        op = np.array([[5, 1], [1.5, 1.15], [4.0, 6.8999999999999995]])

        kmeans = KMeans(D=2, n_clusters=3)
        kmeans.cluster_centers = centers
        it = kmeans.train(data, 1)
        if np.allclose(kmeans.cluster_centers, op) and it == 0:
            marks += 0.5

        data = np.array([[i + 1, i * 2.3] for i in range(3)])
        centers = np.array([[5, 1], [-1., 2]])
        op = np.array([[3.0, 4.6], [1.5, 1.15]])
        kmeans = KMeans(D=2, n_clusters=2)
        kmeans.cluster_centers = centers
        it = kmeans.train(data, 5)
        if np.allclose(kmeans.cluster_centers, op) and it == 1:
            marks += 1
    except:
        print('Error in k-means')
    return marks
示例#2
0
 def test02_non_fitted_model_raises_not_fitted_error_message(self):
     model = KMeans(k=2)
     try:
         model.predict(np.array([[1, 0], [0, 1]]))
         self.fail()
     except Exception as e:
         self.assertEqual(str(e), KMeans.NOT_FITTED_ERROR_MESSAGE)
示例#3
0
def kmeans_image_compression():
    im = plt.imread('baboon.tiff')
    N, M = im.shape[:2]
    im = im / 255

    # convert to RGB array
    data = im.reshape(N * M, 3)

    k_means = KMeans(n_cluster=16, max_iter=100, e=1e-6)
    centroids, _, i = k_means.fit(data)

    print('RGB centroids computed in {} iteration'.format(i))
    new_im = transform_image(im, centroids)

    assert new_im.shape == im.shape, \
        'Shape of transformed image should be same as image'

    mse = np.sum((im - new_im)**2) / (N * M)
    print('Mean square error per pixel is {}'.format(mse))
    plt.imsave('plots/compressed_baboon.png', new_im)

    np.savez('results/k_means_compression.npz',
             im=im,
             centroids=centroids,
             step=i,
             new_image=new_im,
             pixel_error=mse)
示例#4
0
    def _initialise_prams(self, X):

        # Get initial clusters using Kmeans
        kmeans = KMeans(k=self.k, max_iters=500)
        kmeans.fit(X)
        kmeans_preds = kmeans.predict(X)

        N, col_length = X.shape
        mixture_labels = np.unique(kmeans_preds)
        initial_mean = np.zeros((self.k, col_length))
        initial_cov = np.zeros((self.k, col_length, col_length))
        initial_pi = np.zeros(self.k)

        for index, mixture_label in enumerate(mixture_labels):
            mixture_indices = (kmeans_preds == mixture_label)
            Nk = X[mixture_indices].shape[0]

            # Initial pi
            initial_pi[index] = Nk / N

            # Intial mean
            initial_mean[index, :] = np.mean(X[mixture_indices], axis=0)

            # Initial covariance
            de_meaned = X[mixture_indices] - initial_mean[index, :]
            initial_cov[index] = np.dot(initial_pi[index] * de_meaned.T,
                                        de_meaned) / Nk
        assert np.sum(initial_pi) == 1
        return initial_pi, initial_mean, initial_cov
示例#5
0
def do_KMeans_clustering(N_cluster, X, device):
	"""
	This function will use KMeans Clustering method to label training data
	according to its proximity with a cluster
	Input:
		N_cluster: number of cluster estimated by Gap Statistics
		X: Training data for the input layer
	Output:
		cluster_label: label assigned to every point
		over_coef: this will be used in the oversampling method to increase
				   number of points in the less densed cluster region
	"""

	X = X.to(device)

	#Instantiating kmeans object
	kmeans = KMeans(n_clusters=N_cluster, mode='euclidean', verbose=1)
	cluster_label = kmeans.fit_predict(X)

	#Calculating the size of cluster (number of data near the cluster centroid)
	cluster_size = torch.zeros(N_cluster, dtype=torch.int32).to(device)
	for cluster in range(N_cluster):
		cluster_size[cluster] = len(torch.where(cluster_label==cluster)[0])

	over_coef = torch.zeros(N_cluster, dtype=torch.int32).to(device)
	for cluster in range(N_cluster):
		over_coef[cluster] = torch.clone((max(cluster_size))/cluster_size[cluster]).to(device)
		if over_coef[cluster] > 10:
			over_coef[cluster] = 10

	return cluster_label.cpu(), over_coef.cpu()
示例#6
0
文件: main.py 项目: L4v/ori
def main():
    # NOTE(Jovan): Load data
    data = pd.read_csv("data/skincancer.csv", delimiter=',', index_col=0)
    mort = data.Mort.values
    lat = data.Lat.values
    lon = data.Long.values

    # NOTE(Jovan): Init LinearRegression and predict
    lin_reg = LinearRegression(lat, mort)
    hawaii = lin_reg.predict(20)
    print("Prediction for hawaii[lat=20]:", hawaii)

    # NOTE(Jovan): Init KMeans and add lat and long points
    k_means = KMeans()
    for i, j in zip(lat, lon):
        k_means.points.append(Point(i, j))
    k_means.split(2, 0.01)

    # NOTE(Jovan): Plot clusters
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    # NOTE(Jovan): First clusters
    for p in k_means._clusters[0].points:
        ax.scatter(p.x, p.y, c="#ff0000")
    # NOTE(Jovan): Second clusters
    for p in k_means._clusters[1].points:
        ax.scatter(p.x, p.y, c="#00ff00")

    # NOTE(Jovan): Plot cluster centers
    center1 = k_means._clusters[0].center
    center2 = k_means._clusters[1].center
    ax.scatter(center1.x, center1.y, marker="P", c="#ff0000")
    ax.scatter(center2.x, center2.y, marker="P", c="#00ff00")
    plt.show()
示例#7
0
 def test_fit_with_different_initial_centroids(self):
     expected_labels = [0, 0, 0, 1, 1, 1]
     expected_centroids = [[-1.6666667, -1.6666667], [1.6666667, 1.6666667]]
     k_means = KMeans(num_clusters=self.num_clusters, seed=0)
     k_means.fit(self.data)
     self.assertEqual(expected_labels, k_means.labels_)
     np.testing.assert_almost_equal(expected_centroids, k_means.centroids_)
示例#8
0
    def test_cluster_points_two_cluster(self):
        test_vector = self.create_test_data_vector()

        kmeans = KMeans(test_vector, 2)

        test_point0 = datapoint.DataPoint()
        test_point0.add_dimension(1.1)
        test_point0.add_dimension(2.1)
        test_point0.add_dimension(3.1)
        test_point1 = datapoint.DataPoint()
        test_point1.add_dimension(3.1)
        test_point1.add_dimension(1.1)
        test_point1.add_dimension(2.1)
        test_cluster = datapoint.DataVector()
        test_cluster.add_point(test_point0)
        test_cluster.add_point(test_point1)

        self.assertEqual(
            [1.0, 2.0, 3.0],
            kmeans.cluster_points(test_cluster)[0].data_points[0].coordinates)
        self.assertEqual(
            [2.0, 3.0, 1.0],
            kmeans.cluster_points(test_cluster)[0].data_points[1].coordinates)
        self.assertEqual(
            [3.0, 1.0, 2.0],
            kmeans.cluster_points(test_cluster)[1].data_points[0].coordinates)
示例#9
0
文件: gmm.py 项目: paulesta55/CS-7641
    def _init_components(self, points, K, **kwargs):  # [5pts]
        """
        Args:
            points: NxD numpy array, the observations
            K: number of components
            kwargs: any other args you want
        Return:
            pi: numpy array of length K, prior
            mu: KxD numpy array, the center for each gaussian.
            sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian. You will have KxDxD numpy
            array for full covariance matrix case

        """
        sigma = np.zeros((K, points.shape[1], points.shape[1]))
        pi = np.array([i / K for i in range(K)])
        clusters_idx, mu, _ = KMeans()(points,
                                       K,
                                       max_iters=10000,
                                       verbose=False)
        for k in range(K):
            n_k = len(np.where(clusters_idx == k))
            mu_k = mu[k]
            sigma[k] = np.dot(pi[k] * mu_k.T, mu_k) / n_k
        print("sigma shape".format(sigma.shape))
        return pi, mu, sigma
示例#10
0
 def score(self):
     scores_dict = {}
     for ivecset in self.ivecs:
         name = os.path.normpath(ivecset.name)
         ivecs = ivecset.get_all()
         loginfo('[Diarization.score] Scoring {} ...'.format(name))
         size = ivecset.size()
         if size > 0:
             if ivecset.num_speakers is not None:
                 num_speakers = min(ivecset.num_speakers, size)
                 sklearnkmeans = sklearnKMeans(
                     n_clusters=num_speakers).fit(ivecs)
                 centroids = KMeans(sklearnkmeans.cluster_centers_,
                                    num_speakers, self.plda).fit(ivecs)
             else:
                 num_speakers, centroids = self.get_num_speakers(ivecs)
             if self.norm_list is None:
                 scores_dict[name] = self.plda.score(
                     ivecs, centroids, self.scale, self.shift)
             else:
                 scores_dict[name] = self.s_norm(ivecs, centroids)
         else:
             logwarning(
                 '[Diarization.score] No i-vectors to score in {}.'.format(
                     ivecset.name))
     return scores_dict
示例#11
0
    def get_num_speakers(self, ivecs, min_speakers=2, max_speakers=6):
        """ Obtain number of speakers from pretrained model.

            :param ivecs: input i-vectors
            :type ivecs: numpy.array
            :param min_speakers: minimal number of speakers from model
            :type min_speakers: int
            :param max_speakers: maximal number of speakers from model
            :type max_speakers: int
            :returns: estimated number of speakers and KMeans centroid
            :rtype: tuple
        """
        avg, centroids_list = [], []
        features = []
        for num_speakers in range(min_speakers, max_speakers + 1):
            sklearnkmeans = sklearnKMeans(n_clusters=num_speakers).fit(ivecs)
            centroids = KMeans(sklearnkmeans.cluster_centers_, num_speakers,
                               self.plda).fit(ivecs)
            centroids_list.append(centroids)
            scores = self.s_norm(centroids,
                                 centroids)[np.tril_indices(num_speakers, -1)]
            features.append(Normalization.get_features(scores))
        num_speakers = np.argmax(
            np.sum(self.model.test(features, prob=True), axis=0))
        # raw_input('ENTER')
        return num_speakers + min_speakers, centroids_list[num_speakers]
示例#12
0
    def test_assign_points(self):
        """
        Tests initialize methods of the KMeans class. 
        """
        X, y, centers = generate_cluster_samples()
        n_samples = X.shape[0]
        k = centers.shape[0]

        kmeans = KMeans(k, N_ITER)

        # Set cluster centers so that assignment is deterministic
        kmeans.cluster_centers = centers
        assignments, distances = kmeans.assign_points(X)

        # check assignment array shape
        self.assertEqual(assignments.ndim, 1)
        self.assertEqual(assignments.shape[0], n_samples)

        # check distances array shape
        self.assertEqual(distances.ndim, 1)
        self.assertEqual(distances.shape[0], n_samples)

        # check that assignments only include valid cluster indices (0 <= idx < k)
        self.assertTrue(
            np.all(np.logical_and(assignments < k, assignments >= 0)))

        # Check cluster assignments are correct
        self.assertTrue(np.all(assignments[:25] == 0))
        self.assertTrue(np.all(assignments[25:50] == 1))
        self.assertTrue(np.all(assignments[50:75] == 2))
        self.assertTrue(np.all(assignments[75:] == 3))
示例#13
0
    def test_initialize(self):
        """
        Tests initialize methods of the KMeans class. 
        """
        k = 3
        n_samples = 100
        n_features = 10

        for i in range(N_TRIALS):
            X = np.random.randn(n_samples, n_features)

            kmeans = KMeans(k, N_ITER)
            kmeans.initialize_clusters(X)

            # ensure that the cluster_centers matrix has the right shape
            self.assertEqual(kmeans.cluster_centers.ndim, 2)
            self.assertEqual(kmeans.cluster_centers.shape[0], k)
            self.assertEqual(kmeans.cluster_centers.shape[1], n_features)

            # Check that every center is one the points in X.
            # Calculcate the distances between every cluster center
            # and every point in X.  Find the closest matches.
            # Checks that the distances are nearly 0.0
            distances = find_smallest_distances(X, kmeans.cluster_centers)
            for d in distances:
                self.assertAlmostEqual(d, 0.0)
示例#14
0
    def test_whole(self):
        """
        Tests the score method.
        """

        X, y, centers = generate_cluster_samples()
        n_samples = X.shape[0]
        n_features = X.shape[1]
        k = centers.shape[0]

        # run N_TRIALS, pick best model
        best_model = None
        for i in range(N_TRIALS):
            kmeans = KMeans(k, N_ITER)
            kmeans.fit(X)
            if best_model is None:
                best_model = kmeans
            elif kmeans.score(X) < best_model.score(X):
                best_model = kmeans

        # check sum squared errors
        sum_squared_errors = best_model.score(X)
        self.assertLess(sum_squared_errors / n_samples, EPS)

        # compare centers to expected centers
        smallest_distances = find_smallest_distances(
            best_model.cluster_centers, centers)
        for distance in smallest_distances:
            self.assertLess(distance, EPS)
示例#15
0
    def test_update_centers(self):
        """
        Tests update centers
        """
        X, y, centers = generate_cluster_samples()
        n_samples = X.shape[0]
        n_features = X.shape[1]
        k = centers.shape[0]

        kmeans = KMeans(k, N_ITER)

        # Set cluster centers so that assignment is deterministic
        kmeans.cluster_centers = centers
        assignments, distances = kmeans.assign_points(X)
        assignments = kmeans.reinitialize_empty_clusters(
            X, assignments, distances)

        # clear out centers to test method
        kmeans.cluster_centers = np.zeros((k, n_features))
        kmeans.update_centers(X, assignments)

        # calculate average difference in coordinates of estimated
        # and real centers
        error = np.linalg.norm(kmeans.cluster_centers - centers) / k
        self.assertLess(error, EPS)
示例#16
0
def B4(pca=False):
    '''
		Evaluate using NMI and visualize in 2D.
	'''
    fnames = [
        'digits-embedding.csv', 'digits-embedding-2467.csv',
        'digits-embedding-67.csv'
    ]
    nmi = zeros(len(fnames))
    for i, k, fname in zip([0, 1, 2], [8, 4, 2], fnames):
        raw = genfromtxt(fname, delimiter=',')
        X = raw[:, 2:]
        y = get_normalized_labels(raw[:, 1])
        kmeans = KMeans(n_clusters=k)
        ind = kmeans.fit(X, y)
        _, _, nmi[i] = kmeans.get_evals()
        figure()
        perm = permutation(X.shape[0])[:1000]
        X = X[perm]
        ind = ind[perm]
        colors = rand(k, 3)[ind, :]
        scatter(X[:, 0], X[:, 1], c=colors, alpha=0.9, s=30)
    print(fnames)
    print("NMI =", nmi)
    show()
示例#17
0
def kmeans_trials(k=3, r=1):

    # Create and train r models for trials
    models = [KMeans(k, data) for _ in range(r)]
    training_err = [m.train(data) for m in models]

    # Sort modes by sum-of-squares error
    results = [(err[-1], model) for err, model in zip(training_err, models)]
    results = sorted(results,
                     key=lambda x: x[0])  # Sort asscending by sum square error

    # Plot trial results
    for i, trial in enumerate(results):
        final_err = round(trial[0], 2)
        m = trial[1]

        plt.title(f'Trial {i+1} Cluster Assignments (SSE={final_err})')
        plotKClusters(m, k, data)
        plt.show()

    # Show best model from r trials
    best_sse = round(results[0][0], 2)
    best_model = results[0][1]
    plt.title(f"Best model (SSE={best_sse})")
    plotKClusters(best_model, k, data)
    plt.show()
示例#18
0
def B1(pca=False):
    '''
		Plot WC_SSD and SC over K.
	'''
    K = [2, 4, 6, 8, 16, 32]
    fnames = [
        'digits-embedding.csv', 'digits-embedding-2467.csv',
        'digits-embedding-67.csv'
    ]
    wc_ssd_val = zeros((len(fnames), len(K)))
    sc_val = zeros((len(fnames), len(K)))
    for i, fname in enumerate(fnames):
        X = genfromtxt(fname, delimiter=',')[:, 2:]
        for j, k in enumerate(K):
            kmeans = KMeans(n_clusters=k)
            kmeans.fit(X)
            wc_ssd_val[i, j], sc_val[i, j], _ = kmeans.get_evals()
    # Plot WC_SSD
    figure()
    for i, fname in enumerate(fnames):
        plot(K, wc_ssd_val[i], label=fname)
    legend()
    title('WC_SSD v.s. K')
    figure()
    for i, fname in enumerate(fnames):
        plot(K, sc_val[i], label=fname)
    legend()
    title('SC v.s. K')
    show()
示例#19
0
 def test_predict(self):
     test_samples = [[-3, -3], [3, 3], [-1, -1], [1, 1]]
     expected_predictions = [0, 1, 0, 1]
     k_means = KMeans(num_clusters=self.num_clusters, seed=1)
     k_means.fit(self.data)
     predictions = k_means.predict(test_samples)
     self.assertEqual(expected_predictions, predictions)
def cluster_colors(img, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters)
    color_vectors = cv.cvtColor(img, cv.COLOR_BGR2RGB).reshape([-1, 3])
    centroids = kmeans.fit(color_vectors)
    labels = kmeans.predict(color_vectors)
    pred = labels.reshape(img.shape[:-1])

    # Initialize img for clusters
    cluster_img = np.zeros(img.shape)
    for i in range(n_clusters):
        cluster_img[np.where(pred == i)] = centroids[i]

    cluster_img = cluster_img.astype(np.uint8)
    plt.figure(figsize=(10, 10))
    plt.imshow(cluster_img)

    colors = ["Cluster {}".format(i) for i in range(n_clusters)]
    patches = [
        mpatches.Patch(color=centroids[i] / 255, label=colors[i])
        for i in range(len(colors))
    ]
    plt.legend(handles=patches,
               bbox_to_anchor=(1.05, 1),
               loc=2,
               borderaxespad=0.)
    plt.show()

    return kmeans
示例#21
0
 def test_select_initial_centroids(self):
     expected_initial_centroids = [[2, 1], [-1, -2]]
     k_means = KMeans(num_clusters=self.num_clusters, seed=3)
     k_means.fit(self.data)
     initial_centroids = k_means._select_initial_centroids(self.data)
     self.assertEqual(expected_initial_centroids, initial_centroids)
     self.assertEqual(self.num_clusters, len(initial_centroids))
def spectral(X, sigma, k, centroids):
    """
    Ng谱聚类算法
    :param X: 数据点
    :param sigma: 参数
    :param k: 参数
    :return: accu聚类精度
    """
    (n, d) = X.shape
    L_sym, L = get_L(X, k, sigma)
    eig, eigvec = np.linalg.eig(L_sym)  # eigvec按列
    # eig_index = np.argsort(eig)[1:d+1]
    eig_index = np.argsort(eig)[:d]  # 最小的d个特征值的索引
    U = eigvec[:, eig_index]
    T = np.zeros(U.shape)
    for i in range(n):
        for j in range(d):
            T[i][j] = U[i][j] / np.linalg.norm(U[i])
    Y = T
    # visual(Y, k=k, sigma=sigma, save=1)

    cluster = KMeans(2, 100, centroids)
    cluster.fit(Y)
    labels = cluster.labels

    if labels[0] == 0:
        n1 = 100 - sum(labels[:100])
        n2 = sum(labels[100:])
    else:
        n1 = sum(labels[:100])
        n2 = 100 - sum(labels[100:])
    accu = (n1 + n2) / n
    print('---------------------sigma=%.2f, k=%d, accu=%.4f' %
          (sigma, k, accu))
    return accu
示例#23
0
def main():
    tagged_words = brown.tagged_words()
    words_corpus = brown.words()

    word2vec = Word2Vec()
    word2vec.train(words_corpus)

    word_vecs = [word2vec.word2vec(word) for word in words_corpus]

    n_clusters = 10 # random number for now
    kmeans = KMeans(n_clusters)
    kmeans.compute(word_vecs)

    # word-cluster HMM
    p_word = {}
    p_cluster = {}

    p_cluster_given_word = None # softmax
    p_word_given_cluster = None # joint probability formula

    p_transition_cluster = None # count
    p_initial_cluster = None # count

    # cluster-tag HMM
    p_cluster_given_tag = None # softmax
    p_transition_tag = None # count from tagged data
    p_initial_tag = None # count from tagged data

    hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster)
    hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag)

    words = []
    clusters = hmm_word_cluster.viterbi(words)
    tags = hmm_cluster_tag.viterbi(clusters)
示例#24
0
def main():
    filepath = "./data/self_test.csv"
    #filepath = "./data/self_test_petit.csv"
    #filepath = "./data/iris.csv"

    # chargement des données
    data, labels = load_dataset(filepath)

    # initialisation de l'objet KMeans
    kmeans = KMeans(n_clusters=3,
                    max_iter=100,
                    early_stopping=True,
                    tol=1e-6,
                    display=True)

    # calcule les clusters
    kmeans.fit(data)

    # calcule la pureté de nos clusters
    score = kmeans.score(data, labels)
    print("Pureté : {}".format(score))



    input("Press any key to exit...")
示例#25
0
def kmeans_image_compression():

    print("[+] K-Means Image Compression")
    im = plt.imread('baboon.tiff')
    N, M = im.shape[:2]
    im = im / 255

    # convert to RGB array
    data = im.reshape(N * M, 3)
    # print(im)

    k_means = KMeans(n_cluster=16, max_iter=100, e=1e-6)
    centroids, _, i = k_means.fit(data)

    # print(centroids.shape)

    print('[+] RGB centroids computed in {} iteration'.format(i))
    new_im = transform_image(im, centroids)

    assert new_im.shape == im.shape, \
        'Shape of transformed image should be same as image'

    mse = np.sum((im - new_im)**2) / (N * M)
    print('[+] Mean square error per pixel is {}\n'.format(mse))
    plt.imsave('plots/compressed_baboon.png', new_im)
def initialize_clusters(X, n_clusters):
    """ 
        Initialize the clusters by storing the information in the data matrix X into the clusters

        Parameter:
            X: Input feature matrix
            n_clusters: Number of clusters we are trying to classify

        Return:
            cluster: List of clusters. Each cluster center is calculated by the KMeans algorithm above.
    """
    clusters = []
    index = np.arange(X.shape[0])

    # We use the KMeans centroids to initialise the GMM

    kmeans = KMeans().fit(X)
    mu_k = kmeans.centers

    for i in range(n_clusters):
        clusters.append({
            'w_k': 1.0 / n_clusters,
            'mu_k': mu_k[i],
            'cov_k': np.identity(X.shape[1], dtype=np.float64)
        })

    return clusters
def kmeans_builder(centroid_func):
    samples_per_cluster = 50
    n_cluster = 9

    x, y = toy_dataset(n_cluster, samples_per_cluster)
    fig = Figure()
    fig.ax.scatter(x[:, 0], x[:, 1], c=y)
    fig.savefig('plots/toy_dataset_real_labels.png')

    fig.ax.scatter(x[:, 0], x[:, 1])
    fig.savefig('plots/toy_dataset.png')

    k_means = KMeans(n_cluster=n_cluster, max_iter=100, e=1e-8)

    centroids, membership, i = k_means.fit(x, centroid_func)



    assert centroids.shape == (n_cluster, 2), \
        ('centroids for toy dataset should be numpy array of size {} X 2'
            .format(n_cluster))

    assert membership.shape == (samples_per_cluster * n_cluster,), \
        'membership for toy dataset should be a vector of size {}'.format(len(membership))

    assert type(i) == int and i > 0,  \
        'Number of updates for toy datasets should be integer and positive'

    print('[success] : kmeans clustering done on toy dataset')
    print('Toy dataset K means clustering converged in {} steps'.format(i))

    fig = Figure()
    fig.ax.scatter(x[:, 0], x[:, 1], c=membership)
    fig.ax.scatter(centroids[:, 0], centroids[:, 1], c='red')
    fig.savefig('plots/toy_dataset_predicted_labels.png')
示例#28
0
 def __init__(self,
              n_cluster: int,
              data: np.ndarray,
              use_kmeans: bool = False,
              w: float = 0.9,
              c1: float = 0.5,
              c2: float = 0.3,
              flag: int = 1,
              weights: list = None):
     index = np.random.choice(list(range(len(data))), n_cluster)
     self.centroids = data[index].copy()
     if use_kmeans:
         kmeans = KMeans(n_cluster=n_cluster, init_pp=False)
         kmeans.fit(data)
         self.centroids = kmeans.centroid.copy()
     self.best_position = self.centroids.copy()
     self.best_score = quantization_error(self.centroids, self._predict(data), data)
     self.flag=flag
     if self.flag%2==1:
         self.best_sse = calc_sse(self.centroids, self._predict(data), data)
     else:
         self.best_sse = calc_sse2(self.centroids, self._predict(data), data, weights)
     self.velocity = np.zeros_like(self.centroids)
     self._w = w
     self._c1 = c1
     self._c2 = c2
示例#29
0
def kmeans_toy():
    x, y = toy_dataset(4)
    fig = Figure()
    fig.ax.scatter(x[:, 0], x[:, 1], c=y)
    fig.savefig('plots/toy_dataset_real_labels.png')

    fig.ax.scatter(x[:, 0], x[:, 1])
    fig.savefig('plots/toy_dataset.png')
    n_cluster = 4
    k_means = KMeans(n_cluster=n_cluster, max_iter=100, e=1e-8)
    centroids, membership, i = k_means.fit(x)

    assert centroids.shape == (n_cluster, 2), \
        ('centroids for toy dataset should be numpy array of size {} X 2'
            .format(n_cluster))

    assert membership.shape == (50 * n_cluster,), \
        'membership for toy dataset should be a vector of size 200'

    assert type(i) == int and i > 0,  \
        'Number of updates for toy datasets should be integer and positive'

    print('[success] : kmeans clustering done on toy dataset')
    print('Toy dataset K means clustering converged in {} steps'.format(i))

    fig = Figure()
    fig.ax.scatter(x[:, 0], x[:, 1], c=membership)
    fig.ax.scatter(centroids[:, 0], centroids[:, 1], c='red')
    fig.savefig('plots/toy_dataset_predicted_labels.png')

    np.savez('results/k_means_toy.npz',
             centroids=centroids,
             step=i,
             membership=membership,
             y=y)
示例#30
0
def cluster_newsgroups():
    """ Cluster newsgroup categories. """

    from kmeans import KMeans
    from similarity import simMatrix

    corpus, dictionary = build_dictionary(bigram=True)
    tfidf = TFIDF(dictionary)
    newsgroups = tfidf.vectorize(corpus)
    dictionary = tfidf.dictionary

    categories = sorted(corpus.keys())

    N = 6
    print "\n{}-Most Common Words".format(N)
    for index, category in enumerate(categories):
        nlargest = np.argpartition(newsgroups[index, :], -N)[-N:]
        nlargest = nlargest[np.argsort(newsgroups[index, nlargest])][::-1]
        print "{:>24} {}".format(category, dictionary[nlargest])
    print

    K = 3
    km = KMeans(n_clusters=K)
    km.fit(newsgroups)

    labels = km.labels_

    print "\nKMeans Label Assignment, K = {}".format(K)
    for category, label, in zip(categories, labels):
        print int(label), category

    simMatrix(newsgroups).plot().show()