예제 #1
0
    def cluster(self):
        method = self.options.get_cluster_method()

        if method == 'kmeans':
            self.clusters = kmeans.cluster(self.options, self.data)
        else:
            raise Exception('Unknown cluster methiod: ' + method)
예제 #2
0
def gmm_init(k, samples):
    """
    init a gauss mixture model for all samples
    using kmeans algorithm
    weights don't sum up to 1
    """
    centers = km.kmeans(k, samples)
    clusters = km.cluster(samples, centers)

    #params is a list of (mean, sigma, weight)
    #    shapec = np.shape(centers[0])
    shapes = np.shape(np.outer(samples[0], samples[0]))

    #params = [[np.zeros_like(centers[0]), np.zeros(shapes), 0]]*k
    params = [None] * k
    for i in range(k):
        cluster, center = clusters[i], centers[i]
        num_samples = len(cluster)
        deviation = np.zeros(shapes)
        for sample in cluster:
            diff = sample - center
            deviation += np.outer(diff, diff)
        deviation /= len(cluster)
        params[i] = [center, deviation, num_samples]
    return params
예제 #3
0
def extract_centroids_histogram(descs, k = DEFAULT_K_CLUSTERS):
    if len(descs) == 0:
        return [], []
    print "Performing clustering on " + str(len(descs)) + " descriptors (k=" + str(k) + ")..."
    # Perform clustering to find the best grouping of the descriptors
    centroids, hist = kmeans.cluster(descs, k)
    print "Found " + str(len(centroids)) + " clusters in training descriptors "
    hist = normalize_hist(hist, k)
    return centroids, hist
예제 #4
0
def extract_centroids_histogram(descs, k=DEFAULT_K_CLUSTERS):
    if len(descs) == 0:
        return [], []
    print "Performing clustering on " + str(
        len(descs)) + " descriptors (k=" + str(k) + ")..."
    # Perform clustering to find the best grouping of the descriptors
    centroids, hist = kmeans.cluster(descs, k)
    print "Found " + str(len(centroids)) + " clusters in training descriptors "
    hist = normalize_hist(hist, k)
    return centroids, hist
예제 #5
0
def cluster(data, method, k):
  print 'clustering with method: ', method, ' and k: ', k
  clusters = []

  human_needed = True
  while human_needed:
    if method == 'kmeans':
      duplicates = False
      unique_labels = []
      # TODO: move all of this out into a separate function
      # 1. cluster with kmeans
      centroids = kmeans.cluster(data, k)

      # 2. ask for human to label clusters
      for cent in centroids:
        if cent not in human_points:
          # display centroid
          display_char(cent, 16, 8, 'Centroid')

          # ask for label from h00man
          label = raw_input('Please label this centroid: ')

          # save point with label to labeled_points
          human_points.append(cent)
          human_labels.append(label)
          if label not in unique_labels:
            unique_labels.append(label)
          else:
            k -= 1
            duplicates = True
          # unhashable type...
          #human[cent] = label

          # TODO
          # how to check for duplicates?

      # 3. ask if there needs to be more clusters
      if not duplicates:
        more = raw_input('Is ' + str(k) + ' enough clusters? (yes/no): ')
        if more == 'no':
          k += 1
        else:
          human_needed = False
          clusters = kmeans.fit(data, centroids)
      else:
        print 'Removing duplicate clusters'

    else:
      # who knows what will go here? long-term goals
      print method.m, ' is not a valid method right now. sorry!'


  return clusters
예제 #6
0
def processData():
    articles = getArticles()

    articlesDict = {'articles': articles}

    with open('Articles.txt', 'w') as outfile:
        json.dump(articlesDict, outfile)

    clusters = kmeans.cluster(5, articles)

    data = {'clusters': clusters}

    return data
예제 #7
0
def test_K((k, corpus_filename, algorithm)):
    try:
        X, labels = Xy(corpus_filename)
        pred_clusters, centers = cluster(
            X,
            seed=1,
            n_clusters=k,
            alg=algorithm
        )
        clusters_data = extract_clusters(X, pred_clusters, labels, k)
        metric = wssse(clusters_data, centers)
    except Exception as e:
        metric = e
    return k, metric
예제 #8
0
    def test_vs_sklearn(self):
        """Compare results with scikit-learn implementation"""

        data = skdatasets.load_iris().data
        num_clusters = 3

        # Use Erisoglu as it is deterministic
        seeds = erisoglu.generate(data, num_clusters)

        mine = mykm.cluster(data, num_clusters, seeds)

        theirs = skcluster.KMeans(n_clusters=num_clusters,
                                  n_init=1,
                                  init=seeds)
        theirs.fit(data)

        # Assert same centroids
        np.testing.assert_array_almost_equal(mine['centroids'],
                                             theirs.cluster_centers_,
                                             decimal=6)

        # Assert SSE calculated correctly
        self.assertAlmostEqual(mine['inertia'], theirs.inertia_, places=8)
예제 #9
0
import dataset
import kmeans
import matplotlib.pyplot as plt
import numpy as np

# Load dataset
iris_data = dataset.load_dataset('iris.csv')

# Convert class names to numeric representations
iris_data, iris_classes = dataset.to_numeric(iris_data, 'species')

# Convert dataframe strings to floats
attrs_conv = list(iris_data.axes[1][:-1])
data = dataset.from_str(iris_data, attrs_conv)

# Convert dataset to matrix representation
iris_ds = dataset.to_matrix(iris_data)
print(type(iris_ds))
# Perform k-means clustering
centroids, cluster_assignments, iters, orig_centroids = kmeans.cluster(np.delete(iris_ds, 4, 1), 3)

# Output results
print ('Number of iterations:', iters)
print ('\nFinal centroids:\n', centroids)
print ('\nCluster membership and error of first 10 instances:\n', cluster_assignments[:10])
print ('\nOriginal centroids:\n', orig_centroids)

# plot cost function for different values of K
# to get the optimum K with elbow method

import numpy as np
import matplotlib.pyplot as plt

from kmeans import cluster

J = []  # list to hold the costs for various K

low, high = 1, 10  # bounds on K to analyze

for K in range(low, high):
    result = cluster(K)
    cost, length = 0, 0
    for i in range(K):
        cl = np.array(result[i])
        mu = np.mean(cl, axis=0)
        cost += np.sum((cl - mu)**2)
        length += np.size(cl, axis=0)
    J.append(cost / length)

plt.figure()
plt.style.use("seaborn")
plt.plot(range(low, high), J, "r--")
plt.show()
예제 #11
0
def clusterFeatures(features):
    ret = []
    for feat in features:
        centers, codes, weights = kmeans.cluster(feat, 500)
        ret.append((centers, weights))
    return ret
예제 #12
0
import kmeans
import random
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np

if __name__ == "__main__":
    k = 3
    init_flag = False
    while not (init_flag):
        print("\n loading dataset")
        feature_vectors = kmeans.gen_feature_vectors("iris.data.txt")
        print ("\n Initializing cluster centres")
        centers = random.sample(feature_vectors, k)
        clustered_data = kmeans.cluster(centers, feature_vectors, True) #not that this is only an initial clustering 
        print("\ncounting cluster members for each cluster")
        new_count = kmeans.count_elements(clustered_data, k)
        print("\n verifying that the clusters are acceptable") 
        init_flag = kmeans.check_count(new_count, k)
        if not (init_flag):
            print("\n Zero cluster is detected .. reinitializing algorithm")
        else:
            print("\n clusters are acceptable .. proceeding for optimization of cluster centers")
        converge_flag = False
        counter = 1
    
    while not bool(converge_flag):
        print("\n Iteration no :", counter)
        old_count = new_count
        centers = kmeans.calculate_centers(clustered_data, k)
# record the results of k-means clustering
# using the optimum value of K as obtained by elbow method

import json
from kmeans import cluster

# user inputs K after assessing the elbow plot
K = int(input())
results = cluster(K)

filename = "results.json"
with open(filename, "w") as f:
    json.dump(results, f)  # store clustering results
예제 #14
0
sns.set()
import kmeans
from util import dataset

points = np.vstack(((np.random.randn(150, 2) * 0.75 + np.array([1, 0])),
                    (np.random.randn(50, 2) * 0.25 + np.array([-0.5, 0.5])),
                    (np.random.randn(50, 2) * 0.5 + np.array([-0.5, -0.5]))))

dataset = dataset(points)
dataset.reduce(5)

plt.scatter(points[:, 0], points[:, 1])
#ax = plt.gca()
#ax.add_artist(plt.Circle(np.array([1, 0]), 0.75/2, fill=False, lw=3))
#ax.add_artist(plt.Circle(np.array([-0.5, 0.5]), 0.25/2, fill=False, lw=3))
#ax.add_artist(plt.Circle(np.array([-0.5, -0.5]), 0.5/2, fill=False, lw=3))

#centroids = kmeans.cluster(points, 3)
centroids, closest = kmeans.cluster(dataset.reduced_data, 3)

arg1 = np.argwhere(closest == 0)
cluster1 = np.array(points[arg1])
print(cluster1)
plt.scatter(cluster1[:, 0], cluster1[:, 1], c='g')

whole = np.insert(points, 2, closest, axis=1)
print(whole)

print(centroids[:, 0])
plt.scatter(centroids[:, 0], centroids[:, 1], c='r', s=100)