def cluster_objects(objects, optimize_within_clusters=False, round_trip=False, initial=None): """ Return a list of objects clustered by geographical position. :param objects: The list of objects or a queryset. The objects must be an instance of PointGeoTag or implement `get_point_coordinates(self, as_string=False, inverted=False)` to obtain the coordinates :param optimize_within_clusters: a boolean specifying if the clusters should be ordered based on the (near-)optimal route. :returns: A list of clusters. Example: [[<p1>, <p2>], [<p3>, <p4>, <p5>]] """ X = np.array([list(i.get_point_coordinates(as_string=False, inverted=True)) for i in objects if i.get_point_coordinates(as_string=False, inverted=True)]) # Afinity propagation. # This way we can determine the number of clusters automatically # X_norms = np.sum(X*X, axis=1) # S = - X_norms[:,np.newaxis] - X_norms[np.newaxis,:] + 2 * np.dot(X, X.T) # p = 10*np.median(S) # af = AffinityPropagation() # af.fit(S, p) # n_clusters_ = len(af.cluster_centers_indices_) n_items = len(X) max_items = getattr(settings, 'ITEMS_PER_BUCKET', 10) - 1 n_clusters = n_items / max_items #n_clusters += n_items % max_items == 0 and 0 or 1 # KMeans. # If we want a pre-specified number of clusters this is the way to go km = KMeans(k=n_clusters, init='k-means++') km.fit(X) cluster_dict = defaultdict(list) for i, cluster_id in enumerate(km.labels_): cluster_dict[cluster_id].append(objects[i]) clusters = cluster_dict.values() if optimize_within_clusters: if initial: result = [] for cluster in clusters: if initial in cluster: cluster.remove(initial) cluster.insert(0, initial) result.insert(0, google_TSP(cluster, round_trip=round_trip)) else: result.append(google_TSP(cluster, round_trip=round_trip)) return result else: return [google_TSP(cluster, round_trip=round_trip) for cluster in clusters] return clusters
def cluster_centroids(x, k=32, max_iter=300, km_kwargs={}): '''Return norm-ordered centroids''' km = KMeans(k, init='k-means++', max_iter=300, **km_kwargs) trained = km.fit(x) centroids = trained.cluster_centers_ ind = np.argsort(np.linalg.norm(centroids, axis=1)) return centroids[ind]
############################################################################## # Generate sample data np.random.seed(0) batch_size = 45 centers = [[1, 1], [-1, -1], [1, -1]] n_clusters = len(centers) X, labels_true = make_blobs(n_samples=1200, centers=centers, cluster_std=0.7) ############################################################################## # Compute clustering with Means k_means = KMeans(init='k-means++', k=3) t0 = time.time() k_means.fit(X) t_batch = time.time() - t0 k_means_labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ k_means_labels_unique = np.unique(k_means_labels) ############################################################################## # Compute clustering with MiniBatchKMeans mbk = MiniBatchKMeans(init='k-means++', k=3, chunk_size=batch_size) t0 = time.time() mbk.fit(X) t_mini_batch = time.time() - t0 mbk_means_labels = mbk.labels_ mbk_means_cluster_centers = mbk.cluster_centers_ mbk_means_labels_unique = np.unique(mbk_means_labels)
n_points_per_cluster = 250 n_clusters = 3 n_points = n_points_per_cluster*n_clusters means = np.array([[1,1],[-1,-1],[1,-1]]) std = .6 clustMed = [] X = np.empty((0, 2)) for i in range(n_clusters): X = np.r_[X, means[i] + std * np.random.randn(n_points_per_cluster, 2)] ################################################################################ # Compute clustering with KMeans km = KMeans(init='k-means++', k=3, n_init=1) km.fit(X); labels = km.labels_ cluster_centers = km.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print "number of estimated clusters : %d" % n_clusters_ ################################################################################ # Plot result import pylab as pl from itertools import cycle pl.figure(1)
print ################################################################################ # Digits dataset clustering using Self-Organizing Map print "Self-Organizing Map " t0 = time() grid_width = 4 som = SelfOrganizingMap(size=grid_width, n_iterations=n_samples*5, learning_rate=1) som.fit(data) print "done in %0.3fs" % (time() - t0) print F = pseudo_F(data, som.labels_, som.neurons_) print 'pseudo_F %0.2f | %0.2f%%' % (F, 100 * (F / (1 + F))) print ################################################################################ # Digits dataset clustering using Kmeans print "KMeans " t0 = time() km = KMeans(init='k-means++', k=grid_width**2, n_init=10) km.fit(data) print "done in %0.3fs" % (time() - t0) print F = pseudo_F(data, km.labels_, km.cluster_centers_) print 'pseudo_F %0.2f | %0.2f%%' % (F, 100 * (F / (1 + F)))
# You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. from __future__ import division import numpy as np import pickle from scikits.learn.cluster import KMeans import logging import time logging.basicConfig(level=logging.DEBUG) LEARN_SIZE = 100 K = 8 ITER = 10 moto,plane = [pickle.load(open(file)) for file in ['moto','plane']] logging.info('Data loaded') m = np.vstack([v for f,v in moto.items()[0:LEARN_SIZE]]) p = np.vstack([v for f,v in plane.items()[0:LEARN_SIZE]]) all = np.vstack([m,p]) km = KMeans(k=K,max_iter=ITER) km.fit(all) filename = 'centroids_%d_%d_%d' % (LEARN_SIZE,K,ITER) pickle.dump(km.cluster_centers_,open(filename,'wb'))
n_points_per_cluster = 250 n_clusters = 3 n_points = n_points_per_cluster * n_clusters means = np.array([[1, 1], [-1, -1], [1, -1]]) std = .6 clustMed = [] X = np.empty((0, 2)) for i in range(n_clusters): X = np.r_[X, means[i] + std * np.random.randn(n_points_per_cluster, 2)] ################################################################################ # Compute clustering with KMeans km = KMeans(init='k-means++', k=3, n_init=1) km.fit(X) labels = km.labels_ cluster_centers = km.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print "number of estimated clusters : %d" % n_clusters_ ################################################################################ # Plot result import pylab as pl from itertools import cycle pl.figure(1)
def cluster_objects(objects, optimize_within_clusters=False, round_trip=False, initial=None): """ Return a list of objects clustered by geographical position. :param objects: The list of objects or a queryset. The objects must be an instance of PointGeoTag or implement `get_point_coordinates(self, as_string=False, inverted=False)` to obtain the coordinates :param optimize_within_clusters: a boolean specifying if the clusters should be ordered based on the (near-)optimal route. :returns: A list of clusters. Example: [[<p1>, <p2>], [<p3>, <p4>, <p5>]] """ X = np.array([ list(i.get_point_coordinates(as_string=False, inverted=True)) for i in objects if i.get_point_coordinates(as_string=False, inverted=True) ]) # Afinity propagation. # This way we can determine the number of clusters automatically # X_norms = np.sum(X*X, axis=1) # S = - X_norms[:,np.newaxis] - X_norms[np.newaxis,:] + 2 * np.dot(X, X.T) # p = 10*np.median(S) # af = AffinityPropagation() # af.fit(S, p) # n_clusters_ = len(af.cluster_centers_indices_) n_items = len(X) max_items = getattr(settings, 'ITEMS_PER_BUCKET', 10) - 1 n_clusters = n_items / max_items #n_clusters += n_items % max_items == 0 and 0 or 1 # KMeans. # If we want a pre-specified number of clusters this is the way to go km = KMeans(k=n_clusters, init='k-means++') km.fit(X) cluster_dict = defaultdict(list) for i, cluster_id in enumerate(km.labels_): cluster_dict[cluster_id].append(objects[i]) clusters = cluster_dict.values() if optimize_within_clusters: if initial: result = [] for cluster in clusters: if initial in cluster: cluster.remove(initial) cluster.insert(0, initial) result.insert(0, google_TSP(cluster, round_trip=round_trip)) else: result.append(google_TSP(cluster, round_trip=round_trip)) return result else: return [ google_TSP(cluster, round_trip=round_trip) for cluster in clusters ] return clusters