def test_kmeans_expr(self): ctx = spartan.blob_ctx.get() pts = expr.rand(N_PTS, N_DIM, tile_hint=(divup(N_PTS, ctx.num_workers), N_DIM)).force() k = KMeans(N_CENTERS, ITER) k.fit(pts)
def benchmark_kmeans(ctx, timer): print "#worker:", ctx.num_workers N_PTS = 1000 * 256 N_CENTERS = 10 N_DIM = 512 ITER = 1 pts = expr.rand(N_PTS, N_DIM) k = KMeans(N_CENTERS, ITER) t1 = datetime.now() k.fit(pts) t2 = datetime.now() cost_time = millis(t1, t2) print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/ITER)
def benchmark_kmeans(ctx, timer): print "#worker:", ctx.num_workers N_PTS = 1000 * 256 N_CENTERS = 10 N_DIM = 512 ITER = 1 pts = expr.rand(N_PTS, N_DIM) k = KMeans(N_CENTERS, ITER) t1 = datetime.now() k.fit(pts) t2 = datetime.now() cost_time = millis(t1, t2) print "total cost time:%s ms, per iter cost time:%s ms" % ( cost_time, cost_time / ITER)
def test_kmeans(self): _skip_if_travis() N_PTS = 1000 * 1000 * self.ctx.num_workers ITER = 5 N_DIM = 10 N_CENTERS = 10 start = time.time() pts = expr.rand(N_PTS, N_DIM).force() k = KMeans(N_CENTERS, ITER) k.fit(pts) cost = time.time() - start self._verify_cost("kmeans", cost)
def test_kmeans(self): _skip_if_travis() N_PTS = 1000 * 1000 * self.ctx.num_workers ITER = 5 N_DIM = 10 N_CENTERS = 10 start = time.time() pts = expr.rand(N_PTS, N_DIM).evaluate() k = KMeans(N_CENTERS, ITER) k.fit(pts) cost = time.time() - start self._verify_cost("kmeans", cost)
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'): ''' clustering data points using kmeans spectral clustering method. Args: points(Expr or DistArray): the data points to be clustered. k(int): the number of clusters we need to generate. num_iter(int): the max number of iterations that kmeans clustering method runs. similarity_measurement(str): distance method used to measure similarity between two points. ''' # calculate similarity for each pair of points to generate the adjacency matrix A A = expr.shuffle(points, _row_similarity_mapper, kw={'similarity_measurement': similarity_measurement}, shape_hint=(points.shape[0], points.shape[0])) num_dims = A.shape[1] # Construct the diagonal matrix D D = expr.sum(A, axis=1, tile_hint=(A.shape[0], )) # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5) L = expr.shuffle(A, _laplacian_mapper, kw={'D': D}, shape_hint=A.shape) # Perform eigen-decomposition using Lanczos solver overshoot = min(k * 2, num_dims) d, U = lanczos.solve(L, L, overshoot, True) U = U[:, 0:k] # Generate initial clusters which picks rows as centers if that row contains max eigen # value in that column init_clusters = U[np.argmax(U, axis=0)] # Run kmeans clustering with init_clusters kmeans = KMeans(k, num_iter) U = expr.from_numpy(U) centers, labels = kmeans.fit(U, init_clusters) return labels
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'): ''' clustering data points using kmeans spectral clustering method. Args: points(Expr or DistArray): the data points to be clustered. k(int): the number of clusters we need to generate. num_iter(int): the max number of iterations that kmeans clustering method runs. similarity_measurement(str): distance method used to measure similarity between two points. ''' # calculate similarity for each pair of points to generate the adjacency matrix A A = expr.shuffle(points, _row_similarity_mapper, kw={'similarity_measurement': similarity_measurement}) num_dims = A.shape[1] # Construct the diagonal matrix D D = expr.sum(A, axis=1, tile_hint=(A.shape[0],)) # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5) L = expr.shuffle(A, _laplacian_mapper, kw={'D': D}) # Perform eigen-decomposition using Lanczos solver overshoot = min(k * 2, num_dims) d, U = lanczos.solve(L, L, overshoot, True) U = U[:, 0:k] # Generate initial clusters which picks rows as centers if that row contains max eigen # value in that column init_clusters = U[np.argmax(U, axis=0)] # Run kmeans clustering with init_clusters kmeans = KMeans(k, num_iter) U = expr.from_numpy(U) centers, labels = kmeans.fit(U, init_clusters) return labels
def test_kmeans_expr(self): FLAGS.opt_parakeet_gen = 0 pts = expr.rand(N_PTS, N_DIM) k = KMeans(N_CENTERS, ITER) k.fit(pts)