def benchmark_svm(ctx, timer): print "#worker:", ctx.num_workers max_iter = 2 #N = 200000 * ctx.num_workers N = 1000 * 64 D = 64 # create data data = expr.randn(N, D, dtype=np.float64, tile_hint=(N, util.divup(D, ctx.num_workers))) labels = expr.shuffle(data, _init_label_mapper, shape_hint=(data.shape[0], 1)) t1 = datetime.now() w = fit(data, labels, T=max_iter).force() t2 = datetime.now() util.log_warn('train time per iteration:%s ms, final w:%s', millis(t1,t2)/max_iter, w.glom().T) correct = 0 for i in range(10): new_data = expr.randn(1, D, dtype=np.float64, tile_hint=[1, D]) new_label = predict(w, new_data) #print 'point %s, predict %s' % (new_data.glom(), new_label) new_data = new_data.glom() if new_data[0,0] >= new_data[0,1] and new_label == 1.0 or new_data[0,0] < new_data[0,1] and new_label == -1.0: correct += 1 print 'predict precision:', correct * 1.0 / 10
def benchmark_cg(ctx, timer): print "#worker:", ctx.num_workers l = int(math.sqrt(ctx.num_workers)) n = 2000 * 16 #n = 4000 * l la = 20 niter = 5 tile_hint = (n, n/ctx.num_workers) #nonzer = 7 #nz = n * (nonzer + 1) * (nonzer + 1) + n * (nonzer + 2) #density = 0.5 * nz/(n*n) A = expr.rand(n, n, tile_hint=tile_hint) A = (A + expr.transpose(A))*0.5 I = expr.sparse_diagonal((n,n), tile_hint=tile_hint) * la I.force() A = expr.eager(A - I) #x1 = numpy_cg(A.glom(), niter) util.log_warn('begin cg!') t1 = datetime.now() x2 = conj_gradient(A, niter).force() t2 = datetime.now() cost_time = millis(t1,t2) print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/niter)
def benchmark_cg(ctx, timer): print "#worker:", ctx.num_workers l = int(math.sqrt(ctx.num_workers)) #n = 2000 * 16 n = 500 * ctx.num_workers la = 20 niter = 5 #nonzer = 7 #nz = n * (nonzer + 1) * (nonzer + 1) + n * (nonzer + 2) #density = 0.5 * nz/(n*n) A = expr.rand(n, n) A = (A + expr.transpose(A)) * 0.5 I = expr.sparse_diagonal((n, n)) * la A = A - I #x1 = numpy_cg(A.glom(), niter) util.log_warn('begin cg!') t1 = datetime.now() x2 = conj_gradient(A, niter).force() t2 = datetime.now() cost_time = millis(t1, t2) print "total cost time:%s ms, per iter cost time:%s ms" % ( cost_time, cost_time / niter)
def benchmark_naive_bayes(ctx, timer): print "#worker:", ctx.num_workers #N = 100000 * ctx.num_workers N = 10000 * 64 D = 128 # create data data = expr.randint(N, D, low=0, high=D, tile_hint=(N, D/ctx.num_workers)) labels = expr.shuffle(expr.ndarray((data.shape[0], 1), dtype=np.int), _init_label_mapper, kw={'data': data}, shape_hint=(data.shape[0], 1), cost_hint={hash(data):{'00': 0, '10': np.prod(data.shape)}} ) #util.log_warn('data:%s, label:%s', data.glom(), labels.glom()) util.log_warn('begin train') t1 = datetime.now() model = fit(data, labels, D) t2 = datetime.now() util.log_warn('train time:%s ms', millis(t1,t2)) correct = 0 for i in range(10): new_data = expr.randint(1, D, low=0, high=D, tile_hint=(1, D)) new_label = predict(model, new_data) #print 'point %s, predict %s' % (new_data.glom(), new_label) new_data = new_data.glom() if np.isclose(new_data[0, new_label], np.max(new_data)): correct += 1 print 'predict precision:', correct * 1.0 / 10
def benchmark_cholesky(ctx, timer): print "#worker:", ctx.num_workers #n = int(math.pow(ctx.num_workers, 1.0 / 3.0)) n = int(math.sqrt(ctx.num_workers)) #ARRAY_SIZE = 1600 * 4 ARRAY_SIZE = 1600 * n util.log_warn('prepare data!') #A = np.random.randn(ARRAY_SIZE, ARRAY_SIZE) #A = np.dot(A, A.T) #A = expr.force(from_numpy(A, tile_hint=(ARRAY_SIZE/n, ARRAY_SIZE/n))) #A = expr.randn(ARRAY_SIZE, ARRAY_SIZE, tile_hint=(ARRAY_SIZE/n, ARRAY_SIZE/n)) A = expr.randn(ARRAY_SIZE, ARRAY_SIZE) # FIXME: Ideally we should be able to get rid of tile_hint. # However, current extent.change_partition_axis relies on the # information of one-dimentional size to change tiling to grid tiling. # It assumes that every extent should be partitioned in the same size. # Trace extent.pyx to think about how to fix it! A = expr.dot(A, expr.transpose(A), tile_hint=(ARRAY_SIZE, ARRAY_SIZE / ctx.num_workers)).force() util.log_warn('begin cholesky!') t1 = datetime.now() L = cholesky(A).glom() t2 = datetime.now() assert np.all(np.isclose(A.glom(), np.dot(L, L.T.conj()))) cost_time = millis(t1, t2) print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time / n)
def benchmark_lda(ctx, timer): print "#worker:", ctx.num_workers NUM_TERMS = 160 NUM_DOCS = 200 * ctx.num_workers #NUM_DOCS = 10 * 64 # create data # NUM_TERMS = 41807 # NUM_DOCS = 21578 # terms_docs_matrix = from_file("/scratch/cq/numpy_dense_matrix", sparse = False, tile_hint = (NUM_TERMS, int((NUM_DOCS + ctx.num_workers - 1) / ctx.num_workers))).evaluate() terms_docs_matrix = expr.randint(NUM_TERMS, NUM_DOCS, low=0, high=100) max_iter = 3 k_topics = 16 t1 = datetime.now() doc_topics, topic_term_count = learn_topics(terms_docs_matrix, k_topics, max_iter=max_iter) doc_topics.optimized().evaluate() topic_term_count.optimized().evaluate() t2 = datetime.now() time_cost = millis(t1, t2) util.log_warn('total_time:%s ms, train time per iteration:%s ms', time_cost, time_cost / max_iter)
def benchmark_naive_bayes(ctx, timer): print "#worker:", ctx.num_workers N = 100000 * ctx.num_workers D = 128 # create data data = expr.randint(N, D, low=0, high=D, tile_hint=(N/ctx.num_workers, D)) labels = expr.eager(expr.shuffle(data, _init_label_mapper)) #util.log_warn('data:%s, label:%s', data.glom(), labels.glom()) util.log_warn('begin train') t1 = datetime.now() model = fit(data, labels, D) t2 = datetime.now() util.log_warn('train time:%s ms', millis(t1,t2)) correct = 0 for i in range(10): new_data = expr.randint(1, D, low=0, high=D, tile_hint=(1, D)) new_label = predict(model, new_data) #print 'point %s, predict %s' % (new_data.glom(), new_label) new_data = new_data.glom() if np.isclose(new_data[0, new_label], np.max(new_data)): correct += 1 print 'predict precision:', correct * 1.0 / 10
def numpy_cg(A, num_iter): x = np.ones((A.shape[1], 1)) for iter in range(num_iter): util.log_warn('iteration:%d', iter) z = numpy_cgit(A, x) x = z / np.linalg.norm(z, 2) return x
def numpy_cg(A, num_iter): x = np.ones((A.shape[1],1)) for iter in range(num_iter): util.log_warn('iteration:%d', iter) z = numpy_cgit(A, x) x = z / np.linalg.norm(z,2) return x
def sparse_multiply(wts, p, p_tile_hint): avg_time = 0.0 for i in range(num_iter): util.log_warn('iteration %d begin!', i) t1 = datetime.now() p = expr.dot(wts, p, tile_hint=p_tile_hint).force() t2 = datetime.now() time_cost = millis(t1, t2) print "iteration %d sparse * dense: %s ms" % (i, time_cost) avg_time += time_cost return avg_time / num_iter
def benchmark_lreg(ctx, timer): print "#worker:", ctx.num_workers FLAGS.opt_parakeet_gen = 0 N_EXAMPLES = 4000000 * ctx.num_workers #N_EXAMPLES = 5000000 * 64 x = expr.rand(N_EXAMPLES, N_DIM) y = expr.rand(N_EXAMPLES, 1) start = time.time() linear_regression.linear_regression(x, y, ITERATION) total = time.time() - start util.log_warn("time cost : %s s" % (total*1.0/ITERATION,))
def benchmark_ridgereg(ctx, timer): print "#worker:", ctx.num_workers #N_EXAMPLES = 100000000 * ctx.num_workers N_EXAMPLES = 90000000 * ctx.num_workers x = expr.rand(N_EXAMPLES, N_DIM) y = expr.rand(N_EXAMPLES, 1) start = time.time() ridge_regression.ridge_regression(x, y, 1, ITERATION) total = time.time() - start util.log_warn("time cost : %s s" % (total * 1.0 / ITERATION, ))
def benchmark_logreg(ctx, timer): print "#worker:", ctx.num_workers #N_EXAMPLES = 40000000 * ctx.num_workers N_EXAMPLES = 5000000 * 64 x = expr.eager(expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / ctx.num_workers, N_DIM))) y = expr.eager(expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / ctx.num_workers, 1))) start = time.time() logistic_regression.logistic_regression(x, y, ITERATION) total = time.time() - start util.log_warn("time cost : %s s" % (total*1.0/ITERATION,))
def benchmark_ridgereg(ctx, timer): print "#worker:", ctx.num_workers #N_EXAMPLES = 100000000 * ctx.num_workers N_EXAMPLES = 90000000 * ctx.num_workers x = expr.rand(N_EXAMPLES, N_DIM) y = expr.rand(N_EXAMPLES, 1) start = time.time() ridge_regression.ridge_regression(x, y, 1, ITERATION) total = time.time() - start util.log_warn("time cost : %s s" % (total*1.0/ITERATION,))
def _cholesky_dsyrk_dgemm_mapper(extents, tiles): util.log_warn("dgemm %s" % str(extents)) input = tiles[0] ex = extents[0] A_mk = tiles[1].T if ex.ul[0] == ex.ul[1] and ex.lr[0] == ex.lr[1]: # diag block return ex, linalg.blas.dsyrk(-1.0, A_mk, 1.0, input, lower=1) else: # other block A_lk = tiles[2] return ex, linalg.blas.dgemm(-1.0, A_lk, A_mk.T, 1.0, input)
def benchmark_als(ctx, timer): print "#worker:", ctx.num_workers #USER_SIZE = 400 * ctx.num_workers USER_SIZE = 200 * 64 MOVIE_SIZE = 12800 num_features = 20 num_iter = 5 A = expr.eager(expr.randint(USER_SIZE, MOVIE_SIZE, low=0, high=5, tile_hint=(USER_SIZE/ctx.num_workers, MOVIE_SIZE))) util.log_warn('begin als!') t1 = datetime.now() U, M = als(A, implicit_feedback=True, num_features=num_features, num_iter=num_iter) U.force() M.force() t2 = datetime.now() cost_time = millis(t1,t2) print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/num_iter)
def numpy_cgit(A, x): z = np.zeros(x.shape) r = x rho = np.dot(r.T, r) util.log_warn('rho:%s', rho) p = r for i in xrange(15): q = np.dot(A, p) alpha = rho / np.dot(p.T, q) #util.log_warn('alpha:%s', alpha) z = z + p * alpha rho0 = rho r = r - q * alpha rho = np.dot(r.T, r) beta = rho / rho0 #util.log_warn('beta:%s', beta) p = r + p * beta return z
def benchmark_als(ctx, timer): print "#worker:", ctx.num_workers #USER_SIZE = 100 * ctx.num_workers USER_SIZE = 320 #USER_SIZE = 200 * 64 MOVIE_SIZE = 12800 num_features = 20 num_iter = 2 A = expr.randint(USER_SIZE, MOVIE_SIZE, low=0, high=5, tile_hint=(USER_SIZE, util.divup(MOVIE_SIZE, ctx.num_workers))) #A = expr.randint(USER_SIZE, MOVIE_SIZE, low=0, high=5) util.log_warn('begin als!') t1 = datetime.now() U, M = als(A, implicit_feedback=True, num_features=num_features, num_iter=num_iter) U.force() M.force() t2 = datetime.now() cost_time = millis(t1,t2) print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/num_iter)
def benchmark_cholesky(ctx, timer): print "#worker:", ctx.num_workers #n = int(math.pow(ctx.num_workers, 1.0 / 3.0)) n = int(math.sqrt(ctx.num_workers)) #ARRAY_SIZE = 1600 * 4 ARRAY_SIZE = 900 * n util.log_warn('prepare data!') #A = np.random.randn(ARRAY_SIZE, ARRAY_SIZE) #A = np.dot(A, A.T) A = expr.randn(ARRAY_SIZE, ARRAY_SIZE) A = expr.dot(A, expr.transpose(A)) util.log_warn('begin cholesky!') t1 = datetime.now() L = cholesky(A).optimized().glom() t2 = datetime.now() #assert np.all(np.isclose(A.glom(), np.dot(L, L.T.conj()))) cost_time = millis(t1, t2) print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/n)
def benchmark_cholesky(ctx, timer): print "#worker:", ctx.num_workers # n = int(math.pow(ctx.num_workers, 1.0 / 3.0)) n = int(math.sqrt(ctx.num_workers)) ARRAY_SIZE = 1600 * 4 # ARRAY_SIZE = 1600 * n util.log_warn("prepare data!") # A = np.random.randn(ARRAY_SIZE, ARRAY_SIZE) # A = np.dot(A, A.T) # A = expr.force(from_numpy(A, tile_hint=(ARRAY_SIZE/n, ARRAY_SIZE/n))) A = expr.randn(ARRAY_SIZE, ARRAY_SIZE, tile_hint=(ARRAY_SIZE / n, ARRAY_SIZE / n)) A = expr.dot(A, expr.transpose(A)).force() util.log_warn("begin cholesky!") t1 = datetime.now() L = cholesky(A).glom() t2 = datetime.now() assert np.all(np.isclose(A.glom(), np.dot(L, L.T.conj()))) cost_time = millis(t1, t2) print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time / n)
def benchmark_cholesky(ctx, timer): print "#worker:", ctx.num_workers #n = int(math.pow(ctx.num_workers, 1.0 / 3.0)) n = int(math.sqrt(ctx.num_workers)) #ARRAY_SIZE = 1600 * 4 ARRAY_SIZE = 900 * n util.log_warn('prepare data!') #A = np.random.randn(ARRAY_SIZE, ARRAY_SIZE) #A = np.dot(A, A.T) A = expr.randn(ARRAY_SIZE, ARRAY_SIZE) A = expr.dot(A, expr.transpose(A)) util.log_warn('begin cholesky!') t1 = datetime.now() L = cholesky(A).optimized().glom() t2 = datetime.now() #assert np.all(np.isclose(A.glom(), np.dot(L, L.T.conj()))) cost_time = millis(t1, t2) print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time / n)
def benchmark_lda(ctx, timer): print "#worker:", ctx.num_workers NUM_TERMS = 160 NUM_DOCS = 200 * ctx.num_workers #NUM_DOCS = 10 * 64 # create data # NUM_TERMS = 41807 # NUM_DOCS = 21578 # terms_docs_matrix = from_file("/scratch/cq/numpy_dense_matrix", sparse = False, tile_hint = (NUM_TERMS, int((NUM_DOCS + ctx.num_workers - 1) / ctx.num_workers))).force() terms_docs_matrix = expr.randint(NUM_TERMS, NUM_DOCS, low=0, high=100) max_iter = 3 k_topics = 16 t1 = datetime.now() doc_topics, topic_term_count = learn_topics(terms_docs_matrix, k_topics, max_iter=max_iter) doc_topics.optimized().force() topic_term_count.optimized().force() t2 = datetime.now() time_cost = millis(t1,t2) util.log_warn('total_time:%s ms, train time per iteration:%s ms', time_cost, time_cost/max_iter)
def benchmark_jacobi(ctx, timer): global base, ITERATION util.log_warn('util.log_warn: %s', ctx.num_workers) A, b = jacobi.jacobi_init(base * ctx.num_workers) A, b = A.evaluate(), b.evaluate() start = time.time() result = jacobi.jacobi_method(A, b, ITERATION).glom() cost = time.time() - start util.log_info('\nresult =\n%s', result) util.log_warn('time cost: %s s', cost) util.log_warn('cost per iteration: %s s\n', cost / ITERATION)
def fit(self, X, centers=None, implementation='outer'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0],)) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={'d_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels}, shape_hint=(1,), cost_hint={hash(labels): {'00': 0, '01': np.prod(labels.shape)}}) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def sparse_multiply(wts, p, p_tile_hint): for i in range(num_iter): util.log_warn('iteration %d begin!', i) p = expr.dot(wts, p).optimized() p.evaluate() return
def fit(self, X, centers=None, implementation='map2'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0], )) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum( X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={ 'd_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels }, shape_hint=(1, ), cost_hint={ hash(labels): { '00': 0, '01': np.prod(labels.shape) } }) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn( n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels