def test_reshape_dot(self): npa1 = np.random.random((357, 93)) npa2 = np.random.random((31, 357)) result = np.dot(np.reshape(npa1, (1071, 31)), npa2) t1 = expr.from_numpy(npa1) t2 = expr.from_numpy(npa2) t3 = expr.dot(expr.reshape(t1, (1071, 31)), t2) Assert.all_eq(result, t3.glom(), 10e-9) npa1 = np.random.random((357, 718)) npa2 = np.random.random((718, )) result = np.dot(npa1, np.reshape(npa2, (718, 1))) t1 = expr.from_numpy(npa1) t2 = expr.from_numpy(npa2) t3 = expr.dot(t1, expr.reshape(t2, (718, 1))) Assert.all_eq(result, t3.glom(), 10e-9) npa1 = np.random.random((718, )) npa2 = np.random.random((1, 357)) result = np.dot(np.reshape(npa1, (718, 1)), npa2) t1 = expr.from_numpy(npa1) t2 = expr.from_numpy(npa2) t3 = expr.dot(expr.reshape(t1, (718, 1)), t2) Assert.all_eq(result, t3.glom(), 10e-9)
def test_reshape5(self): a = expr.arange((35511, )) b = expr.reshape(a, (133, 267)) c = expr.reshape(b, (267, 133)) d = expr.reshape(c, (1, 35511)) e = expr.arange((1, 35511)) Assert.all_eq(d.glom(), e.glom())
def test_reshape3(self): a = expr.arange((100, 100)) b = expr.reshape(a, (10000,)) c = expr.reshape(b, (10000, 1)) d = expr.reshape(c, (1, 10000)) e = expr.arange((1, 10000)) Assert.all_eq(d.glom(), e.glom())
def test_reshape6(self): a = expr.arange((12319, )) b = expr.reshape(a, (127, 97)) c = expr.reshape(b, (97, 127)) d = expr.reshape(c, (1, 12319)) e = expr.arange((1, 12319)) Assert.all_eq(d.glom(), e.glom())
def test_reshape3(self): a = expr.arange((100, 100)) b = expr.reshape(a, (10000, )) c = expr.reshape(b, (10000, 1)) d = expr.reshape(c, (1, 10000)) e = expr.arange((1, 10000)) Assert.all_eq(d.glom(), e.glom())
def test_reshape8(self): t1 = expr.sparse_diagonal((137, 113)) t2 = expr.sparse_diagonal((113, 137)) a = expr.reshape(t1, (113, 137)) b = expr.reshape(t2, (137, 113)) Assert.all_eq(a.glom().todense(), sp.eye(137, 113).tolil().reshape((113, 137)).todense()) Assert.all_eq(b.glom().todense(), sp.eye(113, 137).tolil().reshape((137, 113)).todense())
def kneighbors(self, X, n_neighbors=None): """Finds the K-neighbors of a point. Returns distance Parameters ---------- X : array-like, last dimension same as that of fit data The new point. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). Returns ------- dist : array Array representing the lengths to point, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. """ if n_neighbors is not None: self.n_neighbors = n_neighbors if isinstance(X, np.ndarray): X = expr.from_numpy(X) if self.algorithm in ('auto', 'brute'): X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) fit_X_broadcast = expr.reshape(self.X, (1, self.X.shape[0], self.X.shape[1])) distances = expr.sum((X_broadcast - fit_X_broadcast) ** 2, axis=2) neigh_ind = expr.argsort(distances, axis=1) neigh_ind = neigh_ind[:, :n_neighbors].optimized().glom() neigh_dist = expr.sort(distances, axis=1) neigh_dist = expr.sqrt(neigh_dist[:, :n_neighbors]).optimized().glom() return neigh_dist, neigh_ind else: results = self.X.foreach_tile(mapper_fn=_knn_mapper, kw={'X': self.X, 'Q': X, 'n_neighbors': self.n_neighbors, 'algorithm': self.algorithm}) dist = None ind = None """ Get the KNN candidates for each tile of X, then find out the real KNN """ for k, v in results.iteritems(): if dist is None: dist = v[0] ind = v[1] else: dist = np.concatenate((dist, v[0]), axis=1) ind = np.concatenate((ind, v[1]), axis=1) mask = np.argsort(dist, axis=1)[:, :self.n_neighbors] new_dist = np.array([dist[i][mask[i]] for i, r in enumerate(dist)]) new_ind = np.array([ind[i][mask[i]] for i, r in enumerate(ind)]) return new_dist, new_ind
def test_reshape4(self): a = expr.arange((10000, )) b = expr.reshape(a, (10, 1000)) c = expr.reshape(b, (1000, 10)) d = expr.reshape(c, (20, 500)) e = expr.reshape(d, (500, 20)) f = expr.reshape(e, (1, 10000)) g = expr.arange((1, 10000)) Assert.all_eq(f.glom(), g.glom())
def fit(self, X, centers=None): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if centers is None: centers = expr.from_numpy(np.random.rand(self.n_clusters, num_dim)) for i in range(self.n_iter): X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels '''
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim) labels = expr.zeros((points.shape[0],), dtype=np.int) for iter in range(num_iter): centers = expr.as_array(centers) points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2) # This is used to avoid dividing zero distances = distances + 0.00000000001 util.log_info('distances shape %s' % str(distances.shape)) distances_broadcast = expr.reshape(distances, (distances.shape[0], 1, distances.shape[1])) distances_broadcast2 = expr.reshape(distances, (distances.shape[0], distances.shape[1], 1)) prob = 1.0 / expr.sum(expr.power(distances_broadcast / distances_broadcast2, 2.0 / (m - 1)), axis=2) prob.force() counts = expr.sum(prob, axis=0) counts = expr.reshape(counts, (counts.shape[0], 1)) labels = expr.argmax(prob, axis=1) centers = expr.sum(expr.reshape(points, (points.shape[0], 1, points.shape[1])) * expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)), axis=0) # We assume that the size of centers are relative small that can be handled # on the master. counts = counts.glom() centers = centers.glom() # If any centroids don't have any points assigned to them. zcount_indices = (counts == 0).reshape(k) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, which results in their # position being the zero-vector. We reseed these centroids with new random values # and set their counts to 1 in order to get rid of dividing by zero. counts[zcount_indices, :] = 1 centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices), num_dim) centers = centers / counts return labels
def update(self): """ gradient_update = 2xTxw - 2xTy + 2* lambda * w Correct this if the update function is wrong. """ xT = expr.transpose(self.x) g1 = expr.dot(expr.dot(xT, self.x), self.w) g2 = expr.dot(xT, self.y) g3 = self.ridge_lambda * self.w g4 = g1 + g2 + g3 return expr.reshape(g4, (1, self.N_DIM))
def test_optimization_shape(self): shape = (200, 800) na = np.arange(np.prod(shape), dtype=np.int).reshape(shape) nb = np.random.randint(1, 1000, (1000, 1000)) nc = np.random.randint(1, 1000, (1000, 1000)) a = expr.arange(shape, dtype=np.int) b = expr.from_numpy(nb) c = expr.from_numpy(nc) d = b + c e = b + d f = d[200:900, 200:900] g = e[200:900, 200:900] h = f + g i = f + h j = h[100:500, 100:500] k = i[100:300, 100:300] l = expr.reshape(expr.ravel(j), (800, 200)) m = expr.dot(a, l) n = m + k o = n + m q = o[100:200, 100:200] nd = nb + nc ne = nb + nd nf = nd[200:900, 200:900] ng = ne[200:900, 200:900] nh = nf + ng ni = nf + nh nj = nh[100:500, 100:500] nk = ni[100:300, 100:300] nl = np.reshape(np.ravel(nj), (800, 200)) nm = np.dot(na, nl) nn = nm + nk no = nn + nm nq = no[100:200, 100:200] Assert.all_eq(nq, q.optimized().glom(), tolerance = 1e-10)
def center_data(X, y, fit_intercept, normalize=False): """ Centers data to have mean zero along axis 0. This is here because nearly all linear models will want their data to be centered. """ if fit_intercept: X_mean = X.mean(axis = 0) X_mean = expr.reshape(X_mean, (1, X_mean.shape[0])) X -= X_mean if normalize: X_std = expr.sqrt(expr.sum(X ** 2, axis=0)).force() X_std[X_std == 0] = 1 X /= X_std else: X_std = expr.ones(X.shape[1]) y_mean = y.mean(axis=0) y -= y_mean else: X_mean = expr.zeros(X.shape[1]) X_std = expr.ones(X.shape[1]) y_mean = 0. if y.ndim == 1 else expr.zeros(y.shape[1], dtype=X.dtype) return X, y, X_mean, y_mean, X_std
def test_reshape1(self): a = expr.arange((10, 10)) b = expr.reshape(a, (100,)) c = expr.arange((100,)) Assert.all_eq(b.glom(), c.glom())
def test_reshape7(self): t1 = expr.arange((23, 120, 100)).glom() t2 = expr.arange((12, 230, 100)).glom() t3 = expr.arange((276000, 1)).glom() t4 = expr.arange((1, 276000)).glom() a = expr.arange((100, 23, 120)) b = expr.arange((12, 23, 1000)) c = expr.arange((1, 276000)) d = expr.arange((276000, 1)) e = expr.arange((276000, )) Assert.all_eq(expr.reshape(a, (23, 120, 100)).glom(), t1) Assert.all_eq(expr.reshape(a, (12, 230, 100)).glom(), t2) Assert.all_eq(expr.reshape(a, (276000, 1)).glom(), t3) Assert.all_eq(expr.reshape(a, (1, 276000)).glom(), t4) Assert.all_eq(expr.reshape(b, (23, 120, 100)).glom(), t1) Assert.all_eq(expr.reshape(b, (12, 230, 100)).glom(), t2) Assert.all_eq(expr.reshape(b, (276000, 1)).glom(), t3) Assert.all_eq(expr.reshape(b, (1, 276000)).glom(), t4) Assert.all_eq(expr.reshape(c, (23, 120, 100)).glom(), t1) Assert.all_eq(expr.reshape(c, (12, 230, 100)).glom(), t2) Assert.all_eq(expr.reshape(c, (276000, 1)).glom(), t3) Assert.all_eq(expr.reshape(c, (1, 276000)).glom(), t4) Assert.all_eq(expr.reshape(d, (23, 120, 100)).glom(), t1) Assert.all_eq(expr.reshape(d, (12, 230, 100)).glom(), t2) Assert.all_eq(expr.reshape(d, (276000, 1)).glom(), t3) Assert.all_eq(expr.reshape(d, (1, 276000)).glom(), t4) Assert.all_eq(expr.reshape(e, (23, 120, 100)).glom(), t1) Assert.all_eq(expr.reshape(e, (12, 230, 100)).glom(), t2) Assert.all_eq(expr.reshape(e, (276000, 1)).glom(), t3) Assert.all_eq(expr.reshape(e, (1, 276000)).glom(), t4)
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim) labels = expr.zeros((points.shape[0], ), dtype=np.int) for iter in range(num_iter): centers = expr.as_array(centers) points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2) # This is used to avoid dividing zero distances = distances + 0.00000000001 util.log_info('distances shape %s' % str(distances.shape)) distances_broadcast = expr.reshape( distances, (distances.shape[0], 1, distances.shape[1])) distances_broadcast2 = expr.reshape( distances, (distances.shape[0], distances.shape[1], 1)) prob = 1.0 / expr.sum(expr.power( distances_broadcast / distances_broadcast2, 2.0 / (m - 1)), axis=2) prob.force() counts = expr.sum(prob, axis=0) counts = expr.reshape(counts, (counts.shape[0], 1)) labels = expr.argmax(prob, axis=1) centers = expr.sum( expr.reshape(points, (points.shape[0], 1, points.shape[1])) * expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)), axis=0) # We assume that the size of centers are relative small that can be handled # on the master. counts = counts.glom() centers = centers.glom() # If any centroids don't have any points assigned to them. zcount_indices = (counts == 0).reshape(k) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, which results in their # position being the zero-vector. We reseed these centroids with new random values # and set their counts to 1 in order to get rid of dividing by zero. counts[zcount_indices, :] = 1 centers[zcount_indices, :] = np.random.rand( np.count_nonzero(zcount_indices), num_dim) centers = centers / counts return labels
def test_reshape2(self): a = expr.arange((1000, ), tile_hint=[100]) b = expr.reshape(a, (10, 100)).force() c = expr.reshape(b, (1000, )).force()
def test_reshape1(self): a = expr.arange((10, 10)) b = expr.reshape(a, (100, )) c = expr.arange((100, )) Assert.all_eq(b.glom(), c.glom())
def test_reshape2(self): a = expr.arange((1000,), tile_hint=[100]) b = expr.reshape(a, (10, 100)).force() c = expr.reshape(b, (1000,)).force()
def fit(self, X, centers=None, implementation='outer'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0],)) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={'d_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels}, shape_hint=(1,), cost_hint={hash(labels): {'00': 0, '01': np.prod(labels.shape)}}) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def fit(self, X, centers=None, implementation='map2'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0], )) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum( X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={ 'd_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels }, shape_hint=(1, ), cost_hint={ hash(labels): { '00': 0, '01': np.prod(labels.shape) } }) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn( n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def kneighbors(self, X, n_neighbors=None): """Finds the K-neighbors of a point. Returns distance Parameters ---------- X : array-like, last dimension same as that of fit data The new point. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). Returns ------- dist : array Array representing the lengths to point, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. """ if n_neighbors is not None: self.n_neighbors = n_neighbors if isinstance(X, np.ndarray): X = expr.from_numpy(X) if self.algorithm in ('auto', 'brute'): X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) fit_X_broadcast = expr.reshape( self.X, (1, self.X.shape[0], self.X.shape[1])) distances = expr.sum((X_broadcast - fit_X_broadcast)**2, axis=2) neigh_ind = expr.argsort(distances, axis=1) neigh_ind = neigh_ind[:, :n_neighbors].optimized().glom() neigh_dist = expr.sort(distances, axis=1) neigh_dist = expr.sqrt( neigh_dist[:, :n_neighbors]).optimized().glom() return neigh_dist, neigh_ind else: results = self.X.foreach_tile(mapper_fn=_knn_mapper, kw={ 'X': self.X, 'Q': X, 'n_neighbors': self.n_neighbors, 'algorithm': self.algorithm }) dist = None ind = None """ Get the KNN candidates for each tile of X, then find out the real KNN """ for k, v in results.iteritems(): if dist is None: dist = v[0] ind = v[1] else: dist = np.concatenate((dist, v[0]), axis=1) ind = np.concatenate((ind, v[1]), axis=1) mask = np.argsort(dist, axis=1)[:, :self.n_neighbors] new_dist = np.array([dist[i][mask[i]] for i, r in enumerate(dist)]) new_ind = np.array([ind[i][mask[i]] for i, r in enumerate(ind)]) return new_dist, new_ind