def cgit(A, x): ''' CGIT Conjugate Gradient iteration z = cgit(A, x) generates approximate solution to A*z = x. Args: A(Expr): matrix to be processed. x(Expr): the input vector. ''' z = expr.zeros(x.shape, tile_hint=(A.tile_shape()[1], 1)) r = x rho = expr.sum(r * r).glom() #util.log_warn('rho:%s', rho) p = r for i in xrange(15): q = expr.dot(A, p, tile_hint=(A.tile_shape()[1], 1)) alpha = rho / expr.sum(p * q).glom() #util.log_warn('alpha:%s', alpha) z = z + p * alpha rho0 = rho r = r - q * alpha rho = expr.sum(r * r).glom() beta = rho / rho0 #util.log_warn('beta:%s', beta) p = r + p * beta return z
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim) labels = expr.zeros((points.shape[0],), dtype=np.int) for iter in range(num_iter): centers = expr.as_array(centers) points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2) # This is used to avoid dividing zero distances = distances + 0.00000000001 util.log_info('distances shape %s' % str(distances.shape)) distances_broadcast = expr.reshape(distances, (distances.shape[0], 1, distances.shape[1])) distances_broadcast2 = expr.reshape(distances, (distances.shape[0], distances.shape[1], 1)) prob = 1.0 / expr.sum(expr.power(distances_broadcast / distances_broadcast2, 2.0 / (m - 1)), axis=2) prob.force() counts = expr.sum(prob, axis=0) counts = expr.reshape(counts, (counts.shape[0], 1)) labels = expr.argmax(prob, axis=1) centers = expr.sum(expr.reshape(points, (points.shape[0], 1, points.shape[1])) * expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)), axis=0) # We assume that the size of centers are relative small that can be handled # on the master. counts = counts.glom() centers = centers.glom() # If any centroids don't have any points assigned to them. zcount_indices = (counts == 0).reshape(k) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, which results in their # position being the zero-vector. We reseed these centroids with new random values # and set their counts to 1 in order to get rid of dividing by zero. counts[zcount_indices, :] = 1 centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices), num_dim) centers = centers / counts return labels
def fit(self, X, centers=None): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if centers is None: centers = expr.from_numpy(np.random.rand(self.n_clusters, num_dim)) for i in range(self.n_iter): X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels '''
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = points.evaluate() num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim) #labels = expr.zeros((points.shape[0],), dtype=np.int) for iter in range(num_iter): centers = centers.glom() fuzzy = expr.map2(points, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers, "m": m}, shape=(points.shape[0], centers.shape[0])) labels = expr.argmax(fuzzy, axis=1) new_centers = expr.map2((points, fuzzy), (0, 0), fn=kmeans_map2_center_mapper, fn_kw={"centers": centers, "m": m}, shape=(centers.shape[0], centers.shape[1]), reducer=np.add) new_centers /= expr.sum(fuzzy ** m, axis=0)[:, expr.newaxis] centers = new_centers return labels
def _step(): yp = expr.dot(x, w) Assert.all_eq(yp.shape, y.shape) diff = x * (yp - y) grad = expr.sum(diff, axis=0).glom().reshape((N_DIM, 1)) wprime = w - grad * 1e-6 expr.force(wprime)
def kneighbors(self, X, n_neighbors=None): """Finds the K-neighbors of a point. Returns distance Parameters ---------- X : array-like, last dimension same as that of fit data The new point. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). Returns ------- dist : array Array representing the lengths to point, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. """ if n_neighbors is not None: self.n_neighbors = n_neighbors if isinstance(X, np.ndarray): X = expr.from_numpy(X) if self.algorithm in ('auto', 'brute'): X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) fit_X_broadcast = expr.reshape(self.X, (1, self.X.shape[0], self.X.shape[1])) distances = expr.sum((X_broadcast - fit_X_broadcast) ** 2, axis=2) neigh_ind = expr.argsort(distances, axis=1) neigh_ind = neigh_ind[:, :n_neighbors].optimized().glom() neigh_dist = expr.sort(distances, axis=1) neigh_dist = expr.sqrt(neigh_dist[:, :n_neighbors]).optimized().glom() return neigh_dist, neigh_ind else: results = self.X.foreach_tile(mapper_fn=_knn_mapper, kw={'X': self.X, 'Q': X, 'n_neighbors': self.n_neighbors, 'algorithm': self.algorithm}) dist = None ind = None """ Get the KNN candidates for each tile of X, then find out the real KNN """ for k, v in results.iteritems(): if dist is None: dist = v[0] ind = v[1] else: dist = np.concatenate((dist, v[0]), axis=1) ind = np.concatenate((ind, v[1]), axis=1) mask = np.argsort(dist, axis=1)[:, :self.n_neighbors] new_dist = np.array([dist[i][mask[i]] for i, r in enumerate(dist)]) new_ind = np.array([ind[i][mask[i]] for i, r in enumerate(ind)]) return new_dist, new_ind
def fit(data, labels, label_size, alpha=1.0): ''' Train standard naive bayes model. Args: data(Expr): documents to be trained. labels(Expr): the correct labels of the training data. label_size(int): the number of different labels. alpha(float): alpha parameter of naive bayes model. ''' labels = expr.force(labels) # calc document freq df = expr.reduce(data, axis=0, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: (data > 0).sum(axis), accumulate_fn=np.add, tile_hint=(data.shape[1],)) idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1 # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency # by the root mean square of features frequencies in that document square_sum = expr.reduce(data, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis), accumulate_fn=np.add, tile_hint=(data.shape[0],)) rms = expr.sqrt(square_sum * 1.0 / data.shape[1]) # calculate weight normalized Tf-Idf data = data / rms.reshape((data.shape[0], 1)) * idf.reshape((1, data.shape[1])) # add up all the feature vectors with the same labels sum_instance_by_label = expr.ndarray((label_size, data.shape[1]), dtype=np.float64, reduce_fn=np.add, tile_hint=(label_size / len(labels.tiles), data.shape[1])) sum_instance_by_label = expr.shuffle(data, _sum_instance_by_label_mapper, target=sum_instance_by_label, kw={'labels': labels, 'label_size': label_size}) # sum up all the weights for each label from the previous step weights_per_label = expr.sum(sum_instance_by_label, axis=1, tile_hint=(label_size,)) # generate naive bayes per_label_and_feature weights weights_per_label_and_feature = expr.shuffle(sum_instance_by_label, _naive_bayes_mapper, kw={'weights_per_label': weights_per_label, 'alpha':alpha}) return {'scores_per_label_and_feature': weights_per_label_and_feature.force(), 'scores_per_label': weights_per_label.force(), }
def move(galaxy, dt): '''Move the bodies. First find forces and change velocity and then move positions. ''' # `.reshape(add_tuple(a, 1))` is the spartan way of doing # `ndarray[:, np.newaxis]` in numpy. While syntactically different, both # add a dimension of length 1 after the other dimensions. # e.g. (5, 5) becomes (5, 5, 1) # Calculate all distances component wise (with sign). dx_new = galaxy['x'].reshape(add_tuple(galaxy['x'].shape, [1])) dy_new = galaxy['y'].reshape(add_tuple(galaxy['y'].shape, [1])) dz_new = galaxy['z'].reshape(add_tuple(galaxy['z'].shape, [1])) dx = (galaxy['x'] - dx_new) * -1 dy = (galaxy['y'] - dy_new) * -1 dz = (galaxy['z'] - dz_new) * -1 # Euclidean distances (all bodies). r = sqrt(dx**2 + dy**2 + dz**2) r = set_diagonal(r, 1.0) # Prevent collision. mask = r < 1.0 #r = r * ~mask + 1.0 * mask r = spartan.map((r, mask), lambda x, m: x * ~m + 1.0 * m) m = galaxy['m'].reshape(add_tuple(galaxy['m'].shape, [1])) # Calculate the acceleration component wise. fx = G*m*dx / r**3 fy = G*m*dy / r**3 fz = G*m*dz / r**3 # Set the force (acceleration) a body exerts on itself to zero. fx = set_diagonal(fx, 0.0) fy = set_diagonal(fy, 0.0) fz = set_diagonal(fz, 0.0) galaxy['vx'] += dt*expr.sum(fx, axis=0) galaxy['vy'] += dt*expr.sum(fy, axis=0) galaxy['vz'] += dt*expr.sum(fz, axis=0) galaxy['x'] += dt*galaxy['vx'] galaxy['y'] += dt*galaxy['vy'] galaxy['z'] += dt*galaxy['vz']
def train(self): self.x = expr.eager(self.x) self.y = expr.eager(self.y) for i in range(self.iterations): diff = self.update() grad = expr.sum(diff, axis=0, tile_hint=[self.N_DIM]).glom().reshape((self.N_DIM, 1)) self.w = self.w - grad * self.alpha return self.w
def fn2(): a = expr.ones((N, N)) b = expr.ones((N, N/2)) g = expr.dot(a, b) + expr.dot(expr.sum(a, axis=1).reshape((1, N)), b) t1 = time.time() g_opt = g.optimized() #g_opt.force() t2 = time.time() print t2 - t1 print g_opt
def fit(data, labels, label_size, alpha=1.0): ''' Train standard naive bayes model. Args: data(Expr): documents to be trained. labels(Expr): the correct labels of the training data. label_size(int): the number of different labels. alpha(float): alpha parameter of naive bayes model. ''' # calc document freq df = expr.reduce(data, axis=0, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: (data > 0).sum(axis), accumulate_fn=np.add) idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1 # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency # by the root mean square of features frequencies in that document square_sum = expr.reduce(data, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis), accumulate_fn=np.add) rms = expr.sqrt(square_sum * 1.0 / data.shape[1]) # calculate weight normalized Tf-Idf data = data / rms.reshape((data.shape[0], 1)) * idf.reshape((1, data.shape[1])) # add up all the feature vectors with the same labels #weights_per_label_and_feature = expr.ndarray((label_size, data.shape[1]), dtype=np.float64) #for i in range(label_size): # i_mask = (labels == i) # weights_per_label_and_feature = expr.assign(weights_per_label_and_feature, np.s_[i, :], expr.sum(data[i_mask, :], axis=0)) weights_per_label_and_feature = expr.shuffle(expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)), _sum_instance_by_label_mapper, target=expr.ndarray((label_size, data.shape[1]), dtype=np.float64, reduce_fn=np.add), kw={'labels': labels, 'label_size': label_size}, cost_hint={hash(labels):{'00':0, '01':np.prod(labels.shape)}}) # sum up all the weights for each label from the previous step weights_per_label = expr.sum(weights_per_label_and_feature, axis=1) # generate naive bayes per_label_and_feature weights weights_per_label_and_feature = expr.log((weights_per_label_and_feature + alpha) / (weights_per_label.reshape((weights_per_label.shape[0], 1)) + alpha * weights_per_label_and_feature.shape[1])) return {'scores_per_label_and_feature': weights_per_label_and_feature.optimized().force(), 'scores_per_label': weights_per_label.optimized().force(), }
def fit(self, X, y): """ Transform to distarray if it's numpy array""" if isinstance(X, np.ndarray): X = expr.make_from_numpy(X) if isinstance(y, np.ndarray): y = expr.make_from_numpy(y) X, y, X_mean, y_mean, X_std = self._center_data( X, y, self.fit_intercept, self.normalize) N_DIM = X.shape[1] self._coef = np.random.randn(N_DIM, 1) for i in range(self.iterations): yp = expr.dot(X, self._coef) print expr.sum((yp - y) ** 2).glom() diff = X * (yp - y) grad = expr.sum(diff, axis=0).glom().reshape((N_DIM, 1)) self._coef = self._coef - grad * 1e-6 return self
def _get_norm_of_each_item(self, rating_table): """Get norm of each item vector. For each Item, caculate the norm the item vector. Parameters ---------- rating_table : Spartan matrix of shape(M, N). Each column represents the rating of the item. Returns --------- item_norm: Spartan matrix of shape(N,). item_norm[i] equals || rating_table[:,i] || """ return expr.sqrt(expr.sum(expr.multiply(rating_table, rating_table), axis=0))
def _get_norm_of_each_item(self, rating_table): """Get norm of each item vector. For each Item, caculate the norm the item vector. Parameters ---------- rating_table : Spartan matrix of shape(M, N). Each column represents the rating of the item. Returns --------- item_norm: Spartan matrix of shape(N,). item_norm[i] equals || rating_table[:,i] || """ ctx = blob_ctx.get() if isinstance(rating_table, array.distarray.DistArray): rating_table = expr.lazify(rating_table) res = expr.sqrt(expr.sum(expr.multiply(rating_table, rating_table), axis=0, tile_hint=(rating_table.shape[1] / ctx.num_workers, ))) return res.force()
def fit(data, labels, T=50, la=1.0): ''' Train an SVM model using the disdca (2013) algorithm. Args: data(Expr): points to be trained. labels(Expr): the correct labels of the training data. T(int): max training iterations. la(float): lambda parameter of this SVM model. ''' w = expr.zeros((data.shape[1], 1), dtype=np.float64) alpha = expr.zeros((data.shape[0], 1), dtype=np.float64) for i in range(T): alpha = expr.shuffle(expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)), _svm_mapper, kw={'labels': labels, 'alpha': alpha, 'w': w, 'lambda_n': la * data.shape[0]}, shape_hint=alpha.shape, cost_hint={ hash(labels) : {'00': 0, '01': np.prod(labels.shape)}, hash(alpha) : {'00': 0, '01': np.prod(alpha.shape)} }) w = expr.sum(data * alpha * 1.0 / la / data.shape[0], axis=0).reshape((data.shape[1], 1)) w = w.optimized() return w
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10, M=None): ''' compute the factorization A = U M' using the alternating least-squares (ALS) method. where `A` is the "ratings" matrix which maps from a user and item to a rating score, `U` and `M` are the factor matrices, which represent user and item preferences. Args: A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score. la(float): the parameter of the als. alpha(int): confidence parameter used on implicit feedback. implicit_feedback(bool): whether using implicit_feedback method for als. num_features(int): dimension of the feature space. num_iter(int): max iteration to run. ''' num_users = A.shape[0] num_items = A.shape[1] AT = expr.transpose(A) avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0) M = expr.rand(num_items, num_features) M = expr.assign(M, np.s_[:, 0], avg_rating.reshape((avg_rating.shape[0], 1))) #A = expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0)) #AT = expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0)) for i in range(num_iter): # Recomputing U shape = (num_users, num_features) U = expr.outer((A, M), (0, None), fn=_solve_U_or_M_mapper, fn_kw={'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback, 'shape': shape}, shape=shape, dtype=np.float) # Recomputing M shape = (num_items, num_features) M = expr.outer((AT, U), (0, None), fn=_solve_U_or_M_mapper, fn_kw={'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback, 'shape': shape}, shape=shape, dtype=np.float) return U, M
def test_optimization_reduced(self): na = np.random.rand(1000, 1000) nb = np.random.rand(1000, 1000) a = expr.from_numpy(na) b = expr.from_numpy(nb) c = a - b d = a + c f = c[200:900, 200:900] g = d[200:900, 200:900] h = f - g i = f + h j = h[100:500, 100:500] k = i[100:500, 100:500] l = expr.dot(j, k) m = j + k n = k - l o = n - m q = n + o r = q - m s = expr.sum(r) nc = na - nb nd = na + nc nf = nc[200:900, 200:900] ng = nd[200:900, 200:900] nh = nf - ng ni = nf + nh nj = nh[100:500, 100:500] nk = ni[100:500, 100:500] nl = np.dot(nj, nk) nm = nj + nk nn = nk - nl no = nn - nm nq = nn + no nr = nq - nm ns = np.sum(nr) # Our sum seems to reduce precision Assert.all_eq(ns, s.optimized().glom(), tolerance = 1e-6)
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10): ''' compute the factorization A = U M' using the alternating least-squares (ALS) method. where `A` is the "ratings" matrix which maps from a user and item to a rating score, `U` and `M` are the factor matrices, which represent user and item preferences. Args: A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score. la(float): the parameter of the als. alpha(int): confidence parameter used on implicit feedback. implicit_feedback(bool): whether using implicit_feedback method for als. num_features(int): dimension of the feature space. num_iter(int): max iteration to run. ''' A = expr.force(A) AT = expr.shuffle(expr.ndarray((A.shape[1], A.shape[0]), dtype=A.dtype, tile_hint=(A.shape[1] / len(A.tiles), A.shape[0])), _transpose_mapper, kw={'orig_array': A}) num_items = A.shape[1] avg_rating = expr.sum(A, axis=0, tile_hint=(num_items / len(A.tiles),)) * 1.0 / \ expr.count_nonzero(A, axis=0, tile_hint=(num_items / len(A.tiles),)) M = expr.shuffle(expr.ndarray((num_items, num_features), tile_hint=(num_items / len(A.tiles), num_features)), _init_M_mapper, kw={'avg_rating': avg_rating}) #util.log_warn('avg_rating:%s M:%s', avg_rating.glom(), M.glom()) for i in range(num_iter): # Recomputing U U = expr.shuffle(A, _solve_U_or_M_mapper, kw={'U_or_M': M, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback}) # Recomputing M M = expr.shuffle(AT, _solve_U_or_M_mapper, kw={'U_or_M': U, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback}) return U, M
def test_linear_reg(self): _skip_if_travis() N_EXAMPLES = 10 * 1000 * 1000 * self.ctx.num_workers N_DIM = 10 x = expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / self.ctx.num_workers, N_DIM)).astype(np.float32) y = expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / self.ctx.num_workers, 1)).astype(np.float32) w = np.random.rand(N_DIM, 1).astype(np.float32) x = expr.eager(x) y = expr.eager(y) start = time.time() for i in range(5): yp = expr.dot(x, w) diff = x * (yp - y) grad = expr.sum(diff, axis=0, tile_hint=[N_DIM]).glom().reshape((N_DIM, 1)) w = w - grad * 1e-6 cost = time.time() - start self._verify_cost("linear_reg", cost)
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'): ''' clustering data points using kmeans spectral clustering method. Args: points(Expr or DistArray): the data points to be clustered. k(int): the number of clusters we need to generate. num_iter(int): the max number of iterations that kmeans clustering method runs. similarity_measurement(str): distance method used to measure similarity between two points. ''' # calculate similarity for each pair of points to generate the adjacency matrix A A = expr.shuffle(points, _row_similarity_mapper, kw={'similarity_measurement': similarity_measurement}) num_dims = A.shape[1] # Construct the diagonal matrix D D = expr.sum(A, axis=1, tile_hint=(A.shape[0],)) # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5) L = expr.shuffle(A, _laplacian_mapper, kw={'D': D}) # Perform eigen-decomposition using Lanczos solver overshoot = min(k * 2, num_dims) d, U = lanczos.solve(L, L, overshoot, True) U = U[:, 0:k] # Generate initial clusters which picks rows as centers if that row contains max eigen # value in that column init_clusters = U[np.argmax(U, axis=0)] # Run kmeans clustering with init_clusters kmeans = KMeans(k, num_iter) U = expr.from_numpy(U) centers, labels = kmeans.fit(U, init_clusters) return labels
def center_data(X, y, fit_intercept, normalize=False): """ Centers data to have mean zero along axis 0. This is here because nearly all linear models will want their data to be centered. """ if fit_intercept: X_mean = X.mean(axis = 0) X_mean = expr.reshape(X_mean, (1, X_mean.shape[0])) X -= X_mean if normalize: X_std = expr.sqrt(expr.sum(X ** 2, axis=0)).force() X_std[X_std == 0] = 1 X /= X_std else: X_std = expr.ones(X.shape[1]) y_mean = y.mean(axis=0) y -= y_mean else: X_mean = expr.zeros(X.shape[1]) X_std = expr.ones(X.shape[1]) y_mean = 0. if y.ndim == 1 else expr.zeros(y.shape[1], dtype=X.dtype) return X, y, X_mean, y_mean, X_std
def kneighbors(self, X, n_neighbors=None): """Finds the K-neighbors of a point. Returns distance Parameters ---------- X : array-like, last dimension same as that of fit data The new point. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). Returns ------- dist : array Array representing the lengths to point, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. """ if n_neighbors is not None: self.n_neighbors = n_neighbors if isinstance(X, np.ndarray): X = expr.from_numpy(X) if self.algorithm in ('auto', 'brute'): X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) fit_X_broadcast = expr.reshape( self.X, (1, self.X.shape[0], self.X.shape[1])) distances = expr.sum((X_broadcast - fit_X_broadcast)**2, axis=2) neigh_ind = expr.argsort(distances, axis=1) neigh_ind = neigh_ind[:, :n_neighbors].optimized().glom() neigh_dist = expr.sort(distances, axis=1) neigh_dist = expr.sqrt( neigh_dist[:, :n_neighbors]).optimized().glom() return neigh_dist, neigh_ind else: results = self.X.foreach_tile(mapper_fn=_knn_mapper, kw={ 'X': self.X, 'Q': X, 'n_neighbors': self.n_neighbors, 'algorithm': self.algorithm }) dist = None ind = None """ Get the KNN candidates for each tile of X, then find out the real KNN """ for k, v in results.iteritems(): if dist is None: dist = v[0] ind = v[1] else: dist = np.concatenate((dist, v[0]), axis=1) ind = np.concatenate((ind, v[1]), axis=1) mask = np.argsort(dist, axis=1)[:, :self.n_neighbors] new_dist = np.array([dist[i][mask[i]] for i, r in enumerate(dist)]) new_ind = np.array([ind[i][mask[i]] for i, r in enumerate(ind)]) return new_dist, new_ind
def simulate(ts_all, te_all, lamb_all, num_paths): """Range over a number of independent products. :param ts_all: DistArray Start dates for a series of swaptions. :param te_all: DistArray End dates for a series of swaptions. :param lamb_all: DistArray Parameter values for a series of swaptions. :param num_paths: Int Number of paths used in random walk. :rtype: DistArray """ swaptions = [] i = 0 for ts_a, te, lamb in zip(ts_all, te_all, lamb_all): for ts in ts_a: # start = time() print i time_structure = arange(None, 0, ts + DELTA, DELTA) maturity_structure = arange(None, 0, te, DELTA) ############# MODEL ############### # Variance reduction technique - Antithetic Variates. eps_tmp = randn(time_structure.shape[0] - 1, num_paths) eps = concatenate(eps_tmp, -eps_tmp, 1) # Forward LIBOR rates for the construction of the spot measure. f_kk = zeros((time_structure.shape[0], 2 * num_paths)) f_kk = assign(f_kk, np.s_[0, :], F_0) # Plane kxN of simulated LIBOR rates. f_kn = ones((maturity_structure.shape[0], 2 * num_paths)) * F_0 # Simulations of the plane f_kn for each time step. for t in xrange(1, time_structure.shape[0]): f_kn_new = f_kn[1:, :] * exp( lamb * mu(f_kn, lamb) * DELTA - 0.5 * lamb * lamb * DELTA + lamb * eps[t - 1, :] * sqrt(DELTA) ) f_kk = assign(f_kk, np.s_[t, :], f_kn_new[0]) f_kn = f_kn_new ############## PRODUCT ############### # Value of zero coupon bonds. zcb = ones((int((te - ts) / DELTA) + 1, 2 * num_paths)) f_kn_modified = 1 + DELTA * f_kn for j in xrange(zcb.shape[0] - 1): zcb = assign(zcb, np.s_[j + 1], zcb[j] / f_kn_modified[j]) # Swaption price at maturity. last_row = zcb[zcb.shape[0] - 1, :].reshape((20,)) swap_ts = maximum(1 - last_row - THETA * DELTA * expr.sum(zcb[1:], 0), 0) # Spot measure used for discounting. b_ts = ones((2 * num_paths,)) tmp = 1 + DELTA * f_kk for j in xrange(int(ts / DELTA)): b_ts *= tmp[j].reshape((20,)) # Swaption price at time 0. swaption = swap_ts / b_ts # Save expected value in bps and std. me = mean((swaption[0:num_paths] + swaption[num_paths:]) / 2) * 10000 st = std((swaption[0:num_paths] + swaption[num_paths:]) / 2) / sqrt(num_paths) * 10000 swaptions.append([me.optimized().force(), st.optimized().force()]) # print time() - start i += 1 return swaptions
def fn3(): a = expr.ones((N, N)) b = expr.ones((N, N / 2)) g = expr.dot(a, b) + expr.dot(expr.sum(a, axis=1).reshape((1, N)), b) return g
def gen_reduce(a): if hasattr(a, 'shape') and len(a.shape) > 0: return [expr.sum(a, axis=random.randrange(len(a.shape)))] return [a]
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10, M=None): ''' compute the factorization A = U M' using the alternating least-squares (ALS) method. where `A` is the "ratings" matrix which maps from a user and item to a rating score, `U` and `M` are the factor matrices, which represent user and item preferences. Args: A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score. la(float): the parameter of the als. alpha(int): confidence parameter used on implicit feedback. implicit_feedback(bool): whether using implicit_feedback method for als. num_features(int): dimension of the feature space. num_iter(int): max iteration to run. ''' num_users = A.shape[0] num_items = A.shape[1] AT = expr.transpose(A) avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0) M = expr.rand(num_items, num_features) M = expr.assign(M, np.s_[:, 0], avg_rating.reshape( (avg_rating.shape[0], 1))) #A = expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0)) #AT = expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0)) for i in range(num_iter): # Recomputing U shape = (num_users, num_features) U = expr.outer( (A, M), (0, None), fn=_solve_U_or_M_mapper, fn_kw={ 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback, 'shape': shape }, shape=shape, dtype=np.float) # Recomputing M shape = (num_items, num_features) M = expr.outer( (AT, U), (0, None), fn=_solve_U_or_M_mapper, fn_kw={ 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback, 'shape': shape }, shape=shape, dtype=np.float) return U, M
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim) labels = expr.zeros((points.shape[0], ), dtype=np.int) for iter in range(num_iter): centers = expr.as_array(centers) points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2) # This is used to avoid dividing zero distances = distances + 0.00000000001 util.log_info('distances shape %s' % str(distances.shape)) distances_broadcast = expr.reshape( distances, (distances.shape[0], 1, distances.shape[1])) distances_broadcast2 = expr.reshape( distances, (distances.shape[0], distances.shape[1], 1)) prob = 1.0 / expr.sum(expr.power( distances_broadcast / distances_broadcast2, 2.0 / (m - 1)), axis=2) prob.force() counts = expr.sum(prob, axis=0) counts = expr.reshape(counts, (counts.shape[0], 1)) labels = expr.argmax(prob, axis=1) centers = expr.sum( expr.reshape(points, (points.shape[0], 1, points.shape[1])) * expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)), axis=0) # We assume that the size of centers are relative small that can be handled # on the master. counts = counts.glom() centers = centers.glom() # If any centroids don't have any points assigned to them. zcount_indices = (counts == 0).reshape(k) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, which results in their # position being the zero-vector. We reseed these centroids with new random values # and set their counts to 1 in order to get rid of dividing by zero. counts[zcount_indices, :] = 1 centers[zcount_indices, :] = np.random.rand( np.count_nonzero(zcount_indices), num_dim) centers = centers / counts return labels
def train(self): for i in range(self.iterations): diff = self.update() grad = expr.sum(diff, axis=0).optimized().glom().reshape((self.N_DIM, 1)) self.w = self.w - grad * self.alpha return self.w
def simulate(ts_all, te_all, lamb_all, num_paths): '''Range over a number of independent products. :param ts_all: DistArray Start dates for a series of swaptions. :param te_all: DistArray End dates for a series of swaptions. :param lamb_all: DistArray Parameter values for a series of swaptions. :param num_paths: Int Number of paths used in random walk. :rtype: DistArray ''' swaptions = [] i = 0 for ts_a, te, lamb in zip(ts_all, te_all, lamb_all): for ts in ts_a: #start = time() print i time_structure = arange(None, 0, ts + DELTA, DELTA) maturity_structure = arange(None, 0, te, DELTA) ############# MODEL ############### # Variance reduction technique - Antithetic Variates. eps_tmp = randn(time_structure.shape[0] - 1, num_paths) eps = concatenate(eps_tmp, -eps_tmp, 1) # Forward LIBOR rates for the construction of the spot measure. f_kk = zeros((time_structure.shape[0], 2*num_paths)) f_kk = assign(f_kk, np.s_[0, :], F_0) # Plane kxN of simulated LIBOR rates. f_kn = ones((maturity_structure.shape[0], 2*num_paths))*F_0 # Simulations of the plane f_kn for each time step. for t in xrange(1, time_structure.shape[0]): f_kn_new = f_kn[1:, :]*exp(lamb*mu(f_kn, lamb)*DELTA-0.5*lamb*lamb * DELTA + lamb*eps[t - 1, :]*sqrt(DELTA)) f_kk = assign(f_kk, np.s_[t, :], f_kn_new[0]) f_kn = f_kn_new ############## PRODUCT ############### # Value of zero coupon bonds. zcb = ones((int((te-ts)/DELTA)+1, 2*num_paths)) f_kn_modified = 1 + DELTA*f_kn for j in xrange(zcb.shape[0] - 1): zcb = assign(zcb, np.s_[j + 1], zcb[j] / f_kn_modified[j]) # Swaption price at maturity. last_row = zcb[zcb.shape[0] - 1, :].reshape((20, )) swap_ts = maximum(1 - last_row - THETA*DELTA*expr.sum(zcb[1:], 0), 0) # Spot measure used for discounting. b_ts = ones((2*num_paths, )) tmp = 1 + DELTA * f_kk for j in xrange(int(ts/DELTA)): b_ts *= tmp[j].reshape((20, )) # Swaption price at time 0. swaption = swap_ts/b_ts # Save expected value in bps and std. me = mean((swaption[0:num_paths] + swaption[num_paths:])/2) * 10000 st = std((swaption[0:num_paths] + swaption[num_paths:])/2)/sqrt(num_paths)*10000 swaptions.append([me.optimized().force(), st.optimized().force()]) #print time() - start i += 1 return swaptions
def fit(self, X, centers=None, implementation='map2'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0], )) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum( X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={ 'd_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels }, shape_hint=(1, ), cost_hint={ hash(labels): { '00': 0, '01': np.prod(labels.shape) } }) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn( n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def fit(data, labels, label_size, alpha=1.0): ''' Train standard naive bayes model. Args: data(Expr): documents to be trained. labels(Expr): the correct labels of the training data. label_size(int): the number of different labels. alpha(float): alpha parameter of naive bayes model. ''' # calc document freq df = expr.reduce(data, axis=0, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: (data > 0).sum(axis), accumulate_fn=np.add) idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1 # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency # by the root mean square of features frequencies in that document square_sum = expr.reduce( data, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis), accumulate_fn=np.add) rms = expr.sqrt(square_sum * 1.0 / data.shape[1]) # calculate weight normalized Tf-Idf data = data / rms.reshape((data.shape[0], 1)) * idf.reshape( (1, data.shape[1])) # add up all the feature vectors with the same labels #weights_per_label_and_feature = expr.ndarray((label_size, data.shape[1]), dtype=np.float64) #for i in range(label_size): # i_mask = (labels == i) # weights_per_label_and_feature = expr.assign(weights_per_label_and_feature, np.s_[i, :], expr.sum(data[i_mask, :], axis=0)) weights_per_label_and_feature = expr.shuffle( expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)), _sum_instance_by_label_mapper, target=expr.ndarray((label_size, data.shape[1]), dtype=np.float64, reduce_fn=np.add), kw={ 'labels': labels, 'label_size': label_size }, cost_hint={hash(labels): { '00': 0, '01': np.prod(labels.shape) }}) # sum up all the weights for each label from the previous step weights_per_label = expr.sum(weights_per_label_and_feature, axis=1) # generate naive bayes per_label_and_feature weights weights_per_label_and_feature = expr.log( (weights_per_label_and_feature + alpha) / (weights_per_label.reshape((weights_per_label.shape[0], 1)) + alpha * weights_per_label_and_feature.shape[1])) return { 'scores_per_label_and_feature': weights_per_label_and_feature.optimized().force(), 'scores_per_label': weights_per_label.optimized().force(), }
def fn3(): a = expr.ones((N, N)) b = expr.ones((N, N/2)) g = expr.dot(a, b) + expr.dot(expr.sum(a, axis=1).reshape((1, N)), b) return g
def fit(self, X, centers=None, implementation='outer'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0],)) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={'d_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels}, shape_hint=(1,), cost_hint={hash(labels): {'00': 0, '01': np.prod(labels.shape)}}) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels