def _update_centroids(self, x: DNDarray, matching_centroids: DNDarray): """ Compute coordinates of new centroid as mean of the data points in ``x`` that are assigned to this centroid. Parameters ---------- x : DNDarray Input data matching_centroids : DNDarray Array filled with indices ``i`` indicating to which cluster ``ci`` each sample point in ``x`` is assigned """ new_cluster_centers = self._cluster_centers.copy() for i in range(self.n_clusters): # points in current cluster selection = (matching_centroids == i).astype(ht.int64) # accumulate points and total number of points in cluster assigned_points = x * selection points_in_cluster = selection.sum(axis=0, keepdim=True).clip( 1.0, ht.iinfo(ht.int64).max) # compute the new centroids new_cluster_centers[i:i + 1, :] = (assigned_points / points_in_cluster).sum( axis=0, keepdim=True) return new_cluster_centers
def fit(self, X): """ Computes the centroid of a k-means clustering. Parameters ---------- X : ht.DNDarray, shape=(n_samples, n_features) Training instances to cluster. """ # input sanitation if not isinstance(X, ht.DNDarray): raise ValueError( "input needs to be a ht.DNDarray, but was {}".format(type(X))) # initialize the clustering self._initialize_cluster_centers(X) self._n_iter = 0 matching_centroids = ht.zeros((X.shape[0]), split=X.split, device=X.device, comm=X.comm) X = X.expand_dims(axis=2) new_cluster_centers = self._cluster_centers.copy() # iteratively fit the points to the centroids for epoch in range(self.max_iter): # increment the iteration count self._n_iter += 1 # determine the centroids matching_centroids = self._fit_to_cluster(X) # update the centroids for i in range(self.n_clusters): # points in current cluster selection = (matching_centroids == i).astype(ht.int64) # accumulate points and total number of points in cluster assigned_points = (X * selection).sum(axis=0, keepdim=True) points_in_cluster = selection.sum(axis=0, keepdim=True).clip( 1.0, ht.iinfo(ht.int64).max) # compute the new centroids new_cluster_centers[:, :, i:i + 1] = assigned_points / points_in_cluster # check whether centroid movement has converged self._inertia = ((self._cluster_centers - new_cluster_centers)**2).sum() self._cluster_centers = new_cluster_centers.copy() if self.tol is not None and self._inertia <= self.tol: break self._labels = matching_centroids.squeeze() return self
def test_iinfo(self): info32 = ht.iinfo(ht.int32) self.assertEqual(info32.bits, 32) self.assertEqual(info32.max, 2147483647) self.assertEqual(info32.min, -2147483648) with self.assertRaises(TypeError): ht.iinfo(1.0) with self.assertRaises(TypeError): ht.iinfo(ht.float64) with self.assertRaises(TypeError): ht.iinfo('int16')