def find_centroids(centroids, data, labels, pairwise_distances, zero_point, C): # Get the number of points associated with each centroid counts = np.bincount(labels, minlength=C) # more bincounts using the positions as weights produce the unnormalized # updated centroid locations (have to do each dimension separately since # a weight cannot be a vector) for idx in range(data.shape[1]): centroids[:, idx] = np.bincount(labels, weights=data[:, idx], minlength=C) # would have been nice if numpy offered a combined amin/argmin to avoid # iterating over pairwise_distances twice distance_sum = np.sum(np.amin(pairwise_distances, axis=1)) # To avoid introducing divide by zero errors # If a centroid has no weight, we'll do no normalization # This will keep its coordinates defined. counts = np.maximum(counts, np.ones((1, ), dtype=np.uint64)) centroids /= counts[:, np.newaxis] return distance_sum
def find_centroids(data, labels, C, D): # Sort the points by their labels indices = np.argsort(labels) sorted_points = data[indices] # Compute counts and indexes for ending of sets of points for each centroid counts = np.bincount(labels, minlength=C) indexes = np.cumsum(counts) # Now we can use the indexes to split the array into sub-arrays and then # sum across them to create the centroids centroids = np.empty((C, D), dtype=data.dtype) ragged_arrays = np.split(sorted_points, indexes) for idx in xrange(C): centroids[idx, :] = np.sum(ragged_arrays[idx], axis=0) # To avoid introducing divide by zero errors # If a centroid has no weight, we'll do no normalization # This will keep its coordinates defined. counts = np.maximum(counts, 1) return centroids / counts[:, np.newaxis]
def find_centroids(centroids, data, labels, pairwise_distances, zero_point, C, D): # Get the number of points associated with each centroid counts = np.bincount(labels, minlength=C) # Build label masks for each centroid and sum across all the # points assocated with each new centroid distance_sum = 0.0 for idx in range(C): # Boolean mask indicating where the points are for this center centroid_mask = labels == idx centroids[idx, :] = np.sum(np.where(centroid_mask[..., np.newaxis], data, zero_point), axis=0) distance_sum += np.sum( np.where(centroid_mask, pairwise_distances[:, idx], 0.0)) # To avoid introducing divide by zero errors # If a centroid has no weight, we'll do no normalization # This will keep its coordinates defined. counts = np.maximum(counts, np.ones((1, ), dtype=np.uint64)) centroids /= counts[:, np.newaxis] return distance_sum