def _k_means_mod(seeds, subset, num_clusters): """The KMeansMod() step of the algorithm""" clustering = _k_means(seeds, subset, num_clusters) centroids = clustering.cluster_centers_ # Because labels_ returned by kmeans are arbitrarily numbered, # we work with the returned centroids distances = distance_table(subset, centroids) labels = distances.argmin(axis=1) sought = set(range(0, num_clusters)) labels = set(labels) missing = sought - labels missingcount = len(missing) if missingcount > 0: # print("Missing:", missing) furthest = _find_furthest(distances, missingcount) # print("Furthest-nearest:", furthest) i = 0 for clusterid in missing: # print("Replacing", seeds[clusterid], "with", subset[furthest[i]]) seeds[clusterid] = subset[furthest[i]] i += 1 clustering = _k_means(seeds, subset, num_clusters) centroids = clustering.cluster_centers_ return centroids
def find_centers(self): """Main Initialisation interface method""" # i-iv) The point furthest from the centre, plus the two main axes first, axes = self._initialise() # v) Incrementally find points most remote from latest seed candidates = self._generate_candidates(first, axes) # print("Candidates:\n", candidates) # print("Axes:", axes) # Check for the eternal problem of duplicates deduped = np.unique(candidates, axis=0) # print("Deduped:\n", deduped) if len(deduped) < self._num_clusters: raise InitialisationException("Duplicate candidates found") # vi) Turn the candidates into means of initial clusters distances = kmeans.distance_table(self._data, candidates, axes) mins = distances.argmin(axis=1) means = [None] * self._num_clusters for k in range(self._num_clusters): cluster = self._data[mins == k, :] # print("Cluster contains:", len(cluster)) means[k] = np.mean(cluster, axis=0) return np.array(means)
def find_centers(self): centroids = [] to_find = self._num_clusters data = self._data while to_find > 1: first = self._find_first_centroid(data) centroids.append(first) temp_centroids = np.array([first]) while len(temp_centroids) < to_find: furthest = self._find_furthest(temp_centroids, data) temp_centroids = np.vstack((temp_centroids, furthest)) # Delete latest clustering = np.argmin(distance_table(temp_centroids, data), axis=0) mask = np.where(clustering == 0)[0] data = np.delete(data, mask, axis=0) to_find -= 1 # Finally just get the mean of the remaining points final = np.mean(data, axis=0) centroids.append(final) return np.array(centroids)
def test_with_1_empty(self): """Seeds and data known to leave one empty cluster after k_means(), and thus trigger k_means_mod() to reassign a centroid""" seeds = np.array([ [5.4, 3.0, 4.5, 1.5], [6.7, 3.0, 5.0, 1.7], [5.1, 3.8, 1.5, 0.3], # Doesn't get any data points assigned ]) data = np.array([ # Assigned to 0 but is furthest, so becomes the new 2 [6.4, 2.9, 4.3, 1.3], [6.3, 3.4, 5.6, 2.4], [6.8, 3.0, 5.5, 2.1], [5.0, 2.0, 3.5, 1.0], [5.8, 2.7, 5.1, 1.9], ]) expected_labels = [2, 1, 1, 0, 0] expected_centroids = [ [5.4, 2.35, 4.3, 1.45], [6.55, 3.2, 5.55, 2.25], [6.4, 2.9, 4.3, 1.3], # The new 2 ] centroids = bfinit._k_means_mod(seeds, data, len(seeds)) labels = kmeans.distance_table(data, centroids).argmin(1) np.testing.assert_array_equal(labels, expected_labels) np.testing.assert_array_equal(centroids, expected_centroids)
def _calc_density(self, point, latestdata): """Sum of distances to its nearest neighbours""" neighbours = int(len(latestdata) / self._num_clusters) + 1 dists = distance_table(np.array([point]), latestdata)[0] idx = np.argpartition(dists, neighbours) subdists = dists[idx[:neighbours]] return np.sum(subdists)
def test_distance_table(self): """Calculate matrix of distances between two sets of data points""" data = np.array([[1, 1], [2, 3], [4, 4]]) centroids = np.array([[2, 2], [3, 3]]) dtable = mykm.distance_table(data, centroids) self.assertEqual(dtable.shape, (3, 2)) expected = np.array([[2, 8], [1, 1], [8, 2]]) self.assertTrue(np.array_equal(dtable, expected))
def find_centers(self): """Main method""" # 1-3) The most densely surrounded point is the first initial centroid centre_h = self._find_hdp() # 4) Add X_h to C as the first centroid centroids = np.array([centre_h]) # Find the remaining required centroids while len(centroids) < self._num_clusters: # 5) For each point xi, set D(xi)... distances = distance_table(self._data, centroids) mins_d = np.min(distances, axis=1) # 6) Find y as ... # Though why it's supposedly recalculated on each loop is puzzling dist_h = distance_table(np.array([centre_h]), self._data)[0] dist_h = dist_h[dist_h != 0] # Anderson skips the 0 one partition = np.partition(dist_h, self._how_many)[:self._how_many] my_y = sum(partition) # 7-8) Find the unique integer i so that... i = 0 accum_dist = 0 while accum_dist < my_y: accum_dist = accum_dist + mins_d[i] i = i + 1 # 9) Add X_i to C # But surely the i found here isn't a meaningful index to X? # It just looks like we're cycling thought the data in a way # that's highly dependent on its arbitrary order centroids = np.vstack((centroids, self._data[i])) return centroids
def find_centers(self): """Main method""" # L2/Euclidean norm, as suggested by the R kkz() documentation norms = np.linalg.norm(self._data, axis=1) first = self._data[np.argmax(norms)] codebook = np.array([first]) while codebook.shape[0] < self._num_clusters: distances = distance_table(self._data, codebook) mins = np.min(distances, axis=1) amax = np.argmax(mins, axis=0) nxt = self._data[amax] codebook = np.append(codebook, [nxt], axis=0) return codebook
def find_centers(self): # Initial centroid randindex = np.random.choice(self._num_samples, replace=False) centroids = np.array([self._data[randindex]]) # Remaining required centroids while len(centroids) < self._num_clusters: distances = kmeans.distance_table(self._data, centroids) probabilities = distances.min(1)**2 / np.sum(distances.min(1)**2) randindex = np.random.choice(self._num_samples, replace=False, p=probabilities) centroids = np.append(centroids, [self._data[randindex]], axis=0) return centroids
def _find_hdp(self): """The highest density point""" distances = distance_table(self._data, self._data) sum_v = np.sum(distances, axis=1) # doesn't matter which axis return self._data[np.argmin(sum_v)]
def _find_furthest(self, temp_centroids, latestdata): """The furthest-nearest point (exact opposite of Yuan)""" distances = distance_table(latestdata, temp_centroids) nearests = np.min(distances, axis=1) return latestdata[np.argmax(nearests)]
def _objective_function(data, centroids): """Sum of intra-cluster distances""" distances = distance_table(data, centroids) return np.sum(distances.min(1))