def kmeans_plus_plus(self, num_clusters=100, exponent=2): ''' Select k initial clusters by kmeans++ scheme, see https://en.wikipedia.org/wiki/K-means%2B%2B ''' X = self.data n, p = X.shape clusters = np.zeros([num_clusters, p]) first_row = np.random.randint(0, n) clusters[0] = X[first_row] X = np.delete(X, first_row, 0) print "Starting kmeans++ initialization..." for i in range(1, num_clusters): index = 0 rank = 0 for j in range(len(X)): sample = X[j] biggest_distance_to_cluster = 0 for k in range(i): distance = stats_utils.euclidean_distance(clusters[k], sample) if distance > biggest_distance_to_cluster: biggest_distance_to_cluster = distance weight = biggest_distance_to_cluster ** exponent new_rank = pow(np.random.rand(), 1 / weight) # weighted reservoir sampling if new_rank > rank: rank = new_rank index = j clusters[i] = X[index] X = np.delete(X, index, 0) return clusters
def get_closest_cluster_index(self, sample, clusters): smallest_distance = np.Inf index = None for k in range(len(clusters)): distance = stats_utils.euclidean_distance(clusters[k], sample) if distance < smallest_distance: smallest_distance = distance index = k return index