def update(self, X, n_clusters, labels, k, l, id): point = X[id] self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.centroids = cluster_centroid.update_centroids( np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l) self.numerators[k] = 0.0 self.numerators[l] = 0.0 for i in range(len(labels)): if labels[i] == k or labels[i] == l: self.numerators[labels[i]] += utils.euclidian_dist( X[i], self.centroids[labels[i]]) for i in range(len(labels)): if labels[i] == k: self.inner_max_dists[i][id] = utils.euclidian_dist(X[i], X[id]) self.inner_max_dists[id][i] = self.inner_max_dists[i][id] if labels[i] == l: self.inner_max_dists[i][id] = 0 self.inner_max_dists[id][i] = 0 self.outer_min_dists[l][id] = sys.float_info.max for c in [k, l]: for i in range(len(labels)): if labels[i] == c: continue inner_max_dist = 0 for j in range(len(self.inner_max_dists[i])): if labels[j] == c: inner_max_dist = max(inner_max_dist, self.inner_max_dists[i][j]) if inner_max_dist != 0: self.outer_min_dists[c][i] = inner_max_dist outer_min_dist = np.amin(self.outer_min_dists[c]) self.accumulator[c] = self.numerators[c] / outer_min_dist return sum(self.accumulator) / len(labels)
def update(self, X, n_clusters, labels, k, l, id): self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.cluster_sizes), X[id], k, l) #self.cluster_sizes[k] -= 1 #self.cluster_sizes[l] += 1 for i in range(len(labels)): if labels[i] == k: self.s_c -= utils.euclidian_dist(X[i], X[id]) if labels[i] == l: self.s_c += utils.euclidian_dist(X[i], X[id]) prev_n_w = self.n_w self.n_w = self.n_w - (self.cluster_sizes[k] + 1) * self.cluster_sizes[k] / 2 + self.cluster_sizes[k] * (self.cluster_sizes[k] - 1) / 2 \ - (self.cluster_sizes[l] - 1) * (self.cluster_sizes[l] - 2) / 2 + self.cluster_sizes[l] * (self.cluster_sizes[l] - 1) / 2 delta = 0.1 #print(prev_n_w) #print(self.n_w) #print(delta * len(labels)) if abs(self.n_w - prev_n_w) > delta * len(labels): self.s_min = heapq.nsmallest(int(self.n_w), self.distances) self.s_max = heapq.nlargest(int(self.n_w), self.distances) #ones = [1] * int(self.n_w) #s_min_c = np.dot(self.s_min, np.transpose(ones)) #s_max_c = np.dot(self.s_max, np.transpose(ones)) s_min_c = sum(self.s_min) s_max_c = sum(self.s_max) return (self.s_c - s_min_c) / (s_max_c - s_min_c)
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.dists = [[0 for _ in range(len(labels))] for _ in range(len(labels))] self.centroid_dists = [0 for _ in range(len(labels))] self.delta = [[0 for _ in range(n_clusters)] for _ in range(n_clusters)] minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster self.sums = [0 for _ in range(n_clusters)] for i in range(len(labels)): self.centroid_dists[i] = utils.euclidian_dist( X[i], self.centroids[labels[i]]) self.sums[labels[i]] += self.centroid_dists[i] for i in range(len(labels) - 1): for j in range(i + 1, len(labels)): self.dists[i][j] = utils.euclidian_dist(X[i], X[j]) self.dists[j][i] = self.dists[i][j] self.dists_same_c.append([i, j]) maximum_same_c = max(self.dists[i][j], maximum_same_c) for i in range(n_clusters): for j in range(n_clusters): if i != j: self.delta[i][j] = (self.sums[i] + self.sums[j]) / float( self.cluster_sizes[i] + self.cluster_sizes[j]) minimum_dif_c = min(minimum_dif_c, self.delta[i][j]) return -(minimum_dif_c / maximum_same_c)
def sv(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) numerator = 0.0 for k in range(0, n_clusters - 1): min_dist = sys.float_info.max for l in range(k + 1, n_clusters): min_dist = min(min_dist, utils.euclidian_dist(centroids[k], centroids[l])) numerator += min_dist denominator = 0.0 for k in range(0, n_clusters): list = [] for i in range(0, len(labels)): if labels[i] != k: continue list.append(utils.euclidian_dist(X[i], centroids[k])) # get sum of 0.1*|Ck| largest elements acc = 0.0 max_n = heapq.nlargest(int(math.ceil(0.1 * cluster_sizes[k])), list) for i in range(0, len(max_n)): acc += max_n[i] denominator += acc * 10.0 / cluster_sizes[k] return -numerator / denominator
def cs_index(X, labels, n_clusters): elements, ignore_columns = X.shape centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) max_dists = [sys.float_info.min] * elements for i in range(0, elements): # for every element for j in range(i, elements - 1): # for every other if labels[i] != labels[j]: continue # if they are in the same cluster # update the distance to the farthest element in the same cluster max_dists[i] = max(max_dists[i], utils.euclidian_dist(X[i], X[j])) # max_dists contain for each element the farthest the his cluster numerator = 0.0 for i in range(0, elements): numerator += max_dists[i] / cluster_sizes[labels[i]] denominator = 0.0 for i in range(0, n_clusters): min_centroids_dist = sys.float_info.max for j in range(i + 1, n_clusters): min_centroids_dist = min( utils.euclidian_dist(centroids[i], centroids[j]), min_centroids_dist) denominator += min_centroids_dist assert denominator != 0.0 return numerator / denominator
def dunn53(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) rows, colums = X.shape dl = [0.0] * n_clusters d = np.array(dl) point_in_c = [0] * n_clusters for i in range(0, len(labels)): point_in_c[labels[i]] += 1 delta_l = [[0.0] * n_clusters] * n_clusters delta = np.array(delta_l) minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster for i in range(0, int(math.ceil(float(rows) / 2.0))): for j in range(0, rows): if (labels[i] != labels[j]): delta[labels[i]][labels[j]] += ( utils.euclidian_dist(X[i], centroids[labels[i]]) + utils.euclidian_dist(X[j], centroids[labels[j]])) else: d[labels[i]] += utils.euclidian_dist(X[i], centroids[labels[i]]) for i in range(0, n_clusters): d[i] /= point_in_c[i] d[i] += 2.0 maximum_same_c = max(d[i], maximum_same_c) for j in range(0, n_clusters): delta[i][j] /= float(point_in_c[i] + point_in_c[j]) minimum_dif_c = min(minimum_dif_c, delta[i][j]) return -minimum_dif_c / maximum_same_c
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) rows, colums = X.shape self.sums = [0 for _ in range(n_clusters)] minimum_dif_c = sys.float_info.max # min dist in different clusters centres_l = [[sys.float_info.max] * n_clusters] * n_clusters self.centers = np.array(centres_l) self.centroid_dists = [0 for _ in range(len(labels))] # self.centroid_dists = [utils.euclidian_dist(X[i], self.centroids[labels[i]]) for i in range(len(X))] for i in range(len(labels)): self.centroid_dists[i] = utils.euclidian_dist( X[i], self.centroids[labels[i]]) self.sums[labels[i]] += self.centroid_dists[i] for i in range(n_clusters): for j in range(n_clusters): if i != j: self.centers[i][j] = utils.euclidian_dist( self.centroids[i], self.centroids[j]) for i in range(rows): for j in range(rows): if labels[i] != labels[j]: dist = self.centers[labels[i]][labels[j]] minimum_dif_c = min(dist, minimum_dif_c) denominator = list(self.sums) for i in range(n_clusters): denominator[i] *= (2 / self.cluster_sizes[i]) return -(minimum_dif_c / max(denominator))
def dunn43(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) rows, colums = X.shape point_in_c = [0] * n_clusters for i in range(0, len(labels)): point_in_c[labels[i]] += 1 dl = [0.0] * n_clusters d = np.array(dl) minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster centres_l = [[0.0] * n_clusters] * n_clusters centers = np.array(centres_l) for i in range(0, n_clusters): for j in range(0, n_clusters): centers[i][j] = utils.euclidian_dist(centroids[i], centroids[j]) for i in range(0, rows): for j in range(0, rows): if labels[i] != labels[j]: dist = centers[labels[i]][labels[j]] minimum_dif_c = min(dist, minimum_dif_c) else: d[labels[i]] += utils.euclidian_dist(X[i], centroids[labels[i]]) for i in range(0, n_clusters): d[i] /= point_in_c[i] d[i] += 2.0 maximum_same_c = max(d[i], maximum_same_c) return -minimum_dif_c / maximum_same_c
def update(self, X, n_clusters, labels, k, l, id): point = X[id] prev_centroids = np.copy(self.centroids) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.centroids = cluster_centroid.update_centroids( np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l) for i in range(n_clusters): if i > k: self.dist_centroids[k][i] = utils.euclidian_dist( self.centroids[i], self.centroids[k]) if i > l: self.dist_centroids[l][i] = utils.euclidian_dist( self.centroids[i], self.centroids[l]) numerator = np.amax(self.dist_centroids) delta = 10**(-math.log(len(X), 10) - 1) for i in range(len(labels)): if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter): self.dist_ps[i] = utils.d_ps(X, labels, X[i], labels[i], self.centroids) denominator = sum(self.dist_ps) return -(numerator / (denominator * n_clusters))
def c_ind(X, labels, n_clusters): rows, colums = X.shape s_c = 0 for i in range(0, rows): for j in range(0, int(math.ceil(float(rows) / 2.0))): s_c += utils.euclidian_dist(X[i], X[j]) cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) n_w = 0 for k in range(0, n_clusters): n_w += cluster_sizes[k] * (cluster_sizes[k] - 1) / 2 distances = [] for i in range(0, len(labels) - 1): for j in range(i + 1, len(labels)): distances.append(utils.euclidian_dist(X[i], X[j])) s_min = heapq.nsmallest(int(n_w), distances) s_max = heapq.nlargest(int(n_w), distances) ones = [1] * int(n_w) s_min_c = np.dot(s_min, np.transpose(ones)) s_max_c = np.dot(s_max, np.transpose(ones)) # TODO check dot product correct return (s_c - s_min_c) / (s_max_c - s_min_c)
def update(self, X, n_clusters, labels, k, l, id): point = X[id] prev_centroids = np.copy(self.centroids) self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l) for i in range(n_clusters): if i > k: self.centroid_dists[k][i] = utils.euclidian_dist(self.centroids[i], self.centroids[k]) self.centroid_dists[i][k] = self.centroid_dists[k][i] if i > l: self.centroid_dists[l][i] = utils.euclidian_dist(self.centroids[i], self.centroids[l]) self.centroid_dists[i][l] = self.centroid_dists[l][i] numerator = 0.0 for i in range(n_clusters): min_dist = np.amin(self.centroid_dists[i]) numerator += min_dist denominator = 0.0 self.dists[k][id] = 0. delta = 10**(-math.log(len(X), 10) - 1) for i in range(len(labels)): if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter): self.dists[labels[i]][i] = utils.euclidian_dist(X[i], self.centroids[labels[i]]) for c in range(n_clusters): # get sum of 0.1*|Ck| largest elements acc = 0.0 max_n = heapq.nlargest(int(math.ceil(0.1 * self.cluster_sizes[c])), self.dists[c]) for i in range(0, len(max_n)): acc += max_n[i] denominator += acc * 10.0 / self.cluster_sizes[c] return -(numerator / denominator)
def find(self, X, labels, n_clusters): self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) self.diameter = utils.find_diameter(X) self.cluster_sizes = [] self.distances = [] self.s_c = 0 self.n_w = 0 rows, colums = X.shape for i in range(rows - 1): for j in range(i + 1, rows): if labels[i] == labels[j]: self.s_c += utils.euclidian_dist(X[i], X[j]) self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) for k in range(0, n_clusters): self.n_w += self.cluster_sizes[k] * (self.cluster_sizes[k] - 1) / 2 for i in range(0, len(labels) - 1): for j in range(i + 1, len(labels)): self.distances.append(utils.euclidian_dist(X[i], X[j])) self.s_min = heapq.nsmallest(int(self.n_w), self.distances) self.s_max = heapq.nlargest(int(self.n_w), self.distances) #ones = [1] * int(self.n_w) #s_min_c = np.dot(self.s_min, np.transpose(ones)) #s_max_c = np.dot(self.s_max, np.transpose(ones)) s_min_c = sum(self.s_min) s_max_c = sum(self.s_max) return (self.s_c - s_min_c) / (s_max_c - s_min_c)
def update(self, X, n_clusters, labels, k, l, id): point = X[id] prev_centroids = np.copy(self.centroids) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.centroids = cluster_centroid.update_centroids( np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l) delta = 10**(-math.log(len(X), 10) - 1) if utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter: self.sigmas[k] = self.normed_cluster_sigma(X, labels, k) if utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter: self.sigmas[l] = self.normed_cluster_sigma(X, labels, l) term1 = sum(self.sigmas) / (n_clusters * self.normed_sigma_x) stdev_val = self.stdev(n_clusters) if (utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter or utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter): self.dens = 0.0 for k in range(0, n_clusters): for l in range(0, n_clusters): self.dens += self.den2(X, labels, self.centroids, k, l, stdev_val) /\ max(self.den1(X, labels, self.centroids, k, stdev_val), self.den1(X, labels, self.centroids, l, stdev_val)) self.dens /= n_clusters * (n_clusters - 1) return (term1 + self.dens)
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.dist_same_c = [] rows, colums = X.shape self.dists = [[0. for _ in range(rows)] for _ in range(rows)] minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster centres_l = [[sys.float_info.max] * n_clusters] * n_clusters self.centers = np.array(centres_l) for i in range(n_clusters): for j in range(n_clusters): if i != j: self.centers[i][j] = utils.euclidian_dist( self.centroids[i], self.centroids[j]) for i in range(rows - 1): for j in range(i + 1, rows): self.dists[i][j] = utils.euclidian_dist(X[i], X[j]) self.dists[j][i] = self.dists[i][j] if labels[i] != labels[j]: dist = self.centers[labels[i]][labels[j]] minimum_dif_c = min(dist, minimum_dif_c) else: self.dist_same_c.append([i, j]) maximum_same_c = max(self.dists[i][j], maximum_same_c) return -(minimum_dif_c / maximum_same_c)
def find(self, X, labels, n_clusters): self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) self.dists = [[0. for _ in range(len(labels))] for _ in range(len(labels))] self.sums = [0 for _ in range(n_clusters)] rows, colums = X.shape self.point_in_c = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.delta_l = [[0.0] * n_clusters] * n_clusters self.delta = np.array(self.delta_l) self.centroid_dists = [0 for _ in range(len(labels))] #self.centroid_dists = [utils.euclidian_dist(X[i], self.centroids[labels[i]]) for i in range(len(X))] minimum_dif_c = sys.float_info.max for i in range(len(labels)): self.centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]]) self.sums[labels[i]] += self.centroid_dists[i] for i in range(rows - 1): for j in range(i + 1, rows): self.dists[i][j] = utils.euclidian_dist(X[i], X[j]) self.dists[j][i] = self.dists[i][j] if labels[i] != labels[j]: self.delta[labels[i]][labels[j]] += self.dists[i][j] for i in range(n_clusters): for j in range(n_clusters): self.delta[i][j] /= float(self.point_in_c[i] * self.point_in_c[j]) if self.delta[i][j] != 0: minimum_dif_c = min(minimum_dif_c, self.delta[i][j]) self.sums[i] *= (2 / self.point_in_c[i]) #print(max(self.sums)) return -(minimum_dif_c / max(self.sums))
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.centroid_dists = [[sys.float_info.max for _ in range(n_clusters)] for _ in range(n_clusters)] self.dists = [[0 for _ in range(len(labels))] for _ in range(n_clusters)] numerator = 0.0 for k in range(0, n_clusters - 1): for l in range(k + 1, n_clusters): self.centroid_dists[k][l] = utils.euclidian_dist(self.centroids[k], self.centroids[l]) self.centroid_dists[l][k] = self.centroid_dists[k][l] for i in range(n_clusters): min_dist = np.amin(self.centroid_dists[i]) numerator += min_dist denominator = 0.0 for k in range(n_clusters): for i in range(len(labels)): if labels[i] != k: continue self.dists[k][i] = utils.euclidian_dist(X[i], self.centroids[k]) for k in range(n_clusters): # get sum of 0.1*|Ck| largest elements acc = 0.0 max_n = heapq.nlargest(int(math.ceil(0.1 * self.cluster_sizes[k])), self.dists[k]) for i in range(0, len(max_n)): acc += max_n[i] denominator += acc * 10.0 / self.cluster_sizes[k] return -(numerator / denominator)
def update(self, X, n_clusters, labels, k, l, id): point = X[id] prev_cluster_sizes = list(self.cluster_sizes) prev_centroids = np.copy(self.centroids) self.cluster_sizes = cluster_centroid.count_cluster_sizes(np.copy(labels), n_clusters) self.centroids = cluster_centroid.update_centroids(self.centroids, self.cluster_sizes, point, k, l) minimum_dif_c = sys.float_info.max # min dist in different clusters #update numerator new_centroid_dists = list(self.centroid_dists) dell = 10**(-math.log(len(X), 10) - 1) for i in range(len(labels)): if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > dell * self.diameter or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > dell * self.diameter): new_centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]]) for i in range(n_clusters): for j in range(n_clusters): self.delta[i][j] *= (prev_cluster_sizes[i] + prev_cluster_sizes[j]) new_sums = [0 for _ in range(n_clusters)] for i in range(n_clusters): if i != k and i != l: new_sums[i] = self.sums[i] for i in range(len(labels)): if labels[i] == k or labels[i] == l: new_sums[labels[i]] += new_centroid_dists[i] for i in range(n_clusters): for j in range(n_clusters): if i != j: if self.cluster_sizes[i] + self.cluster_sizes[j] == 0: self.delta[i][j] = float('inf') else: self.delta[i][j] = (new_sums[i] + new_sums[j]) / float(self.cluster_sizes[i] + self.cluster_sizes[j]) minimum_dif_c = min(minimum_dif_c, self.delta[i][j]) #update denominator denominator = list(new_sums) #print(denominator) for i in range(n_clusters): if self.cluster_sizes[i] == 0: denominator[i] = float('inf') else: denominator[i] *= (2 / self.cluster_sizes[i]) return -(minimum_dif_c / max(denominator))
def update(self, X, n_clusters, labels, k, l, id): self.diameter = utils.find_diameter(X) prev_point_in_c = list(self.point_in_c) prev_centroids = np.copy(self.centroids) self.point_in_c = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.point_in_c), X[id], k, l) minimum_dif_c = sys.float_info.max # min dist in different clusters #update numerator for i in range(n_clusters): for j in range(n_clusters): self.delta[i][j] *= (prev_point_in_c[i] * prev_point_in_c[j]) for i in range(len(labels)): if labels[i] != k and id < i: self.delta[k][labels[i]] -= self.dists[id][i] if labels[i] != k and id > i: self.delta[labels[i]][k] -= self.dists[i][id] if labels[i] != l and id < i: self.delta[l][labels[i]] += self.dists[id][i] if labels[i] != l and id > i: self.delta[labels[i]][l] += self.dists[i][id] for i in range(n_clusters - 1): for j in range(i + 1, n_clusters): self.delta[i][j] /= float(self.point_in_c[i] * self.point_in_c[j]) if self.delta[i][j] != 0: minimum_dif_c = min(minimum_dif_c, self.delta[i][j]) # update denominator new_centroid_dists = list(self.centroid_dists) dell = 10 ** (-math.log(len(X), 10) - 1) for i in range(len(labels)): if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > dell * self.diameter or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > dell * self.diameter): new_centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]]) new_sums = [0 for _ in range(n_clusters)] for i in range(n_clusters): if i != k and i != l: new_sums[i] = self.sums[i] for i in range(len(labels)): if labels[i] == k or labels[i] == l: new_sums[labels[i]] += new_centroid_dists[i] denominator = list(new_sums) for i in range(n_clusters): if self.point_in_c[i] != 0: denominator[i] *= (2 / self.point_in_c[i]) return -(minimum_dif_c / max(denominator))
def bcd_score(X, labels, n_clusters, centroids, cluster_sizes): mean_x = np.mean(X, axis=0) numerator = 0.0 for k in range(0, n_clusters): numerator += cluster_sizes[k] * utils.euclidian_dist( centroids[k], mean_x) return numerator / len(labels) / n_clusters
def os(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) numerator = 0.0 for k in range(0, n_clusters): for i in range(0, len(labels)): if labels[i] != k: continue numerator += ov(X, labels, X[i], k) denominator = 0.0 for k in range(0, n_clusters): l = [] for i in range(0, len(labels)): if labels[i] != k: continue l.append(utils.euclidian_dist(X[i], centroids[k])) # get sum of 0.1*|Ck| largest elements acc = 0.0 max_n = heapq.nlargest(int(math.ceil(0.1 * cluster_sizes[k])), l) for i in range(0, len(max_n)): acc += max_n[i] denominator += acc * 10.0 / cluster_sizes[k] return -numerator / denominator
def find(self, X, labels, n_clusters): self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.diameter = utils.find_diameter(X) self.dists = [[0. for _ in range(len(labels))] for _ in range(len(labels))] self.dist_same_c = [] rows, colums = X.shape delta_l = [[0.0] * n_clusters] * n_clusters self.delta = np.array(delta_l) minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster for i in range(rows - 1): for j in range(i + 1, rows): self.dists[i][j] = utils.euclidian_dist(X[i], X[j]) self.dists[j][i] = self.dists[i][j] if labels[i] != labels[j]: self.delta[labels[i]][labels[j]] += self.dists[i][j] else: self.dist_same_c.append([i, j]) maximum_same_c = max(self.dists[i][j], maximum_same_c) for i in range(n_clusters - 1): for j in range(i + 1, n_clusters): self.delta[i][j] /= float(self.cluster_sizes[i] * self.cluster_sizes[j]) if self.delta[i][j] != 0: minimum_dif_c = min(minimum_dif_c, self.delta[i][j]) return -minimum_dif_c / maximum_same_c
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.s_clusters = [0. for _ in range(n_clusters)] self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) db = 0 self.points_in_clusters = cluster_centroid.count_cluster_sizes( labels, n_clusters) for i in range(n_clusters): self.s_clusters[i] = self.s(X, i, self.points_in_clusters, labels, self.centroids) self.sums = [[0 for _ in range(n_clusters)] for _ in range(n_clusters)] for i in range(0, n_clusters): for j in range(0, n_clusters): if i != j: tm = utils.euclidian_dist(self.centroids[i], self.centroids[j]) if tm != 0: self.sums[i][j] = (self.s_clusters[i] + self.s_clusters[j]) / tm else: pass #a = -Constants.bad_cluster tmp = np.amax(self.sums[i]) db += tmp db /= float(n_clusters) return db
def s(self, X, cluster_k_index, cluster_sizes, labels, centroids): sss = 0 for i in range(0, len(labels)): if labels[i] == cluster_k_index: sss += utils.euclidian_dist(X[i], self.centroids[cluster_k_index]) if self.cluster_sizes[cluster_k_index] == 0: return float('inf') return sss / self.cluster_sizes[cluster_k_index]
def update(self, X, n_clusters, labels, k, l, id): point = X[id] # prev_cluster_sizes = list(self.cluster_sizes) prev_centroids = np.copy(self.centroids) self.cluster_sizes = cluster_centroid.count_cluster_sizes( np.copy(labels), n_clusters) self.centroids = cluster_centroid.update_centroids( np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l) # update denominator new_centroid_dists = list(self.centroid_dists) dell = 10**(-math.log(len(X), 10) - 1) for i in range(len(labels)): if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > dell * self.diameter or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > dell * self.diameter): new_centroid_dists[i] = utils.euclidian_dist( X[i], self.centroids[labels[i]]) new_sums = [0 for _ in range(n_clusters)] for i in range(n_clusters): if i != k and i != l: new_sums[i] = self.sums[i] for i in range(len(labels)): if labels[i] == k or labels[i] == l: new_sums[labels[i]] += new_centroid_dists[i] denominator = list(new_sums) for i in range(n_clusters): if self.cluster_sizes[i] != 0: denominator[i] *= (2 / self.cluster_sizes[i]) # update numerator for i in range(n_clusters): if i != k: self.centers[i][k] = utils.euclidian_dist( self.centroids[i], self.centroids[k]) self.centers[k][i] = self.centers[i][k] if i != l: self.centers[i][l] = utils.euclidian_dist( self.centroids[i], self.centroids[l]) self.centers[l][i] = self.centers[i][l] minimum_dif_c = np.amin(self.centers) return -(minimum_dif_c / max(denominator))
def a(X, labels, x_i, cluster_k_index): acc = 0.0 count = 0 for j in range(0, len(labels)): if labels[j] != cluster_k_index: continue acc += utils.euclidian_dist(x_i, X[j]) count += 1 return acc / count
def dl(X, labels, distance, n_clusters): result = 0 for k in range(0, n_clusters - 1): for l in range(k + 1, n_clusters): if labels[k] == labels[l]: continue # x_k and x_l different clusters: if utils.euclidian_dist(X[k], X[l]) < distance: result += 1 return result
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.numerators = [0.0] * n_clusters for i in range(0, len(labels)): self.numerators[labels[i]] += utils.euclidian_dist( X[i], self.centroids[labels[i]]) self.inner_max_dists = [[0 for _ in range(len(labels))] for _ in range(len(labels))] self.outer_min_dists = [[ sys.float_info.max for _ in range(len(labels)) ] for _ in range(n_clusters)] self.accumulator = [0 for _ in range(n_clusters)] for k in range(0, n_clusters): for i in range(len(labels)): # iterate elements outside cluster if labels[i] == k: continue for j in range(len(labels)): # iterate inside cluster if labels[j] != k: continue self.inner_max_dists[i][j] = utils.euclidian_dist( X[i], X[j]) self.inner_max_dists[j][i] = self.inner_max_dists[i][j] for c in range(n_clusters): for i in range(len(labels)): if labels[i] == c: continue inner_max_dist = 0 for j in range(len(self.inner_max_dists[i])): if labels[j] == c: inner_max_dist = max(inner_max_dist, self.inner_max_dists[i][j]) if inner_max_dist != 0: self.outer_min_dists[c][i] = inner_max_dist outer_min_dist = np.amin(self.outer_min_dists[c]) self.accumulator[c] = self.numerators[c] / outer_min_dist return sum(self.accumulator) / len(labels)
def dunn41(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) rows, colums = X.shape minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster centres_l = [[0.0] * n_clusters] * n_clusters centers = np.array(centres_l) for i in range(0, n_clusters): for j in range(0, n_clusters): centers[i][j] = utils.euclidian_dist(centroids[i], centroids[j]) for i in range(0, int(math.ceil(float(rows) / 2.0))): for j in range(0, rows): if (labels[i] != labels[j]): dist = centers[labels[i]][labels[j]] minimum_dif_c = min(dist, minimum_dif_c) else: dist = utils.euclidian_dist(X[i], X[j]) maximum_same_c = max(dist, maximum_same_c) return -minimum_dif_c / maximum_same_c
def update(self, X, n_clusters, labels, k, l, id): point = X[id] prev_centroids = np.copy(self.centroids) delta = 10**(-math.log(len(X), 10) - 1) self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.centroids = cluster_centroid.update_centroids(self.centroids, self.cluster_sizes, point, k, l) if utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter: self.s_clusters[k] = self.s(X, k, self.cluster_sizes, labels, self.centroids) if utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter: self.s_clusters[l] = self.s(X, l, self.cluster_sizes, labels, self.centroids) for i in range(n_clusters): if i > k: self.max_s_sum[k][i] = self.s_clusters[i] + self.s_clusters[k] self.min_centroids_dist[k][i] = utils.euclidian_dist(self.centroids[i], self.centroids[k]) if i > l: self.max_s_sum[l][i] = self.s_clusters[i] + self.s_clusters[l] self.min_centroids_dist[l][i] = utils.euclidian_dist(self.centroids[i], self.centroids[l]) numerator = 0.0 for i in range(n_clusters): numerator += np.max(self.max_s_sum[i]) / np.min(self.min_centroids_dist[i]) return numerator / n_clusters
def dunn(X, labels): rows, colums = X.shape minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster for i in range(0, int(math.ceil(float(rows) / 2.0))): for j in range(0, rows): dist = utils.euclidian_dist(X[i], X[j]) if (labels[i] != labels[j]): minimum_dif_c = min(dist, minimum_dif_c) else: maximum_same_c = max(dist, maximum_same_c) return -minimum_dif_c / maximum_same_c