def agglomerative_l_method(X, method='ward'): # library: fastcluster merge_hist = linkage(X, method=method, metric='euclidean', preserve_input=True) # reorder to be x [2->N] num_groups = [i for i in range(2, len(X) + 1)] merge_dist = list(reversed([each[2] for each in merge_hist])) cluster_count = refined_l_method(num_groups, merge_dist) # print('refined_l_method time:', end_time - start_time) # print('cluster_count:', cluster_count) # make clusters by merging them according to merge_hist disjoint = DisjointSet(len(X)) for a, b, _, _ in islice(merge_hist, 0, len(X) - cluster_count): a, b = int(a), int(b) disjoint.join(a, b) # get cluster name for each instance belong_to = [disjoint.parent(i) for i in range(len(X))] # print('belong_to:', belong_to) # counter = Counter(belong_to) # print('belong_to:', counter) # rename the cluster name to be 0 -> cluster_count - 1 cluster_map = {} cluster_name = 0 belong_to_renamed = [] for each in belong_to: if not each in cluster_map: cluster_map[each] = cluster_name cluster_name += 1 belong_to_renamed.append(cluster_map[each]) # print('belong_to_renamed:', belong_to_renamed) centroids = get_centroids(X, belong_to_renamed) # print('centroids:', centroids) return Result(belong_to_renamed, centroids)
def fit(self, X, max_merge_dist): self.X = X self.max_merge_dist = max_merge_dist merge_hist = linkage(X, method='ward', metric='euclidean', preserve_input=True) disjoint = DisjointSet(len(X)) # _, _, merge_dists, _ = list(zip(*merge_hist)) # print('merge_dists:', merge_dists) for a, b, merge_dist, _ in merge_hist: if merge_dist > max_merge_dist: break a, b = int(a), int(b) disjoint.join(a, b) belong_to = [disjoint.parent(i) for i in range(len(X))] # rename the cluster name to be 0 -> cluster_count - 1 cluster_map = {} cluster_name = 0 belong_to_renamed = [] for each in belong_to: if not each in cluster_map: cluster_map[each] = cluster_name cluster_name += 1 belong_to_renamed.append(cluster_map[each]) # print('belong_to_renamed:', belong_to_renamed) centroids, cluster_member_cnt = self.get_centroids(X, belong_to_renamed) self.cluster_centers_ = centroids # print('centroids:', centroids) # print('cluster_member_cnt:', cluster_member_cnt) return centroids, cluster_member_cnt