def kclustering(distances_unsorted, k): # Get list of all nodes in distance list nodes = set() for u, v, d in distances_unsorted: nodes.add(u) nodes.add(v) # Create UnionFind to store clusters x = unionfind.UnionFind(nodes) # Sort [(u, v, d)] list on ascending d so we can pop off shortest edges # first distances = sorted(distances_unsorted, key=lambda x: x[2]) # Single-link clustering loop: while > k clusters, merge clusters # containing closest nodes while len(x.get_heads()) > k: u, v, d = distances.pop(0) x.union(u, v) # In the event of ties (?), distances may contain pairs that are now in # the same cluster, but the first element in distances for which # this is not true is the closest pair not in the same cluster. while True: u, v, min_distance = distances.pop(0) if x.find(u) != x.find(v): break return x, min_distance
def zero_barcode_pointcloud(dist_matrix, alpha): #create e-neighborhood graph from matrix complete_graph = nx.from_numpy_matrix(dist_matrix) G_epsilon = nx.Graph() for node in complete_graph.nodes(): G_epsilon.add_node(node) for edge in complete_graph.edges(): weight = complete_graph[edge[0]][edge[1]]['weight'] if weight <= alpha: G_epsilon.add_edge(edge[0], edge[1], weight=weight) #init unionFind U = unionfind.UnionFind() #init bars bars = [] #for each node in graph, create a set, and create a default bar for vi in G_epsilon.nodes(): U.find(vi) bars.append([0, float("inf")]) #sort edges by weights (increasing) edges = sorted(G_epsilon.edges(data=True), key=lambda x: x[2]['weight']) for edge in edges: if U.find(edge[0]) != U.find(edge[1]): U.union(edge[0], edge[1]) bars[edge[0]][1] = G_epsilon[edge[0]][edge[1]]['weight'] return bars
def main(args): term = 10000 start_height = 1 end_height = dq.get_max_height() pool_num = multiprocessing.cpu_count() // 2 cdq = ClusterDB(args.dbpath) stime = time.time() u = uf.UnionFind(int(dq.get_max_address()) + 1) try: for sheight, eheight in zip(range(start_height, end_height, term), \ range(start_height+term, end_height+term, term)): if eheight >= end_height: eheight = end_height + 1 with multiprocessing.Pool(pool_num) as p: result = p.imap(one_time_change, range(sheight, eheight)) for addr_list in result: for addr_set in addr_list: addr_1 = addr_set[0] addr_2 = addr_set[1] print(addr_1, ',', addr_2) u.union(int(addr_1), int(addr_2)) etime = time.time() #print('height: {}, time:{}'.format(eheight, etime-stime)) del u.rank db_write(stime, cdq, u) except KeyboardInterrupt: print('Keyboard Interrupt Detected! Commit transactions...') cdq.commit_transactions()
def graph_to_dot3(infile): graph, metadata = json_to_graph_data(infile) uf = unionfind.UnionFind(graph.keys()) for v, edges in graph.items(): for edge in edges: if metadata[edge.label].data["cf:type"] == "version": uf.union(edge.dest, v) def node2str(node): data = metadata[node].data dtype = str(data["cf:type"]) if dtype == "file_name": return data["cf:pathname"] return dtype + ", " + uf.find(node) #return uf.find(node) already = set() s = ["digraph prov {"] for v, edges in graph.items(): for edge in edges: if metadata[edge.label].data["cf:type"] == "version": continue pair = (uf.find(v), uf.find(edge.dest)) if pair in already: continue already.add(pair) s.append('\t"%s" -> "%s";' % ( node2str(v), node2str(edge.dest)))#, #metadata[edge.label].typ + ", " + #metadata[edge.label].data['cf:type'])) s.append("}") return "\n".join(s)
def _update_segmentation(self, action_state): if len(self.points) < 2: return n_pixels = np.product(self.embedding.data.shape[1:]) uf = unionfind.UnionFind(n_pixels) label_image = np.zeros((1, ) + self.embedding.data.shape[1:]) # find minimal threshold threshold = None for p0 in self.points: for p1 in self.points: if p0 < p1: min_max_edge = None short_path = nx.shortest_path(self.mst_graph, p0, p1) for u, v in zip(short_path, short_path[1:]): weight = self.mst_graph.get_edge_data(u, v)['weight'] if min_max_edge is None or weight > min_max_edge: min_max_edge = weight if threshold is None or threshold > min_max_edge: threshold = min_max_edge uf.union_array(mst[mst[:, 2] < threshold][:, :2].astype(np.uint32)) label_image = uf.get_label_image(self.embedding.data.shape[1:])[None] label_image = label_image.astype(np.uint32) self.segmentation[:] = label_image self.update_view()
def get_structurally_symmetric(A, force_unique = False): disj_set = uf.UnionFind() for i in range(len(A)): disj_set.add(i) if force_unique: return disj_set.components(), disj_set.component_mapping() autos = get_automorphisms(A) for i in range(len(A)): for j in range(len(autos[0])): disj_set.union(i, autos[i][j]) return disj_set.components(), disj_set.component_mapping()
def IsConnected(Permutation, Reference, InteractionPairs): diagram = set(InteractionPairs) for i in range(len(Permutation)): diagram.add((Reference[i], Permutation[i])) n_node = len(InteractionPairs)*2 diagram_union = unionfind.UnionFind(n_node) for edge in diagram: if edge[0] != edge[1] and not diagram_union.is_connected(edge[0], edge[1]): diagram_union.union(edge[0], edge[1]) return diagram_union.get_n_circles() == 1
def main(args): count = 0 stime = time.time() csv_list = read_csv(args.csv_file) etime = time.time() cdq = ClusterDB('/home/dnlab/DataHDD/database/multi-input25man.db') cdq.create_cluster_table() print("DEBUG:", csv_list[0], etime - stime) print("START UNION FIND") stime = time.time() #u = uf.UnionFind(int(dq.get_max_address())+1) u = uf.UnionFind(90000000) etime = time.time() print(f"MAKE ADDRESS END, TOTAL TIME:{etime - stime}") for first, second in csv_list: u.union(first, second) if count % 10000000 == 0: etime = time.time() print(f"COUNT {count} END, TOTAL TIME: {etime - stime}") count += 1 etime = time.time() print(f"UNION FIND END TOTAL TIME:{etime - stime}") del u.rank print("START CLUSTERING") stime = time.time() count = 0 addr_list = list() for index, cluster in enumerate(u.par): addr_list.append((str(index), u.find(cluster))) if count % 10000 == 0: cdq.begin_transactions() cdq.insert_cluster_many(addr_list) cdq.commit_transactions() etime = time.time() print( f"COUNT {count} END, TOTAL TIME: {etime - stime}, {addr_list[len(addr_list)-1]}" ) del addr_list addr_list = list() count += 1 cdq.begin_transactions() cdq.insert_cluster_many(addr_list) cdq.commit_transactions() del addr_list etime = time.time() del u.par print(f"CLUSTERING END:{etime - stime}")
def __init__( self, rules: Rule, discards: typing.List[Domino] = None, union: unionfind.UnionFind = None, ): self.rules = rules if discards is None: discards = [] self.discards = discards if union is None: union = unionfind.UnionFind() self.union = union self.grid = Grid(GridSize.MIGHTY_DUEL if Rule.MIGHTY_DUEL in self.rules else GridSize.STANDARD)
def main(args): count = 0 stime = time.time() csv_list = read_csv(args.csv_file) etime = time.time() print("DEBUG:", csv_list[0], etime-stime) print("START UNION FIND") stime = time.time() u = uf.UnionFind(int(cdq.get_max_address())+1) #u = uf.UnionFind(int(100000000)) etime = time.time() print(f"MAKE ADDRESS END, TOTAL TIME:{etime - stime}") for first, second in csv_list: u.union(first, second) if count % 10000000 == 0: etime = time.time() print(f"COUNT {count} END, TOTAL TIME: {etime - stime}") count += 1 etime = time.time() print(f"UNION FIND END TOTAL TIME:{etime - stime}") del u.rank print("START CLUSTERING") stime = time.time() count = 0 addr_list = list() for index, cluster in enumerate(u.par): addr_list.append((u.find(cluster), str(index))) if count % 10000 == 0: cdq.begin_transactions() cdq.update_cluster_many(addr_list) cdq.commit_transactions() etime = time.time() print(f"COUNT {count} END, TOTAL TIME: {etime - stime}, {addr_list[len(addr_list)-1]}") del addr_list addr_list = list() count += 1 etime = time.time() del u.par print(f"CLUSTERING END:{etime - stime}")
def run(self): # get parameters weight = self.dataSet["weight"] # init result self.result.setAllEdgeValue(False) # sort edge in increasing order of weight edges = [] for e in self.graph.getEdges(): edges.append([e, weight.getEdgeMax() - weight[e]]) edges.sort(key=lambda ed: ed[1]) # init Union-Find data structure uf = unionfind.UnionFind() for ed in edges: e = ed[0] if uf[self.graph.source(e)] != uf[self.graph.target(e)]: uf.union(self.graph.source(e), self.graph.target(e)) self.result[e] = True return True
def main(args): count = 0 stime = time.time() csv_list = read_csv(args.csv_file) etime = time.time() cdq = ClusterDB('/home/dnlab/DataHDD/dbv3cluster.db') print("DEBUG:", csv_list[0], etime - stime) print("START UNION FIND") stime = time.time() #u = uf.UnionFind(int(cdq.get_max_address())+1) u = uf.UnionFind(int(200000000)) etime = time.time() print(f"MAKE ADDRESS END, TOTAL TIME:{etime - stime}") for first, second in csv_list: u.union(first, second) if count % 10000000 == 0: etime = time.time() print(f"COUNT {count} END, TOTAL TIME: {etime - stime}") count += 1 etime = time.time() print(f"UNION FIND END TOTAL TIME:{etime - stime}") del u.rank print("START CLUSTERING") stime = time.time() count = 0 addr_list = list() for index, cluster in enumerate(u.par): addr_list.append((str(index), u.find(cluster))) del u.par df = pd.DataFrame( addr_list, columns=['addr', 'number'], ) df.to_csv('/home/dnlab/DataHDD/cluster_result/40man.csv', index=False) etime = time.time() print(f"CLUSTERING END:{etime - stime}")
def implicitClustering(nodeList, maxDist=3, bitsLabel=24): numNodes = len(nodeList) ufstruct = uf.UnionFind(list(range(len(nodeList)))) swapMasks = calculateMasks(bitsLabel) labelDict = createLabelDict(nodeList) numClusterings = 0 for i, node1 in enumerate(nodeList): print("Node {} of {}".format(i, numNodes - 1)) for sameLabelIndex in labelDict[node1]: if sameLabelIndex != i: ufstruct.union(i, sameLabelIndex) for mask in swapMasks: modLabel = swapByMask(node1, mask) if modLabel in labelDict: for index in labelDict[modLabel]: ufstruct.union(i, index) k = ufstruct.totalComponents #len(nodeList) - numClusterings return k
def hamming_clustering(nodes, min_distance): ''' Performs clustering on a list of nodes whose position is expressed as a binary sequence in ndimensional space. Requires a function find_pairs() that finds all pairs of nodes separated by a given Hamming distance, i.e. the number of bits that must be flipped to make two numbers identical (e.g. the Hamming distance from 1010 to 1001 is 2). Returns the UnionFind object representation of the clusters. >>> nodes = read_nodes('kclustering_tests/tcbig1.txt') >>> print hamming_clustering(nodes, 2) Found 15 edges with length 1 {6: [6, 7, 4, 5, 0, 1, 3, 2, 8, 9]} >>> nodes = read_nodes('kclustering_tests/tcbig2.txt') >>> print hamming_clustering(nodes, 2) Found 2 edges with length 1 {1: [1, 3, 2], 13: [13]} ''' # Get maximum length of binary representation of nodes, i.e. ndim ndim = max(map(len, map("{:b}".format, nodes.keys()))) # Create UnionFind to store clusters x = unionfind.UnionFind(nodes.keys()) for d in range(1, min_distance): pairs = find_pairs(nodes, ndim, d) npairs = len(pairs) print "Found {} edges with length {}".format(npairs, d) for i, pair in enumerate(pairs): x.union(*pair) return x
dist = np.empty([nnode, nnode], dtype=int) for i in np.arange(nnode - 1): for j in np.arange(nnode - 1 - i) + 1 + i: diff = np.logical_xor(data[i, :], data[j, :]) dist[i, j] = len(np.where(diff == True)[0]) #dist[i,j] = sum(ch1 != ch2 for ch1, ch2 in zip(data[i,:],data[j,:])) print 'finish distance calculation' return dist def cluster(data, nf, dist, mindist): nnode = len(data[:, 0]) for i in np.arange(nnode - 1): for j in np.arange(nnode - 1 - i) + 1 + i: if dist[i, j] == mindist: if nf.find(i) != nf.find(j): nf.union(i, j) return nf if __name__ == "__main__": data = np.int_(np.loadtxt(sys.argv[1], skiprows=1)) nf = unionfind.UnionFind() nf.insert_objects(np.arange(len(data[:, 0]))) dist = builddist(data) nf = cluster(data, nf, dist, 0) nf = cluster(data, nf, dist, 1) nf = cluster(data, nf, dist, 2) print nf print len(set(nf.parent_pointers.values()))
def perform_cluster(corpus, num_processes, language_type, save_group_id): # 分词,过滤停用词 first_articles = [(obj.messageTitle, '') for obj in corpus] if language_type == 0: first_cut_result = jieba_multipro.multi_cut_words_for_cluster( first_articles, num_processes) if language_type == 1: first_cut_result = nltk_multipro.multi_cut_words( first_articles, num_processes) logger.info('first multi_cut_words_for_cluster....') # 构建TF-IDF矩阵 first_tfidf_matrix = tfidfWeight.getTfidf(first_cut_result, True) logger.info('first construct_matrix....') # DBScan聚类算法 first_cluster_results = first_cluster(corpus, first_tfidf_matrix, save_group_id, language_type) logger.info('first first_cluster....') for label, dict_ in first_cluster_results.items(): logger.debug('label:%d, cluster_topic: %s, ids: %s' % (label, dict_['cluster_topic'], '-'.join(dict_['ids']))) # 如果第一次聚类结果为null cluster_results = [] if len(first_cluster_results) == 0: logger.info('first_cluster_results 为空, 随机选择10条数据作为聚类结果....') cluster_results = get_result_corpus(corpus) else: # 二次聚类, 根据topic second_corpus = [(label, topic['cluster_message']) for (label, topic) in first_cluster_results.items() if topic['cluster_message'] != ''] second_articles = [article[1] for article in second_corpus] for label, message in second_corpus: logger.debug('label:%d, message: %s' % (label, message)) # 如果第二次聚类语料为null, 则直接返回第一次聚类结果 if len(second_articles) == 0: cluster_results = get_result(first_cluster_results) else: # 构建TF-IDF矩阵 second_tfidf_matrix = tfidfWeight.getTfidf(second_articles, True) logger.info('second construct_matrix....') # DBScan聚类算法 second_cluster_results = second_cluster(second_corpus, second_tfidf_matrix, save_group_id) logger.info('second second_cluster....') # 合并聚类结果 first_ids = [[label] for (label, topic) in first_cluster_results.items()] logger.info('first_ids: %s', first_ids) second_ids = [ clusters.keys() for (label, clusters) in second_cluster_results.items() if (len(clusters)) > 1 ] logger.info('second_ids: %s', second_ids) first_ids.extend(second_ids) u = unionfind.UnionFind(first_ids) u.create_tree() ids_list = u.get_tree() logger.info('ids_list: %s', ids_list) cluster_results = return_data(ids_list, first_cluster_results) logger.info('cluster_results....') return cluster_results
""" Code taken from Hyperbolic Hierarchical Clustering (HypHC) by Chami et al. for more details visit https://github.com/HazyResearch/HypHC """ import numpy as np import unionfind if __name__ == '__main__': uf = unionfind.UnionFind(5) uf.merge(np.array([[0, 1], [2, 3], [0, 4], [3, 4]])) print(uf.parent) print(uf.tree)
def detect(self, image): """ Detect maximally stable extremal regions @type image: np.ndarray @param image: The image from which MSER are to be extracted from @rtype: list @return: A list of contours representing detected MSERs """ if image.ndim > 2: raise TypeError(f'A grayscale image expected but image had \ {image.ndim} channels') image_shape = image.shape image = image.flatten() sorting_indices = np.argsort(image) row_indices, column_indices = np.unravel_index(sorting_indices, image_shape) threshold_list = np.linspace(0, 255, self.num_thresh, endpoint=True) output_images = np.zeros( (self.num_thresh, image_shape[0], image_shape[1]), dtype=np.uint8) sorted_intensity = image[sorting_indices] # connected component history UF = unionfind.UnionFind(image.size) for i, threshold in enumerate(threshold_list): # Find the index at which intensity > thershold end_index = bisect.bisect(sorted_intensity, threshold) if end_index >= image.size: continue # Build the thresholded image output_images[i, row_indices[end_index:], column_indices[end_index:]] = 255 ids = sorting_indices[end_index:] # Find the neighboring pixels of all white pixels neighbors = np.apply_along_axis(find_neighbors, 0, ids, image_shape).T for j, pix_neighbors in enumerate(neighbors): for neighbor in pix_neighbors: # TODO: Handle pixels on image edges (3 neighbors) if neighbor < 0 or neighbor > image.size: continue # If a neighbor is a white pixel it is part of connected component else it is border pixel pix = ids[j] if neighbor in ids: UF.union(neighbor, pix) else: UF.add_neighbor(pix, neighbor) all_history = UF.get_top_level_history() history = filter(self.is_possibly_MSER, all_history) # Identify connected components corresponding to the MSERs msers = [] for parent_comp in history: ph = [parent_comp.size, parent_comp.size] pq = [float('inf'), float('inf')] MSER.find_msers(parent_comp, msers, ph, pq, self.max_area, self.min_area, self.max_var) contours = [] for mser in msers: neighbors = mser.neighbors.difference(mser.members) border_pixels = np.array(list(neighbors)) contour = np.apply_along_axis(id2coords, 0, border_pixels, image_shape).T # Make contours OpenCV compatible (switch x and y) contour = np.flip(contour, axis=1) contour = MSER.order_rotationally(contour) contours.append(contour) return contours
def perform_cluster(corpus, num_processes, min_sample): first_articles = [(obj.messageTitle, obj.messageContent) for obj in corpus] first_cut_results = nltk_multipro_new.multi_cut_words( first_articles, num_processes) logger.info('first_cut_results end....') # 构建TF-IDF矩阵 from utils import tfidfWeight first_tf_matrix = tfidfWeight.getTfidf(first_cut_results, True) logger.info('first_tfidf_matrix end....') # 聚类算法 first_labels = run_cluster(first_tf_matrix) first_cluster_results = first_get_return_data(first_labels, corpus) logger.info('first_cluster_results end....') for label, dict_ in first_cluster_results.items(): logger.debug('label:%d, cluster_topic: %s, ids: %s' % (label, dict_['title_list'], '-'.join(dict_['ids']))) # 如果第一次聚类结果为null cluster_results = [] if len(first_cluster_results) == 0: logger.info('first_cluster_results 为空, 随机选择10条数据作为聚类结果....') cluster_results = get_result_corpus(corpus) else: # 二次聚类, 根据topic second_corpus = [(label, topic['title_list']) for (label, topic) in first_cluster_results.items() if '。'.join(topic['title_list']).strip() != ''] second_articles = [('。'.join(article[1]), '') for article in second_corpus] second_cut_results = nltk_multipro_new.multi_cut_words( second_articles, num_processes) logger.info('second_cut_results end....') ziped_second_corpus_cut_results = zip(second_corpus, second_cut_results) ziped_second_corpus_cut_results = filter( lambda t: t[1] != '@' > 0, ziped_second_corpus_cut_results) second_corpus = [t[0] for t in ziped_second_corpus_cut_results] second_cut_results = [t[1] for t in ziped_second_corpus_cut_results] # 构建TF-IDF矩阵 second_tf_matrix = tfidfWeight.getTfidf(second_cut_results, True) logger.info('second_tf_matrix end....') second_labels = run_cluster(second_tf_matrix) second_cluster_results = second_get_return_data( second_labels, second_corpus) logger.info('second_cluster_results end....') # 合并聚类结果 first_ids = [[label] for (label, topic) in first_cluster_results.items()] logger.info('first_ids: %s', first_ids) second_ids = [ clusters.keys() for (label, clusters) in second_cluster_results.items() if (len(clusters)) > 1 ] logger.info('second_ids: %s', second_ids) first_ids.extend(second_ids) u = unionfind.UnionFind(first_ids) u.create_tree() ids_list = u.get_tree() logger.info('ids_list: %s', ids_list) cluster_results = return_data(ids_list, first_cluster_results, min_sample) logger.info('cluster_results....') return cluster_results
def localsearch(self): self.candidates= {} global localcount self.neis=self.conquered_kingdoms[:] for i in self.neis: self.T = 2000000000 self.p = exp(-1/self.T) self.copykingdoms = self.conquered_kingdoms[:] self.copywalks = self.closed_walk[:] passs = False # it means that the node we are changing is at the last part fullcopywalk = False potential = -self.M[i][i] checker = True index = i NTR = {} size = 0 # check whether below really removes or not self.copykingdoms.remove(i) dontconquer = index for j in self.getn(index): jindex = j[0] if jindex not in self.conquered_kingdoms: for k in self.getn(jindex): kindex = k[0] if kindex in self.copykingdoms: checker = False if checker: size += 1 NTR[jindex] = [self.M[i][jindex] + self.M[jindex][jindex],0,0] first={} uniongroup={} unioned=[] uf = unionfind.UnionFind(len(NTR.keys())) if len(NTR.keys()) == 0: passs = True else: for p in NTR: root = None # dont know whether use below addiunion = False neigh = self.getns(p) if p not in unioned: if len(neigh) > 0: uniongroup[p] = [p] unioned += [p] root = p else: first[p] = [self.M[p][dontconquer] + self.M[p][p],0,0] else: root = uf._root(p) for j in neigh: if j in NTR.keys(): uf.union(p,j) if j not in unioned: uniongroup[root] += [j] unioned+=[j] self.newclosedwalk=[] for q in first: back = self.closed_walk[self.conquered_at[i] - 2] try : front = self.closed_walk[self.conquered_at[i]] except: front = None if q == back: self.copykingdoms +=[q] size -= 1 if size != 0: potential += self.M[q][q] else : newbackcost, newpath = self.dijkreturn(back) self.copywalks = self.closed_walk[:back] + newpath potential += newbackcost - self.originalbackcost + self.M[q][q] - self.M[i][q] fullcopywalk = True # think about it # check if front has str type elif q == front: self.copykingdoms += [q] potential += self.M[q][q] size -= 1 # ----------------------------------------maybe I don't need below else: potential += first[q][0] self.copykingdoms+=[q] self.newclosedwalk += [q, dontconquer] # ---------------------------------------------- usedroot=[] submains=[] for q in unioned: tosubgraph=[] root = uf._root(q) if root not in usedroot: usedroot += [root] for j in uniongroup[root]: tosubgraph += [j] self.subG = self.G.subgraph(tosubgraph) # !!!!!!!self.subG -- adjacency matrix. check! self.subG[dontconquer][dontconquer] = 4000000001 submains += [submain(self.list_of_kingdom_names, dontconquer, self.subG, self.subG, 0, params=[])] for q in submains: self.newclosedwalk += q.closed_walk self.copykingdoms += q.conquered_kingdoms potential += q.totalcost dontconquerindex = self.conquered_at[dontconquer] - 1 if not fullcopywalk: self.copywalks = self.closed_walk[:dontconquerindex + 1] + self.newclosedwalk + self.closed_walk[dontconquerindex + 1:] if not passs: self.candidates[potential] = [self.copykingdoms, self.copywalks] if self.candidates: finalpotential = min(self.candidates) if finalpotential < 0: localcount += 1 self.closed_walk = self.candidates[finalpotential][1] self.conquered_kingdoms = self.candidates[finalpotential][0] self.totalcost += finalpotential self.T -= 1 if self.T < 0: T=0.00001 elif self.p > random.random(): self.T -= 1 if self.T < 0: T=0.00001 coun = 0 for l in NTR: if coun ==0: self.neis +=[l] coun +=1 print('ha')
def __init__(self, graph): self.graph = graph self.sets = u.UnionFind() self.my_queue = q.Queue(20)
def setUp(self): self.forest = unionfind.UnionFind(10)
def check_completed(now, prv, size): uf = unf.UnionFind(len(prv)) for i in range(size): uf.unite(i, prv[i]) uf.unite(i, now[i]) return uf.one_roop()
def resume(): if is_resume(): term = 10000 start_height = cdq.get_max_height() end_height = dq.gext_max_height() pool_num = multiprocessing.cpu_count()//2 s_index = cdq.get_max_address() u = uf.UnionFind(dq.get_max_address() - s_index + 1) try: for sheight, eheight in zip(range(start_height, end_height, term), \ range(start_height+term, end_height+term, term)): if eheight >= end_height: eheight = end_height + 1 with multiprocessing.Pool(pool_num) as p: result = p.imap(multi_input, range(sheight, eheight)) for addr_list in result: for addr_set in addr_list: addr_1 = addr_set[0] addr_2 = addr_set[1] u.union(int(addr_1) - s_index, int(addr_2) - s_index) etime = time.time() print('height: {}, time:{}'.format(eheight, etime-stime)) del u.rank except KeyboardInterrupt: print('Keyboard Interrupt Detected! Commit transactions...') cdq.commit_transactions() addr_list = [] count = 0 for index, cluster in enumerate(u.par): addr_list.append((str(index + s_index), u.find(cluster) + s_index)) count += 1 df = pd.DataFrame(addr_list, columns =['Address', 'ClusterNum']) mi_group = mi_df.groupby('ClusterNum') for cluster_number, addr_group in mi_group: if cluster_number != -1: addr_list = list(addr_group.Address) cluster_num_list = list(cdq.get_cluster_number(addr_list)) if len(cluster_num_list) <= 1: if cluster_num == -1: insert_cluster_many(list(zip(addr_list, [cluster_number] * len(addr_list)))) else: insert_cluster_many(list(zip(addr_list, [cluster_num] * len(addr_list)))) else: cluster_num_list.sort() cluster_num = cluster_num_list.pop(0) if cluster_num == -1: cluster_num = cluster_num_list.pop(0) #TODO 만약 같은 주소가 존재한다면 update 그렇지 않다면 insert update_cluster_many(list(zip([cluster_num] * len(addr_list), addr_list))) ''' 지속적인 비트코인 주소를 업데이트하는 함수 1. 현재주소와, 최대주소 비교 (Meta Table 만드는것 추천) 2. 현재주소와 최대주소가 다르다면, Clustering 시작 3. start_height = Metatable.blk.+1 end_height = dq.get_max_height() 4. index = cur_addr 5. uf.UnionFind(max_addr - cur_addr + 1) 6. 아래와 유사 ** u.union(int(addr_1) - index, int(addr_2) - index) ** ==> 함수 1 union addr_list.append((str(index) + index, u.find(cluster)+index)) ==> 함수 2 Clustering df = pd.DataFrame(addr_list) for cluster_list groupby 해서 주소리스트를 가져옴: ==> dbwrite - 주소들이 포함된 모든 클러스터 번호를 가져옴 만약 클러스터 번호가 없다면 그대로 add - 클러스터 번호가 1개라면 그 번호로 클러스터 add - 만약 클러스터 번호가 여러개라면 가장 작은것으로 add후 다른 클러스터가 있는것은 update ''' def main(args): term = 10000 start_height = 1 end_height = dq.get_max_height() pool_num = multiprocessing.cpu_count()//2 cdq = ClusterDB(args.dbpath) stime = time.time() u = uf.UnionFind(int(dq.get_max_address())+1) try: for sheight, eheight in zip(range(start_height, end_height, term), \ range(start_height+term, end_height+term, term)): if eheight >= end_height: eheight = end_height + 1 with multiprocessing.Pool(pool_num) as p: result = p.imap(multi_input, range(sheight, eheight)) for addr_list in result: for addr_set in addr_list: addr_1 = addr_set[0] addr_2 = addr_set[1] u.union(int(addr_1), int(addr_2)) etime = time.time() print('height: {}, time:{}'.format(eheight, etime-stime)) del u.rank db_write(stime, cdq, u) except KeyboardInterrupt: print('Keyboard Interrupt Detected! Commit transactions...') cdq.commit_transactions() if __name__=="__main__": import argparse parser = argparse.ArgumentParser(description='Heuristics Clusterings') parser.add_argument('--dbpath', '-d', type=str, required=True, help='insert make dbpath') parser.add_argument('--resume', '-r', type=bool, default=False, help='execute resume') args = parser.parse_args() main(args)
def create_unionfind_ds(edgelist): nodes = create_node_list(edgelist) UFDS = uf.UnionFind(nodes) return UFDS