def count_clusters(points, dist): n = len(points) uf = UnionFind(n) # compute all (dx,dy,dz,dt) such that abs(dx) + abs(dy) + abs(dz) + abs(dt) <= 3 # and don't include (0,0,0,0) DIRS = [(dx,dy,dz,dt) for dx in xrange(-dist, dist+1) \ for dy in xrange(-dist+abs(dx), dist+1-abs(dx)) \ for dz in xrange(-dist+abs(dx)+abs(dy), dist+1-abs(dx)-abs(dy)) \ for dt in xrange(-dist+abs(dx)+abs(dy)+abs(dz), dist+1-abs(dx)-abs(dy)-abs(dz)) \ if (dx,dy,dz,dt) != (0,0,0,0)] # generate a dict of each point to its index point_to_index = {p: i for i, p in enumerate(points)} # for each connected points, perform a union find between the indices of the two points for i in xrange(n): x, y, z, t = points[i] for dx, dy, dz, dt in DIRS: p2 = (x + dx, y + dy, z + dz, t + dt) if p2 in point_to_index: uf.union(i, point_to_index[p2]) # return the number of disjoint sets we found return uf.count()
def check_alg_for_root_comp(root_comp, words, comps): global GRAPH_TIME root_node = generate_algorithm(root_comp) if DEBUG: print("({}) Starting checking of algorithms with root value {}".format( strftime("%Y-%m-%d %H:%M:%S", gmtime()), root_comp)) # Note: We do not want to manipulate the root - different root-values will be checked in other executions # Compute three subsets of the words and of the tree bigger_list, equal_list, smaller_list = MY_UTIL.divide_words( root_comp, words) # union-find datastructure that is used to keep track if the underlying ordering graph is yet weakly connected cc = UF(n) cc.union(root_comp[0], root_comp[1]) # graph that keeps track of transitive dependencies G = nx.DiGraph() G.add_nodes_from(range(n)) start = time.time() G_smaller = G.copy() G_equal = G.copy() G_bigger = G.copy() G_smaller.add_edge(root_comp[0], root_comp[1]) G_equal.add_edge(root_comp[0], root_comp[1]) G_equal.add_edge(root_comp[1], root_comp[0]) G_bigger.add_edge(root_comp[1], root_comp[0]) GRAPH_TIME += (time.time() - start) # If, for a word w=a_1 a_2 ... a_n, we already know that the max_suffix is in the subword a_i ... a_n and we # conduct a comparison between the a_i and a_j which yields a_i < a_j we can subsequently only # investigate the subword a_{i+1} a_{i+2} ... a_n comps_smaller = [c for c in comps if c != root_comp] if root_comp[0] == 0: comps_smaller = [c for c in comps_smaller if c[0] != 0 ] + [c for c in comps_smaller if c[0] == 0] first_rel_char_smaller = 1 else: first_rel_char_smaller = 0 if (check_alg(root_node.children[0], smaller_list, comps_smaller, cc, G_smaller, first_rel_char_smaller) and check_alg(root_node.children[1], equal_list, [c for c in comps if c != root_comp], cc, G_equal, 0) and check_alg(root_node.children[2], bigger_list, [c for c in comps if c != root_comp], cc, G_bigger, 0)): return root_node else: return
def get_cc(recompute): if not recompute: with open("pickles/cc_hist_testdb.pickle", "rb") as f: dct = pickle.load(f) return dct, len(dct.keys()) print("identifying nodes...") addr_dct = {x.identifier: x for x in BtcAddresses.scan()} addr_identifiers = addr_dct.keys() # max_id = get_num_addresses() num_nodes = max(addr_identifiers) + 1 print("num addresses: " + str(len(addr_identifiers))) union_find = UF(num_nodes) for identifier, address_struct in addr_dct.items(): neighbor_addrs = json.loads(address_struct.neighbor_addrs) parent = identifier if len(neighbor_addrs) != 0: for neighbor in neighbor_addrs: union_find.union(neighbor, parent) print("===== union find done =====") print("normalizing node ids...") # need to re-number groups so that node numbers go from 0 to num_connected_components init_node_id = 0 # addr to curr_node_id addr_to_node_id = {} # pre_node_id to curr_node_id used_node_ids = {} for identifier in addr_identifiers: node_id_pre = union_find.find(identifier) if node_id_pre in used_node_ids: node_id_post = used_node_ids[node_id_pre] else: used_node_ids[node_id_pre] = init_node_id node_id_post = used_node_ids[node_id_pre] init_node_id += 1 addr_to_node_id[identifier] = node_id_post with open("pickles/cc_hist.pickle", "wb") as handle: pickle.dump(addr_to_node_id, handle) print("cc pickled, creating graph...") return addr_to_node_id, len(addr_to_node_id.keys())
def get_cc_for_addrs(addr_ids): addr_ids_list = list(addr_ids) # retrieve all addresses necessary for transactions # that yet to be clustered print("retrieving addrs in transaction...") addrs = {} '''qset = BtcAddress.objects( ref_id__in=addr_ids_list).only('ref_id', 'neighbor_addrs') for addr in qset: addrs[addr.ref_id] = addr''' for addr_id in addr_ids_list: addrs[addr_id] = BtcAddress.objects(ref_id=addr_id).only( 'ref_id', 'neighbor_addrs').first() print("done retrieving addrs...") # normalize address ids to set of [0, num_addrs] addrs_to_normalized_addrs = {} start_inx = 0 sorted_keys = sorted(addrs.keys()) for key in sorted_keys: addrs_to_normalized_addrs[key] = start_inx start_inx += 1 num_addrs = len(addr_ids_list) print("number of nodes: ", num_addrs) union_find = UF(num_addrs) print("identifying nodes...") possible_otc_addrs = [] for identifer, addr_obj in addrs.items(): identifer_node_id = addrs_to_normalized_addrs[identifer] neighbor_addrs = addr_obj.neighbor_addrs if len(neighbor_addrs) != 0: for reference in neighbor_addrs: if reference in addrs_to_normalized_addrs: ref_node_id = addrs_to_normalized_addrs[reference] union_find.union(ref_node_id, identifer_node_id) else: possible_otc_addrs.append(identifer) print("=====union find done=====") # {addr_ref_id: node_id} uf_dict = { x: union_find.find(addrs_to_normalized_addrs[x]) for x in addrs.keys() } # TODO: change addresses return uf_dict
def __init__(self, components_dict, relations_dict, component_name_to_label = dict(), label_min_group_size=1): self.components = components_dict self.relations = relations_dict self.component_name_to_label = component_name_to_label self.label_min_group_size = label_min_group_size self.group_counts = dict() self.component_id_to_label = dict() self.component_number = dict() counter = 0 for id, component in self.components.items(): if component["operation"] == "delete" or id in self.component_number.keys(): continue self.component_number[id] = counter component_name = component['data'].get("name", None) if component_name in self.component_name_to_label.keys(): self.component_id_to_label[id] = self.component_name_to_label[component_name] counter += 1 self.unionfind = UF(counter)
def test_graph_2(): txs = [[1, 2, 3], [4, 5], [4, 6], [5, 7]] uf = UF(8) for tx in txs: parent = min(tx) for node in tx: uf.union(parent, node) print(uf.connected(5, 6)) print(uf.find(4)) print(uf.find(7)) print(uf.find(3)) return uf.count()
def get_true_clusters(nodes): uf = UF(len(nodes)) print(len(nodes)) n_map = {} for n in range(len(nodes)): n_map[nodes[n]] = n prev_in = '0' prev_out = '0' prev_tx = '0' with open('../nerd/temp.dat', 'r') as f: edge = f.readline() while edge is not None: tokens = edge.strip().split('\t') if len(tokens) < 4: break txid = tokens[0] in_id = int(tokens[1]) + cst out_id = int(tokens[2]) + cst if in_id == out_id: edge = f.readline() continue elif txid == prev_tx: if uf.find(n_map[in_id]) != uf.find(n_map[prev_in]): uf.union(n_map[in_id], n_map[prev_in]) # if uf.find(n_map[out_id]) != uf.find(n_map[prev_out]): # uf.union(n_map[out_id], n_map[prev_out]) prev_tx = txid prev_in = in_id prev_out = out_id edge = f.readline() c_map = {} for id in range(len(uf._id)): if nodes[uf._id[id]] in c_map: c_map[nodes[uf._id[id]]].append(nodes[id]) else: c_map[nodes[uf._id[id]]] = [nodes[id]] print(c_map) print(len(c_map.keys())) return uf._id
def setUp(self): self.N = 10 self.uf = UF(self.N) self.pairs = ((0, 1), (1, 2), (4, 5), (7, 8), (8, 9))
class TestUnionFind(unittest.TestCase): def setUp(self): self.N = 10 self.uf = UF(self.N) self.pairs = ((0, 1), (1, 2), (4, 5), (7, 8), (8, 9)) def test_count(self): self.assertEqual(self.uf.count(), self.N) self.assertEqual(self.count_sets(), self.N) for x, y in self.pairs: self.uf.union(x, y) n = self.N - len(self.pairs) self.assertEqual(self.uf.count(), n) self.assertEqual(self.count_sets(), n) def test_find(self): for i in range(self.N): self.assertEqual(self.uf.find(i), i) for x, y in self.pairs: self.uf.union(x, y) for x, y in self.pairs: self.assertEqual(self.uf.find(x), self.uf.find(y)) def test_connected(self): for i in range(self.N): for j in range(self.N): if i == j: continue self.assertFalse(self.uf.connected(i, j)) for x, y in self.pairs: self.uf.union(x, y) for x, y in self.pairs: self.assertTrue(self.uf.connected(x, y)) def test_str_empty_uf(self): self.assertEqual(str(UF(0)), "") def test_str_uf(self): s = " ".join([str(x) for x in range(self.N)]) self.assertEqual(str(self.uf), s) def count_sets(self): return len(set([self.uf.find(x) for x in range(self.N)])) def tearDown(self): pass
class UcmdbComponentGroups(object): def __init__(self, components_dict, relations_dict, component_name_to_label = dict(), label_min_group_size=1): self.components = components_dict self.relations = relations_dict self.component_name_to_label = component_name_to_label self.label_min_group_size = label_min_group_size self.group_counts = dict() self.component_id_to_label = dict() self.component_number = dict() counter = 0 for id, component in self.components.items(): if component["operation"] == "delete" or id in self.component_number.keys(): continue self.component_number[id] = counter component_name = component['data'].get("name", None) if component_name in self.component_name_to_label.keys(): self.component_id_to_label[id] = self.component_name_to_label[component_name] counter += 1 self.unionfind = UF(counter) def label_groups(self): self._union_groups() self._calculate_group_counts() self._label_components() def _union_groups(self): for id, relation in self.relations.items(): if 'source_id' in relation and 'target_id' in relation: source_number = self.component_number.get(relation['source_id'], None) target_number = self.component_number.get(relation['target_id'], None) if source_number is not None and target_number is not None and not self.unionfind.connected(source_number, target_number): self.unionfind.union(source_number, target_number) def _calculate_group_counts(self): for id, component in self.components.items(): component_number = self.component_number.get(id, None) if component_number is None: continue group_id = self.unionfind.find(component_number) if group_id in self.group_counts.keys(): self.group_counts[group_id] += 1 else: self.group_counts[group_id] = 1 def _label_components(self): group_number_to_label = dict() for id, label in self.component_id_to_label.items(): component_number = self.component_number[id] group = self.unionfind.find(component_number) group_number_to_label[group] = label for id, component in self.components.items(): component_number = self.component_number.get(id, None) if component_number is None: continue group_id = self.unionfind.find(component_number) label = group_number_to_label.get(group_id, None) if label is None: group_size = self.group_counts.get(group_id, None) if group_size is not None and group_size >= self.label_min_group_size: self._append_label(component['data'], "group_of_size_%s" % group_size) else: self._append_label(component['data'], label) def _append_label(self, data, label): data['label.connected_group'] = label def get_components(self): return self.components def get_relations(self): return self.relations
def check_alg_for_root_comp(root_comp, words, comps): global TREE root_node = generate_algorithm(root_comp) TREE.node(SIGNATURE, '', style='filled', fillcolor='{};0.5:{}'.format(COLORS[root_comp[0]], COLORS[root_comp[1]]), pos="{},{}!".format(0, m - 1)) filename = build_filename(root_node) TREE.render('{}/{}'.format(OUTPUT_DIR, filename)) if DEBUG: print("({}) Starting checking of algorithms with root value {}".format( strftime("%Y-%m-%d %H:%M:%S", gmtime()), root_comp)) bigger_list, equal_list, smaller_list = MY_UTIL.divide_words( root_comp, words) # union-find datastructure that is used to keep track if the underlying ordering graph is yet weakly connected G_smaller = None G_equal = None G_bigger = None cc = None if USE_ALL_OPTIM: cc = UF(n) cc.union(root_comp[0], root_comp[1]) # graph that keeps track of transitive dependencies G = nx.DiGraph() G.add_nodes_from(range(n)) G_smaller = G.copy() G_equal = G.copy() G_bigger = G.copy() G_smaller.add_edge(root_comp[0], root_comp[1]) G_equal.add_edge(root_comp[0], root_comp[1]) G_equal.add_edge(root_comp[1], root_comp[0]) G_bigger.add_edge(root_comp[1], root_comp[0]) # If, for a word w=a_1 a_2 ... a_n, we already know that the max_suffix is in the subword a_i ... a_n and we # conduct a comparison between the a_i and a_j which yields a_i < a_j we can subsequently only # investigate the subword a_{i+1} a_{i+2} ... a_n comps_new_smaller = comps comps_new_equal = comps comps_new_bigger = comps first_rel_char_smaller = None if USE_OPTIM2 or USE_ALL_OPTIM: comps_new_smaller = [c for c in comps if c != root_comp] comps_new_equal = [c for c in comps if c != root_comp] comps_new_bigger = [c for c in comps if c != root_comp] if root_comp[0] == 0: comps_new_smaller = [c for c in comps if c[0] != 0 ] + [c for c in comps if c[0] == 0] first_rel_char_smaller = 1 else: first_rel_char_smaller = 0 if (check_alg(root_node.children[0], smaller_list, comps_new_smaller, cc, G_smaller, first_rel_char_smaller) and check_alg(root_node.children[1], equal_list, comps_new_equal, cc, G_equal, 0) and check_alg(root_node.children[2], bigger_list, comps_new_bigger, cc, G_bigger, 0)): MY_UTIL.save_current_graph(root_node.root, is_final=True) return root_node else: MY_UTIL.save_current_graph(root_node.root, is_final=True) return
def test_str_empty_uf(self): self.assertEqual(str(UF(0)), "")
def compute_fuzzier(word): n = len(word) count = 0 cc = UF(n) G = nx.DiGraph() G.add_nodes_from(range(n)) max_positions = {0} max_value = word[0] unconsidered_positions = [i for i in range(n)] i = 0 j = 2 unconsidered_positions.remove(0) unconsidered_positions.remove(2) # Step 1: explore while cc.count() > 1: mp_list = list(max_positions) if cc.count() <= 3 and len(max_positions) > 1 and (mp_list[1] - mp_list[0]) > 1: i = mp_list[0] + 1 j = mp_list[1] + 1 while True: if j == n - 1 or i == mp_list[1]: max_positions = {mp_list[0]} i = mp_list[0] if len(unconsidered_positions) > 0: j = unconsidered_positions[len(unconsidered_positions) // 2] unconsidered_positions.remove(j) elif cc.count() > 1: for new_index in range(n): if cc.find(new_index) != cc.find(i): j = new_index break break count += 1 if i in unconsidered_positions: unconsidered_positions.remove(i) if j in unconsidered_positions: unconsidered_positions.remove(j) cc.union(i, j) if word[i] < word[j]: max_positions = {mp_list[1]} j = mp_list[1] if len(unconsidered_positions) > 0: i = unconsidered_positions[len(unconsidered_positions) // 2] unconsidered_positions.remove(i) elif cc.count() > 1: for new_index in range(n): if cc.find(new_index) != cc.find(j): i = new_index break elif word[i] > word[j]: max_positions = {mp_list[0]} i = mp_list[0] if len(unconsidered_positions) > 0: j = unconsidered_positions[len(unconsidered_positions) // 2] unconsidered_positions.remove(j) elif cc.count() > 1: for new_index in range(n): if cc.find(new_index) != cc.find(i): j = new_index break else: i += 1 j += 1 else: count += 1 cc.union(i, j) if word[i] < word[j]: G.add_edge(i, j) if word[j] > max_value: max_value = word[j] max_positions = {j} if len(unconsidered_positions) > 0: i = unconsidered_positions[len(unconsidered_positions) // 2] unconsidered_positions.remove(i) else: for new_index in range(n): if cc.find(new_index) != j: i = new_index break elif word[i] > word[j]: G.add_edge(j, i) if word[i] > max_value: max_value = word[i] max_positions = {i} if len(unconsidered_positions) > 0: j = unconsidered_positions[len(unconsidered_positions) // 2] unconsidered_positions.remove(j) elif cc.count() > 1: for new_index in range(n): if cc.find(new_index) != cc.find(i): j = new_index break else: G.add_edge(i, j) G.add_edge(j, i) if word[i] >= max_value: if i != n-1: max_positions.add(i) if j != n-1: max_positions.add(j) if len(unconsidered_positions) > 0: i = unconsidered_positions[0] unconsidered_positions.remove(i) elif cc.count() > 1: for new_index in range(n): if cc.find(new_index) != cc.find(j): i = new_index break if len(max_positions) == 1: return max_positions.pop(), count # Step 2 find max suffix from max occurences else: mp_list = list(set(max_positions)) mp_list.sort() mp_candidates = [mp_list[0]] longest_streak = 1 current_streak = 1 for i in range(len(mp_list) - 1): if mp_list[i + 1] - mp_list[i] == 1: # two consecutive maxima current_streak += 1 if current_streak > longest_streak: longest_streak = current_streak mp_candidates = [mp_list[i - current_streak + 2]] elif current_streak == longest_streak: mp_candidates.append(mp_list[i - current_streak + 2]) else: current_streak = 1 if current_streak == longest_streak: mp_candidates.append(mp_list[i + 1]) if len(mp_candidates) == 1: return mp_candidates.pop(), count elif mp_candidates[-1] == n - longest_streak: mp_candidates.remove(n - longest_streak) while len(mp_candidates) > 1: count += 1 i = mp_candidates.pop() j = mp_candidates.pop() if word[i + 1] > word[j + 1]: G.add_edge(j + 1, i + 1) mp_candidates.append(i) elif word[i + 1] < word[j + 1]: G.add_edge(i + 1, j + 1) mp_candidates.append(j) else: return j, count return mp_candidates.pop(), count
def get_cc(): try: '''addrs = {} num_addrs = BtcAddress.objects().count() batch_size = 10000 batch_start = 0 print("querying all addresses") while batch_start < num_addrs - 1: batch_end = batch_start + batch_size print("processing addrs up to id: ", batch_end) batch_addrs = BtcAddress.objects(ref_id__lte=batch_end, ref_id__gte=batch_start).only('ref_id', 'neighbor_addrs').all() print("batch received going to add to dict...") for addr in batch_addrs: addrs[addr.ref_id] = addr print("finished adding batch to dict...") batch_start += batch_size''' num_nodes = max(addrs.keys()) + 1 print("number of nodes: ", num_nodes) union_find = UF(num_nodes) print("identifying nodes...") possible_otc_addrs = [] for identifer, addr_obj in addrs.items(): neighbor_addrs = addr_obj.neighbor_addrs if len(neighbor_addrs) != 0: for reference in neighbor_addrs: union_find.union(reference, identifer) else: possible_otc_addrs.append(identifer) print("=====union find done=====") uf_dict = {x: union_find.find(x) for x in addrs.keys()} for addr_ref_id in possible_otc_addrs: num_txs_using_addr_as_input = AddressTransactionLink.objects( addr_ref_id=addr_ref_id, addr_used_as_input=True).count() num_txs_using_addr_as_output = AddressTransactionLink.objects( addr_ref_id=addr_ref_id, addr_used_as_input=False).count() if num_txs_using_addr_as_input == 0 and num_txs_using_addr_as_output == 1: output_tx_link = AddressTransactionLink.objects( addr_ref_id=addr_ref_id, addr_used_as_input=False).only('tx_ref_id').first() output_tx = BtcTransaction.objects( ref_id=output_tx_link.tx_ref_id).first() otc_addr = check_otc_conditions(output_tx, addr_ref_id) if otc_addr: input_addr = output_tx.input_addrs[0].addr_ref_id node_id = uf_dict[input_addr] uf_dict[addr_ref_id] = node_id with open("pickles/cc.pickle", "wb") as f: pickle.dump(uf_dict, f) send_email_notif("Address Clustering Finished", "The address clustering script has finished") return uf_dict except Exception as e: print("Exception occured while clustering data") body = """ Please check the download_net.py script\n he following exception has occured: %s """ % (str(e)) send_email_notif("Error Occured", body) traceback.print_exc() sys.exit(1)