Пример #1
def count_clusters(points, dist):
    n = len(points)
    uf = UnionFind(n)

    # compute all (dx,dy,dz,dt) such that abs(dx) + abs(dy) + abs(dz) + abs(dt) <= 3
    # and don't include (0,0,0,0)
    DIRS = [(dx,dy,dz,dt) for dx in xrange(-dist, dist+1) \
                          for dy in xrange(-dist+abs(dx), dist+1-abs(dx)) \
                          for dz in xrange(-dist+abs(dx)+abs(dy), dist+1-abs(dx)-abs(dy)) \
                          for dt in xrange(-dist+abs(dx)+abs(dy)+abs(dz), dist+1-abs(dx)-abs(dy)-abs(dz)) \
                          if (dx,dy,dz,dt) != (0,0,0,0)]

    # generate a dict of each point to its index
    point_to_index = {p: i for i, p in enumerate(points)}

    # for each connected points, perform a union find between the indices of the two points
    for i in xrange(n):
        x, y, z, t = points[i]
        for dx, dy, dz, dt in DIRS:
            p2 = (x + dx, y + dy, z + dz, t + dt)
            if p2 in point_to_index:
                uf.union(i, point_to_index[p2])

    # return the number of disjoint sets we found
    return uf.count()
Пример #2
def check_alg_for_root_comp(root_comp, words, comps):
    global GRAPH_TIME
    root_node = generate_algorithm(root_comp)

    if DEBUG:
        print("({}) Starting checking of algorithms with root value {}".format(
            strftime("%Y-%m-%d %H:%M:%S", gmtime()), root_comp))
    # Note: We do not want to manipulate the root - different root-values will be checked in other executions
    # Compute three subsets of the words and of the tree
    bigger_list, equal_list, smaller_list = MY_UTIL.divide_words(
        root_comp, words)

    # union-find datastructure that is used to keep track if the underlying ordering graph is yet weakly connected
    cc = UF(n)
    cc.union(root_comp[0], root_comp[1])

    # graph that keeps track of transitive dependencies
    G = nx.DiGraph()

    start = time.time()
    G_smaller = G.copy()
    G_equal = G.copy()
    G_bigger = G.copy()

    G_smaller.add_edge(root_comp[0], root_comp[1])
    G_equal.add_edge(root_comp[0], root_comp[1])
    G_equal.add_edge(root_comp[1], root_comp[0])
    G_bigger.add_edge(root_comp[1], root_comp[0])

    GRAPH_TIME += (time.time() - start)

    # If, for a word w=a_1 a_2 ... a_n, we already know that the max_suffix is in the subword a_i ... a_n and we
    # conduct a comparison between the a_i and a_j which yields  a_i < a_j we can subsequently only
    # investigate the subword a_{i+1} a_{i+2} ... a_n
    comps_smaller = [c for c in comps if c != root_comp]
    if root_comp[0] == 0:
        comps_smaller = [c for c in comps_smaller if c[0] != 0
                         ] + [c for c in comps_smaller if c[0] == 0]
        first_rel_char_smaller = 1
        first_rel_char_smaller = 0

    if (check_alg(root_node.children[0], smaller_list, comps_smaller, cc,
                  G_smaller, first_rel_char_smaller)
            and check_alg(root_node.children[1], equal_list,
                          [c for c in comps if c != root_comp], cc, G_equal, 0)
            and check_alg(root_node.children[2], bigger_list,
                          [c for c in comps if c != root_comp], cc, G_bigger,
        return root_node
Пример #3
def get_cc(recompute):
    if not recompute:
        with open("pickles/cc_hist_testdb.pickle", "rb") as f:
            dct = pickle.load(f)

        return dct, len(dct.keys())

    print("identifying nodes...")
    addr_dct = {x.identifier: x for x in BtcAddresses.scan()}
    addr_identifiers = addr_dct.keys()
    # max_id = get_num_addresses()
    num_nodes = max(addr_identifiers) + 1
    print("num addresses: " + str(len(addr_identifiers)))
    union_find = UF(num_nodes)

    for identifier, address_struct in addr_dct.items():
        neighbor_addrs = json.loads(address_struct.neighbor_addrs)
        parent = identifier
        if len(neighbor_addrs) != 0:
            for neighbor in neighbor_addrs:
                union_find.union(neighbor, parent)

    print("===== union find done =====")
    print("normalizing node ids...")

    # need to re-number groups so that node numbers go from 0 to num_connected_components

    init_node_id = 0

    # addr to curr_node_id
    addr_to_node_id = {}

    # pre_node_id to curr_node_id
    used_node_ids = {}

    for identifier in addr_identifiers:
        node_id_pre = union_find.find(identifier)
        if node_id_pre in used_node_ids:
            node_id_post = used_node_ids[node_id_pre]
            used_node_ids[node_id_pre] = init_node_id
            node_id_post = used_node_ids[node_id_pre]
            init_node_id += 1

        addr_to_node_id[identifier] = node_id_post

    with open("pickles/cc_hist.pickle", "wb") as handle:
        pickle.dump(addr_to_node_id, handle)

    print("cc pickled, creating graph...")

    return addr_to_node_id, len(addr_to_node_id.keys())
Пример #4
def get_cc_for_addrs(addr_ids):
    addr_ids_list = list(addr_ids)

    # retrieve all addresses necessary for transactions
    # that yet to be clustered
    print("retrieving addrs in transaction...")
    addrs = {}
    '''qset = BtcAddress.objects(
            ref_id__in=addr_ids_list).only('ref_id', 'neighbor_addrs')
    for addr in qset:
        addrs[addr.ref_id] = addr'''
    for addr_id in addr_ids_list:
        addrs[addr_id] = BtcAddress.objects(ref_id=addr_id).only(
            'ref_id', 'neighbor_addrs').first()
    print("done retrieving addrs...")

    # normalize address ids to set of [0, num_addrs]
    addrs_to_normalized_addrs = {}
    start_inx = 0
    sorted_keys = sorted(addrs.keys())
    for key in sorted_keys:
        addrs_to_normalized_addrs[key] = start_inx
        start_inx += 1

    num_addrs = len(addr_ids_list)
    print("number of nodes: ", num_addrs)
    union_find = UF(num_addrs)

    print("identifying nodes...")
    possible_otc_addrs = []

    for identifer, addr_obj in addrs.items():
        identifer_node_id = addrs_to_normalized_addrs[identifer]
        neighbor_addrs = addr_obj.neighbor_addrs
        if len(neighbor_addrs) != 0:
            for reference in neighbor_addrs:
                if reference in addrs_to_normalized_addrs:
                    ref_node_id = addrs_to_normalized_addrs[reference]
                    union_find.union(ref_node_id, identifer_node_id)
    print("=====union find done=====")

    # {addr_ref_id: node_id}
    uf_dict = {
        x: union_find.find(addrs_to_normalized_addrs[x])
        for x in addrs.keys()

    # TODO: change addresses

    return uf_dict
 def __init__(self, components_dict, relations_dict, component_name_to_label = dict(), label_min_group_size=1):
     self.components = components_dict
     self.relations = relations_dict
     self.component_name_to_label = component_name_to_label
     self.label_min_group_size = label_min_group_size
     self.group_counts = dict()
     self.component_id_to_label = dict()
     self.component_number = dict()
     counter = 0
     for id, component in self.components.items():
         if component["operation"] == "delete" or id in self.component_number.keys():
         self.component_number[id] = counter
         component_name = component['data'].get("name", None)
         if component_name in self.component_name_to_label.keys():
             self.component_id_to_label[id] = self.component_name_to_label[component_name]
         counter += 1
     self.unionfind = UF(counter)
Пример #6
def test_graph_2():
    txs = [[1, 2, 3], [4, 5], [4, 6], [5, 7]]
    uf = UF(8)

    for tx in txs:
        parent = min(tx)
        for node in tx:
            uf.union(parent, node)

    print(uf.connected(5, 6))
    return uf.count()
Пример #7
def get_true_clusters(nodes):
    uf = UF(len(nodes))
    n_map = {}
    for n in range(len(nodes)):
        n_map[nodes[n]] = n

    prev_in = '0'
    prev_out = '0'
    prev_tx = '0'
    with open('../nerd/temp.dat', 'r') as f:
        edge = f.readline()
        while edge is not None:
            tokens = edge.strip().split('\t')
            if len(tokens) < 4:
            txid = tokens[0]
            in_id = int(tokens[1]) + cst
            out_id = int(tokens[2]) + cst

            if in_id == out_id:
                edge = f.readline()

            elif txid == prev_tx:
                if uf.find(n_map[in_id]) != uf.find(n_map[prev_in]):
                    uf.union(n_map[in_id], n_map[prev_in])

                # if uf.find(n_map[out_id]) != uf.find(n_map[prev_out]):
                #    uf.union(n_map[out_id], n_map[prev_out])

            prev_tx = txid
            prev_in = in_id
            prev_out = out_id

            edge = f.readline()

    c_map = {}
    for id in range(len(uf._id)):
        if nodes[uf._id[id]] in c_map:
            c_map[nodes[uf._id[id]]] = [nodes[id]]

    return uf._id
    def setUp(self):
        self.N = 10
        self.uf = UF(self.N)
        self.pairs = ((0, 1), (1, 2), (4, 5), (7, 8), (8, 9))

    def test_count(self):
        self.assertEqual(self.uf.count(), self.N)
        self.assertEqual(self.count_sets(), self.N)

        for x, y in self.pairs:
            self.uf.union(x, y)
        n = self.N - len(self.pairs)
        self.assertEqual(self.uf.count(), n)
        self.assertEqual(self.count_sets(), n)

    def test_find(self):
        for i in range(self.N):
            self.assertEqual(self.uf.find(i), i)

        for x, y in self.pairs:
            self.uf.union(x, y)

        for x, y in self.pairs:
            self.assertEqual(self.uf.find(x), self.uf.find(y))

    def test_connected(self):
        for i in range(self.N):
            for j in range(self.N):
                if i == j:
                self.assertFalse(self.uf.connected(i, j))

        for x, y in self.pairs:
            self.uf.union(x, y)

        for x, y in self.pairs:
            self.assertTrue(self.uf.connected(x, y))

    def test_str_empty_uf(self):
        self.assertEqual(str(UF(0)), "")

    def test_str_uf(self):
        s = " ".join([str(x) for x in range(self.N)])
        self.assertEqual(str(self.uf), s)

    def count_sets(self):
        return len(set([self.uf.find(x) for x in range(self.N)]))

    def tearDown(self):
class UcmdbComponentGroups(object):
    def __init__(self, components_dict, relations_dict, component_name_to_label = dict(), label_min_group_size=1):
        self.components = components_dict
        self.relations = relations_dict
        self.component_name_to_label = component_name_to_label
        self.label_min_group_size = label_min_group_size
        self.group_counts = dict()
        self.component_id_to_label = dict()
        self.component_number = dict()
        counter = 0
        for id, component in self.components.items():
            if component["operation"] == "delete" or id in self.component_number.keys():
            self.component_number[id] = counter
            component_name = component['data'].get("name", None)
            if component_name in self.component_name_to_label.keys():
                self.component_id_to_label[id] = self.component_name_to_label[component_name]
            counter += 1
        self.unionfind = UF(counter)

    def label_groups(self):

    def _union_groups(self):
        for id, relation in self.relations.items():
            if 'source_id' in relation and 'target_id' in relation:
                source_number = self.component_number.get(relation['source_id'], None)
                target_number = self.component_number.get(relation['target_id'], None)
                if source_number is not None and target_number is not None and not self.unionfind.connected(source_number, target_number):
                    self.unionfind.union(source_number, target_number)

    def _calculate_group_counts(self):
        for id, component in self.components.items():
            component_number = self.component_number.get(id, None)
            if component_number is None:
            group_id = self.unionfind.find(component_number)
            if group_id in self.group_counts.keys():
                self.group_counts[group_id] += 1
                self.group_counts[group_id] = 1

    def _label_components(self):
        group_number_to_label = dict()
        for id, label in self.component_id_to_label.items():
            component_number = self.component_number[id]
            group = self.unionfind.find(component_number)
            group_number_to_label[group] = label

        for id, component in self.components.items():
            component_number = self.component_number.get(id, None)
            if component_number is None:
            group_id = self.unionfind.find(component_number)
            label = group_number_to_label.get(group_id, None)
            if label is None:
                group_size = self.group_counts.get(group_id, None)
                if group_size is not None and group_size >= self.label_min_group_size:
                    self._append_label(component['data'], "group_of_size_%s" % group_size)
                self._append_label(component['data'], label)

    def _append_label(self, data, label):
        data['label.connected_group'] = label

    def get_components(self):
        return self.components

    def get_relations(self):
        return self.relations
Пример #11
def check_alg_for_root_comp(root_comp, words, comps):
    global TREE

    root_node = generate_algorithm(root_comp)

              pos="{},{}!".format(0, m - 1))
    filename = build_filename(root_node)
    TREE.render('{}/{}'.format(OUTPUT_DIR, filename))

    if DEBUG:
        print("({}) Starting checking of algorithms with root value {}".format(
            strftime("%Y-%m-%d %H:%M:%S", gmtime()), root_comp))

    bigger_list, equal_list, smaller_list = MY_UTIL.divide_words(
        root_comp, words)

    # union-find datastructure that is used to keep track if the underlying ordering graph is yet weakly connected
    G_smaller = None
    G_equal = None
    G_bigger = None
    cc = None
        cc = UF(n)
        cc.union(root_comp[0], root_comp[1])

        # graph that keeps track of transitive dependencies
        G = nx.DiGraph()

        G_smaller = G.copy()
        G_equal = G.copy()
        G_bigger = G.copy()

        G_smaller.add_edge(root_comp[0], root_comp[1])
        G_equal.add_edge(root_comp[0], root_comp[1])
        G_equal.add_edge(root_comp[1], root_comp[0])
        G_bigger.add_edge(root_comp[1], root_comp[0])

    # If, for a word w=a_1 a_2 ... a_n, we already know that the max_suffix is in the subword a_i ... a_n and we
    # conduct a comparison between the a_i and a_j which yields  a_i < a_j we can subsequently only
    # investigate the subword a_{i+1} a_{i+2} ... a_n
    comps_new_smaller = comps
    comps_new_equal = comps
    comps_new_bigger = comps
    first_rel_char_smaller = None
        comps_new_smaller = [c for c in comps if c != root_comp]
        comps_new_equal = [c for c in comps if c != root_comp]
        comps_new_bigger = [c for c in comps if c != root_comp]
        if root_comp[0] == 0:
            comps_new_smaller = [c for c in comps if c[0] != 0
                                 ] + [c for c in comps if c[0] == 0]
            first_rel_char_smaller = 1
            first_rel_char_smaller = 0

    if (check_alg(root_node.children[0], smaller_list, comps_new_smaller, cc,
                  G_smaller, first_rel_char_smaller)
            and check_alg(root_node.children[1], equal_list, comps_new_equal,
                          cc, G_equal, 0)
            and check_alg(root_node.children[2], bigger_list, comps_new_bigger,
                          cc, G_bigger, 0)):
        MY_UTIL.save_current_graph(root_node.root, is_final=True)
        return root_node
        MY_UTIL.save_current_graph(root_node.root, is_final=True)
Пример #12
Пример #13
Пример #14
class TestUnionFind(unittest.TestCase):
    def setUp(self):
        self.N = 10
        self.uf = UF(self.N)
        self.pairs = ((0, 1), (1, 2), (4, 5), (7, 8), (8, 9))

    def test_count(self):
        self.assertEqual(self.uf.count(), self.N)
        self.assertEqual(self.count_sets(), self.N)

        for x, y in self.pairs:
            self.uf.union(x, y)
        n = self.N - len(self.pairs)
        self.assertEqual(self.uf.count(), n)
        self.assertEqual(self.count_sets(), n)

    def test_find(self):
        for i in range(self.N):
            self.assertEqual(self.uf.find(i), i)

        for x, y in self.pairs:
            self.uf.union(x, y)

        for x, y in self.pairs:
            self.assertEqual(self.uf.find(x), self.uf.find(y))

    def test_connected(self):
        for i in range(self.N):
            for j in range(self.N):
                if i == j:
                self.assertFalse(self.uf.connected(i, j))

        for x, y in self.pairs:
            self.uf.union(x, y)

        for x, y in self.pairs:
            self.assertTrue(self.uf.connected(x, y))

    def test_str_empty_uf(self):
        self.assertEqual(str(UF(0)), "")

    def test_str_uf(self):
        s = " ".join([str(x) for x in range(self.N)])
        self.assertEqual(str(self.uf), s)

    def count_sets(self):
        return len(set([self.uf.find(x) for x in range(self.N)]))

    def tearDown(self):
Пример #15
def compute_fuzzier(word):
    n = len(word)
    count = 0
    cc = UF(n)
    G = nx.DiGraph()
    max_positions = {0}
    max_value = word[0]
    unconsidered_positions = [i for i in range(n)]

    i = 0
    j = 2

    # Step 1: explore
    while cc.count() > 1:
        mp_list = list(max_positions)
        if cc.count() <= 3 and len(max_positions) > 1 and (mp_list[1] - mp_list[0]) > 1:
            i = mp_list[0] + 1
            j = mp_list[1] + 1

            while True:
                if j == n - 1 or i == mp_list[1]:
                    max_positions = {mp_list[0]}
                    i = mp_list[0]
                    if len(unconsidered_positions) > 0:
                        j = unconsidered_positions[len(unconsidered_positions) // 2]
                    elif cc.count() > 1:
                        for new_index in range(n):
                            if cc.find(new_index) != cc.find(i):
                                j = new_index
                count += 1
                if i in unconsidered_positions:
                if j in unconsidered_positions:
                cc.union(i, j)
                if word[i] < word[j]:
                    max_positions = {mp_list[1]}
                    j = mp_list[1]
                    if len(unconsidered_positions) > 0:
                        i = unconsidered_positions[len(unconsidered_positions) // 2]
                    elif cc.count() > 1:
                        for new_index in range(n):
                            if cc.find(new_index) != cc.find(j):
                                i = new_index
                elif word[i] > word[j]:
                    max_positions = {mp_list[0]}
                    i = mp_list[0]
                    if len(unconsidered_positions) > 0:
                        j = unconsidered_positions[len(unconsidered_positions) // 2]
                    elif cc.count() > 1:
                        for new_index in range(n):
                            if cc.find(new_index) != cc.find(i):
                                j = new_index
                    i += 1
                    j += 1

            count += 1
            cc.union(i, j)
            if word[i] < word[j]:
                G.add_edge(i, j)
                if word[j] > max_value:
                    max_value = word[j]
                    max_positions = {j}
                if len(unconsidered_positions) > 0:
                    i = unconsidered_positions[len(unconsidered_positions) // 2]
                    for new_index in range(n):
                        if cc.find(new_index) != j:
                            i = new_index

            elif word[i] > word[j]:
                G.add_edge(j, i)
                if word[i] > max_value:
                    max_value = word[i]
                    max_positions = {i}
                if len(unconsidered_positions) > 0:
                    j = unconsidered_positions[len(unconsidered_positions) // 2]
                elif cc.count() > 1:
                    for new_index in range(n):
                        if cc.find(new_index) != cc.find(i):
                            j = new_index
                G.add_edge(i, j)
                G.add_edge(j, i)
                if word[i] >= max_value:
                    if i != n-1:
                    if j != n-1:
                if len(unconsidered_positions) > 0:
                    i = unconsidered_positions[0]
                elif cc.count() > 1:
                    for new_index in range(n):
                        if cc.find(new_index) != cc.find(j):
                            i = new_index

    if len(max_positions) == 1:
        return max_positions.pop(), count

    # Step 2 find max suffix from max occurences
        mp_list = list(set(max_positions))
        mp_candidates = [mp_list[0]]

        longest_streak = 1
        current_streak = 1
        for i in range(len(mp_list) - 1):
            if mp_list[i + 1] - mp_list[i] == 1:
                # two consecutive maxima
                current_streak += 1
                if current_streak > longest_streak:
                    longest_streak = current_streak
                    mp_candidates = [mp_list[i - current_streak + 2]]
                elif current_streak == longest_streak:
                    mp_candidates.append(mp_list[i - current_streak + 2])
                current_streak = 1
                if current_streak == longest_streak:
                    mp_candidates.append(mp_list[i + 1])

        if len(mp_candidates) == 1:
            return mp_candidates.pop(), count
        elif mp_candidates[-1] == n - longest_streak:
            mp_candidates.remove(n - longest_streak)
        while len(mp_candidates) > 1:
            count += 1
            i = mp_candidates.pop()
            j = mp_candidates.pop()
            if word[i + 1] > word[j + 1]:
                G.add_edge(j + 1, i + 1)
            elif word[i + 1] < word[j + 1]:
                G.add_edge(i + 1, j + 1)
                return j, count
        return mp_candidates.pop(), count
Пример #16
def get_cc():
        '''addrs = {}

        num_addrs = BtcAddress.objects().count()
        batch_size = 10000
        batch_start = 0
        print("querying all addresses")
        while batch_start < num_addrs - 1:
            batch_end = batch_start + batch_size
            print("processing addrs up to id: ", batch_end)

            batch_addrs = BtcAddress.objects(ref_id__lte=batch_end, 
                ref_id__gte=batch_start).only('ref_id', 'neighbor_addrs').all()
            print("batch received going to add to dict...")

            for addr in batch_addrs:
                addrs[addr.ref_id] = addr

            print("finished adding batch to dict...")
            batch_start += batch_size'''

        num_nodes = max(addrs.keys()) + 1
        print("number of nodes: ", num_nodes)

        union_find = UF(num_nodes)

        print("identifying nodes...")
        possible_otc_addrs = []

        for identifer, addr_obj in addrs.items():
            neighbor_addrs = addr_obj.neighbor_addrs
            if len(neighbor_addrs) != 0:
                for reference in neighbor_addrs:
                    union_find.union(reference, identifer)
        print("=====union find done=====")

        uf_dict = {x: union_find.find(x) for x in addrs.keys()}

        for addr_ref_id in possible_otc_addrs:
            num_txs_using_addr_as_input = AddressTransactionLink.objects(
                addr_ref_id=addr_ref_id, addr_used_as_input=True).count()
            num_txs_using_addr_as_output = AddressTransactionLink.objects(
                addr_ref_id=addr_ref_id, addr_used_as_input=False).count()

            if num_txs_using_addr_as_input == 0 and num_txs_using_addr_as_output == 1:
                output_tx_link = AddressTransactionLink.objects(
                output_tx = BtcTransaction.objects(
                otc_addr = check_otc_conditions(output_tx, addr_ref_id)
                if otc_addr:
                    input_addr = output_tx.input_addrs[0].addr_ref_id
                    node_id = uf_dict[input_addr]
                    uf_dict[addr_ref_id] = node_id

        with open("pickles/cc.pickle", "wb") as f:
            pickle.dump(uf_dict, f)

        send_email_notif("Address Clustering Finished",
                         "The address clustering script has finished")

        return uf_dict
    except Exception as e:
        print("Exception occured while clustering data")
        body = """
        Please check the download_net.py script\n
        he following exception has occured: %s
        """ % (str(e))
        send_email_notif("Error Occured", body)