예제 #1
0
def kurskal(g, n):

    # Set initial condition for Kurskal algorithm
    mst = []

    ds = DisjointSet(n)

    edges = []

    for u in range(n):
        for v in range(u + 1, n):
            if u != v:
                edges.append((u, v, g[u][v]))

    # Sort tuples (u, v, cost) by cost
    edges.sort(key=itemgetter(2))

    for u, v, c in edges:
        if ds.find(u) != ds.find(v):
            mst.append((u, v, c))
            ds.union(u, v)

    total_cost = 0
    print("(origem, destino) -> custo")
    for u, v, c in mst:
        total_cost += c
        print("({}, {}) -> {}".format(u, v, c))

    print("Custo total: {}".format(total_cost))
예제 #2
0
def Kruskal(M):
    """
    Funkcja, która przy uzyciu algorytmu Kruskala wyznaczy i zwróci minimalne drzewo rozpinajace.
    Funkcja przyjmuje na wejściu graf pełny, ważony w postaci macierzy (listy podlist, które 
    są tej samej długości, a ich el. są liczbami).
    W funkcji użyto funckji pomocniczej from_adjacency_matrix() w celu przedstawienia macierz(listy podlist) 
    w formie listy krotek dla i<j.
    Na wyjściu funkcja zwraca minimalne drzewo rozpinajace w postaci listy podlist 
    (za pomocą f. pomocniczej to_adjacency_matrix()).
    """
    assert (type(M) == list), "Macierz musi byćpodana w formie listy list"
    assert all((type(el) == list) for el in M) , "Elementy listy muszą być listami"
    if not all(len(l) == len(next(iter(M))) for l in iter(M)):
        raise ValueError('Podlisty w liście M muszą być tej samej długosci')   
    assert all((type(el) == int or type(el) == float) for sublist in M for el in sublist), "Elementy podlist listy muszą być liczbami"
    n = len(M) #wymiar macierzy M, a dokładnie ilość list w liście
    T = [] #deklaruję pustą listę 
    x = from_adjacency_matrix(M) #pomocnicza funkcja
    y = sorted(x, key=lambda el: el[2])
    #y = x.sort( key=lambda el: el[2])
    ds = DisjointSet() #https://pypi.org/project/disjoint-set/
    for i in y: #dla krotek z listy
        if ds.find(i[0]) != ds.find(i[1]): #jeżeli w krotce wartość pozycji 0 jest rózna od wart. pozycji 1
            b=T.append((i)) #to rozszerz pustą listę o tą krotkę
            ds.union(i[0], i[1]) #połącz dwa wierzchołki
    return to_adjacency_matrix(T)          
예제 #3
0
def disjoint_set(items, join_checker):
    """ 按照一定的相连规则分组

    :param items: 项目清单
    :param join_checker: 检查任意两个对象是否相连,进行分组
    :return:

    算法:因为会转成下标,按照下标进行分组合并,所以支持items里有重复值,或者unhashable对象

    >>> disjoint_set([-1, -2, 2, 0, 0, 1], lambda x, y: x*y>0)
    [[-1, -2], [2, 1], [0], [0]]
    """

    # 1 添加元素
    ds = DisjointSet()
    items = tuple(items)
    n = len(items)
    for i in range(n):
        ds.find(i)

    # 2 连接、分组
    for i, j in combinations(range(n), 2):
        if join_checker(items[i], items[j]):
            ds.union(i, j)

    # 3 返回分组信息
    res = []
    for group in ds.itersets():
        group_elements = [items[g] for g in group]
        res.append(group_elements)
    return res
예제 #4
0
def print_opcode_groups(similarities):
    print("\nOpcode grouping based on the computed cosine similarity values:")
    ds = DisjointSet()
    for i, j in similarities:
        ds.find(i)
        ds.find(j)
        if similarities[i, j] > 0.85:
            ds.union(i, j)
    print(list(ds.itersets()))
def kruskal_min_upper_bound(lower_bound, edges, n):
    sets = DisjointSet(n)
    for w, u, v in edges:
        if w < lower_bound:
            continue
        sets.merge(u, v)
        if sets.find(0) == sets.find(n - 1):
            return w
    return float('inf')
예제 #6
0
def count_clusters_with_djset(graph: Iterable[Tuple[int, int]]):
    all_clusters = DisjointSet()

    for from_node, to_node in graph:
        from_cluster_idx = all_clusters.find(from_node)
        to_cluster_idx = all_clusters.find(to_node)

        if from_cluster_idx != to_cluster_idx:
            all_clusters.union(from_node, to_node)
    return len(list(all_clusters.itersets()))
예제 #7
0
def kruskal(graph):
    ds = DisjointSet()
    A = set()
    for v in graph.V:
        ds.find(v)

    for u, v, w in sorted(graph.edges, key=lambda x: x[2]):
        if ds.find(u) != ds.find(v):
            A = A | {(u, v)}
            ds.union(u, v)
    return A
예제 #8
0
def create_PUG2(cb_records:list, ec_dict):
    """
    first partitions the records according to their ECs (and potnetial transcript overlap thereoff)
    then creates the PUG
    """
    ds = DisjointSet()
    for r in cb_records:
        the_set = ec_dict[r.EC]
        ds.find(the_set[0])
        for s in the_set[1:]:
            ds.union(the_set[0], s)

    # group the records based on the connected component they fall into
    grouping = collections.defaultdict(list)
    for r in cb_records:
        the_set = ec_dict[r.EC]
        roots = [ds.find(_) for _ in the_set]
        assert np.all(np.array(roots) == roots[0])
        grouping[roots[0]].append(r)

    nodes = set()
    edges = []
    for records in grouping.values():
        if len(records) == 1:
            nodes.add(records[0])
        else:
            for r1, r2 in itertools.combinations(records, 2):
                node1 = r1
                node2 = r2
                nodes.add(node1)
                nodes.add(node2)

                hd = hamming_distance(r1.UMI, r2.UMI)
                if hd > 1:
                    continue
                if set(ec_dict[r1.EC]).isdisjoint(ec_dict[r2.EC]):
                    continue

                if hd <= 1 and r1.COUNT > 2 * r2.COUNT - 1:
                    # print(r1, r2)
                    e = (node1, node2)
                    edges.append(e)
                elif hd <= 1 and r2.COUNT > 2 * r1.COUNT - 1:
                    e = (node2, node1)
                    edges.append(e)
                elif hd <= 1:
                    e1 = (node1, node2)
                    e2 = (node2, node1)
                    edges.append(e1)
                    edges.append(e2)
    G = nx.DiGraph()
    G.add_nodes_from(nodes)
    G.add_edges_from(edges)
    return G
예제 #9
0
    def basic_formula_one_agent(
            self, agent_id: int, current_states: Set[int],
            first_winning_state_id: int, winning_states: DisjointSet,
            custom_can_go_there: List[dict]) -> (Set[int], bool):
        result_states = set()
        first_winning_state_id = winning_states.find(first_winning_state_id)
        pre_image = set()
        actions = self.agents_actions[agent_id]
        for winning_state in current_states:
            for pre_state in self.pre_states[winning_state]:
                pre_image.add(
                    self.epistemic_class_membership[agent_id][pre_state])

        for state_epistemic_class in pre_image:
            state = next(
                iter(self.imperfect_information[agent_id]
                     [state_epistemic_class]))
            state = winning_states.find(state)
            if state == first_winning_state_id:
                continue

            same_states = self.imperfect_information[agent_id][
                state_epistemic_class]

            for action in actions:
                states_can_go = custom_can_go_there[state_epistemic_class][
                    action]

                if len(states_can_go) == 0:
                    continue

                is_ok = True
                new_states_can_go = set()

                for state_can in states_can_go:
                    new_state_can = winning_states.find(state_can)

                    if first_winning_state_id != new_state_can:
                        is_ok = False

                    new_states_can_go.add(new_state_can)

                custom_can_go_there[state_epistemic_class][
                    action] = new_states_can_go

                if is_ok:
                    result_states.update(same_states)
                    winning_states.union(first_winning_state_id, state)
                    first_winning_state_id = winning_states.find(
                        first_winning_state_id)
                    modified = True
                    break

        return result_states, modified
def kruskal(edges, n):
    sorted_edges = sorted([(w, u, v) for u, v, w in edges])
    sets = DisjointSet(n)
    selected = []
    total = 0
    for w, u, v in sorted_edges:
        if sets.find(u) == sets.find(v):
            continue
        sets.merge(u, v)
        selected.append((u, v, w))
        total += w
    return selected, total
예제 #11
0
def Kruskals(self):
    """Kruskal's Algorithm"""
    Dset = DisjointSet(self._numVertices)

    # Generate numbers that will act as a wall between two cells in a row
    rows = set()
    pre = .5
    for i in range(self._columns):
        for j in range(self._rows - 1):
            rows.add(pre)
            pre += 1
        pre += 1

    # Generate numbers that will act as a wall between two cells in a column
    columns = set()
    offset = self._rows / 2
    pre = offset
    for i in range(self._rows):
        for j in range(self._columns - 1):
            columns.add(pre)
            pre += 1

    while Dset.nsets != 1:
        if random() < 0.5:
            """Pick a random row"""
            random_row_edge = sample(rows, 1)[0]
            rows.remove(random_row_edge)

            left_cell = int(random_row_edge - .5)
            right_cell = int(random_row_edge + .5)
            # If the left and right cell are not part of the same set merge them
            if Dset.find(left_cell) != Dset.find(right_cell):
                # print("Joining two rows: ", left_cell, right_cell)
                Dset.merge(left_cell, right_cell)
                self.add_edge((left_cell, right_cell))
                self.genTile(left_cell)
                self.genTile(right_cell)
        else:
            """Pick a random column"""
            random_column_edge = sample(columns, 1)[0]
            columns.remove(random_column_edge)

            left_cell = int(random_column_edge - offset)
            right_cell = int(random_column_edge + offset)
            # If the top and bottom cell are not part of the same set merge them
            if Dset.find(left_cell) != Dset.find(right_cell):
                # print("Joining two columns: ", left_cell, right_cell)
                Dset.merge(left_cell, right_cell)
                self.add_edge((left_cell, right_cell))
                self.genTile(left_cell)
                self.genTile(right_cell)
예제 #12
0
 def kruskal(self) -> Set[Tuple[int]]:
     edge_queue = []
     for e, cost in self.edges.items():
         heapq.heappush(edge_queue, (cost, e))
     vertex_set = DisjointSet()
     for v in self.vertexes:
         vertex_set.find(v)
     known_edges = set()
     while len(list(vertex_set.itersets())) > 1:
         cost, edge = heapq.heappop(edge_queue)
         if not vertex_set.connected(*edge):
             known_edges.add(tuple(sorted(list(edge))))
             vertex_set.union(*edge)
     return known_edges
예제 #13
0
def kruskal(graphe: Graphe):
    retour = []

    sommets = [sommet for sommet in graphe]

    aretes = graphe.liens(trier=True)

    ds = DisjointSet(sommets)

    for arete in aretes:
        if ds.find(arete[0]) != ds.find(arete[1]):
            retour.append(f'{arete[0]}-{arete[1]}')
            ds.union(arete[0], arete[1])

    return retour
예제 #14
0
def _dbscan(points, eps, min_pts):
    N = len(points)
    labels = label(points, eps=eps, min_pts=min_pts)
    # The indices of CORE points.
    cores = np.arange(N)[labels == CORE]
    # Rather than adding edges as the algorithm in the book does,
    # I'll utilize a disjoint set to maintain information about the group each point belongs to.
    clusters = DisjointSet()

    # Assign cores in the vicinity of eachother to the same group.
    for a, i in enumerate(cores):
        for j in cores[a + 1:]:
            if norm(points[i], points[j]) <= eps:
                clusters.union(i, j)

    # For each border point, we'll simply assign it to the cluster
    # of the first CORE point we stumble upon in it's vicinity.
    for i in np.arange(N)[labels == BORDER]:
        for j in cores:
            if norm(points[i], points[j]) <= eps:
                clusters.union(i, j)
                break

    # Now, we have assigned every CORE and BORDER to a group. Now, we'll transform the disjoint set
    # into a list of lists, to make it easier for us to gauge what's in what.
    results = {}
    for i in np.arange(N)[labels != NOISE]:
        results.setdefault(clusters.find(i), []).append(i)

    # A tuple (groups, noise) where each of these is a list containing indices.
    return list(results.values()), np.arange(N)[labels == NOISE]
예제 #15
0
class graph(object):
    """graph object, on which the MST is calculated"""
    def __init__(self, vertices, edges):

        self.vertices = vertices

        #
        self.edges = sorted(edges, key=lambda edge: edge[2])

    def kruskal(self):

        # TODO what are these counters?
        edge_i, edge_n = (0, 0)
        self.ds = DisjointSet()
        self.mst = []
        while edge_n < len(self.vertices) - 1:
            vertex_1, vertex_2, weight = self.edges[edge_i]
            edge_i += 1
            cluster_1 = self.ds.find(vertex_1)
            cluster_2 = self.ds.find(vertex_2)
            if cluster_1 != cluster_2:
                self.ds.union(cluster_1, cluster_2)
                self.mst.append([vertex_1, vertex_2, weight])
                edge_n += 1

        return self.mst
예제 #16
0
def Ellers(self):
    Dset = DisjointSet(self._numTiles[0])
    for i in range(self._numTiles[1]):
        self.genTile(i * self._numTiles[0])
        for j in range(1, self._numTiles[0]):

            if Dset.find(j) == Dset.find(j - 1):
                self.genTile(self.toIndex((j, i)))
                continue

            idx = self.toIndex((j, i))
            shouldMerge = bool(randint(int(i == self._numTiles[1] - 1), 1))
            if shouldMerge:
                Dset.merge(j - 1, j)
                self.add_edge((idx - 1, idx))
            self.genTile(idx)

        if i != self._numTiles[1] - 1:
            remainders = [i for i in range(self._numTiles[0])]

            for idx, s in enumerate(Dset.Sets):
                if s == None:
                    continue

                s = s.copy()
                numDownward = randint(1, len(s))
                for k in range(numDownward):
                    c = randint(0, len(s) - 1)
                    cid = s[c]
                    c1 = self.toIndex((s[c], i))
                    c2 = self.toIndex((s[c], i + 1))
                    self.add_edge((c1, c2))
                    self.genTile(c2)
                    s.pop(c)
                    remainders.remove(cid)

            #recreate the disjoint set with the correct set/cell locations
            for r in remainders:
                n = next((i for i, v in enumerate(Dset.Sets) if v == None))
                Dset.Cells[r] = n
                Dset.Sets[n] = [n]
            Dset.Sets = [None] * self._numTiles[0]
            for i, v in enumerate(Dset.Cells):
                if Dset.Sets[v] == None:
                    Dset.Sets[v] = [i]
                if i not in Dset.Sets[v]:
                    Dset.Sets[v].append(i)
예제 #17
0
def greedy_clusters(graph, homes, homes_to_index, shortest, k):
    """ We will return the solution for all values of k """

    # Create all pairs of homes
    pairs = []
    for i in range(len(homes)):
        for j in range(i + 1, len(homes)):
            pairs.append((i, j))
    
    # Sort the pairs
    pairs = sorted(pairs, key = lambda pair: shortest[homes_to_index[homes[pair[0]]]][homes_to_index[homes[pair[1]]]])

    # Create a WQU for clustering
    quick_union = DisjointSet()
    for i in range(len(homes)):
        quick_union.find(i)

    # print(list(quick_union.itersets()))
    # Greedy combine pairs together until all the homes are together
    while len(list(quick_union.itersets())) > k:
        curr = pairs.pop(0)
        quick_union.union(curr[0], curr[1])

    
    # map home to the cluster that it is in
    home_to_cluster = {}
    cluster_index_homes = list(quick_union.itersets())
    clusters_answer = []

    # We just want to move from indices back to homes
    for lst in cluster_index_homes:
        new_lst = []
        for i in lst:
            new_lst.append(homes_to_index[homes[i]])
            home_to_cluster[homes[i]] = new_lst
        clusters_answer.append(new_lst)

    # Now with these cluster_homes, we can add to all the surrounding neighbors 
    # We look at each node, and then we add the node to the cluster of its nearest home
    vals = list(homes_to_index.values())
    for node in graph.nodes:
        if node not in vals: #if it is not a home
            closest_home = min(homes, key = lambda home: shortest[homes_to_index[home]][node])
            home_to_cluster[closest_home].append(node)

    return clusters_answer
def kruskal(points):
    """finds minimum spanning tree of given points

    :param points: numpy array of shape (n_points, 2)
    :return: array of tuples (length, vertex_1, vertex_2) - edges of minimum spanning tree
    """
    length = points.shape[0]
    disjoint_set = DisjointSet(length)
    edges = [(np.linalg.norm(points[i] - points[j]), i, j)
             for i in range(length) for j in range(i + 1, length)]
    edges.sort(key=lambda x: x[0])
    adjacency_list = [[] for _ in range(length)]
    for w, u, v in edges:
        if disjoint_set.find(u) != disjoint_set.find(v):
            adjacency_list[u].append(v)
            adjacency_list[v].append(u)
            disjoint_set.union(u, v)
    return adjacency_list
예제 #19
0
def kruskal(args):
    vertex_list, edge_list = args
    disj_set = DisjointSet()
    min_span_tree = []

    for u in vertex_list:
        disj_set.make_set(u)

    edges = list(edge_list)
    edges.sort()

    for edge in edges:
        weight, u, v = edge
        if disj_set.find(u) != disj_set.find(v):
            disj_set.union(u, v)
            min_span_tree.append(edge)

    return min_span_tree
예제 #20
0
 def compute_must_alias(self):
     pointsto_map = self.table.pointsto_map()
     variables = pointsto_map.keys()
     alias_sets = DisjointSet()
     print(f"\t\t#Variables= {len(variables)}")
     """A O(N logN) algorithm to unify variables. Brute-force approach matches (u,v)  and unifies them if pt(u) = pt(v). 
     This approach maintains a list of visited_heap_objects which maps the integer representation of set of heap 
     objects. If a matching heap objects is found in the visited_heap_objects then the variables are unified,  
     otherwise it updates the visited_heap_objects."""
     visited_heap_objects = defaultdict()
     for v_i in variables:
         heap_objs = int(pointsto_map[v_i])
         if heap_objs in visited_heap_objects.keys():
             v_j = visited_heap_objects[heap_objs]
             if not alias_sets.connected(v_i, v_j):
                 alias_sets.union(v_i, v_j)
         else:
             alias_sets.find(v_i)
         visited_heap_objects[heap_objs] = v_i
     return alias_sets.itersets()
예제 #21
0
 def __KruChoose(self):
     PartitionSize = dict(zip(range(self.Vcount), [1] * self.Vcount))
     ListofMaxPartitionSizes = []
     Echoose = []
     # Components = self.Vcount
     ds = DisjointSet()
     for I, J, W in self.E:
         if ds.find(I) != ds.find(J):
             MergedSize = PartitionSize[ds.find(I)] + PartitionSize[ds.find(
                 J)]
             ds.union(I, J)
             # PartitionSize[I] = 0
             PartitionSize[ds.find(J)] = MergedSize
             ListofMaxPartitionSizes.append(max(PartitionSize.values()))
             Echoose.append(True)
             # Components -= 1
             # if Components == 1:
             #     break
             continue
         Echoose.append(False)
     return Echoose, ListofMaxPartitionSizes
예제 #22
0
def boruvka(adj_list):
    disj_set = DisjointSet()

    for u in adj_list.keys():
        disj_set.make_set(u)

    min_span_tree = []
    while True:
        minima = {}
        for u in adj_list.keys():
            root = disj_set.find(u)
            for v in adj_list[u]:
                if disj_set.find(v) != root and (root not in minima or adj_list[u][v] < minima[root][0]):
                    minima[root] = (adj_list[u][v], u, v)

        if len(minima) == 0:
            break

        for edge in minima.items():
            if disj_set.union(edge[0], edge[1][2]):
                min_span_tree.append(edge[1])

    return min_span_tree
def generate_mst(self, verts):
    num_sites = len(verts)

    # Create connected graph
    edges = []
    for i in range(num_sites):
        for j in range(i + 1, num_sites):
            src, dest = verts[i], verts[j]
            dist = abs(self.x_loc[dest] -
                       self.x_loc[src]) + abs(self.y_loc[dest] -
                                              self.y_loc[src])
            edges.append((src, dest, dist))

    # Find MST
    edges.sort(key=lambda x: x[2])
    ds = DisjointSet(301)
    mst = defaultdict(list)
    for src, dest, dist in edges:
        if ds.find(src) != ds.find(dest):
            ds.union(src, dest)
            mst[src].append(dest)
            mst[dest].append(src)

    return mst
예제 #24
0
    def kruskal(self):
        self.visitados = []
        self.grafoN = dict()
        self.llenarGrafo()
        self.pq = PriorityQueue()
        self.disjointSet = []
        self.grafoKruskal = dict()

        for particula in self.capturador.lista:
            self.pq.put([
                particula.distancia,
                ((particula.origenX, particula.origenY), (particula.destinoX,
                                                          particula.destinoY))
            ])

        self.llenarDisjointSet()

        for particula in self.capturador.lista:
            self.pq.put([
                particula.distancia,
                ((particula.origenX, particula.origenY), (particula.destinoX,
                                                          particula.destinoY))
            ])

        conjunto = DisjointSet(self.disjointSet)
        print(conjunto.get())

        while not self.pq.empty():
            actual = self.pq.get()

            if actual[1][0] not in conjunto.find(actual[1][1]):
                self.llenarGrafoKrustal(actual[1][0], actual[1][1], actual[0])
                print(actual[1][0], actual[1][1])
                conjunto.union(actual[1][0], actual[1][1])

        self.mostrarGrafoG()

        self.pen.setWidth(3)
        for item in self.grafoKruskal.items():
            print(item)
            for item2 in item[1]:
                self.pen.setColor(QColor(0, 0, 0))
                self.scene.addLine(item[0][0], item[0][1], item2[0][0],
                                   item2[0][1], self.pen)
                print(item[0][0], item[0][1], item2[0][0], item2[0][1])
        self.pen.setWidth(1)
예제 #25
0
    def minimum_formula_one_agent(self, agent_id: int,
                                  winning_states: Set[int]) -> Set[int]:
        result_states = self.prepare_result_states(winning_states)
        current_states = winning_states.copy()
        winning_states_disjoint = DisjointSet(0)
        winning_states_disjoint.subsets = copy.deepcopy(
            self.epistemic_class_disjoint[agent_id].subsets)
        first_winning = winning_states_disjoint.find(iter(
            next(winning_states)))
        epistemic_class_ids = set()
        for state_id in winning_states:
            epistemic_class_id = self.epistemic_class_membership[agent_id][
                state_id]
            epistemic_class_ids.add(epistemic_class_id)

        for epistemic_class_id in epistemic_class_ids:
            epistemic_states = self.imperfect_information[agent_id][
                epistemic_class_id]
            is_ok = True
            for epistemic_state in epistemic_states:
                state_id = epistemic_state
                if epistemic_state not in winning_states:
                    is_ok = False
                    break
            if is_ok:
                winning_states_disjoint.union(first_winning, state_id)

        custom_can_go_there = self.can_go_there[agent_id][:]

        while True:
            current_states, modified = self.basic_formula_one_agent(
                agent_id, current_states, first_winning,
                winning_states_disjoint, custom_can_go_there)
            result_states.update(current_states)
            if not modified:
                break

        return result_states
예제 #26
0
    def union(self, x, y):
        # make union diagram
        xRoot = self.find(x)
        yRoot = self.find(y)
        print(x, y, xRoot.value, yRoot.value)
        if xRoot == yRoot:
            return

        # get unions of nodes
        l = len(self.dst_dict)
        max_alphabet = len(self.trans[0])
        bfsq = [(xRoot.value, yRoot.value)]
        ds = DisjointSet([])

        while bfsq:
            v1, v2 = bfsq.pop(0)
            if v1 not in ds:
                ds.add(v1)
            if v2 not in ds:
                ds.add(v2)

            if ds.find(v1) == ds.find(v2):
                continue

            print('\nmerging', v1, v2)
            # if v1 in [16, 25, 26, 64, 56] or v2 in [16, 25, 26, 64, 56]:
            print(ds.get(v1), self.trans[v1][0], self.trans[v1][1])
            print(ds.get(v2), self.trans[v2][0], self.trans[v2][1])

            assert v1 == self.find(v1).value
            assert v2 == self.find(v2).value
            ds.union(v1, v2)
            # v1 = ds.find(v1).value
            # v2 = ds.find(v2).value
            for i in range(max_alphabet):
                newv1 = self.trans[v1][i]
                newv2 = self.trans[v2][i]

                if newv1 != -1 and newv2 != -1:
                    r1 = self.find(newv1)
                    r2 = self.find(newv2)

                    bfsq.append((r1.value, r2.value))
                elif newv1 != -1:
                    self.trans[v2][i] = newv1
                elif newv2 != -1:
                    self.trans[v1][i] = newv2

        # clean up unions
        # Should I have to use the method with union and cleanup?
        unionlist = ds.get()
        print(unionlist)
        topmostnodes = []
        for nodes in unionlist:
            if len(nodes) == 1:
                continue

            # get max rank
            max_rank = -1
            max_rank_nodeid = -1
            max_rank_double = 0
            for node in nodes:
                rank = self.dst_dict[node].rank
                if max_rank < rank:
                    max_rank = rank
                    max_rank_nodeid = node
                    max_rank_double = 0
                elif max_rank == rank:
                    max_rank_double = 1

            # move transition to the topmost node
            topmostnode = self.dst_dict[max_rank_nodeid]
            topmostnodes.append(topmostnode)
            for node in nodes:
                self.dst_dict[node].parent = topmostnode
            topmostnode.rank = max_rank + max_rank_double

        # rearrange transitions
        # new_unions = self.get()
        # max_alphabet = len(self.trans[0])
        # for union in new_unions:
        # 	transitions = [-1 for _ in range(max_alphabet)]
        # 	for i, node in enumerate(union):
        # 		for self.trans[node]

        for nodes in unionlist:
            root = None
            for node in nodes:
                if root == None:
                    root = self.find(node)
                else:
                    assert root == self.find(node)

                for alphabet, c in enumerate(self.trans[root.value]):
                    to = self.trans[node][alphabet]
                    # if to != -1 and c != -1:
                    # 	assert self.find(self.trans[root.value][alphabet]) == self.find(to).value
                    if to != -1:
                        self.trans[root.value][alphabet] = self.find(to).value
예제 #27
0
def gen_company_mapping(df: pd.DataFrame, path: str) -> None:
    """
    Function generates storefront-company mapping by:
        1. Assigning storefronts with same product offering to same company
        2. Assigning storefronts with same phone to same company
        3. Assigning storefronts with same email to same company
        4. Generating a UUID for each company
        5. Adding these UUIDs to the panel dataframe (grouped by storefront ID)
        6. Dropping all columns beside storefront and company ID
    A union-find data structure is used to keep track of the company assignments
    throughout the function.
    """
    # drop unneeded columns
    if 'phone' in df.columns:
        df_sim = df[['email', 'slug', 'phone', 'product_name']]
    else:
        df_sim = df[['email', 'slug', 'product_name']]

    # groupby name and agg columns appropriately for company grouping
    if 'phone' in df.columns:
        df_sim_2 = df_sim.groupby('slug').agg({
            'email':
            'first',
            'phone':
            'first',
            'product_name':
            lambda x: frozenset(x)
        })
    else:
        df_sim_2 = df_sim.groupby('slug').agg({
            'email':
            'first',
            'product_name':
            lambda x: frozenset(x)
        })
    ds = DisjointSet()

    # add all storefront IDs to DJS
    for sid in df_sim_2.index:
        ds.find(sid)

    # setup hash tables for later unions
    product_ht = defaultdict(lambda: [])
    email_ht = defaultdict(lambda: [])
    phone_ht = defaultdict(lambda: [])

    for sid in df_sim_2.index:
        row = df_sim_2.loc[sid]
        if not row.isnull()['product_name']:
            prod_set = row['product_name']
        if not row.isnull()['email']:
            email = row['email']
        if 'phone' in df.columns:
            if not row.isnull()['phone']:
                phone = row['phone']
            phone_ht[phone].append(sid)
        product_ht[prod_set].append(sid)
        email_ht[email].append(sid)

    # union storefront IDs with same product offering
    for key in product_ht:
        first_sid = product_ht[key][0]
        for curr_sid in product_ht[key][1:]:
            ds.union(first_sid, curr_sid)

    # union storefront IDs with same email (check NaN)
    for key in email_ht:
        first_sid = email_ht[key][0]
        for curr_sid in email_ht[key][1:]:
            ds.union(first_sid, curr_sid)

    # union storefront IDs with same phone
    if 'phone' in df.columns:
        for key in phone_ht:
            first_sid = phone_ht[key][0]
            for curr_sid in phone_ht[key][1:]:
                ds.union(first_sid, curr_sid)

    # add company_id column to df
    df_sim_2['company_id'] = np.zeros(df_sim_2.index.shape)

    for comp in ds.itersets():
        comp_id = str(uuid.uuid1())
        for sid in comp:
            df_sim_2.loc[sid, 'company_id'] = comp_id

    mapping = df_sim_2[['company_id']]

    mapping.to_csv(path, line_terminator='\n')
예제 #28
0
def compute_mws_prim_segmentation(edge_weight_exp,
                                  valid_edges_exp,
                                  offsets,
                                  number_of_attractive_channels,
                                  image_shape):

    visited = np.zeros(edge_weight_exp.size, dtype=bool)
    node_labeling = np.zeros(image_shape).ravel()
    number_of_nodes = node_labeling.size
    number_of_attractive_edges = number_of_nodes * number_of_attractive_channels
    ndims = len(offsets[0])
    array_stride = np.empty(ndims, dtype=np.int64)
    current_stride = 1
    mutexes = {}
    for i in range(ndims-1, -1, -1):
        array_stride[i] = current_stride
        current_stride *= image_shape[i]

    offset_strides = []
    for offset in offsets:
        stride = 0
        for i in range(len(offset)):
            stride += offset[i] * array_stride[i]
        offset_strides.append(stride)

    offset_strides = np.asarray(offset_strides)
    node_ufd = DisjointSet()
    for lbl in range(number_of_nodes):
        node_ufd.find(lbl)

    # mutexes = np.ndarray(number_of_nodes)
    pq = queue.PriorityQueue()

    # start prim from top left node
    add_neighbours(0, offset_strides, number_of_nodes, edge_weight_exp, valid_edges_exp, node_ufd, visited, pq)
    # iterate over all edges
    while not pq.empty():
        # extract next element from the queue
        position_vector = pq.get()
        edge_id = position_vector[1]
        u = position_vector[2]
        v = position_vector[3]

        if visited[edge_id]:
            continue
        visited[edge_id] = 1
        # find the current reps and skip if identical or mtx exists
        ru = node_ufd.find(u)
        rv = node_ufd.find(v)
        if ru == rv or check_mutex(ru, rv, mutexes):
            continue

        # check whether this edge is mutex via the edge offset
        if edge_id >= number_of_attractive_edges:
            insert_mutex(ru, rv, edge_id, mutexes)
        else:
            node_ufd.union(u,v)
            if node_ufd.find(ru) == rv:
                rv, ru = ru, rv
            merge_mutexes(rv, ru, mutexes)

        # add the next node to pq
        add_neighbours(v, offset_strides, number_of_nodes, edge_weight_exp, valid_edges_exp, node_ufd, visited, pq)

    # create node labeling from disjoint sets
    # 0's indicate no labeling
    for idx, cc in enumerate(node_ufd.itersets()):
        for node in cc:
            node_labeling[node] = idx+1

    return node_labeling
예제 #29
0
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 17 17:05:21 2020

@author: wyue
"""

from disjoint_set import DisjointSet


class A():
    def __init__(self, a):
        A.v = a


ds = DisjointSet()

ds.find(1)
ds.find(2)
ds.find(3)
ds.find(4)
print(ds.connected(1, 2))
ds.union(1, 2)

print(ds.connected(1, 2))

####print all unique sets
allsets = list(ds.itersets())
print(allsets)
print("Num of sets:", len(allsets))
def iterate_words(corpus):
    for sentence in corpus.sentences:
        for token in sentence:
            if token['pos'] in ['NOUN', 'VERB']:
                yield token['word'].lower(), token['lemma'].lower()


# In[6]:

ds = DisjointSet()
for word, lemma in iterate_words(corpus):
    ds.union(word, lemma)

# In[7]:

print(ds.find('voyage'))
print(ds.find('voyages'))
print(ds.find('voyager'))
print(ds.find('voyagent'))

# In[8]:

print(ds.find('chant'))
print(ds.find('chants'))
print(ds.find('chanter'))
print(ds.find('chante'))
print(ds.find('chantant'))

# ## Group words that share the same lemma

# In[9]:
예제 #31
0
class new:
    def __init__(self,length,width,walls=0.40):
        self.__length = length
        self.__width = width
        self.__exits = []
        self.__map = []
        self.__buf_map = []
        self.__gen_initial_map(walls)
        self.__ds = DisjointSet()
        self.__cpt = (int(self.__length/2),int(self.__width/2))

    def resize_map(self, new_length, new_width, center=True):
        new_map = [[WALL for i in xrange(new_width)]
                   for j in xrange(new_length)]
        ox = int(new_width/2.0-self.__width/2.0+0.5)
        oy = int(new_length/2.0-self.__length/2.0+0.5)
        for i in xrange(self.__width):
            for j in xrange(self.__length):
                x2 = ox + i
                y2 = oy + j
                if (x2 >= 0 and
                    y2 >= 0 and
                    x2 < new_width and
                    y2 < new_width):
                    new_map[x2][y2] = self.__map[i][j]
        self.__map = new_map
        self.__length = new_length
        self.__width = new_width
        self.__exits = []
        self.__cpt = (int(self.__length/2),int(self.__width/2))

    def print_map(self):
        for c in xrange(0,self.__width):
            for r in xrange(0,self.__length):
                if self.__map[r][c] == WALL:
                    sys.stdout.write('#')
                elif self.__map[r][c] == TUNNEL:
                    sys.stdout.write('+')
                else:
                    sys.stdout.write(' ')
            print
        print

    def iterate_walls(self):
        for c in xrange(0,self.__width):
            for r in xrange(0,self.__length):
                if self.__map[r][c] == WALL:
                    if (self.__adj_flr_count(r, c) > 0):
                        yield (c, r)

    def iterate_map(self, cell_type):
        for c in xrange(0,self.__width):
            for r in xrange(0,self.__length):
                if self.__map[r][c] == cell_type:
                    yield (c, r)

    def add_exit(self, pt1, pt2):
        while (pt1 != pt2):
            if (pt1[0] < 0 or
                pt1[0] >= self.__width or
                pt1[1] < 0 or
                pt1[1] >= self.__length):
                sys.exit('WARN: Exit out of range', pt1)
            else:
                self.__exits.append(pt1)
            pt1 = (pt1[0] + cmp(pt2[0], pt1[0]),
                   pt1[1] + cmp(pt2[1], pt1[1]))

    def purge_exits(self):
        self.__exits = []
        for c in xrange(0,self.__width):
            for r in xrange(0,self.__length):
                if (c == 0 or c == self.__width-1 or
                    r == 0 or r == self.__length-1):
                    self.__map[r][c] == WALL

    def grow_map(self):
        self.__generation(1, 2, -1)

    def reduce_map(self):
        self.__generation(1, 7, -1)

    def gen_map(self, mode='default'):
        if mode == 'room':
            # One large cavern room
            self.__generation(4, 5, -1)
            self.__join_rooms()
            self.__generation(1, 5, -1)
        else:
            # Windey passages. 
            #Repeat 4: W?(p) = R1(p) ? 5 || R2(p) ? 2
            #Repeat 3: W?(p) = R1(p) ? 5
            # We do the above, with a cave join pass right before the final
            # iteration. This helps smooth out any sharp edges after the join
            # pass.
            self.__generation(4, 5, 2)
            self.__generation(2, 5, -1)
            self.__join_rooms()
            self.__generation(1, 5, -1)

    def __generation(self, count, r1_cutoff, r2_cutoff):
        while (count > 0):
            self.__buf_map = [[WALL for i in xrange(self.__width)]
                              for j in xrange(self.__length)]
            self.__gen_walls(self.__buf_map)
            self.__gen_walls(self.__map)
            for r in xrange(1,self.__length-1):
                for c in xrange(1,self.__width-1):
                    adjcount_r1 = self.__adj_wall_count(r,c,1)
                    adjcount_r2 = self.__adj_wall_count(r,c,2)
                    if(adjcount_r1 >= r1_cutoff or
                       adjcount_r2 <= r2_cutoff):
                        self.__buf_map[r][c] = WALL
                    else:
                        self.__buf_map[r][c] = FLOOR
            self.__map = list(self.__buf_map)
            count -= 1

    def __gen_initial_map(self, fillprob):
        def rwall(fillprob):
            if (random() < fillprob):
                return WALL
            return FLOOR

        self.__map = [[rwall(fillprob) for i in xrange(self.__width)]
                      for j in xrange(self.__length)]
        self.__gen_walls(self.__map)

    def __gen_walls(self, a_map):
        for j in range(0,self.__length):
            a_map[j][0] = WALL
            a_map[j][self.__width-1] = WALL

        for j in range(0,self.__width):
            a_map[0][j] = WALL
            a_map[self.__length-1][j] = WALL

        # Force the exits to be floor. We grow them out from the edge a bit to
        # make sure they don't get sealed off. 
        for pos in self.__exits:
            a_map[pos[0]][pos[1]] = FLOOR
            for pos2 in ((-1,0), (1,0), (0,-1), (0,1),
                         (-2,0), (2,0), (0,-2), (0,2)):
                p = (pos[0]+pos2[0], pos[1]+pos2[1])
                if (p[0] < 1 or p[1] < 1):
                    continue
                if (p[0] >= self.__width-1 or
                    p[1] >= self.__length-1):
                    continue
                a_map[p[0]][p[1]] = FLOOR

    def __adj_flr_count(self,sr,sc):
        count = 0
        for pos in ((-1,0), (1,0), (0,-1), (0,1)):
            p = (sr+pos[0], sc+pos[1])
            if (p[0] < 0 or p[1] < 0):
                continue
            if (p[0] > self.__width-1 or
                p[1] > self.__length-1):
                continue
            if (self.__map[p[0]][p[1]] == FLOOR):
                count += 1
        return count

    def __adj_wall_count(self,sr,sc,rng=1):
        count = 0

        for r in xrange(-rng,rng+1):
            for c in xrange(-rng,rng+1):
                #if (r == 0 and c == 0):
                #    continue
                if (abs(r) == 2 and abs(c) == 2):
                    continue
                if (sr + r < 0 or sc + c < 0):
                    continue
                if (sr + r >= self.__length or sc + c >= self.__width):
                    continue
                if self.__map[sr + r][sc + c] == WALL:
                    count += 1

        return count

    def __join_rooms(self):
        # Divide all cells into joined sets
        for r in xrange(0,self.__length):
            for c in xrange(0,self.__width):
                if self.__map[r][c] != WALL:
                    self.__union_adj_sqr(r,c)

        all_caves = self.__ds.split_sets()

        while len(all_caves) > 1:
            self.__join_points(all_caves[choice(all_caves.keys())][0])
            all_caves = self.__ds.split_sets()

    def __union_adj_sqr(self,sr,sc):
        loc = (sr,sc)
        root1 = self.__ds.find(loc)
        # A cell is connected to other cells only in cardinal directions.
        # (diagonals don't count for movement).
        for pos in ((-1,0), (1,0), (0,-1), (0,1)):
            if (sr+pos[0] < 0 or sc+pos[1] < 0):
                continue
            if (sr+pos[0] >= self.__length or
                sc+pos[1] >= self.__width):
                continue
            nloc = (sr+pos[0],sc+pos[1])
            if self.__map[nloc[0]][nloc[1]] == FLOOR:
                root2 = self.__ds.find(nloc)
                if root1 != root2:
                    self.__ds.union(root1,root2)

    def __join_points(self,pt1):
        next_pt = pt1
        while 1:
            dir = self.__get_tunnel_dir(pt1,self.__cpt)
            move = randrange(0,3)

            if move == 0:
                next_pt = (pt1[0] + dir[0],pt1[1])
            elif move == 1:
                next_pt = (pt1[0],pt1[1] + dir[1])
            else:
                next_pt = (pt1[0] + dir[0],pt1[1] + dir[1])

            root1 = self.__ds.find(next_pt)
            root2 = self.__ds.find(pt1)

            if root1 != root2:
                self.__ds.union(root1,root2)

            for pos in ((0,0), (-1,0), (1,0), (0,-1), (0,1)):
                if (next_pt[0]+pos[0] < 0 or next_pt[1]+pos[1] < 0 or
                    next_pt[0]+pos[0] >= self.__length or
                    next_pt[1]+pos[1] >= self.__width):
                    continue
                if (self.__map[next_pt[0]+pos[0]][next_pt[1]+pos[1]] == WALL):
                    self.__map[next_pt[0]+pos[0]][next_pt[1]+pos[1]] = TUNNEL

            if self.__stop_drawing(pt1,next_pt,self.__cpt):
                return

            pt1 = next_pt

    def __stop_drawing(self,pt,npt,cpt):
        if self.__ds.find(npt) == self.__ds.find(cpt):
            return 1
        if (self.__ds.find(pt) != self.__ds.find(npt) and
            self.__map[npt[0]][npt[1]] != WALL):
            return 1
        return 0

    def __get_tunnel_dir(self,pt1,pt2):
        if pt1[0] < pt2[0]:
            h_dir = +1
        elif pt1[0] > pt2[0]:
            h_dir = -1
        else:
            h_dir = 0

        if pt1[1] < pt2[1]:
            v_dir = +1
        elif pt1[1] > pt2[1]:
            v_dir = -1
        else:
            v_dir = 0

        return (h_dir,v_dir)
예제 #32
0
    def connected_components(image: np.ndarray, mute=True) -> tuple:
        """
        This method uses the Two Pass algorithm to find all connected components inside an image.
        Connected components is defined as foreground pixels(255) that are connected in the image matrix.
        The Two Pass algorithm consists of two steps:
        1. Looping through all the matrix and put connected components in the unionfind
        2. Compress the unionfind the be more efficient.

       prerequisites:
       1. normalized
       2. binary (black or white values)

        :param mute: mute the console output
        :param image: The image for processing
        :return: Labels matrix that contains all connected components and a list of the connected components
        """
        if mute:
            print("Connected Components Labeling...", end='', flush=True)
            old_stdout = sys.stdout
            sys.stdout = open(os.devnull, 'w')

        print("\nConnected Components Labeling...")
        unionfind = DisjointSet()

        print("First Pass")
        label = 1
        labels = np.zeros_like(image).astype(np.uint32)
        for i in range(1, image.shape[0] - 1):
            for k in range(1, image.shape[1] - 1):
                pix = image[i, k]
                if pix == 255:
                    neighbours = np.unique(
                        np.array([
                            labels[i, k - 1], labels[i - 1, k - 1],
                            labels[i - 1, k], labels[i - 1, k + 1]
                        ]))
                    if neighbours[:].any() != 0:
                        neighbours = np.delete(neighbours,
                                               np.where(neighbours == 0))
                        labels[i, k] = neighbours[0]
                        for n in range(1, neighbours.shape[0]):
                            unionfind.union(neighbours[n], neighbours[0])

                    else:
                        labels[i, k] = label
                        unionfind.union(label, label)
                        label += 1

        print("Second Pass")
        i = 0
        for row in labels:
            j = 0
            for pix in row:
                if pix != 0:
                    root = unionfind.find(pix)
                    if root != pix:
                        labels[i, j] = root
                j += 1
            i += 1

        unique, counts = np.unique(labels, return_counts=True)
        unique_labels = dict(zip(unique, counts))
        print(
            "Connected Components Labeling Filter Was Successfully Completed!\n"
        )

        if mute:
            sys.stdout = old_stdout
            print("[DONE]")

        return unique_labels, labels
예제 #33
0
class POSCorpus(object):
    """Corpus for analyzing POS flexibility. After creation, corpus.sentences should consist of
  a list of sentences, each with a list of words. Example structure:
  [
    [
      {'word': "I", 'lemma': "i", 'pos': "PRON"},
      {'word': "love", 'lemma': "love", 'pos': "VERB"},
      {'word': "cats", 'lemma': "cat", 'pos': "NOUN"},
    ]
  ]
  """
    def __init__(self):
        self.lemma_merge_ds = None

    @classmethod
    def create_from_ud(cls, data_file_list):
        """Initialize corpus from a path to a file in conllu format"""
        corpus = POSCorpus()
        corpus.sentences = []

        for data_file_path in data_file_list:
            with open(data_file_path, "r", encoding="utf-8") as data_file:
                data = data_file.read()
                data = conllu.parse(data)

            for token_list in data:
                sentence = []
                for token in token_list:
                    pos = token['upostag']
                    lemma = token['lemma']
                    word = token['form']
                    # Sometimes the corpus doesn't have words, only underscores
                    if word == '_' or lemma == '_':
                        continue
                    sentence.append({'word': word, 'lemma': lemma, 'pos': pos})
                if len(sentence) > 0:
                    corpus.sentences.append(sentence)

        return corpus

    @classmethod
    def create_from_pickle(cls, data_file_path):
        """Initialize corpus from pkl file, containing lists of sentences"""
        corpus = POSCorpus()
        with open(data_file_path, 'rb') as f:
            corpus.sentences = pickle.load(f)
        return corpus

    def get_per_lemma_stats(self, flexibility_threshold=0.05):
        # Gather usages of each lemma
        # {lemma -> (POS, word)}
        lemma_forms = defaultdict(list)
        for sentence in self.sentences:
            for token in sentence:
                lemma = token['lemma']
                word = token['word']
                pos = token['pos']
                lemma_forms[lemma].append((pos, word))

        # Noun/Verb statistics for each lemma
        lemma_count_df = []
        for lemma, lemma_occurrences in lemma_forms.items():
            noun_count = len(
                [word for (pos, word) in lemma_occurrences if pos == 'NOUN'])
            verb_count = len(
                [word for (pos, word) in lemma_occurrences if pos == 'VERB'])
            lemma_count_df.append({
                'lemma': lemma,
                'noun_count': noun_count,
                'verb_count': verb_count
            })
        lemma_count_df = pd.DataFrame(lemma_count_df)

        lemma_count_df = lemma_count_df[lemma_count_df['noun_count'] +
                                        lemma_count_df['verb_count'] > 0]
        lemma_count_df['majority_tag'] = np.where(
            lemma_count_df['noun_count'] >= lemma_count_df['verb_count'],
            'NOUN', 'VERB')
        lemma_count_df['total_count'] = lemma_count_df[[
            'noun_count', 'verb_count'
        ]].sum(axis=1)
        lemma_count_df['minority_count'] = lemma_count_df[[
            'noun_count', 'verb_count'
        ]].min(axis=1)
        lemma_count_df['minority_ratio'] = lemma_count_df[
            'minority_count'] / lemma_count_df['total_count']
        lemma_count_df['is_flexible'] = lemma_count_df[
            'minority_ratio'] > flexibility_threshold

        return lemma_count_df

    # Helper iterator to return N/V words and lemmas in corpus, lowercased
    def _iterate_words(self):
        for sentence in self.sentences:
            for token in sentence:
                if token['pos'] in ['NOUN', 'VERB']:
                    yield token['word'].lower(), token['lemma'].lower(
                    ), token['pos']

    def _setup_lemma_merges(self):
        self.lemma_merge_ds = DisjointSet()
        for word, lemma, _ in self._iterate_words():
            self.lemma_merge_ds.union(word, lemma)

        # Group words that share the same lemma
        lemma_counter = Counter()
        for _, lemma, _ in self._iterate_words():
            lemma_counter[lemma] += 1

        lemma_groups = defaultdict(set)
        for word, lemma, _ in self._iterate_words():
            lemma_groups[self.lemma_merge_ds.find(word)].add(word)

        # Name of the group is the most frequent lemma in the group
        # Eg: [voyage, voyages, voyagerai, ...] should map to the same lemma
        self.merged_lemma_table = {}
        for word, lemma, _ in self._iterate_words():
            if word in self.merged_lemma_table:
                continue
            maxn, maxw = 0, None
            for w in lemma_groups[self.lemma_merge_ds.find(word)]:
                if lemma_counter[w] > maxn:
                    maxn = lemma_counter[w]
                    maxw = w
            self.merged_lemma_table[word] = maxw

    def get_lemma_stats_merge_method(self, flexibility_threshold=0.05):
        if self.lemma_merge_ds is None:
            self._setup_lemma_merges()

        # Gather usages of each lemma
        # {lemma -> (POS, word)}
        lemma_forms = defaultdict(list)
        for word, lemma, pos in self._iterate_words():
            lemma_forms[self.lemma_merge_ds.find(lemma)].append((pos, word))

        # Noun/Verb statistics for each lemma
        lemma_count_df = []
        for lemma, lemma_occurrences in lemma_forms.items():
            noun_count = len(
                [word for (pos, word) in lemma_occurrences if pos == 'NOUN'])
            verb_count = len(
                [word for (pos, word) in lemma_occurrences if pos == 'VERB'])
            lemma_count_df.append({
                'lemma': self.merged_lemma_table[lemma],
                'noun_count': noun_count,
                'verb_count': verb_count
            })
        lemma_count_df = pd.DataFrame(lemma_count_df)

        lemma_count_df = lemma_count_df[lemma_count_df['noun_count'] +
                                        lemma_count_df['verb_count'] > 0]
        lemma_count_df['majority_tag'] = np.where(
            lemma_count_df['noun_count'] >= lemma_count_df['verb_count'],
            'NOUN', 'VERB')
        lemma_count_df['total_count'] = lemma_count_df[[
            'noun_count', 'verb_count'
        ]].sum(axis=1)
        lemma_count_df['minority_count'] = lemma_count_df[[
            'noun_count', 'verb_count'
        ]].min(axis=1)
        lemma_count_df['minority_ratio'] = lemma_count_df[
            'minority_count'] / lemma_count_df['total_count']
        lemma_count_df['is_flexible'] = lemma_count_df[
            'minority_ratio'] > flexibility_threshold

        return lemma_count_df
예제 #34
0
def compute_partial_mws_prim_segmentation(edge_weight_exp,
                                          valid_edges_exp,
                                          offsets,
                                          number_of_attractive_channels,
                                          image_shape, iterations=None):
    visited = np.zeros(edge_weight_exp.size, dtype=bool)
    node_labeling = np.zeros(image_shape).ravel()
    number_of_nodes = node_labeling.size
    number_of_attractive_edges = number_of_nodes * number_of_attractive_channels
    ndims = len(offsets[0])
    array_stride = np.empty(ndims, dtype=np.int64)
    current_stride = 1
    mutexes = {}
    for i in range(ndims-1, -1, -1):
        array_stride[i] = current_stride
        current_stride *= image_shape[i]

    offset_strides = []
    for offset in offsets:
        stride = 0
        for i in range(len(offset)):
            stride += offset[i] * array_stride[i]
        offset_strides.append(stride)

    offset_strides = np.asarray(offset_strides)
    node_ufd = DisjointSet()
    for lbl in range(number_of_nodes):
        node_ufd.find(lbl)

    # mutexes = np.ndarray(number_of_nodes)
    pq = queue.PriorityQueue()

    # start prim from top left node
    add_neighbours(0, offset_strides, number_of_nodes, edge_weight_exp, valid_edges_exp, node_ufd, visited, pq)
    itr = 0
    # iterate over all edges
    cut_edges = []
    used_mtxs = []
    while not pq.empty():
        # extract next element from the queue
        position_vector = pq.get()
        edge_id = position_vector[1]
        u = position_vector[2]
        v = position_vector[3]

        if visited[edge_id]:
            continue
        visited[edge_id] = 1
        # find the current reps and skip if identical or mtx exists
        ru = node_ufd.find(u)
        rv = node_ufd.find(v)
        if ru == rv:
            continue
        if check_mutex(ru, rv, mutexes):
            if edge_id <= number_of_attractive_edges:
                # this edge is attractive and neighbour has different class
                cut_edges.append(edge_id)
            continue

        # check whether this edge is mutex via the edge offset
        if edge_id >= number_of_attractive_edges:
            used_mtxs.append(edge_id)
            insert_mutex(ru, rv, edge_id, mutexes)
        else:
            node_ufd.union(u,v)
            if node_ufd.find(ru) == rv:
                rv, ru = ru, rv
            merge_mutexes(rv, ru, mutexes)

        # add the next node to pq
        add_neighbours(v, offset_strides, number_of_nodes, edge_weight_exp, valid_edges_exp, node_ufd, visited, pq)
        itr += 1
        if iterations is not None:
            if itr > iterations:
                break

    # recover essential edges and neighbors
    class CutFeatures:
        def __init__(self, cut_edges, mutexes):
            self.cut_edges = cut_edges
            self.mutexes = mutexes
    neighbors_features = {}
    for e_id in cut_edges:
        n1 = e_id % number_of_nodes
        n2 = n1 + offset_strides[e_id//number_of_nodes]
        r1, r2 = node_ufd.find(n1), node_ufd.find(n2)
        sm = min(r1, r2)
        bg = max(r1, r2)
        if (sm, bg) in neighbors_features:
            neighbors_features[(sm, bg)].cut_edges += [e_id]
        else:
            neighbors_features[(sm, bg)] = CutFeatures([e_id],
                                                        get_common_mtxs(node_ufd.find(sm), node_ufd.find(bg), mutexes))

    # create node labeling from disjoint sets
    # 0's indicate no labeling
    for idx, cc in enumerate(node_ufd.itersets()):
        for node in cc:
            node_labeling[node] = idx+1

    return node_labeling, cut_edges, used_mtxs, neighbors_features
예제 #35
0
    def query_by_tags(self, query, group_adjacent=True):
        if len(query) == 0:
            return None

        res = []

        db, cl = self.connect_db()
        guard_col = db[GUARD_COL]
        iterator = guard_col.find(query)

        for doc in iterator:
            res.append(doc)

        # for the same collector_id and collection_seq
        # group adjacently connected rows
        if group_adjacent is True:
            frame_lookup = {}

            for i in range(len(res)):
                hash_key = hash(res[i]["collector_id"] +
                                res[i]["collection_seq"] + res[i]["frame"])
                frame_lookup[hash_key] = i

            ds = DisjointSet()

            # for each unique frame, union it with its start and end frame
            for frame_key, idx in frame_lookup.items():
                # union with start frame even if start frame doesn't exist
                start_key = hash(res[idx]["collector_id"] +
                                 res[idx]["collection_seq"] +
                                 res[idx]["start_frame"])

                if start_key in frame_lookup and ds.find(frame_key) != ds.find(
                        start_key):
                    ds.union(frame_key, start_key)

                # union with end frame even if end fram doesn't exist
                end_key = hash(res[idx]["collector_id"] +
                               res[idx]["collection_seq"] +
                               res[idx]["end_frame"])

                if end_key in frame_lookup and ds.find(frame_key) != ds.find(
                        end_key):
                    ds.union(frame_key, end_key)

            grouped_res = []

            for group in list(ds.itersets()):
                scenario = []

                for hash_key in group:
                    # will only group frames in detected set only
                    if hash_key in frame_lookup:
                        scenario.append(res[frame_lookup[hash_key]])

                scenario = sorted(scenario, key=lambda i: i['frame'])

                grouped_res.append(scenario)

            cl.close()
            return grouped_res

        return res