Пример #1
0
        def assemble_table_path_provenance(o_drs, paths, relation):

            for path in paths:
                src, src_sibling = path[0]
                assert (src_sibling is None)  # sibling of source should be None, as source is an origin
                tgt, tgt_sibling = path[-1]
                origin = DRS([src], Operation(OP.ORIGIN))
                o_drs.absorb_provenance(origin)
                prev_c = src
                for c, sibling in path[1:-1]:
                    nxt = DRS([sibling], Operation(OP.PKFK, params=[prev_c]))
                    o_drs.absorb_provenance(nxt)
                    if c.nid != sibling.nid:  # avoid loop on head nodes of the graph
                        linker = DRS([c], Operation(OP.TABLE, params=[sibling]))
                        o_drs.absorb_provenance(linker)
                    prev_c = c
                sink = DRS([tgt_sibling], Operation(OP.PKFK, params=[prev_c]))

                #The join path at the target has None sibling
                if tgt is not None and tgt_sibling is not None and tgt.nid != tgt_sibling.nid:
                    o_drs = o_drs.absorb_provenance(sink)
                    linker = DRS([tgt], Operation(OP.TABLE, params=[tgt_sibling]))
                    o_drs.absorb(linker)
                else:
                    o_drs = o_drs.absorb(sink)
            return o_drs
Пример #2
0
    def __traverse(self, a: DRS, primitive, max_hops=2) -> DRS:
        """
        Conduct a breadth first search of nodes matching a primitive, starting
        with an initial DRS.
        :param a: a nid, node, tuple, or DRS
        :param primitive: The element to search
        :max_hops: maximum number of rounds on the graph
        """
        a = self._general_to_drs(a)

        o_drs = DRS([], Operation(OP.NONE))

        if a.mode == DRSMode.TABLE:
            raise ValueError(
                'input mode DRSMode.TABLE not supported')

        fringe = a
        o_drs.absorb_provenance(a)
        while max_hops > 0:
            max_hops = max_hops - 1
            for h in fringe:
                hits_drs = self._network.neighbors_id(h, primitive)
                o_drs = self.union(o_drs, hits_drs)
            fringe = o_drs  # grow the initial input
        return o_drs
Пример #3
0
    def __neighbor_search(self,
                        input_data,
                        relation: Relation):
        """
        Given an nid, node, hit or DRS, finds neighbors with specified
        relation.
        :param nid, node tuple, Hit, or DRS:
        """
        # convert whatever input to a DRS
        i_drs = self._general_to_drs(input_data)

        # prepare an output DRS
        o_drs = DRS([], Operation(OP.NONE))
        o_drs = o_drs.absorb_provenance(i_drs)

        # get all of the table Hits in a DRS, if necessary.
        if i_drs.mode == DRSMode.TABLE:
            self._general_to_field_drs(i_drs)

        # Check neighbors
        if not relation.from_metadata():
            for h in i_drs:
                hits_drs = self._network.neighbors_id(h, relation)
                o_drs = o_drs.absorb(hits_drs)
        else:
            md_relation = self._relation_to_mdrelation(relation)
            for h in i_drs:
                neighbors = self.md_search(h, md_relation)
                hits_drs = self._network.md_neighbors_id(h, neighbors, relation)
                o_drs = o_drs.absorb(hits_drs)
        return o_drs
Пример #4
0
 def table(self, drs: DRS) -> DRS:
     """
     Given a DRS, it configures it to the table view
     :param drs: the DRS to configure
     :return: the same DRS in the table mode
     """
     drs.set_table_mode()
     return drs
Пример #5
0
 def drs_expand_to_table(self, drs: DRS) -> DRS:
     o_drs = DRS([], Operation(OP.NONE))
     for h in drs:
         table = h.source_name
         hits = self.__network.get_hits_from_table(table)
         drs = DRS([x for x in hits], Operation(OP.TABLE, params=[h]))
         o_drs.absorb(drs)
     return o_drs
Пример #6
0
 def fields(self, drs: DRS) -> DRS:
     """
     Given a DRS, it configures it to field view (default)
     :param drs: the DRS to configure
     :return: the same DRS in the fields mode
     """
     drs.set_fields_mode()
     return drs
Пример #7
0
 def table_names_search(self, kws: [str], max_results=10) -> DRS:
     """
     Given a collection of schema names, it returns the matches in the internal representation
     :param kws: collection (iterable) of keywords (strings)
     :return: a DRS
     """
     o_drs = DRS([], Operation(OP.NONE))
     for kw in kws:
         res_drs = self.table_name_search(kw, max_results=max_results)
         o_drs = o_drs.absorb(res_drs)
     return o_drs
Пример #8
0
 def assemble_field_path_provenance(o_drs, path, relation):
     src = path[0]
     tgt = path[-1]
     origin = DRS([src], Operation(OP.ORIGIN))
     o_drs.absorb_provenance(origin)
     prev_c = src
     for c in path[1:-1]:
         nxt = DRS([c], Operation(OP.PKFK, params=[prev_c]))
         o_drs.absorb_provenance(nxt)
         prev_c = c
     sink = DRS([tgt], Operation(OP.PKFK, params=[prev_c]))
     o_drs = o_drs.absorb(sink)
     return o_drs
Пример #9
0
    def test_drs_table_iteration(self):
        print(self._testMethodName)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs = DRS([h1, h2, h3, h4], Operation(OP.ORIGIN))
        drs.set_table_mode()

        for el in drs:
            print(str(el))

        self.assertTrue(True)
Пример #10
0
 def traverse(self, a: DRS, primitives, max_hops) -> DRS:
     o_drs = DRS([], Operation(OP.NONE))
     if a.mode == DRSMode.TABLE:
         print("ERROR: input mode TABLE not supported")
         return []
     fringe = [x for x in a]
     o_drs.absorb_provenance(a)
     while max_hops > 0:
         max_hops = max_hops - 1
         for h in fringe:
             hits_drs = self.__network.neighbors_id(h, primitives)
             o_drs = self.union(o_drs, hits_drs)
         fringe = [x for x in o_drs]  # grow the initial input
     return o_drs
Пример #11
0
    def make_drs(self, general_input):
        """
        Makes a DRS from general_input.
        general_input can include an array of strings, Hits, DRS's, etc,
        or just a single DRS.
        """
        try:

            # If this is a list of inputs, condense it into a single drs
            if isinstance(general_input, list):
                general_input = [
                    self._general_to_drs(x) for x in general_input]

                combined_drs = DRS([], Operation(OP.NONE))
                for drs in general_input:
                    combined_drs = self.union(combined_drs, drs)
                general_input = combined_drs

            # else, just convert it to a DRS
            o_drs = self._general_to_drs(general_input)
            return o_drs
        except:
            msg = (
                '--- Error ---' +
                '\nThis function returns domain result set from the ' +
                'supplied input' +
                '\nusage:\n\tmake_drs( table name/hit id | [table name/hit ' +
                'id, drs/hit/string/int] )' +
                '\ne.g.:\n\tmake_drs(1600820766)')
            print(msg)
Пример #12
0
        def get_table_neighbors(hit, relation, paths):
            results = []
            direct_neighbors = self.neighbors_id(hit, relation)

            # Rewriting results - filtering out results that are in the same table as the input. Rewriting prov
            direct_neighbors_list = [
                neigh for neigh in direct_neighbors
                if neigh.source_name != hit.source_name
            ]
            op = self.get_op_from_relation(relation)
            direct_neighbors = DRS(direct_neighbors_list,
                                   Operation(op, params=[hit]))

            # FIXME: filter out already seen nodes here
            for n in direct_neighbors:
                if not check_membership(n, paths):
                    if lean_search:
                        t_neighbors = api._drs_from_table_hit_lean_no_provenance(
                            n)
                    else:
                        t_neighbors = api.drs_from_table_hit(
                            n)  # Brought old API
                    # t_neighbors = api.make_drs(n)  # XXX: this won't take all table neighbors, only the input one
                    results.extend([(x, n) for x in t_neighbors])
            return results  # note how we include hit as sibling of x here
Пример #13
0
    def _general_to_drs(self, general_input) -> DRS:
        """
        Given an nid, node, hit, or DRS and convert it to a DRS.
        :param nid: int
        :param node: (db_name, source_name, field_name)
        :param hit: Hit
        :param DRS: DRS
        :return: DRS
        """
        # test for DRS initially for speed
        if isinstance(general_input, DRS):
            return general_input

        if general_input is None:
            general_input = DRS(data=[], operation=Operation(OP.NONE))

        # Test for ints or strings that represent integers
        if self._represents_int(general_input):
            general_input = self._nid_to_hit(general_input)

        # Test for strings that represent tables
        if isinstance(general_input, str):
            hits = self._network.get_hits_from_table(general_input)
            general_input = DRS([x for x in hits], Operation(OP.ORIGIN))

        # Test for tuples that are not Hits
        if (isinstance(general_input, tuple) and
                not isinstance(general_input, Hit)):
            general_input = self._node_to_hit(general_input)

        # Test for Hits
        if isinstance(general_input, Hit):
            field = general_input.field_name
            if field is '' or field is None:
                # If the Hit's field is not defined, it is in table mode
                # and all Hits from the table need to be found
                general_input = self._hit_to_drs(
                    general_input, table_mode=True)
            else:
                general_input = self._hit_to_drs(general_input)
        if isinstance(general_input, DRS):
            return general_input

        raise ValueError(
            'Input is not None, an integer, field tuple, Hit, or DRS')
Пример #14
0
 def drs_from_table(self, source: str) -> DRS:
     """
     Given a source, it retrieves all fields of the source and returns them
     in the internal representation
     :param source: string with the name of the table
     :return: a DRS with the source-field internal representation
     """
     hits = self.__network.get_hits_from_table(source)
     drs = DRS([x for x in hits], Operation(OP.ORIGIN))
     return drs
Пример #15
0
 def difference(self, a: DRS, b: DRS) -> DRS:
     """
     Returns elements that are in either a or b
     :param a: an iterable object
     :param b: another iterable object
     :return: the union of the two provided iterable objects
     """
     assert a.mode == b.mode, "Input parameters are not in the same mode (fields, table)"
     o_drs = a.set_difference(b)
     return o_drs
Пример #16
0
 def intersection(self, a: DRS, b: DRS) -> DRS:
     """
     Returns elements that are both in a and b
     :param a: an iterable object
     :param b: another iterable object
     :return: the intersection of the two provided iterable objects
     """
     assert a.mode == b.mode, "Input parameters are not in the same mode (fields, table)"
     o_drs = a.intersection(b)
     return o_drs
Пример #17
0
    def test_absorb(self):
        print(self._testMethodName)

        # DRS 1
        h0 = Hit(10, "dba", "table_c", "v", -1)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs1 = DRS([h1, h2, h3, h4], Operation(OP.CONTENT_SIM, params=[h0]))

        # DRS 2
        h5 = Hit(1, "dba", "table_a", "b", -1)

        h6 = Hit(16, "dba", "table_d", "a", -1)
        h7 = Hit(17, "dba", "table_d", "b", -1)
        drs2 = DRS([h6, h7], Operation(OP.SCHEMA_SIM, params=[h5]))

        drs = drs1.absorb(drs2)

        prov_graph = drs.get_provenance().prov_graph()
        nodes = prov_graph.nodes()
        print("NODES")
        for n in nodes:
            print(str(n))
        print(" ")
        edges = prov_graph.edges(keys=True)
        print("EDGES")
        for e in edges:
            print(str(e))
        print(" ")

        drs1_data = set([x for x in drs1])
        drs2_data = set([x for x in drs2])
        merged_data = set([x for x in drs])

        lm = len(merged_data)
        lu = len(drs1_data.union(drs2_data))

        print("Len must be 0: " + str(lu - lm))

        self.assertTrue((lu - lm) == 0)
Пример #18
0
 def keyword_search(self, kw: str, max_results=10) -> DRS:
     """
     Performs a keyword search over the content of the data
     :param kw: the keyword to search
     :param max_results: the maximum number of results to return
     :return: returns a DRS
     """
     hits = store_client.search_keywords(kw, KWType.KW_TEXT, max_results)
     drs = DRS([x for x in hits], Operation(OP.KW_LOOKUP, params=[kw]))  # materialize generator
     return drs
Пример #19
0
 def schema_name_search(self, kw: str, max_results=10) -> DRS:
     """
     Performs a keyword search over the attribute/field names of the data
     :param kw: the keyword to search
     :param max_results: the maximum number of results to return
     :return: returns a DRS
     """
     hits = store_client.search_keywords(kw, KWType.KW_SCHEMA, max_results)
     drs = DRS([x for x in hits], Operation(
         OP.SCHNAME_LOOKUP, params=[kw]))  # materialize generator
     return drs
Пример #20
0
 def schema_neighbors_of(self, i_drs: DRS) -> DRS:
     o_drs = DRS([], Operation(OP.NONE))
     o_drs = o_drs.absorb_provenance(i_drs)
     if i_drs.mode == DRSMode.TABLE:
         i_drs.set_fields_mode()
         for h in i_drs:
             fields_table = self.drs_from_table_hit(h)
             i_drs = i_drs.absorb(fields_table)
     for h in i_drs:
         hits = self.__network.get_hits_from_table(h.source_name)
         hits_drs = DRS([x for x in hits], Operation(OP.TABLE, params=[h]))
         o_drs = o_drs.absorb(hits_drs)
     return o_drs
Пример #21
0
    def exact_search(self, kw: str, kw_type: KWType, max_results=10):
        """
        See 'search'. This only returns exact matches.
        """

        hits = self._store_client.exact_search_keywords(
            keywords=kw, elasticfieldname=kw_type, max_hits=max_results)

        # materialize generator
        drs = DRS([x for x in hits], Operation(OP.KW_LOOKUP, params=[kw]))
        return drs
Пример #22
0
 def schema_neighbors(self, field: (str, str, str)) -> DRS:
     """
     Returns all the other attributes/fields that appear in the same relation than the provided field
     :param field: the provided field
     :return: returns a list of Hit elements of the form (id, source_name, field_name, score)
     """
     db_name, source_name, field_name = field
     hits = self.__network.get_hits_from_table(source_name)
     origin_hit = Hit(id_from(db_name, source_name, field_name), db_name, source_name, field_name, 0)
     o_drs = DRS([x for x in hits], Operation(OP.TABLE, params=[origin_hit]))
     return o_drs
Пример #23
0
 def entity_search(self, kw: str, max_results=10) -> DRS:
     """
     Performs a keyword search over the entities represented by the data
     :param kw: the keyword to search
     :param max_results: the maximum number of results to return
     :return: returns a list of Hit elements of the form (id, source_name, field_name, score)
     """
     hits = store_client.search_keywords(
         kw, KWType.KW_ENTITIES, max_results)
     drs = DRS([x for x in hits], Operation(
         OP.ENTITY_LOOKUP, params=[kw]))  # materialize generator
     return drs
Пример #24
0
 def paths_between(self, a: DRS, b: DRS, primitives, max_hops=2) -> DRS:
     """
     Is there a transitive relationship between any element in a with any element in b?
     This functions finds the answer constrained on the primitive (singular for now) that is passed
     as a parameter.
     :param a:
     :param b:
     :param primitives:
     :return:
     """
     assert(a.mode == b.mode)
     o_drs = DRS([], Operation(OP.NONE))
     o_drs.absorb_provenance(a)
     o_drs.absorb_provenance(b)
     if a.mode == DRSMode.FIELDS:
         for h1 in a:  # h1 is a Hit
             for h2 in b:  # h2 is a Hit
                 if h1 == h2:
                     return o_drs  # same source and target field
                 res_drs = self.__network.find_path_hit(h1, h2, primitives, max_hops=max_hops)
                 o_drs = o_drs.absorb(res_drs)
     elif a.mode == DRSMode.TABLE:
         for h1 in a:  # h1 is a table: str
             for h2 in b:  # h2 is a table: str
                 if h1 == h2:
                     return o_drs  # same source ant target table
                 res_drs = self.__network.find_path_table(
                     h1, h2, primitives, self, max_hops=max_hops)
                 o_drs = o_drs.absorb(res_drs)
     return o_drs
Пример #25
0
    def union(self, a: DRS, b: DRS) -> DRS:
        """
        Returns elements that are in either a or b
        :param a: an iterable object
        :param b: another iterable object
        :return: the union of the two provided iterable objects
        """
        a = self._general_to_drs(a)
        b = self._general_to_drs(b)
        self._assert_same_mode(a, b)

        o_drs = a.union(b)
        return o_drs
Пример #26
0
    def test_sdifference(self):
        print(self._testMethodName)

        # DRS 1
        h0 = Hit(10, "dba", "table_c", "v", -1)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs1 = DRS([h0, h1, h2, h3, h4], Operation(OP.ORIGIN))

        # DRS 2
        h5 = Hit(1, "dba", "table_a", "b", -1)

        h6 = Hit(16, "dba", "table_d", "a", -1)
        h7 = Hit(17, "dba", "table_d", "b", -1)
        drs2 = DRS([h5, h6, h7], Operation(OP.ORIGIN))

        drs = drs1.set_difference(drs2)

        prov_graph = drs.get_provenance().prov_graph()
        nodes = prov_graph.nodes()
        print("NODES")
        for n in nodes:
            print(str(n))
        print(" ")
        edges = prov_graph.edges(keys=True)
        print("EDGES")
        for e in edges:
            print(str(e))
        print(" ")

        data = [x for x in drs]
        ld = len(data)

        print("Len must be 4: " + str(ld))

        self.assertTrue(ld == 4)
Пример #27
0
 def paths(self, a: DRS, primitives) -> DRS:
     """
     Is there any transitive relationship between any two elements in a?
     This function finds the answer constrained on the primitive (singular for now) passed as parameter
     :param a:
     :param primitives:
     :return:
     """
     o_drs = DRS([], Operation(OP.NONE))
     o_drs = o_drs.absorb_provenance(a)
     if a.mode == DRSMode.FIELDS:
         for h1 in a:  # h1 is a Hit
             for h2 in a:  # h2 is a Hit
                 if h1 == h2:
                     continue
                 res_drs = self.__network.find_path_hit(h1, h2, primitives)
                 o_drs = o_drs.absorb(res_drs)
     elif a.mode == DRSMode.TABLE:
         for h1 in a:  # h1 is a table: str
             for h2 in a:  # h2 is a table: str
                 res_drs = self.__network.find_path_table(
                     h1, h2, primitives, self)
                 o_drs = o_drs.absorb(res_drs)
     return o_drs
Пример #28
0
 def md_neighbors_id(self, hit: Hit, md_neighbors: MRS, relation: Relation) -> DRS:
     if isinstance(hit, Hit):
         nid = str(hit.nid)
     if isinstance(hit, str):
         nid = hit
     nid = str(nid)
     data = []
     score = 1.0 # TODO: return more meaningful score results
     for hit in md_neighbors:
         k = hit.target if hit.target != nid else hit.source
         (db_name, source_name, field_name, data_type) = self.__id_names[k]
         data.append(Hit(k, db_name, source_name, field_name, score))
     op = self.get_op_from_relation(relation)
     o_drs = DRS(data, Operation(op, params=[hit]))
     return o_drs
Пример #29
0
    def test_creation_initial_provenance(self):
        print(self._testMethodName)

        h0 = Hit(10, "dba", "table_c", "v", -1)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs = DRS([h1, h2, h3, h4], Operation(OP.CONTENT_SIM, params=[h0]))

        prov_graph = drs.get_provenance().prov_graph()
        nodes = prov_graph.nodes()
        print("NODES")
        for n in nodes:
            print(str(n))
        print(" ")
        edges = prov_graph.edges(keys=True)
        print("EDGES")
        for e in edges:
            print(str(e))
        print(" ")

        self.assertTrue(True)
Пример #30
0
        def get_table_neighbors(hit, relation, paths):
            results = []
            direct_neighbors = self.neighbors_id(hit, relation)

            # Rewriting results - filtering out results that are in the same table as the input. Rewriting prov
            direct_neighbors_list = [neigh for neigh in direct_neighbors if neigh.source_name != hit.source_name]
            op = self.get_op_from_relation(relation)
            direct_neighbors = DRS(direct_neighbors_list, Operation(op, params=[hit]))

            # FIXME: filter out already seen nodes here
            for n in direct_neighbors:
                if not check_membership(n, paths):
                    t_neighbors = api.drs_from_table_hit(n)
                    results.extend([(x, n) for x in t_neighbors])
            return results  # note how we include hit as sibling of x here