Exemplo n.º 1
0
    def test_drs_table_iteration(self):
        print(self._testMethodName)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs = DRS([h1, h2, h3, h4], Operation(OP.ORIGIN))
        drs.set_table_mode()

        for el in drs:
            print(str(el))

        self.assertTrue(True)
Exemplo n.º 2
0
 def fuzzy_keyword_match(self, keywords, max_hits=15):
     """
     Performs a search query on elastic_field_name to match the provided keywords
     :param keywords: the list of keyword to match
     :param max_hits: maximum number of returned objects
     :return: the list of documents that contain the keywords
     """
     filter_path = ['hits.hits._source.id',
                    'hits.hits._score',
                    'hits.total',
                    'hits.hits._source.dbName',
                    'hits.hits._source.sourceName',
                    'hits.hits._source.columnName']
     index = "text"
     query_body = {
         "from": 0, "size": max_hits,
             "query": {
                 "match": {
                     "text": {
                         "query": keywords,
                         "fuzziness": "AUTO"
                     }
                 }
             }
         }
     res = client.search(index=index, body=query_body,
                         filter_path=filter_path)
     if res['hits']['total'] == 0:
         return []
     for el in res['hits']['hits']:
         data = Hit(el['_source']['id'], el['_source']['dbName'], el['_source']['sourceName'],
                    el['_source']['columnName'], el['_score'])
         yield data
Exemplo n.º 3
0
 def get_hits_from_table(self, table) -> [Hit]:
     nids = self.get_fields_of_source(table)
     info = self.get_info_for(nids)
     hits = [
         Hit(nid, db_name, s_name, f_name, 0)
         for nid, db_name, s_name, f_name in info
     ]
     return hits
Exemplo n.º 4
0
 def enumerate_relation(self, relation):
     for nid in self.iterate_ids():
         db_name, source_name, field_name, data_type = self.__id_names[nid]
         hit = Hit(nid, db_name, source_name, field_name, 0)
         neighbors = self.neighbors_id(hit, relation)
         for n2 in neighbors:
             string = str(hit) + " - " + str(n2)
             yield string
Exemplo n.º 5
0
 def drs_from_raw_field(self, field: (str, str, str)) -> DRS:
     """
     Given a field and source name, it returns a DRS with its representation
     :param field: a tuple with the name of the field, (db_name, source_name, field_name)
     :return: a DRS with the source-field internal representation
     """
     db, source, field = field
     nid = id_from(db, source, field)
     h = Hit(nid, db, source, field, 0)
     return self.drs_from_hit(h)
Exemplo n.º 6
0
 def _node_to_hit(self, node: (str, str, str)) -> Hit:
     """
     Given a field and source name, it returns a Hit with its representation
     :param node: a tuple with the name of the field,
         (db_name, source_name, field_name)
     :return: Hit
     """
     db, source, field = node
     nid = id_from(db, source, field)
     hit = Hit(nid, db, source, field, 0)
     return hit
Exemplo n.º 7
0
 def _nid_to_hit(self, nid: int) -> Hit:
     """
     Given a node id, convert it to a Hit
     :param nid: int or string
     :return: DRS
     """
     nid = str(nid)
     score = 0.0
     nid, db, source, field = self._network.get_info_for([nid])[0]
     hit = Hit(nid, db, source, field, score)
     return hit
Exemplo n.º 8
0
 def enumerate_relation(self, relation):
     seen_pairs = set()
     for nid in self.iterate_ids():
         db_name, source_name, field_name, data_type = self.__id_names[nid]
         hit = Hit(nid, db_name, source_name, field_name, 0)
         neighbors = self.neighbors_id(hit, relation)
         for n2 in neighbors:
             if not (n2.nid, nid) in seen_pairs:
                 seen_pairs.add((nid, n2.nid))
                 string = str(hit) + " - " + str(n2)
                 yield string
Exemplo n.º 9
0
 def schema_neighbors(self, field: (str, str, str)) -> DRS:
     """
     Returns all the other attributes/fields that appear in the same relation than the provided field
     :param field: the provided field
     :return: returns a list of Hit elements of the form (id, source_name, field_name, score)
     """
     db_name, source_name, field_name = field
     hits = self.__network.get_hits_from_table(source_name)
     origin_hit = Hit(id_from(db_name, source_name, field_name), db_name, source_name, field_name, 0)
     o_drs = DRS([x for x in hits], Operation(OP.TABLE, params=[origin_hit]))
     return o_drs
Exemplo n.º 10
0
 def add_fields(self, list_of_fields):
     """
     Creates a list of graph nodes from the list of fields and adds them to the graph
     :param list_of_fields: list of (source_name, field_name) tuples
     :return: the newly added list of field nodes
     """
     nodes = []
     for nid, sn, fn in list_of_fields:
         n = Hit(nid, sn, fn, -1)
         nodes.append(n)
     self.__G.add_nodes_from(nodes)
     return nodes
Exemplo n.º 11
0
 def md_neighbors_id(self, hit: Hit, md_neighbors: MRS, relation: Relation) -> DRS:
     if isinstance(hit, Hit):
         nid = str(hit.nid)
     if isinstance(hit, str):
         nid = hit
     nid = str(nid)
     data = []
     score = 1.0 # TODO: return more meaningful score results
     for hit in md_neighbors:
         k = hit.target if hit.target != nid else hit.source
         (db_name, source_name, field_name, data_type) = self.__id_names[k]
         data.append(Hit(k, db_name, source_name, field_name, score))
     op = self.get_op_from_relation(relation)
     o_drs = DRS(data, Operation(op, params=[hit]))
     return o_drs
Exemplo n.º 12
0
    def test_creation_initial_provenance(self):
        print(self._testMethodName)

        h0 = Hit(10, "dba", "table_c", "v", -1)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs = DRS([h1, h2, h3, h4], Operation(OP.CONTENT_SIM, params=[h0]))

        prov_graph = drs.get_provenance().prov_graph()
        nodes = prov_graph.nodes()
        print("NODES")
        for n in nodes:
            print(str(n))
        print(" ")
        edges = prov_graph.edges(keys=True)
        print("EDGES")
        for e in edges:
            print(str(e))
        print(" ")

        self.assertTrue(True)
Exemplo n.º 13
0
 def neighbors_id(self, hit: Hit, relation: Relation) -> DRS:
     if isinstance(hit, Hit):
         nid = str(hit.nid)
     if isinstance(hit, str):
         nid = hit
     nid = str(nid)
     data = []
     neighbours = self.__G[nid]
     for k, v in neighbours.items():
         if relation in v:
             score = v[relation]['score']
             (db_name, source_name, field_name, data_type) = self.__id_names[k]
             data.append(Hit(k, db_name, source_name, field_name, score))
     op = self.get_op_from_relation(relation)
     o_drs = DRS(data, Operation(op, params=[hit]))
     return o_drs
Exemplo n.º 14
0
    def test_absorb(self):
        print(self._testMethodName)

        # DRS 1
        h0 = Hit(10, "dba", "table_c", "v", -1)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs1 = DRS([h1, h2, h3, h4], Operation(OP.CONTENT_SIM, params=[h0]))

        # DRS 2
        h5 = Hit(1, "dba", "table_a", "b", -1)

        h6 = Hit(16, "dba", "table_d", "a", -1)
        h7 = Hit(17, "dba", "table_d", "b", -1)
        drs2 = DRS([h6, h7], Operation(OP.SCHEMA_SIM, params=[h5]))

        drs = drs1.absorb(drs2)

        prov_graph = drs.get_provenance().prov_graph()
        nodes = prov_graph.nodes()
        print("NODES")
        for n in nodes:
            print(str(n))
        print(" ")
        edges = prov_graph.edges(keys=True)
        print("EDGES")
        for e in edges:
            print(str(e))
        print(" ")

        drs1_data = set([x for x in drs1])
        drs2_data = set([x for x in drs2])
        merged_data = set([x for x in drs])

        lm = len(merged_data)
        lu = len(drs1_data.union(drs2_data))

        print("Len must be 0: " + str(lu - lm))

        self.assertTrue((lu - lm) == 0)
Exemplo n.º 15
0
    def test_sdifference(self):
        print(self._testMethodName)

        # DRS 1
        h0 = Hit(10, "dba", "table_c", "v", -1)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs1 = DRS([h0, h1, h2, h3, h4], Operation(OP.ORIGIN))

        # DRS 2
        h5 = Hit(1, "dba", "table_a", "b", -1)

        h6 = Hit(16, "dba", "table_d", "a", -1)
        h7 = Hit(17, "dba", "table_d", "b", -1)
        drs2 = DRS([h5, h6, h7], Operation(OP.ORIGIN))

        drs = drs1.set_difference(drs2)

        prov_graph = drs.get_provenance().prov_graph()
        nodes = prov_graph.nodes()
        print("NODES")
        for n in nodes:
            print(str(n))
        print(" ")
        edges = prov_graph.edges(keys=True)
        print("EDGES")
        for e in edges:
            print(str(e))
        print(" ")

        data = [x for x in drs]
        ld = len(data)

        print("Len must be 4: " + str(ld))

        self.assertTrue(ld == 4)
Exemplo n.º 16
0
 def exact_search_keywords(self, keywords, elasticfieldname, max_hits=15):
     """
     Like search_keywords, but returning only exact results
     :param keywords:
     :param elasticfieldname:
     :param max_hits:
     :return:
     """
     index = None
     query_body = None
     filter_path = ['hits.hits._source.id',
                    'hits.hits._score',
                    'hits.total',
                    'hits.hits._source.dbName',
                    'hits.hits._source.sourceName',
                    'hits.hits._source.columnName']
     if elasticfieldname == KWType.KW_CONTENT:
         index = "text"
         query_body = {"from": 0, "size": max_hits,
                       "query": {"term": {"text": keywords}}}
     elif elasticfieldname == KWType.KW_SCHEMA:
         index = "profile"
         query_body = {"from": 0, "size": max_hits,
                       "query": {"term": {"columnNameNA": keywords}}}
     elif elasticfieldname == KWType.KW_ENTITIES:
         index = "profile"
         query_body = {"from": 0, "size": max_hits,
                       "query": {"term": {"entities": keywords}}}
     elif elasticfieldname == KWType.KW_TABLE:
         index = "profile"
         query_body = {"from": 0, "size": max_hits,
                       "query": {"term": {"sourceNameNA": keywords}}}
     res = client.search(index=index, body=query_body,
                         filter_path=filter_path)
     if res['hits']['total'] == 0:
         return []
     for el in res['hits']['hits']:
         data = Hit(str(el['_source']['id']), el['_source']['dbName'], el['_source']['sourceName'],
                    el['_source']['columnName'], el['_score'])
         yield data
Exemplo n.º 17
0
 def search_keywords(self, keywords, elasticfieldname, max_hits=15):
     """
     Performs a search query on elastic_field_name to match the provided keywords
     :param keywords: the list of keyword to match
     :param elasticfieldname: what is the field in the store where to apply the query
     :return: the list of documents that contain the keywords
     """
     index = None
     query_body = None
     filter_path = ['hits.hits._source.id',
                    'hits.hits._score',
                    'hits.total',
                    'hits.hits._source.dbName',
                    'hits.hits._source.sourceName',
                    'hits.hits._source.columnName']
     if elasticfieldname == KWType.KW_TEXT:
         index = "text"
         query_body = {"from": 0, "size": max_hits,
                       "query": {"match": {"text": keywords}}}
     elif elasticfieldname == KWType.KW_SCHEMA:
         index = "profile"
         query_body = {"from": 0, "size": max_hits,
                       "query": {"match": {"columnName": keywords}}}
     elif elasticfieldname == KWType.KW_ENTITIES:
         index = "profile"
         query_body = {"from": 0, "size": max_hits,
                       "query": {"match": {"entities": keywords}}}
     elif elasticfieldname == KWType.KW_TABLE:
         index = "profile"
         query_body = {"from": 0, "size": max_hits,
                       "query": {"match": {"sourceName": keywords}}}
     res = client.search(index=index, body=query_body,
                         filter_path=filter_path)
     if res['hits']['total'] == 0:
         return []
     for el in res['hits']['hits']:
         data = Hit(el['_source']['id'], el['_source']['dbName'], el['_source']['sourceName'],
                    el['_source']['columnName'], el['_score'])
         yield data
Exemplo n.º 18
0
 def get_hits_from_info(self, info):
     hits = [Hit(nid, db_name, s_name, f_name, 0) for nid, db_name, s_name, f_name in info]
     return hits
Exemplo n.º 19
0
def build_hit(sn, fn):
    nid = compute_field_id(sn, fn)
    return Hit(nid, sn, fn, -1)