def test_drs_table_iteration(self): print(self._testMethodName) h1 = Hit(0, "dba", "table_a", "a", -1) h2 = Hit(1, "dba", "table_a", "b", -1) h3 = Hit(2, "dba", "table_b", "c", -1) h4 = Hit(3, "dba", "table_b", "d", -1) drs = DRS([h1, h2, h3, h4], Operation(OP.ORIGIN)) drs.set_table_mode() for el in drs: print(str(el)) self.assertTrue(True)
def fuzzy_keyword_match(self, keywords, max_hits=15): """ Performs a search query on elastic_field_name to match the provided keywords :param keywords: the list of keyword to match :param max_hits: maximum number of returned objects :return: the list of documents that contain the keywords """ filter_path = ['hits.hits._source.id', 'hits.hits._score', 'hits.total', 'hits.hits._source.dbName', 'hits.hits._source.sourceName', 'hits.hits._source.columnName'] index = "text" query_body = { "from": 0, "size": max_hits, "query": { "match": { "text": { "query": keywords, "fuzziness": "AUTO" } } } } res = client.search(index=index, body=query_body, filter_path=filter_path) if res['hits']['total'] == 0: return [] for el in res['hits']['hits']: data = Hit(el['_source']['id'], el['_source']['dbName'], el['_source']['sourceName'], el['_source']['columnName'], el['_score']) yield data
def get_hits_from_table(self, table) -> [Hit]: nids = self.get_fields_of_source(table) info = self.get_info_for(nids) hits = [ Hit(nid, db_name, s_name, f_name, 0) for nid, db_name, s_name, f_name in info ] return hits
def enumerate_relation(self, relation): for nid in self.iterate_ids(): db_name, source_name, field_name, data_type = self.__id_names[nid] hit = Hit(nid, db_name, source_name, field_name, 0) neighbors = self.neighbors_id(hit, relation) for n2 in neighbors: string = str(hit) + " - " + str(n2) yield string
def drs_from_raw_field(self, field: (str, str, str)) -> DRS: """ Given a field and source name, it returns a DRS with its representation :param field: a tuple with the name of the field, (db_name, source_name, field_name) :return: a DRS with the source-field internal representation """ db, source, field = field nid = id_from(db, source, field) h = Hit(nid, db, source, field, 0) return self.drs_from_hit(h)
def _node_to_hit(self, node: (str, str, str)) -> Hit: """ Given a field and source name, it returns a Hit with its representation :param node: a tuple with the name of the field, (db_name, source_name, field_name) :return: Hit """ db, source, field = node nid = id_from(db, source, field) hit = Hit(nid, db, source, field, 0) return hit
def _nid_to_hit(self, nid: int) -> Hit: """ Given a node id, convert it to a Hit :param nid: int or string :return: DRS """ nid = str(nid) score = 0.0 nid, db, source, field = self._network.get_info_for([nid])[0] hit = Hit(nid, db, source, field, score) return hit
def enumerate_relation(self, relation): seen_pairs = set() for nid in self.iterate_ids(): db_name, source_name, field_name, data_type = self.__id_names[nid] hit = Hit(nid, db_name, source_name, field_name, 0) neighbors = self.neighbors_id(hit, relation) for n2 in neighbors: if not (n2.nid, nid) in seen_pairs: seen_pairs.add((nid, n2.nid)) string = str(hit) + " - " + str(n2) yield string
def schema_neighbors(self, field: (str, str, str)) -> DRS: """ Returns all the other attributes/fields that appear in the same relation than the provided field :param field: the provided field :return: returns a list of Hit elements of the form (id, source_name, field_name, score) """ db_name, source_name, field_name = field hits = self.__network.get_hits_from_table(source_name) origin_hit = Hit(id_from(db_name, source_name, field_name), db_name, source_name, field_name, 0) o_drs = DRS([x for x in hits], Operation(OP.TABLE, params=[origin_hit])) return o_drs
def add_fields(self, list_of_fields): """ Creates a list of graph nodes from the list of fields and adds them to the graph :param list_of_fields: list of (source_name, field_name) tuples :return: the newly added list of field nodes """ nodes = [] for nid, sn, fn in list_of_fields: n = Hit(nid, sn, fn, -1) nodes.append(n) self.__G.add_nodes_from(nodes) return nodes
def md_neighbors_id(self, hit: Hit, md_neighbors: MRS, relation: Relation) -> DRS: if isinstance(hit, Hit): nid = str(hit.nid) if isinstance(hit, str): nid = hit nid = str(nid) data = [] score = 1.0 # TODO: return more meaningful score results for hit in md_neighbors: k = hit.target if hit.target != nid else hit.source (db_name, source_name, field_name, data_type) = self.__id_names[k] data.append(Hit(k, db_name, source_name, field_name, score)) op = self.get_op_from_relation(relation) o_drs = DRS(data, Operation(op, params=[hit])) return o_drs
def test_creation_initial_provenance(self): print(self._testMethodName) h0 = Hit(10, "dba", "table_c", "v", -1) h1 = Hit(0, "dba", "table_a", "a", -1) h2 = Hit(1, "dba", "table_a", "b", -1) h3 = Hit(2, "dba", "table_b", "c", -1) h4 = Hit(3, "dba", "table_b", "d", -1) drs = DRS([h1, h2, h3, h4], Operation(OP.CONTENT_SIM, params=[h0])) prov_graph = drs.get_provenance().prov_graph() nodes = prov_graph.nodes() print("NODES") for n in nodes: print(str(n)) print(" ") edges = prov_graph.edges(keys=True) print("EDGES") for e in edges: print(str(e)) print(" ") self.assertTrue(True)
def neighbors_id(self, hit: Hit, relation: Relation) -> DRS: if isinstance(hit, Hit): nid = str(hit.nid) if isinstance(hit, str): nid = hit nid = str(nid) data = [] neighbours = self.__G[nid] for k, v in neighbours.items(): if relation in v: score = v[relation]['score'] (db_name, source_name, field_name, data_type) = self.__id_names[k] data.append(Hit(k, db_name, source_name, field_name, score)) op = self.get_op_from_relation(relation) o_drs = DRS(data, Operation(op, params=[hit])) return o_drs
def test_absorb(self): print(self._testMethodName) # DRS 1 h0 = Hit(10, "dba", "table_c", "v", -1) h1 = Hit(0, "dba", "table_a", "a", -1) h2 = Hit(1, "dba", "table_a", "b", -1) h3 = Hit(2, "dba", "table_b", "c", -1) h4 = Hit(3, "dba", "table_b", "d", -1) drs1 = DRS([h1, h2, h3, h4], Operation(OP.CONTENT_SIM, params=[h0])) # DRS 2 h5 = Hit(1, "dba", "table_a", "b", -1) h6 = Hit(16, "dba", "table_d", "a", -1) h7 = Hit(17, "dba", "table_d", "b", -1) drs2 = DRS([h6, h7], Operation(OP.SCHEMA_SIM, params=[h5])) drs = drs1.absorb(drs2) prov_graph = drs.get_provenance().prov_graph() nodes = prov_graph.nodes() print("NODES") for n in nodes: print(str(n)) print(" ") edges = prov_graph.edges(keys=True) print("EDGES") for e in edges: print(str(e)) print(" ") drs1_data = set([x for x in drs1]) drs2_data = set([x for x in drs2]) merged_data = set([x for x in drs]) lm = len(merged_data) lu = len(drs1_data.union(drs2_data)) print("Len must be 0: " + str(lu - lm)) self.assertTrue((lu - lm) == 0)
def test_sdifference(self): print(self._testMethodName) # DRS 1 h0 = Hit(10, "dba", "table_c", "v", -1) h1 = Hit(0, "dba", "table_a", "a", -1) h2 = Hit(1, "dba", "table_a", "b", -1) h3 = Hit(2, "dba", "table_b", "c", -1) h4 = Hit(3, "dba", "table_b", "d", -1) drs1 = DRS([h0, h1, h2, h3, h4], Operation(OP.ORIGIN)) # DRS 2 h5 = Hit(1, "dba", "table_a", "b", -1) h6 = Hit(16, "dba", "table_d", "a", -1) h7 = Hit(17, "dba", "table_d", "b", -1) drs2 = DRS([h5, h6, h7], Operation(OP.ORIGIN)) drs = drs1.set_difference(drs2) prov_graph = drs.get_provenance().prov_graph() nodes = prov_graph.nodes() print("NODES") for n in nodes: print(str(n)) print(" ") edges = prov_graph.edges(keys=True) print("EDGES") for e in edges: print(str(e)) print(" ") data = [x for x in drs] ld = len(data) print("Len must be 4: " + str(ld)) self.assertTrue(ld == 4)
def exact_search_keywords(self, keywords, elasticfieldname, max_hits=15): """ Like search_keywords, but returning only exact results :param keywords: :param elasticfieldname: :param max_hits: :return: """ index = None query_body = None filter_path = ['hits.hits._source.id', 'hits.hits._score', 'hits.total', 'hits.hits._source.dbName', 'hits.hits._source.sourceName', 'hits.hits._source.columnName'] if elasticfieldname == KWType.KW_CONTENT: index = "text" query_body = {"from": 0, "size": max_hits, "query": {"term": {"text": keywords}}} elif elasticfieldname == KWType.KW_SCHEMA: index = "profile" query_body = {"from": 0, "size": max_hits, "query": {"term": {"columnNameNA": keywords}}} elif elasticfieldname == KWType.KW_ENTITIES: index = "profile" query_body = {"from": 0, "size": max_hits, "query": {"term": {"entities": keywords}}} elif elasticfieldname == KWType.KW_TABLE: index = "profile" query_body = {"from": 0, "size": max_hits, "query": {"term": {"sourceNameNA": keywords}}} res = client.search(index=index, body=query_body, filter_path=filter_path) if res['hits']['total'] == 0: return [] for el in res['hits']['hits']: data = Hit(str(el['_source']['id']), el['_source']['dbName'], el['_source']['sourceName'], el['_source']['columnName'], el['_score']) yield data
def search_keywords(self, keywords, elasticfieldname, max_hits=15): """ Performs a search query on elastic_field_name to match the provided keywords :param keywords: the list of keyword to match :param elasticfieldname: what is the field in the store where to apply the query :return: the list of documents that contain the keywords """ index = None query_body = None filter_path = ['hits.hits._source.id', 'hits.hits._score', 'hits.total', 'hits.hits._source.dbName', 'hits.hits._source.sourceName', 'hits.hits._source.columnName'] if elasticfieldname == KWType.KW_TEXT: index = "text" query_body = {"from": 0, "size": max_hits, "query": {"match": {"text": keywords}}} elif elasticfieldname == KWType.KW_SCHEMA: index = "profile" query_body = {"from": 0, "size": max_hits, "query": {"match": {"columnName": keywords}}} elif elasticfieldname == KWType.KW_ENTITIES: index = "profile" query_body = {"from": 0, "size": max_hits, "query": {"match": {"entities": keywords}}} elif elasticfieldname == KWType.KW_TABLE: index = "profile" query_body = {"from": 0, "size": max_hits, "query": {"match": {"sourceName": keywords}}} res = client.search(index=index, body=query_body, filter_path=filter_path) if res['hits']['total'] == 0: return [] for el in res['hits']['hits']: data = Hit(el['_source']['id'], el['_source']['dbName'], el['_source']['sourceName'], el['_source']['columnName'], el['_score']) yield data
def get_hits_from_info(self, info): hits = [Hit(nid, db_name, s_name, f_name, 0) for nid, db_name, s_name, f_name in info] return hits
def build_hit(sn, fn): nid = compute_field_id(sn, fn) return Hit(nid, sn, fn, -1)