def fuzzy_keyword_match(self, keywords, max_hits=15): """ Performs a search query on elastic_field_name to match the provided keywords :param keywords: the list of keyword to match :param max_hits: maximum number of returned objects :return: the list of documents that contain the keywords """ filter_path = [ 'hits.hits._source.id', 'hits.hits._score', 'hits.total', 'hits.hits._source.dbName', 'hits.hits._source.sourceName', 'hits.hits._source.columnName' ] index = "text" query_body = { "from": 0, "size": max_hits, "query": { "match": { "text": { "query": keywords, "fuzziness": "AUTO" } } } } res = client.search(index=index, body=query_body, filter_path=filter_path) if res['hits']['total'] == 0: return [] for el in res['hits']['hits']: data = Hit(str(el['_source']['id']), el['_source']['dbName'], el['_source']['sourceName'], el['_source']['columnName'], el['_score']) yield data
def drs_from_raw_field(self, field: (str, str, str)) -> DRS: """ Given a field and source name, it returns a DRS with its representation :param field: a tuple with the name of the field, (db_name, source_name, field_name) :return: a DRS with the source-field internal representation """ db, source, field = field nid = id_from(db, source, field) h = Hit(nid, db, source, field, 0) return self.drs_from_hit(h)
def _node_to_hit(self, node: (str, str, str)) -> Hit: """ Given a field and source name, it returns a Hit with its representation :param node: a tuple with the name of the field, (db_name, source_name, field_name) :return: Hit """ db, source, field = node nid = id_from(db, source, field) hit = Hit(nid, db, source, field, 0) return hit
def _nid_to_hit(self, nid: int) -> Hit: """ Given a node id, convert it to a Hit :param nid: int or string :return: DRS """ nid = str(nid) score = 0.0 nid, db, source, field = self._network.get_info_for([nid])[0] hit = Hit(nid, db, source, field, score) return hit
def add_fields(self, list_of_fields): """ Creates a list of graph nodes from the list of fields and adds them to the graph :param list_of_fields: list of (source_name, field_name) tuples :return: the newly added list of field nodes """ nodes = [] for nid, sn, fn in list_of_fields: n = Hit(nid, sn, fn, -1) nodes.append(n) self.__G.add_nodes_from(nodes) return nodes
def schema_neighbors(self, field: (str, str, str)) -> DRS: """ Returns all the other attributes/fields that appear in the same relation than the provided field :param field: the provided field :return: returns a list of Hit elements of the form (id, source_name, field_name, score) """ db_name, source_name, field_name = field hits = self.__network.get_hits_from_table(source_name) origin_hit = Hit(id_from(db_name, source_name, field_name), db_name, source_name, field_name, 0) o_drs = DRS([x for x in hits], Operation(OP.TABLE, params=[origin_hit])) return o_drs
def enumerate_relation(self, relation, as_str=True): seen_pairs = set() for nid in self.iterate_ids(): db_name, source_name, field_name, data_type = self.__id_names[nid] hit = Hit(nid, db_name, source_name, field_name, 0) neighbors = self.neighbors_id(hit, relation) for n2 in neighbors: if not (n2.nid, nid) in seen_pairs: seen_pairs.add((nid, n2.nid)) if as_str: string = str(hit) + " - " + str(n2) yield string else: yield hit, n2
def md_neighbors_id(self, hit: Hit, md_neighbors: MRS, relation: Relation) -> DRS: if isinstance(hit, Hit): nid = str(hit.nid) if isinstance(hit, str): nid = hit nid = str(nid) data = [] score = 1.0 # TODO: return more meaningful score results for hit in md_neighbors: k = hit.target if hit.target != nid else hit.source (db_name, source_name, field_name, data_type) = self.__id_names[k] data.append(Hit(k, db_name, source_name, field_name, score)) op = self.get_op_from_relation(relation) o_drs = DRS(data, Operation(op, params=[hit])) return o_drs
def neighbors_id(self, hit: Hit, relation: Relation) -> DRS: if isinstance(hit, Hit): nid = str(hit.nid) if isinstance(hit, str): nid = hit nid = str(nid) data = [] neighbours = self.__G[nid] for k, v in neighbours.items(): if relation in v: score = v[relation]['score'] (db_name, source_name, field_name, data_type) = self.__id_names[k] data.append(Hit(k, db_name, source_name, field_name, score)) op = self.get_op_from_relation(relation) o_drs = DRS(data, Operation(op, params=[hit])) return o_drs
def search_keywords(self, keywords, elasticfieldname, max_hits=15): """ Performs a search query on elastic_field_name to match the provided keywords :param keywords: the list of keyword to match :param elasticfieldname: what is the field in the store where to apply the query :return: the list of documents that contain the keywords """ index = None query_body = None filter_path = [ 'hits.hits._source.id', 'hits.hits._score', 'hits.total', 'hits.hits._source.dbName', 'hits.hits._source.sourceName', 'hits.hits._source.columnName' ] if elasticfieldname == KWType.KW_CONTENT: index = "text" query_body = { "from": 0, "size": max_hits, "query": { "match": { "text": keywords } } } elif elasticfieldname == KWType.KW_SCHEMA: index = "profile" query_body = { "from": 0, "size": max_hits, "query": { "match": { "columnName": keywords } } } elif elasticfieldname == KWType.KW_ENTITIES: index = "profile" query_body = { "from": 0, "size": max_hits, "query": { "match": { "entities": keywords } } } elif elasticfieldname == KWType.KW_TABLE: index = "profile" query_body = { "from": 0, "size": max_hits, "query": { "match": { "sourceName": keywords } } } res = client.search(index=index, body=query_body, filter_path=filter_path) if res['hits']['total'] == 0: return [] for el in res['hits']['hits']: data = Hit(str(el['_source']['id']), el['_source']['dbName'], el['_source']['sourceName'], el['_source']['columnName'], el['_score']) yield data
def exact_search_keywords(self, keywords, elasticfieldname, max_hits=15): """ Like search_keywords, but returning only exact results :param keywords: :param elasticfieldname: :param max_hits: :return: """ index = None query_body = None filter_path = [ 'hits.hits._source.id', 'hits.hits._score', 'hits.total', 'hits.hits._source.dbName', 'hits.hits._source.sourceName', 'hits.hits._source.columnName' ] if elasticfieldname == KWType.KW_CONTENT: index = "text" query_body = { "from": 0, "size": max_hits, "query": { "term": { "text": keywords } } } elif elasticfieldname == KWType.KW_SCHEMA: index = "profile" query_body = { "from": 0, "size": max_hits, "query": { "term": { "columnNameNA": keywords } } } elif elasticfieldname == KWType.KW_ENTITIES: index = "profile" query_body = { "from": 0, "size": max_hits, "query": { "term": { "entities": keywords } } } elif elasticfieldname == KWType.KW_TABLE: index = "profile" query_body = { "from": 0, "size": max_hits, "query": { "term": { "sourceNameNA": keywords } } } res = client.search(index=index, body=query_body, filter_path=filter_path) if res['hits']['total'] == 0: return [] for el in res['hits']['hits']: data = Hit(str(el['_source']['id']), el['_source']['dbName'], el['_source']['sourceName'], el['_source']['columnName'], el['_score']) yield data
def get_hits_from_table(self, table) -> [Hit]: nids = self.get_fields_of_source(table) info = self.get_info_for(nids) hits = [Hit(nid, db_name, s_name, f_name, 0) for nid, db_name, s_name, f_name in info] return hits
def get_hits_from_info(self, info): hits = [Hit(nid, db_name, s_name, f_name, 0) for nid, db_name, s_name, f_name in info] return hits
def build_hit(sn, fn): nid = compute_field_id(sn, fn) return Hit(nid, sn, fn, -1)