def assemble_table_path_provenance(o_drs, paths, relation): for path in paths: src, src_sibling = path[0] assert (src_sibling is None) # sibling of source should be None, as source is an origin tgt, tgt_sibling = path[-1] origin = DRS([src], Operation(OP.ORIGIN)) o_drs.absorb_provenance(origin) prev_c = src for c, sibling in path[1:-1]: nxt = DRS([sibling], Operation(OP.PKFK, params=[prev_c])) o_drs.absorb_provenance(nxt) if c.nid != sibling.nid: # avoid loop on head nodes of the graph linker = DRS([c], Operation(OP.TABLE, params=[sibling])) o_drs.absorb_provenance(linker) prev_c = c sink = DRS([tgt_sibling], Operation(OP.PKFK, params=[prev_c])) #The join path at the target has None sibling if tgt is not None and tgt_sibling is not None and tgt.nid != tgt_sibling.nid: o_drs = o_drs.absorb_provenance(sink) linker = DRS([tgt], Operation(OP.TABLE, params=[tgt_sibling])) o_drs.absorb(linker) else: o_drs = o_drs.absorb(sink) return o_drs
def __neighbor_search(self, input_data, relation: Relation): """ Given an nid, node, hit or DRS, finds neighbors with specified relation. :param nid, node tuple, Hit, or DRS: """ # convert whatever input to a DRS i_drs = self._general_to_drs(input_data) # prepare an output DRS o_drs = DRS([], Operation(OP.NONE)) o_drs = o_drs.absorb_provenance(i_drs) # get all of the table Hits in a DRS, if necessary. if i_drs.mode == DRSMode.TABLE: self._general_to_field_drs(i_drs) # Check neighbors if not relation.from_metadata(): for h in i_drs: hits_drs = self._network.neighbors_id(h, relation) o_drs = o_drs.absorb(hits_drs) else: md_relation = self._relation_to_mdrelation(relation) for h in i_drs: neighbors = self.md_search(h, md_relation) hits_drs = self._network.md_neighbors_id( h, neighbors, relation) o_drs = o_drs.absorb(hits_drs) return o_drs
def fields(self, drs: DRS) -> DRS: """ Given a DRS, it configures it to field view (default) :param drs: the DRS to configure :return: the same DRS in the fields mode """ drs.set_fields_mode() return drs
def table(self, drs: DRS) -> DRS: """ Given a DRS, it configures it to the table view :param drs: the DRS to configure :return: the same DRS in the table mode """ drs.set_table_mode() return drs
def drs_expand_to_table(self, drs: DRS) -> DRS: o_drs = DRS([], Operation(OP.NONE)) for h in drs: table = h.source_name hits = self.__network.get_hits_from_table(table) drs = DRS([x for x in hits], Operation(OP.TABLE, params=[h])) o_drs.absorb(drs) return o_drs
def table_names_search(self, kws: [str], max_results=10) -> DRS: """ Given a collection of schema names, it returns the matches in the internal representation :param kws: collection (iterable) of keywords (strings) :return: a DRS """ o_drs = DRS([], Operation(OP.NONE)) for kw in kws: res_drs = self.table_name_search(kw, max_results=max_results) o_drs = o_drs.absorb(res_drs) return o_drs
def assemble_field_path_provenance(o_drs, path, relation): src = path[0] tgt = path[-1] origin = DRS([src], Operation(OP.ORIGIN)) o_drs.absorb_provenance(origin) prev_c = src for c in path[1:-1]: nxt = DRS([c], Operation(OP.PKFK, params=[prev_c])) o_drs.absorb_provenance(nxt) prev_c = c sink = DRS([tgt], Operation(OP.PKFK, params=[prev_c])) o_drs = o_drs.absorb(sink) return o_drs
def traverse(self, a: DRS, primitives, max_hops) -> DRS: o_drs = DRS([], Operation(OP.NONE)) if a.mode == DRSMode.TABLE: print("ERROR: input mode TABLE not supported") return [] fringe = [x for x in a] o_drs.absorb_provenance(a) while max_hops > 0: max_hops = max_hops - 1 for h in fringe: hits_drs = self.__network.neighbors_id(h, primitives) o_drs = self.union(o_drs, hits_drs) fringe = [x for x in o_drs] # grow the initial input return o_drs
def make_drs(self, general_input): """ Makes a DRS from general_input. general_input can include an array of strings, Hits, DRS's, etc, or just a single DRS. """ try: # If this is a list of inputs, condense it into a single drs if isinstance(general_input, list): general_input = [ self._general_to_drs(x) for x in general_input ] combined_drs = DRS([], Operation(OP.NONE)) for drs in general_input: combined_drs = self.union(combined_drs, drs) general_input = combined_drs # else, just convert it to a DRS o_drs = self._general_to_drs(general_input) return o_drs except: msg = ( '--- Error ---' + '\nThis function returns domain result set from the ' + 'supplied input' + '\nusage:\n\tmake_drs( table name/hit id | [table name/hit ' + 'id, drs/hit/string/int] )' + '\ne.g.:\n\tmake_drs(1600820766)') print(msg)
def _drs_from_table_hit_lean_no_provenance(self, hit: Hit) -> DRS: # TODO: migrated from old ddapi as there's no good swap table = hit.source_name hits = self._network.get_hits_from_table(table) drs = DRS([x for x in hits], Operation(OP.TABLE, params=[hit]), lean_drs=True) return drs
def _general_to_drs(self, general_input) -> DRS: """ Given an nid, node, hit, or DRS and convert it to a DRS. :param nid: int :param node: (db_name, source_name, field_name) :param hit: Hit :param DRS: DRS :return: DRS """ # test for DRS initially for speed if isinstance(general_input, DRS): return general_input if general_input is None: general_input = DRS(data=[], operation=Operation(OP.NONE)) # Test for ints or strings that represent integers if self._represents_int(general_input): general_input = self._nid_to_hit(general_input) # Test for strings that represent tables if isinstance(general_input, str): hits = self._network.get_hits_from_table(general_input) general_input = DRS([x for x in hits], Operation(OP.ORIGIN)) # Test for tuples that are not Hits if (isinstance(general_input, tuple) and not isinstance(general_input, Hit)): general_input = self._node_to_hit(general_input) # Test for Hits if isinstance(general_input, Hit): field = general_input.field_name if field == '' or field is None: # If the Hit's field is not defined, it is in table mode # and all Hits from the table need to be found general_input = self._hit_to_drs(general_input, table_mode=True) else: general_input = self._hit_to_drs(general_input) if isinstance(general_input, DRS): return general_input raise ValueError( 'Input is not None, an integer, field tuple, Hit, or DRS')
def intersection(self, a: DRS, b: DRS) -> DRS: """ Returns elements that are both in a and b :param a: an iterable object :param b: another iterable object :return: the intersection of the two provided iterable objects """ assert a.mode == b.mode, "Input parameters are not in the same mode (fields, table)" o_drs = a.intersection(b) return o_drs
def difference(self, a: DRS, b: DRS) -> DRS: """ Returns elements that are in either a or b :param a: an iterable object :param b: another iterable object :return: the union of the two provided iterable objects """ assert a.mode == b.mode, "Input parameters are not in the same mode (fields, table)" o_drs = a.set_difference(b) return o_drs
def drs_from_table(self, source: str) -> DRS: """ Given a source, it retrieves all fields of the source and returns them in the internal representation :param source: string with the name of the table :return: a DRS with the source-field internal representation """ hits = self.__network.get_hits_from_table(source) drs = DRS([x for x in hits], Operation(OP.ORIGIN)) return drs
def schema_neighbors_of(self, i_drs: DRS) -> DRS: o_drs = DRS([], Operation(OP.NONE)) o_drs = o_drs.absorb_provenance(i_drs) if i_drs.mode == DRSMode.TABLE: i_drs.set_fields_mode() for h in i_drs: fields_table = self.drs_from_table_hit(h) i_drs = i_drs.absorb(fields_table) for h in i_drs: hits = self.__network.get_hits_from_table(h.source_name) hits_drs = DRS([x for x in hits], Operation(OP.TABLE, params=[h])) o_drs = o_drs.absorb(hits_drs) return o_drs
def exact_search(self, kw: str, kw_type: KWType, max_results=10): """ See 'search'. This only returns exact matches. """ hits = self._store_client.exact_search_keywords( keywords=kw, elasticfieldname=kw_type, max_hits=max_results) # materialize generator drs = DRS([x for x in hits], Operation(OP.KW_LOOKUP, params=[kw])) return drs
def schema_name_search(self, kw: str, max_results=10) -> DRS: """ Performs a keyword search over the attribute/field names of the data :param kw: the keyword to search :param max_results: the maximum number of results to return :return: returns a DRS """ hits = store_client.search_keywords(kw, KWType.KW_SCHEMA, max_results) drs = DRS([x for x in hits], Operation(OP.SCHNAME_LOOKUP, params=[kw])) # materialize generator return drs
def keyword_search(self, kw: str, max_results=10) -> DRS: """ Performs a keyword search over the content of the data :param kw: the keyword to search :param max_results: the maximum number of results to return :return: returns a DRS """ hits = store_client.search_keywords(kw, KWType.KW_CONTENT, max_results) drs = DRS([x for x in hits], Operation(OP.KW_LOOKUP, params=[kw])) # materialize generator return drs
def entity_search(self, kw: str, max_results=10) -> DRS: """ Performs a keyword search over the entities represented by the data :param kw: the keyword to search :param max_results: the maximum number of results to return :return: returns a list of Hit elements of the form (id, source_name, field_name, score) """ hits = store_client.search_keywords(kw, KWType.KW_ENTITIES, max_results) drs = DRS([x for x in hits], Operation(OP.ENTITY_LOOKUP, params=[kw])) # materialize generator return drs
def union(self, a: DRS, b: DRS) -> DRS: """ Returns elements that are in either a or b :param a: an iterable object :param b: another iterable object :return: the union of the two provided iterable objects """ a = self._general_to_drs(a) b = self._general_to_drs(b) self._assert_same_mode(a, b) o_drs = a.union(b) return o_drs
def schema_neighbors(self, field: (str, str, str)) -> DRS: """ Returns all the other attributes/fields that appear in the same relation than the provided field :param field: the provided field :return: returns a list of Hit elements of the form (id, source_name, field_name, score) """ db_name, source_name, field_name = field hits = self.__network.get_hits_from_table(source_name) origin_hit = Hit(id_from(db_name, source_name, field_name), db_name, source_name, field_name, 0) o_drs = DRS([x for x in hits], Operation(OP.TABLE, params=[origin_hit])) return o_drs
def paths(self, drs_a: DRS, drs_b: DRS, relation=Relation.PKFK, max_hops=2, lean_search=False) -> DRS: """ Is there a transitive relationship between any element in a with any element in b? This function finds the answer constrained on the primitive (singular for now) that is passed as a parameter. If b is not passed, assumes the user is searching for paths between elements in a. :param a: DRS :param b: DRS :param Relation: Relation :return: """ # create b if it wasn't passed in. drs_a = self._general_to_drs(drs_a) drs_b = self._general_to_drs(drs_b) self._assert_same_mode(drs_a, drs_b) # absorb the provenance of both a and b o_drs = DRS([], Operation(OP.NONE)) o_drs.absorb_provenance(drs_a) if drs_b != drs_a: o_drs.absorb_provenance(drs_b) for h1, h2 in itertools.product(drs_a, drs_b): # there are different network operations for table and field mode res_drs = None if drs_a.mode == DRSMode.FIELDS: res_drs = self._network.find_path_hit(h1, h2, relation, max_hops=max_hops) else: res_drs = self._network.find_path_table( h1, h2, relation, self, max_hops=max_hops, lean_search=lean_search) o_drs = o_drs.absorb(res_drs) return o_drs
def md_neighbors_id(self, hit: Hit, md_neighbors: MRS, relation: Relation) -> DRS: if isinstance(hit, Hit): nid = str(hit.nid) if isinstance(hit, str): nid = hit nid = str(nid) data = [] score = 1.0 # TODO: return more meaningful score results for hit in md_neighbors: k = hit.target if hit.target != nid else hit.source (db_name, source_name, field_name, data_type) = self.__id_names[k] data.append(Hit(k, db_name, source_name, field_name, score)) op = self.get_op_from_relation(relation) o_drs = DRS(data, Operation(op, params=[hit])) return o_drs
def paths(self, a: DRS, primitives) -> DRS: """ Is there any transitive relationship between any two elements in a? This function finds the answer constrained on the primitive (singular for now) passed as parameter :param a: :param primitives: :return: """ o_drs = DRS([], Operation(OP.NONE)) o_drs = o_drs.absorb_provenance(a) if a.mode == DRSMode.FIELDS: for h1 in a: # h1 is a Hit for h2 in a: # h2 is a Hit if h1 == h2: continue res_drs = self.__network.find_path_hit(h1, h2, primitives) o_drs = o_drs.absorb(res_drs) elif a.mode == DRSMode.TABLE: for h1 in a: # h1 is a table: str for h2 in a: # h2 is a table: str res_drs = self.__network.find_path_table( h1, h2, primitives, self) o_drs = o_drs.absorb(res_drs) return o_drs
def __traverse(self, a: DRS, primitive, max_hops=2) -> DRS: """ Conduct a breadth first search of nodes matching a primitive, starting with an initial DRS. :param a: a nid, node, tuple, or DRS :param primitive: The element to search :max_hops: maximum number of rounds on the graph """ a = self._general_to_drs(a) o_drs = DRS([], Operation(OP.NONE)) if a.mode == DRSMode.TABLE: raise ValueError('input mode DRSMode.TABLE not supported') fringe = a o_drs.absorb_provenance(a) while max_hops > 0: max_hops = max_hops - 1 for h in fringe: hits_drs = self._network.neighbors_id(h, primitive) o_drs = self.union(o_drs, hits_drs) fringe = o_drs # grow the initial input return o_drs
def neighbors_id(self, hit: Hit, relation: Relation) -> DRS: if isinstance(hit, Hit): nid = str(hit.nid) if isinstance(hit, str): nid = hit nid = str(nid) data = [] neighbours = self.__G[nid] for k, v in neighbours.items(): if relation in v: score = v[relation]['score'] (db_name, source_name, field_name, data_type) = self.__id_names[k] data.append(Hit(k, db_name, source_name, field_name, score)) op = self.get_op_from_relation(relation) o_drs = DRS(data, Operation(op, params=[hit])) return o_drs
def similar_content_to(self, i_drs: DRS) -> DRS: """ Given a DRS it returns another DRS that contains all fields similar to the fields of the input :param i_drs: the input DRS :return: DRS """ o_drs = DRS([], Operation(OP.NONE)) o_drs = o_drs.absorb_provenance(i_drs) if i_drs.mode == DRSMode.TABLE: i_drs.set_fields_mode() for h in i_drs: fields_table = self.drs_from_table_hit(h) i_drs = i_drs.absorb(fields_table) for h in i_drs: hits_drs = self.__network.neighbors_id(h, Relation.CONTENT_SIM) o_drs = o_drs.absorb(hits_drs) return o_drs
def search(self, kw: str, kw_type: KWType, max_results=10) -> DRS: """ Performs a keyword search over the contents of the data. Scope specifies where elasticsearch should be looking for matches. i.e. table titles (SOURCE), columns (FIELD), or comment (SOURCE) :param kw: the keyword to serch :param kw_type: the context type on which to search :param max_results: maximum number of results to return :return: returns a DRS """ hits = self._store_client.search_keywords(keywords=kw, elasticfieldname=kw_type, max_hits=max_results) # materialize generator drs = DRS([x for x in hits], Operation(OP.KW_LOOKUP, params=[kw])) return drs
def get_table_neighbors(hit, relation, paths): results = [] direct_neighbors = self.neighbors_id(hit, relation) # Rewriting results - filtering out results that are in the same table as the input. Rewriting prov direct_neighbors_list = [neigh for neigh in direct_neighbors if neigh.source_name != hit.source_name] op = self.get_op_from_relation(relation) direct_neighbors = DRS(direct_neighbors_list, Operation(op, params=[hit])) # FIXME: filter out already seen nodes here for n in direct_neighbors: if not check_membership(n, paths): if lean_search: t_neighbors = api._drs_from_table_hit_lean_no_provenance(n) else: t_neighbors = api.drs_from_table_hit(n) # Brought old API # t_neighbors = api.make_drs(n) # XXX: this won't take all table neighbors, only the input one results.extend([(x, n) for x in t_neighbors]) return results # note how we include hit as sibling of x here
def pkfk_of(self, i_drs: DRS) -> DRS: """ Given a DRS it returns another DRS that contains all fields similar to the fields of the input :param i_drs: the input DRS :return: DRS """ # alternative provenance propagation o_drs = DRS([], Operation(OP.NONE)) o_drs = o_drs.absorb_provenance(i_drs) if i_drs.mode == DRSMode.TABLE: i_drs.set_fields_mode() for h in i_drs: fields_table = self.drs_from_table_hit(h) i_drs = i_drs.absorb(fields_table) # o_drs.extend_provenance(fields_drs) for h in i_drs: hits_drs = self.__network.neighbors_id(h, Relation.PKFK) o_drs = o_drs.absorb(hits_drs) # o_drs.extend_provenance(i_drs) return o_drs