class ElasticSearcher(Searcher): index: str baseuri: typing.Dict = field(default_factory=dict) propbaseuri: typing.Dict = field(default_factory=dict) es_kwargs: typing.Dict = field(default_factory=dict) parts: bool = False prop_uri: typing.Dict[str, typing.Dict] = field(default_factory=dict) prop_baseuri: typing.Dict = field(default_factory=dict) typer: Typer = SimpleTyper() NUM = re.compile("^[\d\W]+$") def __enter__(self): self.es = Elasticsearch(**self.es_kwargs) return self def _about(self, source): about = {} for k, vs in list(source.items()): baseuri = self.prop_baseuri.get(k, "") k = self.prop_uri.get(k, k) if isinstance(vs, list): about[k] = [(baseuri + v if isinstance(v, str) else v) for v in vs] return about def get_parts(self, query): for char in "([,:": for qpart in query.split(char): qpart = qpart.translate(str.maketrans("", "", ")]")).strip() if qpart != query and not qpart.isnumeric(): yield qpart def make_query_body(self, query, **kwargs): return {"id": "query", "params": {"query": query, **kwargs}} def search_entities( self, query_params, limit=1, add_about=False, ispart=False, ): # Simplify classes query_params = tuple(query_params) if not query_params: return it = iter(query_params) while True: query_chunk = tuple(itertools.islice(it, 10**3)) # chunk per 1000 if not query_chunk: break log.debug(f"Submitting ES multiquery of size {len(query_chunk)}") bodies = [] for query, params in query_chunk: context, classes = params.get("context", []), params.get("classes", []) context = [{ "value": c } for c in (context or []) if not self.NUM.match(c)] classes = [{ "value": c.split("/")[-1] } for c in (classes or [])] body = self.make_query_body(query, context=context, classes=classes, limit=limit) bodies.append(()) bodies.append(body) esresponses = self.es.msearch_template(index=self.index, body=bodies).get( "responses", []) for (query, params), esresponse in zip(query_chunk, esresponses): context, classes = params.get("context", []), params.get("classes", []) results = [] for hit in esresponse.get("hits", {}).get("hits", []): uri = hit.get("_source", {}).get("id") if self.baseuri: uri = self.baseuri + uri score = hit.get("_score", 0) about = self._about(hit.get("_source", {})) context_matches: typing.Dict = {} if ("context" in about) and isinstance(context, dict): for ec in about["context"]: prop = ec.get("prop") if self.propbaseuri: prop = self.propbaseuri + prop vals = ec.get("value", []) for v in vals if isinstance(vals, list) else [vals]: for c, csource in context.items(): for m in self.typer.literal_match(v, c): pms = context_matches.setdefault( csource, {}) pms.setdefault(prop, []).append(m) sr = SearchResult(uri, about, context_matches=context_matches, score=score) results.append(sr) if not results: # log.debug(f"No {self} results for {query}") if self.parts and (not ispart): partqueries = [(p, context) for p in self.get_parts(query)] more = self.search_entities( partqueries, limit=limit, add_about=add_about, ispart=True, ) for srs in more: results += srs yield results @classmethod def create( cls, input: Path = None, format: str = "ttl", surfaces: Path = None, refcounts: Path = None, es_index: str = None, es_kwargs: typing.Dict = None, recreate: bool = True, thread_count: int = 8, baseuris: typing.Sequence[str] = (), uri_prefLabel: typing.Sequence[str] = ( "http://www.w3.org/2004/02/skos/core#prefLabel", ), uri_altLabel: typing.Sequence[str] = ( "http://www.w3.org/2004/02/skos/core#altLabel", ), uri_type: str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", lang: str = "en", extract_surface: bool = False, uri_ignore: str = None, surface_norm: str = "[^\s\w]", context_label_threshold: float = 1, output_statements: bool = True, ): import re baseuris_re = [re.compile(b) for b in baseuris] uri_ignore_re = re.compile(uri_ignore) if uri_ignore else None ignore_uri = lambda uri: uri_ignore_re and uri_ignore_re.match(uri) surface_norm_re = re.compile(surface_norm) def normalize_surface(s): if isinstance(s, dict): s = "".join(s.values()) return surface_norm_re.sub(" ", s.replace("_", " ").lower()).strip() def debase(uri): for baseuri in baseuris_re: uri = baseuri.sub("", uri) return uri id_surfaceformscores = {} if surfaces and surfaces.exists(): import tqdm, json, sys import urllib.parse as ul for line in tqdm.tqdm(open(surfaces), desc="Loading surface forms"): try: id, surf = line.split("\t", 1) id = debase(ul.unquote_plus(id)) id_surfaceformscores[id] = { normalize_surface(k): v for k, v in json.loads(surf).items() } except Exception as e: print(e, file=sys.stderr) id_refcount = {} if refcounts and refcounts.exists(): import tqdm, json, sys import urllib.parse as ul for line in tqdm.tqdm(open(refcounts), desc="Loading refcounts"): try: id, n = line.split("\t", 1) id = debase(ul.unquote_plus(id)) id_refcount[id] = int(n) except Exception as e: print(e, file=sys.stderr) def parse_n3(node, base=False): import urllib.parse as ul if node: if node[0] == "[" and node[-1] == "]": return {} if node[0] == "<" and node[-1] == ">": uri = ul.unquote_plus(node[1:-1]) return {"id": uri if base else debase(uri)} elif '"^^' in node: val, dtype = node.rsplit("^^", 1) if dtype.endswith(">"): dtype = ul.unquote_plus(dtype[1:-1]) return {(dtype if base else debase(dtype)): val[1:-1]} elif '"@' in node: val, l = node.rsplit("@", 1) if l == lang: return {"str": val[1:-1]} elif node[0] == '"' and node[-1] == '"': return {"str": node[1:-1]} elif ":" in node: return {"id": node} def parse_ttl(fname): for line in open(fname): try: s_n3, claims = line.split(None, 1) id = parse_n3(s_n3)["id"] statements = [] surface_score = {} for claim in set( claims.rstrip().rstrip(".").rstrip().split(" ; ")): p_n3, o_n3 = claim.split(None, 1) p, o = parse_n3(p_n3)["id"], parse_n3(o_n3) if " " in p: continue if any(o.values()): if p in uri_prefLabel and ("str" in o): surface_score[normalize_surface( o["str"])] = 1.0 elif p in uri_altLabel and ("str" in o): surface_score[normalize_surface( o["str"])] = 0.5 else: statements.append({"prop": p, **o}) yield id, surface_score, statements except Exception as e: log.warn(e) pass # raise e def parse_json(fname): for line in open(fname): try: if line[0] == "[": continue doc = json.loads(line if line[-2] != "," else line[:-2]) label = doc.get("labels", {}).get(lang, {}).get("value") surface_score = {} if label: surface_score[normalize_surface(label)] = 1.0 for alias in doc.get("aliases", {}).get(lang, []): if alias.get("value"): surface_score[normalize_surface( alias["value"])] = 0.5 yield doc.get( "id"), surface_score, ElasticDB._wd_statements(doc) except Exception as e: raise e def stream(): lines = parse_ttl(input) if format == "ttl" else parse_json(input) for id, surface_score, statements in lines: if ignore_uri(id): continue if extract_surface: surface_score[normalize_surface(id)] = 1 types = set() prop_context: typing.Dict = {} filtered_statements = [] for st in statements: if ignore_uri(st.get("id", "")): continue if st.get("prop") == uri_type and ("id" in st): types.add(st["id"]) else: filtered_statements.append(st) if "id" in st and "prop" in st: vals = set() if extract_surface: vals.add(normalize_surface(st["id"])) for l, ls in id_surfaceformscores.get( st["id"], {}).items(): if ls >= context_label_threshold: vals.add(l) if vals: prop_context.setdefault(st["prop"], set()).update(vals) context = [{ "prop": p, "value": list(vs) } for p, vs in prop_context.items()] surface_score.update(id_surfaceformscores.get(id, {})) if not surface_score: continue yield { "id": id, "type": list(types), "surface": [{ "value": l, "score": c } for l, c in surface_score.items()], **({ "statements": filtered_statements } if output_statements else {}), "context": context, **({ "refs": id_refcount[id] } if id in id_refcount else {}), } if es_index: from elasticsearch import Elasticsearch, helpers import time, sys es_kwargs = es_kwargs or {} es = Elasticsearch(timeout=1000, **es_kwargs) if recreate: cls.init_index(es_index=es_index, es_kwargs=es_kwargs) results = helpers.parallel_bulk( es, ({ "_index": es_index, **d } for d in stream()), thread_count=thread_count, ) for i, (status, r) in enumerate(results): if not status: print("ERROR", r, file=sys.stderr) time.sleep(1) print( f"Indexed {es.count(index=es_index).get('count')} documents", file=sys.stderr, ) else: import json for doc in stream(): print(json.dumps(doc)) @classmethod def test(cls, index: str, *query: str, limit: int = 1, es_kwargs: typing.Dict = None): """Search an Elasticsearch index for a query string """ import json es_kwargs = es_kwargs or {} with cls(index, es_kwargs=es_kwargs) as es: queries = [(q, ()) for q in query] for es in es.search_entities(queries, limit=limit): for e in es: print(json.dumps(e)) @classmethod def store_template(cls, es_kwargs: typing.Dict = None): es_kwargs = es_kwargs or {} body = {"script": {"lang": "mustache", "source": QUERY_SCRIPT}} host = es_kwargs.get("host", "localhost") port = es_kwargs.get("port", "9200") import requests return requests.post(f"http://{host}:{port}/_scripts/query", json=body).text @classmethod def init_index(cls, es_index: str, es_kwargs: typing.Dict = None): import sys es_kwargs = dict(es_kwargs or {}) es = Elasticsearch(timeout=1000, **es_kwargs) es.indices.delete(index=es_index, ignore=[400, 404]) print("Creating index...", file=sys.stderr) es.indices.create(index=es_index, body=SETTINGS) print(cls.store_template(es_kwargs=es_kwargs))