示例#1
0
class ElasticSearcher(Searcher):
    index: str
    baseuri: typing.Dict = field(default_factory=dict)
    propbaseuri: typing.Dict = field(default_factory=dict)
    es_kwargs: typing.Dict = field(default_factory=dict)
    parts: bool = False
    prop_uri: typing.Dict[str, typing.Dict] = field(default_factory=dict)
    prop_baseuri: typing.Dict = field(default_factory=dict)
    typer: Typer = SimpleTyper()
    NUM = re.compile("^[\d\W]+$")

    def __enter__(self):
        self.es = Elasticsearch(**self.es_kwargs)
        return self

    def _about(self, source):
        about = {}
        for k, vs in list(source.items()):
            baseuri = self.prop_baseuri.get(k, "")
            k = self.prop_uri.get(k, k)
            if isinstance(vs, list):
                about[k] = [(baseuri + v if isinstance(v, str) else v)
                            for v in vs]
        return about

    def get_parts(self, query):
        for char in "([,:":
            for qpart in query.split(char):
                qpart = qpart.translate(str.maketrans("", "", ")]")).strip()
                if qpart != query and not qpart.isnumeric():
                    yield qpart

    def make_query_body(self, query, **kwargs):
        return {"id": "query", "params": {"query": query, **kwargs}}

    def search_entities(
        self,
        query_params,
        limit=1,
        add_about=False,
        ispart=False,
    ):
        # Simplify classes
        query_params = tuple(query_params)
        if not query_params:
            return

        it = iter(query_params)
        while True:
            query_chunk = tuple(itertools.islice(it, 10**3))  # chunk per 1000
            if not query_chunk:
                break

            log.debug(f"Submitting ES multiquery of size {len(query_chunk)}")

            bodies = []
            for query, params in query_chunk:
                context, classes = params.get("context",
                                              []), params.get("classes", [])
                context = [{
                    "value": c
                } for c in (context or []) if not self.NUM.match(c)]
                classes = [{
                    "value": c.split("/")[-1]
                } for c in (classes or [])]

                body = self.make_query_body(query,
                                            context=context,
                                            classes=classes,
                                            limit=limit)
                bodies.append(())
                bodies.append(body)

            esresponses = self.es.msearch_template(index=self.index,
                                                   body=bodies).get(
                                                       "responses", [])
            for (query, params), esresponse in zip(query_chunk, esresponses):
                context, classes = params.get("context",
                                              []), params.get("classes", [])
                results = []
                for hit in esresponse.get("hits", {}).get("hits", []):
                    uri = hit.get("_source", {}).get("id")
                    if self.baseuri:
                        uri = self.baseuri + uri
                    score = hit.get("_score", 0)
                    about = self._about(hit.get("_source", {}))

                    context_matches: typing.Dict = {}
                    if ("context" in about) and isinstance(context, dict):
                        for ec in about["context"]:
                            prop = ec.get("prop")
                            if self.propbaseuri:
                                prop = self.propbaseuri + prop
                            vals = ec.get("value", [])
                            for v in vals if isinstance(vals,
                                                        list) else [vals]:
                                for c, csource in context.items():
                                    for m in self.typer.literal_match(v, c):
                                        pms = context_matches.setdefault(
                                            csource, {})
                                        pms.setdefault(prop, []).append(m)

                    sr = SearchResult(uri,
                                      about,
                                      context_matches=context_matches,
                                      score=score)
                    results.append(sr)

                if not results:
                    # log.debug(f"No {self} results for {query}")
                    if self.parts and (not ispart):
                        partqueries = [(p, context)
                                       for p in self.get_parts(query)]
                        more = self.search_entities(
                            partqueries,
                            limit=limit,
                            add_about=add_about,
                            ispart=True,
                        )
                        for srs in more:
                            results += srs

                yield results

    @classmethod
    def create(
        cls,
        input: Path = None,
        format: str = "ttl",
        surfaces: Path = None,
        refcounts: Path = None,
        es_index: str = None,
        es_kwargs: typing.Dict = None,
        recreate: bool = True,
        thread_count: int = 8,
        baseuris: typing.Sequence[str] = (),
        uri_prefLabel: typing.Sequence[str] = (
            "http://www.w3.org/2004/02/skos/core#prefLabel", ),
        uri_altLabel: typing.Sequence[str] = (
            "http://www.w3.org/2004/02/skos/core#altLabel", ),
        uri_type: str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
        lang: str = "en",
        extract_surface: bool = False,
        uri_ignore: str = None,
        surface_norm: str = "[^\s\w]",
        context_label_threshold: float = 1,
        output_statements: bool = True,
    ):
        import re

        baseuris_re = [re.compile(b) for b in baseuris]
        uri_ignore_re = re.compile(uri_ignore) if uri_ignore else None
        ignore_uri = lambda uri: uri_ignore_re and uri_ignore_re.match(uri)

        surface_norm_re = re.compile(surface_norm)

        def normalize_surface(s):
            if isinstance(s, dict):
                s = "".join(s.values())
            return surface_norm_re.sub(" ",
                                       s.replace("_", " ").lower()).strip()

        def debase(uri):
            for baseuri in baseuris_re:
                uri = baseuri.sub("", uri)
            return uri

        id_surfaceformscores = {}
        if surfaces and surfaces.exists():
            import tqdm, json, sys
            import urllib.parse as ul

            for line in tqdm.tqdm(open(surfaces),
                                  desc="Loading surface forms"):
                try:
                    id, surf = line.split("\t", 1)
                    id = debase(ul.unquote_plus(id))
                    id_surfaceformscores[id] = {
                        normalize_surface(k): v
                        for k, v in json.loads(surf).items()
                    }
                except Exception as e:
                    print(e, file=sys.stderr)

        id_refcount = {}
        if refcounts and refcounts.exists():
            import tqdm, json, sys
            import urllib.parse as ul

            for line in tqdm.tqdm(open(refcounts), desc="Loading refcounts"):
                try:
                    id, n = line.split("\t", 1)
                    id = debase(ul.unquote_plus(id))
                    id_refcount[id] = int(n)
                except Exception as e:
                    print(e, file=sys.stderr)

        def parse_n3(node, base=False):
            import urllib.parse as ul

            if node:
                if node[0] == "[" and node[-1] == "]":
                    return {}
                if node[0] == "<" and node[-1] == ">":
                    uri = ul.unquote_plus(node[1:-1])
                    return {"id": uri if base else debase(uri)}
                elif '"^^' in node:
                    val, dtype = node.rsplit("^^", 1)
                    if dtype.endswith(">"):
                        dtype = ul.unquote_plus(dtype[1:-1])
                    return {(dtype if base else debase(dtype)): val[1:-1]}
                elif '"@' in node:
                    val, l = node.rsplit("@", 1)
                    if l == lang:
                        return {"str": val[1:-1]}
                elif node[0] == '"' and node[-1] == '"':
                    return {"str": node[1:-1]}
                elif ":" in node:
                    return {"id": node}

        def parse_ttl(fname):
            for line in open(fname):
                try:
                    s_n3, claims = line.split(None, 1)
                    id = parse_n3(s_n3)["id"]
                    statements = []
                    surface_score = {}
                    for claim in set(
                            claims.rstrip().rstrip(".").rstrip().split(" ; ")):
                        p_n3, o_n3 = claim.split(None, 1)
                        p, o = parse_n3(p_n3)["id"], parse_n3(o_n3)
                        if " " in p:
                            continue
                        if any(o.values()):
                            if p in uri_prefLabel and ("str" in o):
                                surface_score[normalize_surface(
                                    o["str"])] = 1.0
                            elif p in uri_altLabel and ("str" in o):
                                surface_score[normalize_surface(
                                    o["str"])] = 0.5
                            else:
                                statements.append({"prop": p, **o})
                    yield id, surface_score, statements
                except Exception as e:
                    log.warn(e)
                    pass  # raise e

        def parse_json(fname):
            for line in open(fname):
                try:
                    if line[0] == "[":
                        continue
                    doc = json.loads(line if line[-2] != "," else line[:-2])
                    label = doc.get("labels", {}).get(lang, {}).get("value")
                    surface_score = {}
                    if label:
                        surface_score[normalize_surface(label)] = 1.0
                    for alias in doc.get("aliases", {}).get(lang, []):
                        if alias.get("value"):
                            surface_score[normalize_surface(
                                alias["value"])] = 0.5
                    yield doc.get(
                        "id"), surface_score, ElasticDB._wd_statements(doc)
                except Exception as e:
                    raise e

        def stream():
            lines = parse_ttl(input) if format == "ttl" else parse_json(input)
            for id, surface_score, statements in lines:
                if ignore_uri(id):
                    continue

                if extract_surface:
                    surface_score[normalize_surface(id)] = 1

                types = set()
                prop_context: typing.Dict = {}
                filtered_statements = []
                for st in statements:
                    if ignore_uri(st.get("id", "")):
                        continue

                    if st.get("prop") == uri_type and ("id" in st):
                        types.add(st["id"])
                    else:
                        filtered_statements.append(st)
                        if "id" in st and "prop" in st:
                            vals = set()
                            if extract_surface:
                                vals.add(normalize_surface(st["id"]))
                            for l, ls in id_surfaceformscores.get(
                                    st["id"], {}).items():
                                if ls >= context_label_threshold:
                                    vals.add(l)
                            if vals:
                                prop_context.setdefault(st["prop"],
                                                        set()).update(vals)
                context = [{
                    "prop": p,
                    "value": list(vs)
                } for p, vs in prop_context.items()]

                surface_score.update(id_surfaceformscores.get(id, {}))

                if not surface_score:
                    continue

                yield {
                    "id":
                    id,
                    "type":
                    list(types),
                    "surface": [{
                        "value": l,
                        "score": c
                    } for l, c in surface_score.items()],
                    **({
                        "statements": filtered_statements
                    } if output_statements else {}),
                    "context":
                    context,
                    **({
                        "refs": id_refcount[id]
                    } if id in id_refcount else {}),
                }

        if es_index:
            from elasticsearch import Elasticsearch, helpers
            import time, sys

            es_kwargs = es_kwargs or {}
            es = Elasticsearch(timeout=1000, **es_kwargs)

            if recreate:
                cls.init_index(es_index=es_index, es_kwargs=es_kwargs)

            results = helpers.parallel_bulk(
                es,
                ({
                    "_index": es_index,
                    **d
                } for d in stream()),
                thread_count=thread_count,
            )
            for i, (status, r) in enumerate(results):
                if not status:
                    print("ERROR", r, file=sys.stderr)

            time.sleep(1)
            print(
                f"Indexed {es.count(index=es_index).get('count')} documents",
                file=sys.stderr,
            )
        else:
            import json

            for doc in stream():
                print(json.dumps(doc))

    @classmethod
    def test(cls,
             index: str,
             *query: str,
             limit: int = 1,
             es_kwargs: typing.Dict = None):
        """Search an Elasticsearch index for a query string """
        import json

        es_kwargs = es_kwargs or {}
        with cls(index, es_kwargs=es_kwargs) as es:
            queries = [(q, ()) for q in query]
            for es in es.search_entities(queries, limit=limit):
                for e in es:
                    print(json.dumps(e))

    @classmethod
    def store_template(cls, es_kwargs: typing.Dict = None):
        es_kwargs = es_kwargs or {}
        body = {"script": {"lang": "mustache", "source": QUERY_SCRIPT}}
        host = es_kwargs.get("host", "localhost")
        port = es_kwargs.get("port", "9200")
        import requests

        return requests.post(f"http://{host}:{port}/_scripts/query",
                             json=body).text

    @classmethod
    def init_index(cls, es_index: str, es_kwargs: typing.Dict = None):
        import sys

        es_kwargs = dict(es_kwargs or {})
        es = Elasticsearch(timeout=1000, **es_kwargs)
        es.indices.delete(index=es_index, ignore=[400, 404])
        print("Creating index...", file=sys.stderr)
        es.indices.create(index=es_index, body=SETTINGS)
        print(cls.store_template(es_kwargs=es_kwargs))