Пример #1
0
def r():
    """ Provide a module-scoped Greynir instance as a test fixture """
    from reynir import Greynir
    r = Greynir()
    yield r
    # Do teardown here
    r.__class__.cleanup()
Пример #2
0
def parse_text_file(file_handle, affix_lemma=1, id_prefix=None, start_index=1, **options):
    """ Parse contiguous text into reynir simple trees in bracket format """
    text = file_handle.read()
    r = Greynir(**options)
    dd = r.parse(text)
    for idx, sent in enumerate(dd["sentences"]):
        nltk_tree = reynir_sentence_to_annotree(sent)
        id_prefix = "" if id_prefix is None else id_prefix
        id_str = "{}.{}".format(id_prefix, idx)
        meta_node = AnnoTree(
            "META",
            [
                AnnoTree("ID-CORPUS", [id_str]),
                AnnoTree("ID-LOCAL", [id_str]),
                AnnoTree("URL", ["greynir.is"]),
                AnnoTree("COMMENT", [""]),
            ],
        )
        meta_tree = AnnoTree("", [meta_node, nltk_tree])
        yield meta_tree
Пример #3
0
def lemmas(q: str, all_lemmas: bool = False) -> Response:
    """ Lemmatization API. """
    if not q:
        return _err("Missing query parameter")
    if len(q) > _MAX_LEMMAS_TXT_LEN:
        return _err(f"Param exceeds max length ({_MAX_LEMMAS_TXT_LEN} chars)")

    # Lazy-load Greynir engine
    global greynir
    if greynir is None:
        greynir = Greynir()

    resp: Dict[str, Any] = dict(q=q)
    try:
        lem: List[Any] = []
        for m in greynir.lemmatize(q, all_lemmas=all_lemmas):
            # TODO: postprocess in some way?
            lem.append(m)
        resp["err"] = False
        resp["lemmas"] = lem
    except Exception as e:
        return _err(f"Villa kom upp við lemmun texta: '{e}'")

    return JSONResponse(content=resp)
Пример #4
0
 def lemmatize(self, sent: Iterable[Tok]) -> Iterable[LemmaTuple]:
     """ Lemmatize a sentence (list of tokens), returning
         an iterable of (lemma, category) tuples """
     if self._g is None:
         # Initialize parser singleton
         self.__class__._g = Greynir()
     # Attempt to parse the sentence
     assert self._g is not None
     s = self._g.parse_tokens(sent)
     if s.tree is None:
         # Unable to parse: fall back to simple lemmatizer
         yield from super().lemmatize(sent)
     else:
         # Successfully parsed: obtain the (lemma, category) tuples
         # from the terminals of the parse tree
         yield from s.tree.lemmas_and_cats
Пример #5
0
    assert db.cast_to_accusative("Kattarhestur") == "Kattarhest"
    assert db.cast_to_dative("Kattarhestur") == "Kattarhesti"
    assert db.cast_to_genitive("Kattarhestur") == "Kattarhests"

    f = lambda mm: [m for m in mm if "2" not in m.beyging]
    assert db.cast_to_accusative("fjórir", meaning_filter_func=f) == "fjóra"
    assert db.cast_to_dative("fjórir", meaning_filter_func=f) == "fjórum"
    assert db.cast_to_genitive("fjórir", meaning_filter_func=f) == "fjögurra"

    assert db.cast_to_accusative("Suður-Afríka") == "Suður-Afríku"
    assert db.cast_to_dative("Suður-Afríka") == "Suður-Afríku"
    assert db.cast_to_genitive("Suður-Afríka") == "Suður-Afríku"

    assert db.cast_to_accusative("Vestur-Þýskaland") == "Vestur-Þýskaland"
    assert db.cast_to_dative("Vestur-Þýskaland") == "Vestur-Þýskalandi"
    assert db.cast_to_genitive("Vestur-Þýskaland") == "Vestur-Þýskalands"

    f = lambda mm: sorted(mm, key=lambda m: "2" in m.beyging or "3" in m.beyging)
    assert db.cast_to_accusative("Kópavogur", meaning_filter_func=f) == "Kópavog"
    assert db.cast_to_dative("Kópavogur", meaning_filter_func=f) == "Kópavogi"
    assert db.cast_to_genitive("Kópavogur", meaning_filter_func=f) == "Kópavogs"


if __name__ == "__main__":
    # When invoked as a main module, do a verbose test
    from reynir import Greynir
    r = Greynir()
    test_cases(r)
    test_casting()
    r.__class__.cleanup()
Пример #6
0
 def __init__(self, use_icebert: bool = True) -> None:
     self.g = Greynir()
     if use_icebert:
         self.ib = IcebertModel.pos_from_settings()
Пример #7
0
class Lemmatizer:

    SPLIT_WC = 100

    g = None  # Placeholder for Greynir instance
    ib = None  # Placeholder for IceBERT model instance
    device = "cpu"

    def __init__(self, use_icebert: bool = True) -> None:
        self.g = Greynir()
        if use_icebert:
            self.ib = IcebertModel.pos_from_settings()

    def lemmatize_sentence(self, sentence: _Sentence) -> TokLem:
        if sentence.tree is not None:
            return self.g_lemmatize(sentence)
        a_lemmas = []
        a_tokens = []
        tokens = sentence.tokens
        # Split words to not hit 512 token limit in IceBERT
        # Consider making this smarter if dealing with a lot of long sentences.
        for i in range(0, len(tokens), self.SPLIT_WC):
            p_lemmas, p_tokens = self.ib_lemmatize(tokens[i * 100 : (i + 1) * 100])  # noqa
            a_lemmas += p_lemmas
            a_tokens += p_tokens
        return a_lemmas, a_tokens

    def lemmatize(self, text: str) -> List[Tuple[List[str], List[str], str]]:
        lemmatized = []
        parsed = self.parse(text)
        for sentence in parsed:
            lemmas, tokens = self.lemmatize_sentence(sentence)
            lemmatized.append(([l for l in lemmas], tokens, sentence.tidy_text))  # noqa
        return lemmatized

    def lemmatize_pretty(self, text: str) -> None:
        lemmatized_data = self.lemmatize(text)
        for lemmatized, tokens, _ in lemmatized_data:
            print("---")
            print("\t".join("{:10}".format(v) for v in tokens))
            print("\t".join("{:10}".format(v.lower()) for v in lemmatized))

    def g_lemmatize(self, g_sentence: _Sentence) -> TokLem:
        tokens = [t.txt for t in g_sentence.tokens]
        if g_sentence.tree is None:
            return tokens, tokens
        return g_sentence.tree.lemmas, tokens

    def ib_lemmatize(self, g_tokens: TokenList) -> TokLem:
        tokens = [t.txt for t in g_tokens]
        sent = " ".join(tokens)
        if self.ib is None:
            raise ValueError("Lemmatizer needs to be instantiated with use_icebert.")
        ifds = self.ib.predict_to_idf(sent, device=self.device)
        lemmas = []

        for idx, tok in enumerate(g_tokens):

            cands = tok.val
            if isinstance(cands, int) or isinstance(cands, float):
                # Number
                lemmas.append(tok.txt)
                continue
            if cands and len(cands) > 1 and (isinstance(cands[0], int) or isinstance(cands[0], float)):
                # Punctuation
                lemmas.append(tok.txt)
                continue
            if not cands:
                lemmas.append(tok.txt)
                continue

            lemm_cands = set(c.stofn for c in cands if hasattr(c, "stofn"))
            if len(lemm_cands) == 1:
                # Only one candidate, we use that one
                lemmas.append(lemm_cands.pop())
                continue

            found = False
            for c in cands:
                if hasattr(c, "name"):
                    lemmas.append(c.name)
                    found = True
                    break
                if isinstance(c[0], int):
                    lemmas.append(tok.txt)
                    found = True
                    break
                try:
                    ifd = IFD_Tagset(
                        k=tok.kind,
                        c=c.ordfl,
                        t=c.ordfl,
                        f=c.fl,
                        txt=tok.txt,
                        s=c.stofn,
                        b=c.beyging,
                    )
                except:  # noqa
                    lemmas.append(tok.txt)
                    found = True
                    break
                try:
                    str_ifd = str(ifd)
                except TypeError:
                    # Some oddity in ifdtagger
                    str_ifd = ""
                if str_ifd == ifds[idx]:
                    lemmas.append(c.stofn)
                    found = True
                    break
            if not found:
                lemmas.append(tok.txt)

        return lemmas, tokens

    def parse(self, text: str) -> List[_Sentence]:
        if self.g is None:
            raise ValueError("Greynir needs to be instantiated.")
        text = text.replace("\n", " ").replace("  ", " ")
        return self.g.parse(text)["sentences"]  # type: ignore