Пример #1
0
def query_which_route(query, session, result):
    """ Which routes stop at a given bus stop """
    stop_name = result.stop_name  # 'Einarsnes', 'Fiskislóð'...

    if stop_name in {"þar", "þangað"}:
        # Referring to a bus stop mentioned earlier
        ctx = query.fetch_context()
        if ctx and "bus_stop" in ctx:
            stop_name = ctx["bus_stop"]
            result.qkey = stop_name
        else:
            answer = voice_answer = "Ég veit ekki við hvaða stað þú átt."
            response = dict(answer=answer)
            return response, answer, voice_answer

    bus_noun = result.bus_noun  # 'strætó', 'vagn', 'leið'...
    stops = straeto.BusStop.named(stop_name, fuzzy=True)
    if not stops:
        a = [stop_name, "þekkist ekki."]
        va = [
            "Ég", "þekki", "ekki", "biðstöðina", stop_name.capitalize(),
        ]
    else:
        routes = set()
        if query.location:
            straeto.BusStop.sort_by_proximity(stops, query.location)
        stop = stops[0]
        for route_id in stop.visits.keys():
            number = straeto.BusRoute.lookup(route_id).number
            routes.add(number)
        va = [bus_noun, "númer"]
        a = va[:]
        nroutes = len(routes)
        cnt = 0
        for rn in sorted(routes, key=lambda t: int(t)):
            if cnt:
                sep = "og" if cnt + 1 == nroutes else ","
                va.append(sep)
                a.append(sep)
            # We convert inflectable numbers to their text equivalents
            # since the speech engine can't be relied upon to get the
            # inflection of numbers right
            va.append(numbers_to_neutral(rn))
            a.append(rn)
            cnt += 1
        tail = ["stoppar á", to_dative(stop.name)]
        va.extend(tail)
        a.extend(tail)
        # Store a location coordinate and a bus stop name in the context
        query.set_context({"location": stop.location, "bus_stop": stop.name})

    voice_answer = correct_spaces(" ".join(va) + ".")
    answer = correct_spaces(" ".join(a))
    answer = answer[0].upper() + answer[1:]
    response = dict(answer=answer)
    return response, answer, voice_answer
Пример #2
0
 def VillaEndingANA(self, txt: str, variants: str,
                    node: Node) -> AnnotationDict:
     # 'þingflokkana' á sennilega að vera 'þingflokkanna'
     # In this case, we need the genitive form
     # of the token in self._tokens[node.start]
     tnode = self._terminal_nodes[node.start]
     suggestion = tnode.genitive_np
     correct_np = correct_spaces(suggestion)
     canonical_np = tnode.canonical_np
     if canonical_np.endswith("ar"):
         # This might be something like 'landsteinar' which is only plural
         detail = ("Karlkyns orð sem enda á '-ar' í nefnifalli fleirtölu, "
                   "eins og '{0}', eru rituð "
                   "'{1}' með tveimur n-um í eignarfalli fleirtölu, "
                   "ekki '{2}' með einu n-i.").format(
                       canonical_np, correct_np, txt)
     else:
         detail = ("Karlkyns orð sem enda á '-{3}' í nefnifalli eintölu, "
                   "eins og '{0}', eru rituð "
                   "'{1}' með tveimur n-um í eignarfalli fleirtölu, "
                   "ekki '{2}' með einu n-i.").format(
                       canonical_np, correct_np, txt, canonical_np[-2:])
     return dict(
         text="Á sennilega að vera '{0}'".format(correct_np),
         detail=detail,
         suggestion=suggestion,
     )
Пример #3
0
def parse_tsv_file(file_handle, reorder=True):
    """ Parse .tsv file of the format:
            flag, uuid, sentence_index, text, url [, datetime]
        if the number of sentences in text is not 1 (according to the tokenizer/parser)
        then they will be merged naively.
        """
    parser = Reynir()
    filtered = []
    for (line_idx, line) in enumerate(file_handle):
        flags, uuid, idx, text, url, *_ = line.strip().split("\t")[:6]
        should_export = False if not flags else "1" in flags
        if not should_export:
            continue
        filtered.append(
            CorpusEntry(flags=flags, uuid=uuid, text=text, index=idx, url=url)
        )
    if reorder:
        filtered = sorted(filtered, key=lambda e: len(e.text.split(" ")))
    for entry in filtered:
        res = parser.parse(correct_spaces(entry.text))
        annotrees = []
        for idx, sent in enumerate(res["sentences"]):
            tree = reynir_sentence_to_annotree(sent)
            annotrees.append(tree)
        first, *rest = annotrees
        for tree in rest:
            first.insert(len(first), tree)
        id_corpus = "{0}.{1}".format(entry.uuid, entry.index)
        yield CorpusTree(id_corpus=id_corpus, tree=first, url=entry.url)
Пример #4
0
 def VillaFsMeðFallstjórn(self, txt: str, variants: str,
                          node: Node) -> AnnotationDict:
     # Forsetningin z á að stýra x-falli en ekki y-falli
     tnode = self._terminal_nodes[node.start]
     p = tnode.enclosing_tag("PP")
     subj = None
     if p is not None:
         try:
             subj = p.NP
         except AttributeError:
             pass
     if subj:
         preposition = p.P.text
         suggestion = preposition + " " + self.cast_to_case(variants, subj)
         correct_np = correct_spaces(suggestion)
         return dict(
             text="Á sennilega að vera '{0}'".format(correct_np),
             detail=("Forsetningin '{0}' stýrir {1}falli.".format(
                 preposition.lower(),
                 CASE_NAMES[variants],
             )),
             suggestion=suggestion,
         )
     # In this case, there's no suggested correction
     return dict(text="Forsetningin '{0}' stýrir {1}falli.".format(
         txt.split()[0].lower(), CASE_NAMES[variants]), )
Пример #5
0
    def parse(self, result):
        """ Parse the query from its string, returning True if valid """
        self._tree = None  # Erase previous tree, if any
        self._error = None  # Erase previous error, if any
        self._qtype = None  # Erase previous query type, if any
        self._key = None
        self._toklist = None

        q = self._query.strip()
        if not q:
            self.set_error("E_EMPTY_QUERY")
            return False

        toklist = tokenize(q, auto_uppercase=self._auto_uppercase and q.islower())
        toklist = list(recognize_entities(toklist, enclosing_session=self._session))

        actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt))
        if actual_q:
            actual_q = actual_q[0].upper() + actual_q[1:]
            if not any(actual_q.endswith(s) for s in ("?", ".", "!")):
                actual_q += "?"

        # Update the beautified query string, as the actual_q string
        # probably has more correct capitalization
        self.set_beautified_query(actual_q)

        if Settings.DEBUG:
            # Log the query string as seen by the parser
            print("Query is: '{0}'".format(actual_q))

        parse_result, trees = Query._parse(toklist)

        if not trees:
            # No parse at all
            self.set_error("E_NO_PARSE_TREES")
            return False

        result.update(parse_result)

        if result["num_sent"] != 1:
            # Queries must be one sentence
            self.set_error("E_MULTIPLE_SENTENCES")
            return False
        if result["num_parsed_sent"] != 1:
            # Unable to parse the single sentence
            self.set_error("E_NO_PARSE")
            return False
        if 1 not in trees:
            # No sentence number 1
            self.set_error("E_NO_FIRST_SENTENCE")
            return False
        # Looks good
        # Store the resulting parsed query as a tree
        tree_string = "S1\n" + trees[1]
        print(tree_string)
        self._tree = Tree()
        self._tree.load(tree_string)
        # Store the token list
        self._toklist = toklist
        return True
Пример #6
0
def query_which_route(query, session, result):
    """ Which routes stop at a given bus stop """
    stop_name = result.stop_name  # 'Einarsnes', 'Fiskislóð'...
    bus_noun = result.bus_noun  # 'strætó', 'vagn', 'leið'...
    stops = straeto.BusStop.named(stop_name, fuzzy=True)
    if not stops:
        a = [stop_name, "þekkist ekki."]
        va = [
            "Ég",
            "þekki",
            "ekki",
            "biðstöðina",
            stop_name.capitalize(),
        ]
    else:
        routes = set()
        if query.location:
            straeto.BusStop.sort_by_proximity(stops, query.location)
        stop = stops[0]
        for route_id in stop.visits.keys():
            number = straeto.BusRoute.lookup(route_id).number
            routes.add(number)
        va = [bus_noun, "númer"]
        a = va[:]
        nroutes = len(routes)
        cnt = 0
        for rn in sorted(routes, key=lambda t: int(t)):
            if cnt:
                sep = "og" if cnt + 1 == nroutes else ","
                va.append(sep)
                a.append(sep)
            # We convert inflectable numbers to their text equivalents
            # since the speech engine can't be relied upon to get the
            # inflection of numbers right
            va.append(NUMBERS_NEUTRAL.get(rn, rn))
            a.append(rn)
            cnt += 1
        tail = ["stoppar á", to_dative(stop.name)]
        va.extend(tail)
        a.extend(tail)

    voice_answer = correct_spaces(" ".join(va) + ".")
    answer = correct_spaces(" ".join(a))
    response = dict(answer=answer)
    return response, answer, voice_answer
Пример #7
0
def recent_persons(limit=_RECENT_PERSONS_LENGTH):
    """ Return a list of names and titles appearing recently in the news """
    toplist = dict()

    with SessionContext(read_only=True) as session:

        q = (
            session.query(Person.name, Person.title, Person.article_url, Article.id)
            .join(Article)
            .join(Root)
            .filter(Root.visible)
            .order_by(desc(Article.timestamp))[
                0 : limit * 2
            ]  # Go through up to 2 * N records
        )

        def is_better_title(new_title, old_title):
            len_new = len(new_title)
            len_old = len(old_title)
            if len_old >= _MAX_TITLE_LENGTH:
                # Too long: we want a shorter one
                return len_new < len_old
            if len_new >= _MAX_TITLE_LENGTH:
                # This one is too long: we don't want it
                return False
            # Otherwise, longer is better
            return len_new > len_old

        with BIN_Db.get_db() as bindb:
            for p in q:
                # Insert the name into the list if it's not already there,
                # or if the new title is longer than the previous one
                if p.name not in toplist or is_better_title(
                    p.title, toplist[p.name][0]
                ):
                    toplist[p.name] = (
                        correct_spaces(p.title),
                        p.article_url,
                        p.id,
                        bindb.lookup_name_gender(p.name),
                    )
                    if len(toplist) >= limit:
                        # We now have as many names as we initially wanted: terminate the loop
                        break

    with changedlocale() as strxfrm:
        # Convert the dictionary to a sorted list of dicts
        return sorted(
            [
                dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2])
                for name, tu in toplist.items()
            ],
            key=lambda x: strxfrm(x["name"]),
        )
Пример #8
0
def recent_persons(limit=_RECENT_PERSONS_LENGTH):
    """ Return a list of names and titles appearing recently in the news """
    toplist = dict()

    with SessionContext(read_only=True) as session:

        q = (
            session.query(Person.name, Person.title, Person.article_url, Article.id)
            .join(Article)
            .join(Root)
            .filter(Root.visible)
            .order_by(desc(Article.timestamp))[
                0 : limit * 2
            ]  # Go through up to 2 * N records
        )

        def is_better_title(new_title, old_title):
            len_new = len(new_title)
            len_old = len(old_title)
            if len_old >= _MAX_TITLE_LENGTH:
                # Too long: we want a shorter one
                return len_new < len_old
            if len_new >= _MAX_TITLE_LENGTH:
                # This one is too long: we don't want it
                return False
            # Otherwise, longer is better
            return len_new > len_old

        with BIN_Db.get_db() as bindb:
            for p in q:
                # Insert the name into the list if it's not already there,
                # or if the new title is longer than the previous one
                if p.name not in toplist or is_better_title(
                    p.title, toplist[p.name][0]
                ):
                    toplist[p.name] = (
                        correct_spaces(p.title),
                        p.article_url,
                        p.id,
                        bindb.lookup_name_gender(p.name),
                    )
                    if len(toplist) >= limit:
                        # We now have as many names as we initially wanted: terminate the loop
                        break

    with changedlocale() as strxfrm:
        # Convert the dictionary to a sorted list of dicts
        return sorted(
            [
                dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2])
                for name, tu in toplist.items()
            ],
            key=lambda x: strxfrm(x["name"]),
        )
Пример #9
0
def append_answers(rd, q, prop_func):
    """ Iterate over query results and add them to the result dictionary rd """
    for p in q:
        s = correct_spaces(prop_func(p))
        ai = dict(
            domain=p.domain,
            uuid=p.id,
            heading=p.heading,
            timestamp=p.timestamp,
            ts=p.timestamp.isoformat()[0:16],
            url=p.url,
        )
        rd[s][p.id] = ai  # Add to a dict of UUIDs
Пример #10
0
 def annotate_wrong_subject_case(subj_case_abbr, correct_case_abbr):
     """ Create an annotation that describes a verb having a subject
         in the wrong case """
     wrong_case = CASE_NAMES[subj_case_abbr]
     # Retrieve the correct case
     correct_case = CASE_NAMES[correct_case_abbr]
     # Try to recover the verb's subject
     subj = self.find_verb_subject(tnode)
     code = "P_WRONG_CASE_" + subj_case_abbr + "_" + correct_case_abbr
     personal = "persónuleg" if correct_case_abbr == "nf" else "ópersónuleg"
     if subj is not None:
         # We know what the subject is: annotate it
         start, end = subj.span
         subj_text = subj.tidy_text
         suggestion = self.cast_to_case(correct_case_abbr, subj)
         correct_np = correct_spaces(suggestion)
         correct_np = emulate_case(correct_np, subj_text)
         # Skip the annotation if it suggests the same text as the
         # original one; this can happen if the word forms for two
         # cases are identical
         if subj_text != correct_np:
             self._ann.append(
                 Annotation(
                     start=start,
                     end=end,
                     code=code,
                     text="Á líklega að vera '{0}'".format(correct_np),
                     detail="Sögnin 'að {0}' er {3}. "
                     "Frumlag hennar á að vera "
                     "í {1}falli í stað {2}falls.".format(
                         verb, correct_case, wrong_case, personal),
                     suggest=suggestion,
                 ))
     else:
         # We don't seem to find the subject, so just annotate the verb.
         # In this case, there's no suggested correction.
         index = node.token.index
         self._ann.append(
             Annotation(
                 start=index,
                 end=index,
                 code=code,
                 text="Frumlag sagnarinnar 'að {0}' "
                 "á að vera í {1}falli".format(verb, correct_case),
                 detail="Sögnin 'að {0}' er {3}. "
                 "Frumlag hennar á að vera "
                 "í {1}falli í stað {2}falls.".format(
                     verb, correct_case, wrong_case, personal),
             ))
Пример #11
0
 def _query_string_from_toklist(toklist: Iterable[Tok]) -> str:
     """ Re-create a query string from an auto-capitalized token list """
     actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt))
     if actual_q:
         # Fix stuff that the auto-capitalization tends to get wrong,
         # such as 'í Dag'
         for wrong, correct in _CAPITALIZATION_REPLACEMENTS:
             actual_q = actual_q.replace(wrong, correct)
         # Capitalize the first letter of the query
         actual_q = actual_q[0].upper() + actual_q[1:]
         # Terminate the query with a question mark,
         # if not otherwise terminated
         if not any(actual_q.endswith(s) for s in ("?", ".", "!")):
             actual_q += "?"
     return actual_q
Пример #12
0
def query_api(version=1):
    """ Respond to a query string """

    if not (1 <= version <= 1):
        return better_jsonify(valid=False, reason="Unsupported version")

    if request.method == "GET":
        q = request.args.get("q", "")
    else:
        q = request.form.get("q", "")
    q = q.strip()[0:_MAX_QUERY_LENGTH]

    # Auto-uppercasing can be turned off by sending autouppercase: false in the query JSON
    auto_uppercase = bool_from_request(request, "autouppercase", True)
    result = dict()
    ql = q.lower()

    if ql in _SPECIAL_QUERIES or (ql + "?") in _SPECIAL_QUERIES:
        result["valid"] = True
        result["qtype"] = "Special"
        result["q"] = q
        if ql in _SPECIAL_QUERIES:
            result["response"] = _SPECIAL_QUERIES[ql]
        else:
            result["response"] = _SPECIAL_QUERIES[ql + "?"]
    else:
        with SessionContext(commit=True) as session:

            toklist = tokenize(
                q, auto_uppercase=q.islower() if auto_uppercase else False
            )
            toklist = list(recognize_entities(toklist, enclosing_session=session))
            actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt))

            # if Settings.DEBUG:
            #     # Log the query string as seen by the parser
            #     print("Query is: '{0}'".format(actual_q))

            # Try to parse and process as a query
            try:
                is_query = process_query(session, toklist, result)
            except:
                is_query = False

        result["valid"] = is_query
        result["q"] = actual_q

    return better_jsonify(**result)
Пример #13
0
 def VillaEndingIR(self, txt: str, variants: str,
                   node: Node) -> AnnotationDict:
     # 'læknirinn' á sennilega að vera 'lækninn'
     # In this case, we need the accusative form
     # of the token in self._tokens[node.start]
     tnode = self._terminal_nodes[node.start]
     suggestion = tnode.accusative_np
     correct_np = correct_spaces(suggestion)
     article = " með greini" if "gr" in tnode.all_variants else ""
     return dict(
         text="Á sennilega að vera '{0}'".format(correct_np),
         detail=("Karlkyns orð sem enda á '-ir' í nefnifalli eintölu, "
                 "eins og '{0}', eru rituð "
                 "'{1}' í þolfalli{2}.".format(tnode.canonical_np,
                                               correct_np, article)),
         suggestion=suggestion,
     )
Пример #14
0
def append_names(rd, q, prop_func):
    """ Iterate over query results and add them to the result dictionary rd,
        assuming that the key is a person name """
    for p in q:
        s = correct_spaces(prop_func(p))
        ai = dict(
            domain=p.domain,
            uuid=p.id,
            heading=p.heading,
            timestamp=p.timestamp,
            ts=p.timestamp.isoformat()[0:16],
            url=p.url,
        )
        # Obtain the key within rd that should be updated with new
        # data. This may be an existing key, a new key or None if no
        # update is to be performed.
        s = name_key_to_update(rd, s)
        if s is not None:
            rd[s][p.id] = ai  # Add to a dict of UUIDs
Пример #15
0
def query_person_title(session, name):
    """ Return the most likely title for a person """
    def we_dont_like(answer):
        """ Return False if we don't like this title and
            would prefer another one """
        # Skip titles that simply say that somebody is the husband or
        # wife of somebody else
        return answer.startswith(_DONT_LIKE_TITLE)

    rl = _query_person_titles(session, name)
    len_rl = len(rl)
    index = 0
    while index < len_rl and we_dont_like(rl[index]["answer"]):
        index += 1
    if index >= len_rl:
        # If we don't like any answer anyway, go back to the topmost one
        index = 0
    if index >= len_rl:
        return "", None
    return correct_spaces(
        rl[index]["answer"]), rl[index]["sources"][0]["domain"]
Пример #16
0
 def correct_text(self,
                  text: StringIterable,
                  *,
                  only_rare: bool = False) -> str:
     """Attempt to correct all words within a text, returning the corrected text.
     If only_rare is True, correction is only attempted on rare words."""
     result: List[str] = []
     look_back = -MAX_ORDER + 1
     for token in tokenize(text):
         if token.kind == TOK.WORD:
             if only_rare and not self.is_rare(token.txt):
                 # The word is not rare, so we don't attempt correction
                 result.append(token.txt)
             else:
                 # Correct the word and return the result
                 result.append(
                     self.correct(token.txt,
                                  context=tuple(result[look_back:])))
         elif token.txt:
             result.append(token.txt)
         elif token.kind in {TOK.S_BEGIN, TOK.S_END}:
             result.append("")
     return correct_spaces(" ".join(result))
Пример #17
0
    def _node_text(self, node: Node, original_case: bool = False) -> str:
        """ Return the text within the span of the node """
        def text(t):
            """ If the token t is a word token, return a lower case
                version of its text, unless we have a reason to keep
                the original case, i.e. if it is a lemma that is upper case
                in BÍN """
            if t.kind != TOK.WORD:
                # Not a word token: keep the original text
                return t.txt
            if len(t.txt) > 1 and t.txt.isupper():
                # All uppercase: keep it that way
                return t.txt
            if t.val and any(m.stofn[0].isupper() for m in t.val):
                # There is an uppercase lemma for this word in BÍN:
                # keep the original form
                return t.txt
            # No uppercase lemma in BÍN: return a lower case copy
            return t.txt.lower()

        first, last = self._node_span(node)
        text_func = (lambda t: t.txt) if original_case else text
        return correct_spaces(" ".join(
            text_func(t) for t in self._tokens[first:last + 1] if t.txt))
Пример #18
0
def query_entity_def(session, name):
    """ Return a single (best) definition of an entity """
    rl = _query_entity_definitions(session, name)
    return correct_spaces(rl[0]["answer"]) if rl else ""
Пример #19
0
def query_person_title(session, name):
    """ Return the most likely title for a person """
    rl = _query_person_titles(session, name)
    return correct_spaces(rl[0]["answer"]) if rl else ""
Пример #20
0
 def annotate(self, sent: Sentence) -> List[Annotation]:
     """Returns a list of annotations for a sentence object, containing
     spelling and grammar annotations of that sentence"""
     ann: List[Annotation] = []
     parsed = sent.deep_tree is not None
     # Create a mapping from token indices to terminal indices.
     # This is necessary because not all tokens are included in
     # the token list that is passed to the parser, and therefore
     # the terminal-token matches can be fewer than the original tokens.
     token_to_terminal: Dict[int, int] = {}
     if parsed:
         token_to_terminal = {
             tnode.index: ix
             for ix, tnode in enumerate(sent.terminal_nodes)
             if tnode.index is not None
         }
     grammar = self.parser.grammar
     # First, add token-level annotations and count words that occur in BÍN
     words_in_bin = 0
     words_not_in_bin = 0
     for ix, t in enumerate(sent.tokens):
         if t.kind == TOK.WORD:
             if t.has_meanings:
                 # The word has at least one meaning
                 words_in_bin += 1
             else:
                 # The word has no recognized meaning
                 words_not_in_bin += 1
         elif t.kind == TOK.PERSON:
             # Person names count as recognized words
             words_in_bin += 1
         elif t.kind == TOK.ENTITY:
             # Entity names do not count as recognized words;
             # we count each enclosed word in the entity name
             words_not_in_bin += t.txt.count(" ") + 1
         # Note: these tokens and indices are the original tokens from
         # the submitted text, including ones that are not understood
         # by the parser, such as quotation marks and exotic punctuation
         annotate = False
         if getattr(t, "error_code", None):
             # This is a CorrectToken instance (or a duck typing equivalent)
             assert isinstance(t, CorrectToken)  # Satisfy Mypy
             annotate = True
             if parsed and ix in token_to_terminal:
                 # For the call to suggestion_does_not_match(), we need a
                 # BIN_Token instance, which we obtain in a bit of a hacky
                 # way by creating it on the fly
                 bin_token = BIN_Parser.wrap_token(t, ix)
                 # Obtain the original BIN_Terminal instance from the grammar
                 terminal_index = token_to_terminal[ix]
                 terminal_node = sent.terminal_nodes[terminal_index]
                 original_terminal = terminal_node.original_terminal
                 if original_terminal not in grammar.terminals:
                     # At least one case, finna→Finna, gets the terminal "person_kvk"
                     # which isn't found in grammar.terminals!
                     annotate = False
                     continue
                 assert original_terminal is not None
                 terminal = grammar.terminals[original_terminal]
                 assert isinstance(terminal, VariantHandler)
                 if t.suggestion_does_not_match(terminal, bin_token):
                     # If this token is annotated with a spelling suggestion,
                     # do not add it unless it works grammatically
                     annotate = False
             if annotate:
                 a = Annotation(
                     start=ix,
                     end=ix + t.error_span - 1,
                     code=t.error_code,
                     text=t.error_description,
                     detail=t.error_detail,
                     references=t.error_references,
                     original=t.error_original,
                     suggest=t.error_suggest,
                 )
                 ann.append(a)
     # Then, look at the whole sentence
     num_words = words_in_bin + words_not_in_bin
     if (num_words > 2 and words_in_bin / num_words < ICELANDIC_RATIO
             and "E004" not in self._ignore_rules):
         # The sentence contains less than 50% Icelandic
         # words: assume it's in a foreign language and discard the
         # token level annotations
         ann = [
             # E004: The sentence is probably not in Icelandic
             Annotation(
                 start=0,
                 end=len(sent.tokens) - 1,
                 code="E004",
                 text="Málsgreinin er sennilega ekki á íslensku",
                 detail=
                 "{0:.0f}% orða í henni finnast ekki í íslenskri orðabók".
                 format(words_not_in_bin / num_words * 100.0),
             )
         ]
     elif not parsed:
         if self._annotate_unparsed_sentences and "E001" not in self._ignore_rules:
             # If the sentence couldn't be parsed,
             # put an annotation on it as a whole.
             # In this case, we keep the token-level annotations.
             err_index = sent.err_index or 0
             start = max(0, err_index - 1)
             end = min(len(sent.tokens), err_index + 2)
             toktext = correct_spaces(" ".join(
                 t.txt for t in sent.tokens[start:end] if t.txt))
             ann.append(
                 # E001: Unable to parse sentence
                 Annotation(
                     start=0,
                     end=len(sent.tokens) - 1,
                     code="E001",
                     text="Málsgreinin fellur ekki að reglum",
                     detail="Þáttun brást í kringum {0}. tóka ('{1}')".
                     format(err_index + 1, toktext),
                 ))
     else:
         # Successfully parsed:
         # Add annotations for error-marked nonterminals from the grammar
         # found in the parse tree
         ErrorFinder(ann, sent).run()
         # Run the pattern matcher on the sentence,
         # annotating questionable patterns
         PatternMatcher(ann, sent).run()
     # Sort the annotations by their start token index,
     # and then by decreasing span length
     ann.sort(key=lambda a: (a.start, -a.end))
     # Eliminate duplicates, i.e. identical annotation
     # codes for identical spans
     i = 1
     while i < len(ann):
         a, prev = ann[i], ann[i - 1]
         if a.code == prev.code and a.start == prev.start and a.end == prev.end:
             # Identical annotation: remove it from the list
             del ann[i]
         else:
             # Check the next pair
             i += 1
     # Remove ignored annotations
     ann = [a for a in ann if a.code not in self._ignore_rules]
     return ann
Пример #21
0
 def annotate(sent: _Sentence) -> List[Annotation]:
     """ Returns a list of annotations for a sentence object, containing
         spelling and grammar annotations of that sentence """
     ann: List[Annotation] = []
     words_in_bin = 0
     words_not_in_bin = 0
     # First, add token-level annotations
     for ix, t in enumerate(sent.tokens):
         if t.kind == TOK.WORD:
             if t.val:
                 # The word has at least one meaning
                 words_in_bin += 1
             else:
                 # The word has no recognized meaning
                 words_not_in_bin += 1
         elif t.kind == TOK.PERSON:
             # Person names count as recognized words
             words_in_bin += 1
         # Note: these tokens and indices are the original tokens from
         # the submitted text, including ones that are not understood
         # by the parser, such as quotation marks and exotic punctuation
         if hasattr(t, "error_code"):
             assert isinstance(t, CorrectToken)
             if t.error_code:
                 ann.append(
                     Annotation(
                         start=ix,
                         end=ix + t.error_span - 1,
                         code=t.error_code,
                         text=t.error_description,
                     ))
     # Then, look at the whole sentence
     num_words = words_in_bin + words_not_in_bin
     if num_words > 2 and words_in_bin / num_words < ICELANDIC_RATIO:
         # The sentence contains less than 50% Icelandic
         # words: assume it's in a foreign language and discard the
         # token level annotations
         ann = [
             # E004: The sentence is probably not in Icelandic
             Annotation(
                 start=0,
                 end=len(sent.tokens) - 1,
                 code="E004",
                 text="Málsgreinin er sennilega ekki á íslensku",
                 detail=
                 "{0:.0f}% orða í henni finnast ekki í íslenskri orðabók".
                 format(words_not_in_bin / num_words * 100.0))
         ]
     elif sent.deep_tree is None:
         # If the sentence couldn't be parsed,
         # put an annotation on it as a whole.
         # In this case, we keep the token-level annotations.
         err_index = sent.err_index or 0
         start = max(0, err_index - 1)
         end = min(len(sent.tokens), err_index + 2)
         toktext = correct_spaces(" ".join(t.txt
                                           for t in sent.tokens[start:end]
                                           if t.txt))
         ann.append(
             # E001: Unable to parse sentence
             Annotation(
                 start=0,
                 end=len(sent.tokens) - 1,
                 code="E001",
                 text="Málsgreinin fellur ekki að reglum",
                 detail="Þáttun brást í kring um {0}. tóka ('{1}')".format(
                     err_index + 1, toktext)))
     else:
         # Successfully parsed:
         # Add annotations for error-marked nonterminals from the grammar
         # found in the parse tree
         ErrorFinder(ann, sent).go()
         # Run the pattern matcher on the sentence,
         # annotating questionable patterns
         PatternMatcher(ann, sent).go()
     # Sort the annotations by their start token index,
     # and then by decreasing span length
     ann.sort(key=lambda a: (a.start, -a.end))
     return ann