Exemplo n.º 1
0
 def try_to_help(cls, query: str, result: ResponseDict) -> None:
     """Attempt to help the user in the case of a failed query,
     based on lemmas in the query string"""
     # Collect a set of lemmas that occur in the query string
     lemmas = set()
     with GreynirBin.get_db() as db:
         for token in query.lower().split():
             if token.isalpha():
                 m = db.meanings(token)
                 if not m:
                     # Try an uppercase version, just in case (pun intended)
                     m = db.meanings(token.capitalize())
                 if m:
                     lemmas |= set(mm.stofn.lower().replace("-", "")
                                   for mm in m)
     # Collect a list of potential help text functions from the query modules
     help_text_funcs: List[Tuple[str, HelpFunc]] = []
     for lemma in lemmas:
         help_text_funcs.extend([
             (lemma, help_text_func)
             for help_text_func in cls._help_texts.get(lemma, [])
         ])
     if help_text_funcs:
         # Found at least one help text func matching a lemma in the query
         # Select a function at random and invoke it with the matched
         # lemma as a parameter
         lemma, help_text_func = random.choice(help_text_funcs)
         result["answer"] = result["voice"] = help_text_func(lemma)
         result["valid"] = True
Exemplo n.º 2
0
def _answer_phonenum4name_query(q: Query, result: Result) -> AnswerTuple:
    """ Answer query of the form "hvað er síminn hjá [íslenskt mannsnafn]?" """
    res = query_ja_api(result.qkey)

    nþgf = NounPhrase(result.qkey).dative or result.qkey

    # Verify that we have a sane response with at least 1 result
    if not res or not res.get("people") or not res["people"].get("items"):
        return gen_answer("Ekki tókst að fletta upp {0}.".format(nþgf))

    # Check if we have a single canonical match from API
    allp = res["people"]["items"]
    single = len(allp) == 1
    first = allp[0]
    fname = first["name"]
    if not single:
        # Many found with that name, generate smart message asking for disambiguation
        name_components = result.qkey.split()
        one_name_only = len(name_components) == 1
        with GreynirBin.get_db() as bdb:
            fn = name_components[0].title()
            gender = bdb.lookup_name_gender(fn)
        msg = (
            "Það fundust {0} með það nafn. Prófaðu að tilgreina {1}heimilisfang"
            .format(
                "margar" if gender == "kvk" else "margir",
                "fullt nafn og " if one_name_only else "",
            ))
        # Try to generate example, e.g. "Jón Jónssón á Smáragötu"
        for i in allp:
            try:
                street_nf = i["address_nominative"].split()[0]
                street_þgf = i["address"].split()[0]
                msg = msg + " t.d. {0} {1} {2}".format(
                    fname, iceprep_for_street(street_nf), street_þgf)
                break
            except (KeyError, ValueError) as e:
                logging.warning("Exception: " + str(e))
                continue
        return gen_answer(msg)

    # Scan API call result, try to find the best phone nuber to provide
    phone_number = _best_number(first)
    if not phone_number:
        return gen_answer("Ég finn ekki símanúmerið hjá {0}".format(nþgf))

    # Sanitize number and generate answer
    phone_number = phone_number.replace("-", "").replace(" ", "")
    answ = phone_number
    fn = NounPhrase(fname).dative or fname
    voice = "Síminn hjá {0} er {1}".format(fn, " ".join(list(phone_number)))

    q.set_context(dict(phone_number=phone_number, name=fname))
    q.set_source(_JA_SOURCE)

    return dict(answer=answ), answ, voice
Exemplo n.º 3
0
def to_genitive(np: str,
                *,
                filter_func: Optional[EntryFilterFunc] = None) -> str:
    """ Return the noun phrase after casting it from nominative to genitive case """
    with GreynirBin.get_db() as db:
        return _to_case(
            np,
            db.lookup_g,
            db.cast_to_genitive,
            filter_func=filter_func,
        )
Exemplo n.º 4
0
 def add(word: str) -> None:
     """Add the given (wrongly capitalized) word stem to the stem set"""
     # We support compound words such as 'félags- og barnamálaráðherra' here
     split_on_hyphen = False
     if " " in word:
         prefix, suffix = word.rsplit(" ", maxsplit=1)
         prefix += " "
     else:
         prefix, suffix = "", word
         # Split_on_hyphen is True for e.g. 'norður-kórea' and 'nýja-sjáland'
         split_on_hyphen = "-" in word
     db = GreynirBin().get_db()
     # The suffix may not be in BÍN except as a compound, and in that
     # case we want its hyphenated lemma
     suffix_rev = CapitalizationErrors.reverse_capitalization(
         suffix, split_on_hyphen=split_on_hyphen)
     _, m = db.lookup_g(suffix_rev)
     # Only consider lemmas
     m = [mm for mm in m if mm.stofn == mm.ordmynd]
     if not m:
         raise ConfigError(
             "No BÍN meaning for '{0}' (from error word '{1}') in capitalization_errors section"
             .format(suffix_rev, word))
     if not prefix:
         # This might be something like 'barnamálaráðherra' which comes out
         # with a lemma of 'barnamála-ráðherra'
         word = CapitalizationErrors.emulate_case(m[0].stofn, template=word)
     else:
         # This might be something like 'félags- og barnamálaráðherra' which comes out
         # with a lemma of 'félags- og barnamála-ráðherra'
         word = prefix + m[0].stofn
     if word in CapitalizationErrors.SET:
         raise ConfigError(
             "Multiple definition of '{0}' in capitalization_errors section"
             .format(word))
     # Construct the reverse casing of the word
     word_rev = CapitalizationErrors.reverse_capitalization(
         word, split_on_hyphen=split_on_hyphen)
     # Add the word and its reverse case to the set of errors
     CapitalizationErrors.SET.add(word)
     CapitalizationErrors.SET_REV.add(word_rev)
Exemplo n.º 5
0
 def add(word: str, replacement: str, explanation: str) -> None:
     if word in TabooWords.DICT:
         raise ConfigError(
             "Multiple definition of '{0}' in taboo_words section".format(
                 word))
     db = GreynirBin.get_db()
     a = word.split("_")
     _, m = db.lookup_g(a[0])
     if not m or (len(a) >= 2 and all(mm.ordfl != a[1] for mm in m)):
         raise ConfigError(
             "The taboo word '{0}' is not found in BÍN".format(word))
     TabooWords.DICT[word] = (replacement, explanation)
Exemplo n.º 6
0
def recent_persons(limit=_RECENT_PERSONS_LENGTH):
    """ Return a list of names and titles appearing recently in the news """
    toplist: Dict[str, Tuple[str, str, str, str]] = dict()

    with SessionContext(read_only=True) as session:

        q = (
            session.query(Person.name, Person.title, Person.article_url,
                          Article.id).join(Article).join(Root).filter(
                              Root.visible)
            # Go through up to 2 * N records
            .order_by(desc(cast(Column, Article.timestamp)))[0:limit * 2])

        def is_better_title(new_title, old_title):
            len_new = len(new_title)
            len_old = len(old_title)
            if len_old >= _MAX_TITLE_LENGTH:
                # Too long: we want a shorter one
                return len_new < len_old
            if len_new >= _MAX_TITLE_LENGTH:
                # This one is too long: we don't want it
                return False
            # Otherwise, longer is better
            return len_new > len_old

        with GreynirBin.get_db() as bindb:
            for p in q:
                # Insert the name into the list if it's not already there,
                # or if the new title is longer than the previous one
                if p.name not in toplist or is_better_title(
                        p.title, toplist[p.name][0]):
                    toplist[p.name] = (
                        correct_spaces(p.title),
                        p.article_url,
                        p.id,
                        bindb.lookup_name_gender(p.name),
                    )
                    if len(toplist) >= limit:
                        # We now have as many names as we initially wanted: terminate the loop
                        break

    with changedlocale() as strxfrm:
        # Convert the dictionary to a sorted list of dicts
        return sorted(
            [
                dict(name=name,
                     title=tu[0],
                     gender=tu[3],
                     url=tu[1],
                     uuid=tu[2]) for name, tu in toplist.items()
            ],
            key=lambda x: strxfrm(x["name"]),
        )
Exemplo n.º 7
0
 def tagset(self, word, at_sentence_start=False):
     """ Return a list of (probability, tag) tuples for the given word """
     toklist = list(parse_tokens(" ".join(word)))
     token = toklist[0]
     w = word[0]
     if token.kind == TOK.WORD and token.val is None:
         try:
             with GreynirBin.get_db() as db:
                 w, m = db.lookup_g(token.txt, at_sentence_start)
         except Exception:
             w, m = token.txt, []
         token = TOK.Word(w, m)
     return self._ngram_tagger.tag_single_token(token)
Exemplo n.º 8
0
def _mynameis_handler(q: Query, ql: str) -> bool:
    """ Handle queries of the form "Ég heiti X", store this information. """
    m: Optional[Match[str]] = None
    for rx in _MY_NAME_IS_REGEXES:
        m = re.search(rx, ql)
        if m:
            break
    if m:
        fname = m.group(1).strip()
        if not fname:
            return False

        # Clean up name string
        name = fname.split(" og ")[0]  # "ég heiti X og blablabla"
        name = name.split(" hvað ")[0]  # "ég heiti X hvað heitir þú"

        # Handle "ég heiti ekki X"
        components = name.split()
        if components[0] == "ekki":
            q.set_answer(*gen_answer("Hvað heitirðu þá?"))
            return True

        # Get first name, look up gender for a gender-tailored response
        with GreynirBin.get_db() as bdb:
            fn = components[0].title()
            gender = bdb.lookup_name_gender(fn) or "hk"
            resp = _MY_NAME_IS_RESPONSES[gender]
            answ = resp.format(fn)
            if fn == "Embla":
                answ = "Sæl og blessuð. Ég heiti líka Embla!"

        # Save this info about user to query data table
        if q.client_id:
            qdata: ClientDataDict = dict(full=name.title(),
                                         first=fn,
                                         gender=gender)
            q.set_client_data("name", qdata)

        # Beautify query by capitalizing the name provided
        bq = q.beautified_query
        q.set_beautified_query(bq.replace(name, name.title()))

        # Generate answer
        voice = answ.replace(",", "")
        q.set_answer(dict(answer=answ), answ, voice)
        q.query_is_command()

        return True

    return False
Exemplo n.º 9
0
def top_authors(days=_TOP_AUTHORS_PERIOD, session=None):
    """ Generate list of top authors w. parse percentage. """
    end = datetime.utcnow()
    start = end - timedelta(days=days)
    authors = BestAuthorsQuery.period(start,
                                      end,
                                      enclosing_session=session,
                                      min_articles=10)[:20]

    authresult = list()
    with GreynirBin.get_db() as bindb:
        for a in authors:
            name = a[0]
            gender = bindb.lookup_name_gender(name)
            if gender == "hk":  # Skip unnamed authors (e.g. "Ritstjórn Vísis")
                continue
            perc = round(float(a[4]), 2)
            authresult.append({"name": name, "gender": gender, "perc": perc})

    return authresult[:10]
Exemplo n.º 10
0
def lookup_best_word(word: str) -> Optional[Tuple[str, str, str, str]]:
    """ Look up word in BÍN, pick right one acc. to a criterion. """
    with GreynirBin().get_db() as db:

        def nouns_only(bin_meaning: BIN_Tuple) -> bool:
            return bin_meaning.ordfl in ("kk", "kvk", "hk")

        res = list(filter(nouns_only, db.lookup_nominative_g(word)))
        if not res:
            # Try with uppercase first char
            capw = word.capitalize()
            res = list(filter(nouns_only, db.lookup_nominative_g(capw)))
            if not res:
                return None

        # OK, we have one or more matching nouns
        if len(res) == 1:
            m = res[0]
        else:
            # TODO: Pick best result
            m = res[0]  # For now

        wid = m.utg

        # TODO: If more than one declension form possible (e.g. gen. björns vs. bjarnar)
        # we should also list such variations
        def sort_by_preference(m_list: BinEntryIterable) -> BinEntryList:
            # Filter out words that don't have the same "utg" i.e. word ID as
            # the one we successfully looked up in BÍN
            mns = list(filter(lambda w: w.bin_id == wid, m_list))
            # Discourage rarer declension forms, i.e. ÞGF2 and ÞGF3
            return sorted(mns, key=lambda m: "2" in m.mark or "3" in m.mark)

        # Look up all cases of the word in BÍN
        nom = m.stofn
        acc = db.cast_to_accusative(nom, filter_func=sort_by_preference)
        dat = db.cast_to_dative(nom, filter_func=sort_by_preference)
        gen = db.cast_to_genitive(nom, filter_func=sort_by_preference)
        return nom, acc, dat, gen
Exemplo n.º 11
0
def dump_tokens(limit):
    """ Iterate through parsed articles and print a list
        of tokens and their matched terminals """

    dtd = dict()
    with GreynirBin.get_db() as db, SessionContext(commit=True) as session:
        # Iterate through the articles
        q = (
            session.query(Article)
            .filter(Article.tree != None)
            .order_by(Article.timestamp)
        )
        if limit is None:
            q = q.all()
        else:
            q = q[0:limit]
        for a in q:
            print(
                "\nARTICLE\nHeading: '{0.heading}'\nURL: {0.url}\nTimestamp: {0.timestamp}"
                .format(a)
            )
            tree = TreeTokenList()
            tree.load(a.tree)
            for ix, toklist in tree.token_lists():
                print("\nSentence {0}:".format(ix))
                at_start = True
                for t in toklist:
                    if t.tokentype == "WORD":
                        wrd = t.token[1:-1]
                        td = dtd.get(t.terminal)
                        if td is None:
                            td = TerminalDescriptor(t.terminal)
                            dtd[t.terminal] = td
                        stem = td.stem(db, wrd, at_start)
                        at_start = False
                        print("    {0} {1} {2}".format(wrd, stem, t.terminal))
                    else:
                        print("    {0.token} {0.cat} {0.terminal}".format(t))
Exemplo n.º 12
0
def test() -> None:

    with GreynirBin.get_db() as db:
        c = Corrector(db)

        txts = [
            """
        FF er flokkur með rasisku ívafi og tilhneygjingu til að einkavinavæða alla fjölmiðla
        Íslands og færa þar með elítunni að geta ein haft áhrif á skoðanamyndandi áhri í
        fjölmiðlaheiminum, er ekki viðbúið að svona flokkur gamgi til samstarf við íhaldið
        eftir kosningar en ekki þessa vondu félagshyggjuflokka
            """,
            """
        fæ alveg hræðileg drauma vegna fyrri áfalla og það hjálpar mér að ná góðum svef og þar með
        betri andlegri lýðan og líka til að auka matarlist. Tek samt skýrt fram að ég hef bæði
        missnotað kannabis og ekki. Hef engan áhuga á að vera undir áhrifum kannabis alla dag.
        Mikil munur á að nota og missnota !
            """,
            """
        Bæði , lyf gegn áfengissyki (leiða) , mér hefur ekki leiðst mikið seinustu 30 ár. Gegn
        Taugaveiklun, konan hamrar á mér alla daga , skærur hennar eru langar og strangar. En ef ég fæ
        eina pípu og gríp gitarinn má hún tuða í mér klukkutímum saman.Ég er bæði rólegur og læri hratt
        á gítarinn, eftir 10 ára hjónaband er ég bara ótrúlega heill og stefni hátt. Ég og gitarinn erum
        orðnir samvaxnir. Auðvitað stefnum við á skilnað og þá mun ég sakna skalaæfinganna.
            """,
            """
        biddu nu hæg - var Kvennalistinn eins malefnis hreyfing. Hvað attu við - ef þu telur malefnið
        hafa verið eitt hvert var það? Kannski leikskola fyrir öll börn? Sömu laun fyrir sömu störf?
        Að borgarskipulag tæki mið af þörfum beggja kynja? Að kynjagleraugu væru notuð við gerð
        fjarlaga? Að þjoðfelagið opnaði augun fyrir kynferðsofbeldinu og sifjaspellum? (hvorutveggja
        sagt aðeins viðgangast i utlöndum). Þetta eru aðeins örfa dæmi um malefni sem brunnu a okkur
        og við börðumst fyrir. Ekki ertu i alvöru að tala framlag okkur niður. Tæplega
        telurðu það EITT malefni þo að i grunninn hafi baratta okkar sem stoðum að Kvennaframboðinu
        og -listanum gengið ut a að ,,betri,, helmingur þjoðarinnar öðlast - ekki bara i orði heldur
        einnig a borði - sömu rettindi og raðandi helmingurinn
            """,
            """
        Salvör ekki standa i að reyna að klora yfir mistök þin. Reynsluheimur kvenna visar að sjalsögðu
        til þess að helmingur mannkynsins - -konur - er olikur hinum helmingnum bæði sökum lffræðilegs munar og
        þess að þær eru gerðar að konum (sb de Beauvoir) þe fra frumbernsku er drengjum hrosað fyrir annað en
        stulkum og væntingar foreldra eru aðrar til dætra en sona og auk þess er ætlast til að dætur læri af mæðrum en synir af
        feðrum. Það er þetta sem gerir konur - helming mannkynsins - frabrugðna körlum sem hafa fra örofi alda verið
        ,,raðandi,, kynið. Það var gegn þvi orettlæti að reynsluheimur kvenna speglaðist ekki i politiskum akvörðunum sem við
        sem stofnaði Kvennafranboðið og - listann börðumst gegn - a öllum vigstöðvum. Að skilgreina barattu okkar
        Kvennalistans - fyrir rettindum halfrar þjoðarinnar til að skapa ,,rettlatara samfelag,, - sem eins mals flokk er
        fjarstæða.
            """,
        ]

        def linebreak(txt: str, margin: int = 80, left_margin: int = 0) -> str:
            """Return a nicely column-formatted string representation of the given text,
            where each line is not longer than the given margin (if possible).
            A left margin can be optionally added, as a sequence of spaces.
            The lines are joined by newlines ('\n') but there is no trailing
            newline."""
            result: List[str] = []
            line: List[str] = []
            len_line = 0
            for wrd in txt.split():
                if len_line + 1 + len(wrd) > margin:
                    result.append(" ".join(line))
                    line = []
                    len_line = 0
                line.append(wrd)
                len_line += 1 + len(wrd)
            if line:
                result.append(" ".join(line))
            return "\n".join(" " * left_margin + line for line in result)

        t0 = time.time()

        for t in txts:
            print("\nOriginal:\n")
            print(linebreak(t, left_margin=8))
            print("\nCorrected:\n")
            print(linebreak(c.correct_text(t), left_margin=8))

        t1 = time.time()
        print("\nTotal time: {0:.2f} seconds".format(t1 - t0))
Exemplo n.º 13
0
def recognize_entities(
    token_stream: Iterator[Tok],
    enclosing_session: Optional[Session] = None,
    token_ctor: Type[TOK] = TOK,
) -> Iterator[Tok]:
    """ Parse a stream of tokens looking for (capitalized) entity names
        The algorithm implements N-token lookahead where N is the
        length of the longest entity name having a particular initial word.
        Adds a named entity recognition layer on top of the
        reynir.bintokenizer.tokenize() function.

    """

    # Token queue
    tq: List[Tok] = []
    # Phrases we're considering. Note that an entry of None
    # indicates that the accumulated phrase so far is a complete
    # and valid known entity name.
    state: Dict[Union[str, None], List[Tuple[List[str],
                                             Entity]]] = defaultdict(list)
    # Entitiy definition cache
    ecache: Dict[str, List[Entity]] = dict()
    # Last name to full name mapping ('Clinton' -> 'Hillary Clinton')
    lastnames: Dict[str, Tok] = dict()

    with GreynirBin.get_db() as db, SessionContext(session=enclosing_session,
                                                   commit=True,
                                                   read_only=True) as session:

        def fetch_entities(w: str, fuzzy: bool = True) -> List[Entity]:
            """ Return a list of entities matching the word(s) given,
                exactly if fuzzy = False, otherwise also as a starting word(s) """
            try:
                q = session.query(Entity.name, Entity.verb, Entity.definition)
                if fuzzy:
                    q = q.filter(
                        Entity.name.like(w + " %") | (Entity.name == w))
                else:
                    q = q.filter(Entity.name == w)
                return q.all()
            except OperationalError as e:
                logging.warning("SQL error in fetch_entities(): {0}".format(e))
                return []

        def query_entities(w: str) -> List[Entity]:
            """ Return a list of entities matching the initial word given """
            e = ecache.get(w)
            if e is None:
                ecache[w] = e = fetch_entities(w)
            return e

        def lookup_lastname(lastname: str) -> Optional[Tok]:
            """ Look up a last name in the lastnames registry,
                eventually without a possessive 's' at the end, if present """
            fullname = lastnames.get(lastname)
            if fullname is not None:
                # Found it
                return fullname
            # Try without a possessive 's', if present
            if lastname.endswith("s"):
                return lastnames.get(lastname[0:-1])
            # Nope, no match
            return None

        def flush_match():
            """ Flush a match that has been accumulated in the token queue """
            if len(tq) == 1 and lookup_lastname(tq[0].txt) is not None:
                # If single token, it may be the last name of a
                # previously seen entity or person
                return token_or_entity(tq[0])
            # Reconstruct original text behind phrase
            ename = " ".join([t.txt for t in tq])
            # We don't include the definitions in the token - they should be looked up
            # on the fly when processing or displaying the parsed article
            return token_ctor.Entity(ename)

        def token_or_entity(token: Tok) -> Tok:
            """ Return a token as-is or, if it is a last name of a person
                that has already been mentioned in the token stream by full name,
                refer to the full name """
            assert token.txt[0].isupper()
            tfull = lookup_lastname(token.txt)
            if tfull is None:
                # Not a last name of a previously seen full name
                return token
            if tfull.kind != TOK.PERSON:
                # Return an entity token with no definitions
                # (this will eventually need to be looked up by full name when
                # displaying or processing the article)
                return token_ctor.Entity(token.txt)
            # Return the full name meanings
            return token_ctor.Person(token.txt, tfull.person_names)

        try:

            while True:

                token = next(token_stream)

                if not token.txt:  # token.kind != TOK.WORD:
                    if state:
                        if None in state:
                            yield flush_match()
                        else:
                            yield from tq
                        tq = []
                        state = defaultdict(list)
                    yield token
                    continue

                # Look for matches in the current state and build a new state
                newstate = defaultdict(list)
                w = token.txt  # Original word

                def add_to_state(slist, entity):
                    """ Add the list of subsequent words to the new parser state """
                    wrd = slist[0] if slist else None
                    rest = slist[1:]
                    newstate[wrd].append((rest, entity))

                if w in state:
                    # This matches an expected token
                    tq.append(token)  # Add to lookahead token queue
                    # Add the matching tails to the new state
                    for sl, entity in state[w]:
                        add_to_state(sl, entity)
                    # Update the lastnames mapping
                    fullname = " ".join([t.txt for t in tq])
                    parts = fullname.split()
                    # If we now have 'Hillary Rodham Clinton',
                    # make sure we delete the previous 'Rodham' entry
                    for p in parts[1:-1]:
                        if p in lastnames:
                            del lastnames[p]
                    if parts[-1][0].isupper():
                        # 'Clinton' -> 'Hillary Rodham Clinton'
                        lastnames[parts[-1]] = token_ctor.Entity(fullname)
                else:
                    # Not a match for an expected token
                    if state:
                        if None in state:
                            # We have an accumulated match, but if the next token
                            # is an uppercase word without a BÍN meaning, we
                            # append it to the current entity regardless.
                            # This means that 'Charley Lucknow' is handled as a single
                            # new entity name even if 'Charley' already exists
                            # as an entity.
                            while w and w[0].isupper() and not token.val:
                                # Append to the accumulated token queue, which will
                                # be squashed to a single token in flush_match()
                                tq.append(token)
                                token = next(token_stream)
                                w = token.txt
                            # Flush the already accumulated match
                            yield flush_match()
                        else:
                            yield from tq
                        tq = []

                    # Add all possible new states for entity names
                    # that could be starting
                    weak = True
                    cnt = 1
                    upper = w and w[0].isupper()
                    parts = []

                    if upper and " " in w:
                        # For all uppercase phrases (words, entities, persons),
                        # maintain a map of last names to full names
                        parts = w.split()
                        lastname = parts[-1]
                        # Clinton -> Hillary [Rodham] Clinton
                        if lastname[0].isupper():
                            # Look for Icelandic patronyms/matronyms
                            _, m = db.lookup_g(lastname, False)
                            if m and any(mm.fl in {"föð", "móð"} for mm in m):
                                # We don't store Icelandic patronyms/matronyms
                                # as surnames
                                pass
                            else:
                                lastnames[lastname] = token

                    if token.kind == TOK.WORD and upper and w not in Abbreviations.DICT:
                        if " " in w:
                            # w may be a person name with more than one embedded word
                            # parts is assigned in the if statement above
                            cnt = len(parts)
                        elif not token.has_meanings or (
                                "-" in token.meanings[0].stofn):
                            # No BÍN meaning for this token, or the meanings
                            # were constructed by concatenation (indicated by a hyphen
                            # in the stem)
                            weak = False  # Accept single-word entity references
                        # elist is a list of Entity instances
                        elist = query_entities(w)
                    else:
                        elist = []

                    if elist:
                        # This word might be a candidate to start an entity reference
                        candidate = False
                        for e in elist:
                            # List of subsequent words in entity name
                            sl = e.name.split()[cnt:]
                            if sl:
                                # Here's a candidate for a longer entity reference
                                # than we already have
                                candidate = True
                            if sl or not weak:
                                add_to_state(sl, e)
                        if weak and not candidate:
                            # Found no potential entity reference longer than this token
                            # already is - and we have a BÍN meaning for it:
                            # Abandon the effort
                            assert not newstate
                            assert not tq
                            yield token_or_entity(token)
                        else:
                            # Go for it: Initialize the token queue
                            tq = [token]
                    else:
                        # Not a start of an entity reference: simply yield the token
                        assert not tq
                        if upper:
                            # Might be a last name referring to a full name
                            yield token_or_entity(token)
                        else:
                            yield token

                # Transition to the new state
                state = newstate

        except StopIteration:
            # Token stream is exhausted
            pass

        # Yield an accumulated match if present
        if state:
            if None in state:
                yield flush_match()
            else:
                yield from tq
            tq = []

    # print("\nEntity cache:\n{0}".format("\n".join("'{0}': {1}".format(k, v) for k, v in ecache.items())))
    # print("\nLast names:\n{0}".format("\n".join("{0}: {1}".format(k, v) for k, v in lastnames.items())))

    assert not tq
Exemplo n.º 14
0
            port=Settings.PORT,
            debug=Settings.DEBUG,
            use_reloader=not ptvsd_attached,
            extra_files=extra_files,
        )
    except socket_error as e:
        if e.errno == errno.EADDRINUSE:  # Address already in use
            logging.error(
                "Greynir web app is already running at host {0}:{1}".format(
                    Settings.HOST, Settings.PORT))
            sys.exit(1)
        else:
            raise
    finally:
        ArticleProxy.cleanup()
        GreynirBin.cleanup()

else:
    app.config["PRODUCTION"] = True

    # Suppress information log messages from Werkzeug
    werkzeug_log = logging.getLogger("werkzeug")
    if werkzeug_log:
        werkzeug_log.setLevel(logging.WARNING)

    # Log our startup
    log_str = ("Greynir instance starting with "
               "host={0}:{1}, db_host={2}:{3} on Python {4}".format(
                   Settings.HOST,
                   Settings.PORT,
                   Settings.DB_HOSTNAME,
Exemplo n.º 15
0
    db_conn = sqlite3.connect(db_path, check_same_thread=False)
    db_conn.row_factory = lambda c, r: dict(
        zip([col[0] for col in c.description], r))

    q = "SELECT DISTINCT nafn FROM ornefni;"

    res = db_conn.cursor().execute(q)

    matches = [row["nafn"] for row in res]

    num_bin = 0
    num_comb = 0
    num_fail = 0

    with GreynirBin.get_db() as db:
        for m in matches:
            w = m.strip()
            if " " in w or "-" in w or "-" in w:
                continue

            # Direct BÍN lookup
            meanings = db.meanings(w)
            if meanings:
                num_bin += 1
                continue

            # Lookup using BÍN and combinator
            _, meanings = db.lookup_g(w, auto_uppercase=True)
            if meanings:
                num_comb += 1