def try_to_help(cls, query: str, result: ResponseDict) -> None: """Attempt to help the user in the case of a failed query, based on lemmas in the query string""" # Collect a set of lemmas that occur in the query string lemmas = set() with GreynirBin.get_db() as db: for token in query.lower().split(): if token.isalpha(): m = db.meanings(token) if not m: # Try an uppercase version, just in case (pun intended) m = db.meanings(token.capitalize()) if m: lemmas |= set(mm.stofn.lower().replace("-", "") for mm in m) # Collect a list of potential help text functions from the query modules help_text_funcs: List[Tuple[str, HelpFunc]] = [] for lemma in lemmas: help_text_funcs.extend([ (lemma, help_text_func) for help_text_func in cls._help_texts.get(lemma, []) ]) if help_text_funcs: # Found at least one help text func matching a lemma in the query # Select a function at random and invoke it with the matched # lemma as a parameter lemma, help_text_func = random.choice(help_text_funcs) result["answer"] = result["voice"] = help_text_func(lemma) result["valid"] = True
def _answer_phonenum4name_query(q: Query, result: Result) -> AnswerTuple: """ Answer query of the form "hvað er síminn hjá [íslenskt mannsnafn]?" """ res = query_ja_api(result.qkey) nþgf = NounPhrase(result.qkey).dative or result.qkey # Verify that we have a sane response with at least 1 result if not res or not res.get("people") or not res["people"].get("items"): return gen_answer("Ekki tókst að fletta upp {0}.".format(nþgf)) # Check if we have a single canonical match from API allp = res["people"]["items"] single = len(allp) == 1 first = allp[0] fname = first["name"] if not single: # Many found with that name, generate smart message asking for disambiguation name_components = result.qkey.split() one_name_only = len(name_components) == 1 with GreynirBin.get_db() as bdb: fn = name_components[0].title() gender = bdb.lookup_name_gender(fn) msg = ( "Það fundust {0} með það nafn. Prófaðu að tilgreina {1}heimilisfang" .format( "margar" if gender == "kvk" else "margir", "fullt nafn og " if one_name_only else "", )) # Try to generate example, e.g. "Jón Jónssón á Smáragötu" for i in allp: try: street_nf = i["address_nominative"].split()[0] street_þgf = i["address"].split()[0] msg = msg + " t.d. {0} {1} {2}".format( fname, iceprep_for_street(street_nf), street_þgf) break except (KeyError, ValueError) as e: logging.warning("Exception: " + str(e)) continue return gen_answer(msg) # Scan API call result, try to find the best phone nuber to provide phone_number = _best_number(first) if not phone_number: return gen_answer("Ég finn ekki símanúmerið hjá {0}".format(nþgf)) # Sanitize number and generate answer phone_number = phone_number.replace("-", "").replace(" ", "") answ = phone_number fn = NounPhrase(fname).dative or fname voice = "Síminn hjá {0} er {1}".format(fn, " ".join(list(phone_number))) q.set_context(dict(phone_number=phone_number, name=fname)) q.set_source(_JA_SOURCE) return dict(answer=answ), answ, voice
def to_genitive(np: str, *, filter_func: Optional[EntryFilterFunc] = None) -> str: """ Return the noun phrase after casting it from nominative to genitive case """ with GreynirBin.get_db() as db: return _to_case( np, db.lookup_g, db.cast_to_genitive, filter_func=filter_func, )
def add(word: str) -> None: """Add the given (wrongly capitalized) word stem to the stem set""" # We support compound words such as 'félags- og barnamálaráðherra' here split_on_hyphen = False if " " in word: prefix, suffix = word.rsplit(" ", maxsplit=1) prefix += " " else: prefix, suffix = "", word # Split_on_hyphen is True for e.g. 'norður-kórea' and 'nýja-sjáland' split_on_hyphen = "-" in word db = GreynirBin().get_db() # The suffix may not be in BÍN except as a compound, and in that # case we want its hyphenated lemma suffix_rev = CapitalizationErrors.reverse_capitalization( suffix, split_on_hyphen=split_on_hyphen) _, m = db.lookup_g(suffix_rev) # Only consider lemmas m = [mm for mm in m if mm.stofn == mm.ordmynd] if not m: raise ConfigError( "No BÍN meaning for '{0}' (from error word '{1}') in capitalization_errors section" .format(suffix_rev, word)) if not prefix: # This might be something like 'barnamálaráðherra' which comes out # with a lemma of 'barnamála-ráðherra' word = CapitalizationErrors.emulate_case(m[0].stofn, template=word) else: # This might be something like 'félags- og barnamálaráðherra' which comes out # with a lemma of 'félags- og barnamála-ráðherra' word = prefix + m[0].stofn if word in CapitalizationErrors.SET: raise ConfigError( "Multiple definition of '{0}' in capitalization_errors section" .format(word)) # Construct the reverse casing of the word word_rev = CapitalizationErrors.reverse_capitalization( word, split_on_hyphen=split_on_hyphen) # Add the word and its reverse case to the set of errors CapitalizationErrors.SET.add(word) CapitalizationErrors.SET_REV.add(word_rev)
def add(word: str, replacement: str, explanation: str) -> None: if word in TabooWords.DICT: raise ConfigError( "Multiple definition of '{0}' in taboo_words section".format( word)) db = GreynirBin.get_db() a = word.split("_") _, m = db.lookup_g(a[0]) if not m or (len(a) >= 2 and all(mm.ordfl != a[1] for mm in m)): raise ConfigError( "The taboo word '{0}' is not found in BÍN".format(word)) TabooWords.DICT[word] = (replacement, explanation)
def recent_persons(limit=_RECENT_PERSONS_LENGTH): """ Return a list of names and titles appearing recently in the news """ toplist: Dict[str, Tuple[str, str, str, str]] = dict() with SessionContext(read_only=True) as session: q = ( session.query(Person.name, Person.title, Person.article_url, Article.id).join(Article).join(Root).filter( Root.visible) # Go through up to 2 * N records .order_by(desc(cast(Column, Article.timestamp)))[0:limit * 2]) def is_better_title(new_title, old_title): len_new = len(new_title) len_old = len(old_title) if len_old >= _MAX_TITLE_LENGTH: # Too long: we want a shorter one return len_new < len_old if len_new >= _MAX_TITLE_LENGTH: # This one is too long: we don't want it return False # Otherwise, longer is better return len_new > len_old with GreynirBin.get_db() as bindb: for p in q: # Insert the name into the list if it's not already there, # or if the new title is longer than the previous one if p.name not in toplist or is_better_title( p.title, toplist[p.name][0]): toplist[p.name] = ( correct_spaces(p.title), p.article_url, p.id, bindb.lookup_name_gender(p.name), ) if len(toplist) >= limit: # We now have as many names as we initially wanted: terminate the loop break with changedlocale() as strxfrm: # Convert the dictionary to a sorted list of dicts return sorted( [ dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2]) for name, tu in toplist.items() ], key=lambda x: strxfrm(x["name"]), )
def tagset(self, word, at_sentence_start=False): """ Return a list of (probability, tag) tuples for the given word """ toklist = list(parse_tokens(" ".join(word))) token = toklist[0] w = word[0] if token.kind == TOK.WORD and token.val is None: try: with GreynirBin.get_db() as db: w, m = db.lookup_g(token.txt, at_sentence_start) except Exception: w, m = token.txt, [] token = TOK.Word(w, m) return self._ngram_tagger.tag_single_token(token)
def _mynameis_handler(q: Query, ql: str) -> bool: """ Handle queries of the form "Ég heiti X", store this information. """ m: Optional[Match[str]] = None for rx in _MY_NAME_IS_REGEXES: m = re.search(rx, ql) if m: break if m: fname = m.group(1).strip() if not fname: return False # Clean up name string name = fname.split(" og ")[0] # "ég heiti X og blablabla" name = name.split(" hvað ")[0] # "ég heiti X hvað heitir þú" # Handle "ég heiti ekki X" components = name.split() if components[0] == "ekki": q.set_answer(*gen_answer("Hvað heitirðu þá?")) return True # Get first name, look up gender for a gender-tailored response with GreynirBin.get_db() as bdb: fn = components[0].title() gender = bdb.lookup_name_gender(fn) or "hk" resp = _MY_NAME_IS_RESPONSES[gender] answ = resp.format(fn) if fn == "Embla": answ = "Sæl og blessuð. Ég heiti líka Embla!" # Save this info about user to query data table if q.client_id: qdata: ClientDataDict = dict(full=name.title(), first=fn, gender=gender) q.set_client_data("name", qdata) # Beautify query by capitalizing the name provided bq = q.beautified_query q.set_beautified_query(bq.replace(name, name.title())) # Generate answer voice = answ.replace(",", "") q.set_answer(dict(answer=answ), answ, voice) q.query_is_command() return True return False
def top_authors(days=_TOP_AUTHORS_PERIOD, session=None): """ Generate list of top authors w. parse percentage. """ end = datetime.utcnow() start = end - timedelta(days=days) authors = BestAuthorsQuery.period(start, end, enclosing_session=session, min_articles=10)[:20] authresult = list() with GreynirBin.get_db() as bindb: for a in authors: name = a[0] gender = bindb.lookup_name_gender(name) if gender == "hk": # Skip unnamed authors (e.g. "Ritstjórn Vísis") continue perc = round(float(a[4]), 2) authresult.append({"name": name, "gender": gender, "perc": perc}) return authresult[:10]
def lookup_best_word(word: str) -> Optional[Tuple[str, str, str, str]]: """ Look up word in BÍN, pick right one acc. to a criterion. """ with GreynirBin().get_db() as db: def nouns_only(bin_meaning: BIN_Tuple) -> bool: return bin_meaning.ordfl in ("kk", "kvk", "hk") res = list(filter(nouns_only, db.lookup_nominative_g(word))) if not res: # Try with uppercase first char capw = word.capitalize() res = list(filter(nouns_only, db.lookup_nominative_g(capw))) if not res: return None # OK, we have one or more matching nouns if len(res) == 1: m = res[0] else: # TODO: Pick best result m = res[0] # For now wid = m.utg # TODO: If more than one declension form possible (e.g. gen. björns vs. bjarnar) # we should also list such variations def sort_by_preference(m_list: BinEntryIterable) -> BinEntryList: # Filter out words that don't have the same "utg" i.e. word ID as # the one we successfully looked up in BÍN mns = list(filter(lambda w: w.bin_id == wid, m_list)) # Discourage rarer declension forms, i.e. ÞGF2 and ÞGF3 return sorted(mns, key=lambda m: "2" in m.mark or "3" in m.mark) # Look up all cases of the word in BÍN nom = m.stofn acc = db.cast_to_accusative(nom, filter_func=sort_by_preference) dat = db.cast_to_dative(nom, filter_func=sort_by_preference) gen = db.cast_to_genitive(nom, filter_func=sort_by_preference) return nom, acc, dat, gen
def dump_tokens(limit): """ Iterate through parsed articles and print a list of tokens and their matched terminals """ dtd = dict() with GreynirBin.get_db() as db, SessionContext(commit=True) as session: # Iterate through the articles q = ( session.query(Article) .filter(Article.tree != None) .order_by(Article.timestamp) ) if limit is None: q = q.all() else: q = q[0:limit] for a in q: print( "\nARTICLE\nHeading: '{0.heading}'\nURL: {0.url}\nTimestamp: {0.timestamp}" .format(a) ) tree = TreeTokenList() tree.load(a.tree) for ix, toklist in tree.token_lists(): print("\nSentence {0}:".format(ix)) at_start = True for t in toklist: if t.tokentype == "WORD": wrd = t.token[1:-1] td = dtd.get(t.terminal) if td is None: td = TerminalDescriptor(t.terminal) dtd[t.terminal] = td stem = td.stem(db, wrd, at_start) at_start = False print(" {0} {1} {2}".format(wrd, stem, t.terminal)) else: print(" {0.token} {0.cat} {0.terminal}".format(t))
def test() -> None: with GreynirBin.get_db() as db: c = Corrector(db) txts = [ """ FF er flokkur með rasisku ívafi og tilhneygjingu til að einkavinavæða alla fjölmiðla Íslands og færa þar með elítunni að geta ein haft áhrif á skoðanamyndandi áhri í fjölmiðlaheiminum, er ekki viðbúið að svona flokkur gamgi til samstarf við íhaldið eftir kosningar en ekki þessa vondu félagshyggjuflokka """, """ fæ alveg hræðileg drauma vegna fyrri áfalla og það hjálpar mér að ná góðum svef og þar með betri andlegri lýðan og líka til að auka matarlist. Tek samt skýrt fram að ég hef bæði missnotað kannabis og ekki. Hef engan áhuga á að vera undir áhrifum kannabis alla dag. Mikil munur á að nota og missnota ! """, """ Bæði , lyf gegn áfengissyki (leiða) , mér hefur ekki leiðst mikið seinustu 30 ár. Gegn Taugaveiklun, konan hamrar á mér alla daga , skærur hennar eru langar og strangar. En ef ég fæ eina pípu og gríp gitarinn má hún tuða í mér klukkutímum saman.Ég er bæði rólegur og læri hratt á gítarinn, eftir 10 ára hjónaband er ég bara ótrúlega heill og stefni hátt. Ég og gitarinn erum orðnir samvaxnir. Auðvitað stefnum við á skilnað og þá mun ég sakna skalaæfinganna. """, """ biddu nu hæg - var Kvennalistinn eins malefnis hreyfing. Hvað attu við - ef þu telur malefnið hafa verið eitt hvert var það? Kannski leikskola fyrir öll börn? Sömu laun fyrir sömu störf? Að borgarskipulag tæki mið af þörfum beggja kynja? Að kynjagleraugu væru notuð við gerð fjarlaga? Að þjoðfelagið opnaði augun fyrir kynferðsofbeldinu og sifjaspellum? (hvorutveggja sagt aðeins viðgangast i utlöndum). Þetta eru aðeins örfa dæmi um malefni sem brunnu a okkur og við börðumst fyrir. Ekki ertu i alvöru að tala framlag okkur niður. Tæplega telurðu það EITT malefni þo að i grunninn hafi baratta okkar sem stoðum að Kvennaframboðinu og -listanum gengið ut a að ,,betri,, helmingur þjoðarinnar öðlast - ekki bara i orði heldur einnig a borði - sömu rettindi og raðandi helmingurinn """, """ Salvör ekki standa i að reyna að klora yfir mistök þin. Reynsluheimur kvenna visar að sjalsögðu til þess að helmingur mannkynsins - -konur - er olikur hinum helmingnum bæði sökum lffræðilegs munar og þess að þær eru gerðar að konum (sb de Beauvoir) þe fra frumbernsku er drengjum hrosað fyrir annað en stulkum og væntingar foreldra eru aðrar til dætra en sona og auk þess er ætlast til að dætur læri af mæðrum en synir af feðrum. Það er þetta sem gerir konur - helming mannkynsins - frabrugðna körlum sem hafa fra örofi alda verið ,,raðandi,, kynið. Það var gegn þvi orettlæti að reynsluheimur kvenna speglaðist ekki i politiskum akvörðunum sem við sem stofnaði Kvennafranboðið og - listann börðumst gegn - a öllum vigstöðvum. Að skilgreina barattu okkar Kvennalistans - fyrir rettindum halfrar þjoðarinnar til að skapa ,,rettlatara samfelag,, - sem eins mals flokk er fjarstæða. """, ] def linebreak(txt: str, margin: int = 80, left_margin: int = 0) -> str: """Return a nicely column-formatted string representation of the given text, where each line is not longer than the given margin (if possible). A left margin can be optionally added, as a sequence of spaces. The lines are joined by newlines ('\n') but there is no trailing newline.""" result: List[str] = [] line: List[str] = [] len_line = 0 for wrd in txt.split(): if len_line + 1 + len(wrd) > margin: result.append(" ".join(line)) line = [] len_line = 0 line.append(wrd) len_line += 1 + len(wrd) if line: result.append(" ".join(line)) return "\n".join(" " * left_margin + line for line in result) t0 = time.time() for t in txts: print("\nOriginal:\n") print(linebreak(t, left_margin=8)) print("\nCorrected:\n") print(linebreak(c.correct_text(t), left_margin=8)) t1 = time.time() print("\nTotal time: {0:.2f} seconds".format(t1 - t0))
def recognize_entities( token_stream: Iterator[Tok], enclosing_session: Optional[Session] = None, token_ctor: Type[TOK] = TOK, ) -> Iterator[Tok]: """ Parse a stream of tokens looking for (capitalized) entity names The algorithm implements N-token lookahead where N is the length of the longest entity name having a particular initial word. Adds a named entity recognition layer on top of the reynir.bintokenizer.tokenize() function. """ # Token queue tq: List[Tok] = [] # Phrases we're considering. Note that an entry of None # indicates that the accumulated phrase so far is a complete # and valid known entity name. state: Dict[Union[str, None], List[Tuple[List[str], Entity]]] = defaultdict(list) # Entitiy definition cache ecache: Dict[str, List[Entity]] = dict() # Last name to full name mapping ('Clinton' -> 'Hillary Clinton') lastnames: Dict[str, Tok] = dict() with GreynirBin.get_db() as db, SessionContext(session=enclosing_session, commit=True, read_only=True) as session: def fetch_entities(w: str, fuzzy: bool = True) -> List[Entity]: """ Return a list of entities matching the word(s) given, exactly if fuzzy = False, otherwise also as a starting word(s) """ try: q = session.query(Entity.name, Entity.verb, Entity.definition) if fuzzy: q = q.filter( Entity.name.like(w + " %") | (Entity.name == w)) else: q = q.filter(Entity.name == w) return q.all() except OperationalError as e: logging.warning("SQL error in fetch_entities(): {0}".format(e)) return [] def query_entities(w: str) -> List[Entity]: """ Return a list of entities matching the initial word given """ e = ecache.get(w) if e is None: ecache[w] = e = fetch_entities(w) return e def lookup_lastname(lastname: str) -> Optional[Tok]: """ Look up a last name in the lastnames registry, eventually without a possessive 's' at the end, if present """ fullname = lastnames.get(lastname) if fullname is not None: # Found it return fullname # Try without a possessive 's', if present if lastname.endswith("s"): return lastnames.get(lastname[0:-1]) # Nope, no match return None def flush_match(): """ Flush a match that has been accumulated in the token queue """ if len(tq) == 1 and lookup_lastname(tq[0].txt) is not None: # If single token, it may be the last name of a # previously seen entity or person return token_or_entity(tq[0]) # Reconstruct original text behind phrase ename = " ".join([t.txt for t in tq]) # We don't include the definitions in the token - they should be looked up # on the fly when processing or displaying the parsed article return token_ctor.Entity(ename) def token_or_entity(token: Tok) -> Tok: """ Return a token as-is or, if it is a last name of a person that has already been mentioned in the token stream by full name, refer to the full name """ assert token.txt[0].isupper() tfull = lookup_lastname(token.txt) if tfull is None: # Not a last name of a previously seen full name return token if tfull.kind != TOK.PERSON: # Return an entity token with no definitions # (this will eventually need to be looked up by full name when # displaying or processing the article) return token_ctor.Entity(token.txt) # Return the full name meanings return token_ctor.Person(token.txt, tfull.person_names) try: while True: token = next(token_stream) if not token.txt: # token.kind != TOK.WORD: if state: if None in state: yield flush_match() else: yield from tq tq = [] state = defaultdict(list) yield token continue # Look for matches in the current state and build a new state newstate = defaultdict(list) w = token.txt # Original word def add_to_state(slist, entity): """ Add the list of subsequent words to the new parser state """ wrd = slist[0] if slist else None rest = slist[1:] newstate[wrd].append((rest, entity)) if w in state: # This matches an expected token tq.append(token) # Add to lookahead token queue # Add the matching tails to the new state for sl, entity in state[w]: add_to_state(sl, entity) # Update the lastnames mapping fullname = " ".join([t.txt for t in tq]) parts = fullname.split() # If we now have 'Hillary Rodham Clinton', # make sure we delete the previous 'Rodham' entry for p in parts[1:-1]: if p in lastnames: del lastnames[p] if parts[-1][0].isupper(): # 'Clinton' -> 'Hillary Rodham Clinton' lastnames[parts[-1]] = token_ctor.Entity(fullname) else: # Not a match for an expected token if state: if None in state: # We have an accumulated match, but if the next token # is an uppercase word without a BÍN meaning, we # append it to the current entity regardless. # This means that 'Charley Lucknow' is handled as a single # new entity name even if 'Charley' already exists # as an entity. while w and w[0].isupper() and not token.val: # Append to the accumulated token queue, which will # be squashed to a single token in flush_match() tq.append(token) token = next(token_stream) w = token.txt # Flush the already accumulated match yield flush_match() else: yield from tq tq = [] # Add all possible new states for entity names # that could be starting weak = True cnt = 1 upper = w and w[0].isupper() parts = [] if upper and " " in w: # For all uppercase phrases (words, entities, persons), # maintain a map of last names to full names parts = w.split() lastname = parts[-1] # Clinton -> Hillary [Rodham] Clinton if lastname[0].isupper(): # Look for Icelandic patronyms/matronyms _, m = db.lookup_g(lastname, False) if m and any(mm.fl in {"föð", "móð"} for mm in m): # We don't store Icelandic patronyms/matronyms # as surnames pass else: lastnames[lastname] = token if token.kind == TOK.WORD and upper and w not in Abbreviations.DICT: if " " in w: # w may be a person name with more than one embedded word # parts is assigned in the if statement above cnt = len(parts) elif not token.has_meanings or ( "-" in token.meanings[0].stofn): # No BÍN meaning for this token, or the meanings # were constructed by concatenation (indicated by a hyphen # in the stem) weak = False # Accept single-word entity references # elist is a list of Entity instances elist = query_entities(w) else: elist = [] if elist: # This word might be a candidate to start an entity reference candidate = False for e in elist: # List of subsequent words in entity name sl = e.name.split()[cnt:] if sl: # Here's a candidate for a longer entity reference # than we already have candidate = True if sl or not weak: add_to_state(sl, e) if weak and not candidate: # Found no potential entity reference longer than this token # already is - and we have a BÍN meaning for it: # Abandon the effort assert not newstate assert not tq yield token_or_entity(token) else: # Go for it: Initialize the token queue tq = [token] else: # Not a start of an entity reference: simply yield the token assert not tq if upper: # Might be a last name referring to a full name yield token_or_entity(token) else: yield token # Transition to the new state state = newstate except StopIteration: # Token stream is exhausted pass # Yield an accumulated match if present if state: if None in state: yield flush_match() else: yield from tq tq = [] # print("\nEntity cache:\n{0}".format("\n".join("'{0}': {1}".format(k, v) for k, v in ecache.items()))) # print("\nLast names:\n{0}".format("\n".join("{0}: {1}".format(k, v) for k, v in lastnames.items()))) assert not tq
port=Settings.PORT, debug=Settings.DEBUG, use_reloader=not ptvsd_attached, extra_files=extra_files, ) except socket_error as e: if e.errno == errno.EADDRINUSE: # Address already in use logging.error( "Greynir web app is already running at host {0}:{1}".format( Settings.HOST, Settings.PORT)) sys.exit(1) else: raise finally: ArticleProxy.cleanup() GreynirBin.cleanup() else: app.config["PRODUCTION"] = True # Suppress information log messages from Werkzeug werkzeug_log = logging.getLogger("werkzeug") if werkzeug_log: werkzeug_log.setLevel(logging.WARNING) # Log our startup log_str = ("Greynir instance starting with " "host={0}:{1}, db_host={2}:{3} on Python {4}".format( Settings.HOST, Settings.PORT, Settings.DB_HOSTNAME,
db_conn = sqlite3.connect(db_path, check_same_thread=False) db_conn.row_factory = lambda c, r: dict( zip([col[0] for col in c.description], r)) q = "SELECT DISTINCT nafn FROM ornefni;" res = db_conn.cursor().execute(q) matches = [row["nafn"] for row in res] num_bin = 0 num_comb = 0 num_fail = 0 with GreynirBin.get_db() as db: for m in matches: w = m.strip() if " " in w or "-" in w or "-" in w: continue # Direct BÍN lookup meanings = db.meanings(w) if meanings: num_bin += 1 continue # Lookup using BÍN and combinator _, meanings = db.lookup_g(w, auto_uppercase=True) if meanings: num_comb += 1