def pick_words(limit=50, verbose=False): """ Pick etymologically ambigious nouns for creating manual clustering. """ query = select([ headword.c.name, freqs.c.freq, ]).select_from(joined_freq).where( word_sense.c.etymology_index.isnot(None) & (word_sense.c.pos == "Noun") & word_sense.c.inflection_of_id.is_(None) ).group_by( headword.c.id ).having( count( distinct(word_sense.c.etymology_index) ) > 1 ).order_by(freqs.c.freq.desc()).limit(limit) session = get_session() candidates = session.execute(query).fetchall() for word, freq in candidates: print(word + ".Noun", "#", freq) if verbose: print("\n") for word, _ in candidates: print("#", word) pprint(session.execute(select([ word_sense.c.sense_id, word_sense.c.sense, ]).select_from(joined).where( headword.c.name == word )).fetchall())
def get_session(): global db_session if db_session: return db_session from wikiparse.utils.db import get_session db_session = get_session() return db_session
def decompile(inf, out_dir): session = get_session() for lemma, grouping in gen_groupings(inf): with open(pjoin(out_dir, lemma), "w") as outf: first = True for group_num, synsets in grouping.items(): if not first: outf.write("\n") else: first = False for synset in synsets: outf.write(synset) outf.write(" # ") if is_wn_ref(synset): sense = wordnet.of2ss(synset).definition() else: sense = session.execute(select([ word_sense.c.sense, ]).select_from(joined).where( (headword.c.name == lemma) & (word_sense.c.sense_id == synset) )).fetchone()["sense"] tokens = word_tokenize(sense) outf.write(" ".join(tokens)) outf.write("\n")
def parse_stats_cov(inf, insert): df = pd.read_csv(inf) top = dict(top_events(df)) got_defns = top.get("got_defns", 0) defns_empty = top.get("defns_empty", 0) total = got_defns + defns_empty error_df = df.drop(["got_defns", "wf", "defns_empty"], axis=1, errors="ignore").set_index("word") partial_success = error_df.sum(axis=1).astype(bool).sum() - defns_empty complete_success = got_defns - partial_success cov_stats = { "success": got_defns, "partial_success": partial_success, "complete_success": complete_success, "empty": defns_empty, "total": total, "partial_coverage": "{:.1f}".format((got_defns / total) * 100), "full_coverage": "{:.1f}".format((complete_success / total) * 100), } if insert: db = get_session() insert_metadata(db, cov_stats) db.commit() for k, v in cov_stats.items(): print(k.title().replace("_", " "), got_defns)
def create_db(db_path): from lextract.mweproc.db.tables import metadata as mweproc_metadata session = get_session(db_path) engine = session().get_bind().engine mweproc_metadata.create_all(engine) return session
def gen(words, out_dir): """ Generate unclustered words in OUT_DIR from word list WORDS """ session = get_session() for word in words: word_pos = word.split("#")[0].strip() word, pos = word_pos.split(".") assert pos == "Noun" with open(pjoin(out_dir, word_pos), "w") as outf: # Get Wiktionary results results = session.execute(select([ word_sense.c.sense_id, word_sense.c.etymology_index, word_sense.c.sense, word_sense.c.extra, ]).select_from(joined).where( (headword.c.name == word) & (word_sense.c.pos == "Noun") ).order_by(word_sense.c.etymology_index)).fetchall() prev_ety = None for row in results: if prev_ety is not None and row["etymology_index"] != prev_ety: outf.write("\n") outf.write("{} # {}\n".format(row["sense_id"], row["extra"]["raw_defn"].strip().replace("\n", " --- "))) prev_ety = row["etymology_index"] # Get WordNet results for synset_id, lemma_objs in get_lemma_objs(word, WORDNETS, "n").items(): wordnets = {wn for wn, _ in lemma_objs} outf.write("\n") outf.write("{} # [{}] {}\n".format(pre_id_to_post(synset_id), ", ".join(wordnets), annotation_comment(lemma_objs)))
def add_keyed_words_cmd(ignore_bare_lemma: bool, add_surf: bool, dry_run: bool): """ Index multiwords/inflections/frames into database """ session = get_session() metadata = extend_mweproc() if not dry_run: metadata.create_all(session().get_bind().engine) inner_it = session.execute(mwe_for_indexing()) if logger.isEnabledFor(logging.INFO): ctx = contextlib.nullcontext(inner_it) else: ctx = click.progressbar(inner_it, label="Inserting word keys") cnt: Counter = Counter() with ctx as outer_it: for indexing_result in add_keyed_words(session, outer_it, ignore_bare_lemma, add_surf, dry_run): if indexing_result == IndexingResult.HEAD_INDEXED: cnt["headword_idxd"] += 1 elif indexing_result == IndexingResult.RAREST_INDEXED: cnt["rarest_idxd"] += 1 else: cnt["fail_idxd"] += 1 if not dry_run: session.commit()
def drop_trunc(which, extra=""): metadata = get_metadata() session = get_session() for t in reversed(metadata.sorted_tables): logger.info("%s %s", which, t.name) session.execute(f"{which} {t.name} {extra} CASCADE;") session.commit()
def add_freq_data(): """ Add table of frequencies to DB """ session = get_session() metadata.create_all(session().get_bind().engine) with click.progressbar(wordfreq.get_frequency_dict("fi").items(), label="Inserting frequencies") as name_freqs: for name, freq in name_freqs: insert(session, freqs, name=name, freq=freq) session.commit()
def extract_toks_cmd(): paragraph = sys.stdin.read() omorfi = get_omorfi() tokenised = omorfi.tokenise(paragraph) starts = [] start = 0 for token in tokenised: start = paragraph.index(token["surf"], start) starts.append(start) surfs = [tok["surf"] for tok in tokenised] session = get_session().get_bind() pprint(list(extract_toks(session, surfs)))
def lookup(word): session = get_session() query = lemma_info_query([word]) print("Counts") for row in session.execute(headword_rels_counts_query([word])): if not row[0]: print("Not found") continue print("# " + row[0]) for (name, _, _), cnt in zip(RELATED, row[1:]): print(name, cnt) print("Senses") for row in session.execute(query): pprint(row)
def wiktionary_batcher(): session = get_session() rows = session.execute(wiktionary_query_all()).fetchall() batch_size = get_batch_size() ids = [] defns = [] for row in rows: defn = row["sense"].strip() if not defn: continue ids.append(wiktionary_sense_id(row)) defns.append(defn) if len(ids) >= batch_size: yield ids, defns ids = [] defns = [] if len(ids): yield ids, defns
def get_wiktionary_defns( lemma_name, pos, skip_empty=True, tokenize=True, lower=False, ): session = get_session() for row in get_wiktionary(session, lemma_name, pos): tokens = row["sense"].strip() if skip_empty and not tokens: sys.stderr.write( f"Empty defn: {row['sense_id']} '{row['sense']}'\n") continue if tokenize: tokens = word_tokenize(tokens) if lower: assert tokenize tokens = [token.lower() for token in tokens] yield row["sense_id"], tokens
def main(mode="all"): if mode != "wiki": for lemma, wns in WordnetFin.lemma_names().items(): lemma_obj = next(iter(get_lemma_objs(lemma, wns).values()))[0][1] pos = lemma_obj.synset().pos() if pos == "s": pos = "a" lemma = norm(lemma) print(f"{lemma},{pos}") if mode != "wn": session = get_session() for pos in POS_MAP.keys(): for row in session.execute( select([headword.c.name]).select_from(joined).where( word_sense.c.pos.in_(POS_MAP[pos]) & word_sense.c.inflection_of_id.is_(None)).distinct() ).fetchall(): lemma = norm(row[0]) print(f"{lemma},{pos}")
def import_clust(indir): session = get_session() metadata.create_all(session().get_bind().engine) def clus_batch(line): line = line.decode('utf-8') lemma_clus, sense, exemp = line.strip().rsplit(",", 2) lemma, clus = lemma_clus.rsplit(".", 1) session.execute(cluster.insert().values( lemma=lemma, clus=int(clus), source=Source.wordnet if is_wn_ref(sense) else Source.wiktionary, sense=sense, exemp=exemp == "1", )) with click.progressbar(IterDirOrTar(indir), label="Inserting clusts") as clusts_chunks: lines = ( line for _, clusts_chunk in clusts_chunks for line in clusts_chunk) batch_commit(session, lines, clus_batch)
def create(): metadata = get_metadata() session = get_session() metadata.create_all(session().get_bind().engine)
def trunc_clust(): print("Dropping", cluster.name) session = get_session() session.execute(f"TRUNCATE {cluster.name} RESTART IDENTITY CASCADE;") session.commit()
def insert_dir(indir: str, filterfile: Optional[TextIO]): members = parse_filterfile(filterfile) insert_dir_inner(get_session(), indir, members)