示例#1
0
def pick_words(limit=50, verbose=False):
    """
    Pick etymologically ambigious nouns for creating manual clustering.
    """
    query = select([
            headword.c.name,
            freqs.c.freq,
        ]).select_from(joined_freq).where(
            word_sense.c.etymology_index.isnot(None) &
            (word_sense.c.pos == "Noun") &
            word_sense.c.inflection_of_id.is_(None)
        ).group_by(
            headword.c.id
        ).having(
            count(
                distinct(word_sense.c.etymology_index)
            ) > 1
        ).order_by(freqs.c.freq.desc()).limit(limit)
    session = get_session()
    candidates = session.execute(query).fetchall()
    for word, freq in candidates:
        print(word + ".Noun", "#", freq)
    if verbose:
        print("\n")
        for word, _ in candidates:
            print("#", word)
            pprint(session.execute(select([
                word_sense.c.sense_id,
                word_sense.c.sense,
            ]).select_from(joined).where(
                headword.c.name == word
            )).fetchall())
示例#2
0
def get_session():
    global db_session
    if db_session:
        return db_session
    from wikiparse.utils.db import get_session
    db_session = get_session()
    return db_session
示例#3
0
def decompile(inf, out_dir):
    session = get_session()
    for lemma, grouping in gen_groupings(inf):
        with open(pjoin(out_dir, lemma), "w") as outf:
            first = True
            for group_num, synsets in grouping.items():
                if not first:
                    outf.write("\n")
                else:
                    first = False
                for synset in synsets:
                    outf.write(synset)
                    outf.write(" # ")
                    if is_wn_ref(synset):
                        sense = wordnet.of2ss(synset).definition()
                    else:
                        sense = session.execute(select([
                            word_sense.c.sense,
                        ]).select_from(joined).where(
                            (headword.c.name == lemma) &
                            (word_sense.c.sense_id == synset)
                        )).fetchone()["sense"]
                    tokens = word_tokenize(sense)
                    outf.write(" ".join(tokens))
                    outf.write("\n")
示例#4
0
def parse_stats_cov(inf, insert):
    df = pd.read_csv(inf)
    top = dict(top_events(df))
    got_defns = top.get("got_defns", 0)
    defns_empty = top.get("defns_empty", 0)
    total = got_defns + defns_empty
    error_df = df.drop(["got_defns", "wf", "defns_empty"],
                       axis=1,
                       errors="ignore").set_index("word")
    partial_success = error_df.sum(axis=1).astype(bool).sum() - defns_empty
    complete_success = got_defns - partial_success
    cov_stats = {
        "success": got_defns,
        "partial_success": partial_success,
        "complete_success": complete_success,
        "empty": defns_empty,
        "total": total,
        "partial_coverage": "{:.1f}".format((got_defns / total) * 100),
        "full_coverage": "{:.1f}".format((complete_success / total) * 100),
    }
    if insert:
        db = get_session()
        insert_metadata(db, cov_stats)
        db.commit()
    for k, v in cov_stats.items():
        print(k.title().replace("_", " "), got_defns)
示例#5
0
def create_db(db_path):
    from lextract.mweproc.db.tables import metadata as mweproc_metadata

    session = get_session(db_path)
    engine = session().get_bind().engine
    mweproc_metadata.create_all(engine)
    return session
示例#6
0
def gen(words, out_dir):
    """
    Generate unclustered words in OUT_DIR from word list WORDS
    """
    session = get_session()
    for word in words:
        word_pos = word.split("#")[0].strip()
        word, pos = word_pos.split(".")
        assert pos == "Noun"
        with open(pjoin(out_dir, word_pos), "w") as outf:
            # Get Wiktionary results
            results = session.execute(select([
                word_sense.c.sense_id,
                word_sense.c.etymology_index,
                word_sense.c.sense,
                word_sense.c.extra,
            ]).select_from(joined).where(
                (headword.c.name == word) &
                (word_sense.c.pos == "Noun")
            ).order_by(word_sense.c.etymology_index)).fetchall()
            prev_ety = None
            for row in results:
                if prev_ety is not None and row["etymology_index"] != prev_ety:
                    outf.write("\n")
                outf.write("{} # {}\n".format(row["sense_id"], row["extra"]["raw_defn"].strip().replace("\n", " --- ")))
                prev_ety = row["etymology_index"]

            # Get WordNet results
            for synset_id, lemma_objs in get_lemma_objs(word, WORDNETS, "n").items():
                wordnets = {wn for wn, _ in lemma_objs}
                outf.write("\n")
                outf.write("{} # [{}] {}\n".format(pre_id_to_post(synset_id), ", ".join(wordnets), annotation_comment(lemma_objs)))
示例#7
0
def add_keyed_words_cmd(ignore_bare_lemma: bool, add_surf: bool,
                        dry_run: bool):
    """
    Index multiwords/inflections/frames into database
    """
    session = get_session()
    metadata = extend_mweproc()
    if not dry_run:
        metadata.create_all(session().get_bind().engine)
    inner_it = session.execute(mwe_for_indexing())
    if logger.isEnabledFor(logging.INFO):
        ctx = contextlib.nullcontext(inner_it)
    else:
        ctx = click.progressbar(inner_it, label="Inserting word keys")
    cnt: Counter = Counter()
    with ctx as outer_it:
        for indexing_result in add_keyed_words(session, outer_it,
                                               ignore_bare_lemma, add_surf,
                                               dry_run):
            if indexing_result == IndexingResult.HEAD_INDEXED:
                cnt["headword_idxd"] += 1
            elif indexing_result == IndexingResult.RAREST_INDEXED:
                cnt["rarest_idxd"] += 1
            else:
                cnt["fail_idxd"] += 1

    if not dry_run:
        session.commit()
示例#8
0
    def drop_trunc(which, extra=""):
        metadata = get_metadata()
        session = get_session()

        for t in reversed(metadata.sorted_tables):
            logger.info("%s %s", which, t.name)
            session.execute(f"{which} {t.name} {extra} CASCADE;")
        session.commit()
示例#9
0
def add_freq_data():
    """
    Add table of frequencies to DB
    """
    session = get_session()
    metadata.create_all(session().get_bind().engine)
    with click.progressbar(wordfreq.get_frequency_dict("fi").items(), label="Inserting frequencies") as name_freqs:
        for name, freq in name_freqs:
            insert(session, freqs, name=name, freq=freq)
    session.commit()
示例#10
0
def extract_toks_cmd():
    paragraph = sys.stdin.read()
    omorfi = get_omorfi()
    tokenised = omorfi.tokenise(paragraph)
    starts = []
    start = 0
    for token in tokenised:
        start = paragraph.index(token["surf"], start)
        starts.append(start)

    surfs = [tok["surf"] for tok in tokenised]
    session = get_session().get_bind()
    pprint(list(extract_toks(session, surfs)))
示例#11
0
def lookup(word):
    session = get_session()
    query = lemma_info_query([word])
    print("Counts")
    for row in session.execute(headword_rels_counts_query([word])):
        if not row[0]:
            print("Not found")
            continue
        print("# " + row[0])
        for (name, _, _), cnt in zip(RELATED, row[1:]):
            print(name, cnt)
    print("Senses")
    for row in session.execute(query):
        pprint(row)
示例#12
0
def wiktionary_batcher():
    session = get_session()
    rows = session.execute(wiktionary_query_all()).fetchall()
    batch_size = get_batch_size()
    ids = []
    defns = []
    for row in rows:
        defn = row["sense"].strip()
        if not defn:
            continue
        ids.append(wiktionary_sense_id(row))
        defns.append(defn)
        if len(ids) >= batch_size:
            yield ids, defns
            ids = []
            defns = []
    if len(ids):
        yield ids, defns
示例#13
0
def get_wiktionary_defns(
    lemma_name,
    pos,
    skip_empty=True,
    tokenize=True,
    lower=False,
):
    session = get_session()
    for row in get_wiktionary(session, lemma_name, pos):
        tokens = row["sense"].strip()
        if skip_empty and not tokens:
            sys.stderr.write(
                f"Empty defn: {row['sense_id']} '{row['sense']}'\n")
            continue
        if tokenize:
            tokens = word_tokenize(tokens)
        if lower:
            assert tokenize
            tokens = [token.lower() for token in tokens]
        yield row["sense_id"], tokens
示例#14
0
def main(mode="all"):
    if mode != "wiki":
        for lemma, wns in WordnetFin.lemma_names().items():
            lemma_obj = next(iter(get_lemma_objs(lemma, wns).values()))[0][1]
            pos = lemma_obj.synset().pos()
            if pos == "s":
                pos = "a"
            lemma = norm(lemma)
            print(f"{lemma},{pos}")

    if mode != "wn":
        session = get_session()

        for pos in POS_MAP.keys():
            for row in session.execute(
                    select([headword.c.name]).select_from(joined).where(
                        word_sense.c.pos.in_(POS_MAP[pos])
                        & word_sense.c.inflection_of_id.is_(None)).distinct()
            ).fetchall():
                lemma = norm(row[0])
                print(f"{lemma},{pos}")
示例#15
0
    def import_clust(indir):
        session = get_session()
        metadata.create_all(session().get_bind().engine)

        def clus_batch(line):
            line = line.decode('utf-8')
            lemma_clus, sense, exemp = line.strip().rsplit(",", 2)
            lemma, clus = lemma_clus.rsplit(".", 1)

            session.execute(cluster.insert().values(
                lemma=lemma,
                clus=int(clus),
                source=Source.wordnet if is_wn_ref(sense) else Source.wiktionary,
                sense=sense,
                exemp=exemp == "1",
            ))
        with click.progressbar(IterDirOrTar(indir), label="Inserting clusts") as clusts_chunks:
            lines = (
                line
                for _, clusts_chunk in clusts_chunks
                for line in clusts_chunk)
            batch_commit(session, lines, clus_batch)
示例#16
0
 def create():
     metadata = get_metadata()
     session = get_session()
     metadata.create_all(session().get_bind().engine)
示例#17
0
 def trunc_clust():
     print("Dropping", cluster.name)
     session = get_session()
     session.execute(f"TRUNCATE {cluster.name} RESTART IDENTITY CASCADE;")
     session.commit()
示例#18
0
def insert_dir(indir: str, filterfile: Optional[TextIO]):
    members = parse_filterfile(filterfile)
    insert_dir_inner(get_session(), indir, members)