Пример #1
0
def build_cnter(query: dict, *, verbose=False) -> Counter[str]:
    [(tgt_key, tgt_val)] = query.items()

    cnter = Counter()
    for sentence in tqdm(mecab_into_sentences()):
        cnter += Counter(d[tgt_key] for d in sentence)

    if verbose:
        with Renderer(f"「{tgt_val}」の出現頻度") as out:
            out.header("上位 10 個")
            pprint.pprint(cnter.most_common(10), stream=sys.stderr)
            out.result("種類", len(cnter))

    return cnter
Пример #2
0
https://nlp100.github.io/ja/ch04.html#33-aのb

[Usage]
python knock33.py
"""
import os
import sys
from typing import Dict, List

from knock30 import mecab_into_sentences

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer  # noqa: E402 isort:skip

Morpheme = Dict[str, str]
Sentence = List[Morpheme]

if __name__ == "__main__":
    tgt = "AのB"

    res = []
    for sentence in mecab_into_sentences():
        for a, no, b in zip(sentence, sentence[1:], sentence[2:]):
            if (a["pos"], no["surface"], b["pos"]) == ("名詞", "の", "名詞"):
                res.append("".join(map(lambda x: x["surface"], (a, no, b))))

    with Renderer(tgt) as out:
        out.result("数", len(res))
        out.result("種類", len(set(res)))
        out.result("上から 10 個", res[:10])