Пример #1
0
from knock30 import load_morpheme_list
from knock36 import get_word_frequency

from typing import List, Dict, Tuple, Iterable
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties
from os.path import expanduser

# 日本語フォントの読み込み
fp = FontProperties(
    fname="/".join([expanduser("~"), "Library", "Fonts", "NotoSansCJKjp-DemiLight.otf"])
)


def zipf(freq: Iterable[Tuple[str, int]]) -> None:
    value = [v for _, v in freq]  # type: List[int]
    rank = range(1, len(value) + 1)

    plt.scatter(rank, value, s=1)
    plt.title("Zipfの法則", fontproperties=fp)
    plt.xlabel("単語の出現頻度順位", fontproperties=fp)
    plt.ylabel("単語の出現頻度", fontproperties=fp)
    plt.xscale("log")
    plt.yscale("log")
    plt.show()


if __name__ == "__main__":
    zipf(get_word_frequency(load_morpheme_list()))
Пример #2
0
# 31. 動詞
# 動詞の表層形をすべて抽出せよ.

from knock30 import load_morpheme_list
from typing import List, Dict

M = List[Dict[str, str]]


def get_verbs(morphemes: M) -> List[str]:
    verbs = []  # type: List[str]
    for morpheme in morphemes:
        if morpheme["pos"] == "動詞":
            verbs.append(morpheme["surface"])
    return verbs


if __name__ == "__main__":
    for verb in get_verbs(load_morpheme_list()):
        print(verb)
Пример #3
0
# 33. サ変名詞
# サ変接続の名詞をすべて抽出せよ.

from knock30 import load_morpheme_list
from typing import List, Dict

M = List[Dict[str, str]]


def get_sahen_noun(morphemes: M) -> List[str]:
    nouns = []  # type: List[str]
    for morpheme in morphemes:
        if morpheme["pos1"] == "サ変接続":
            nouns.append(morpheme["surface"])
    return nouns


if __name__ == "__main__":
    for noun in get_sahen_noun(load_morpheme_list()):
        print(noun)
Пример #4
0
# 34. 「AのB」
# 2つの名詞が「の」で連結されている名詞句を抽出せよ

from knock30 import load_morpheme_list
from typing import List, Dict

M = List[Dict[str, str]]


def get_noun_phrases(m: M) -> List[str]:
    noun_phrases = []
    for i in range(len(m) - 2):
        if m[i]["pos"] == m[i + 2]["pos"] == "名詞" and m[i +
                                                        1]["surface"] == "の":
            noun_phrases.append(m[i]["surface"] + m[i + 1]["surface"] +
                                m[i + 2]["surface"])
    return noun_phrases


if __name__ == "__main__":
    for noun_phrase in get_noun_phrases(load_morpheme_list()):
        print(noun_phrase)
Пример #5
0
from knock30 import load_morpheme_list
from knock36 import get_word_frequency

from typing import Dict, List, Tuple, Iterable
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties
from os.path import expanduser

# 日本語フォントの読み込み
fp = FontProperties(fname="/".join(
    [expanduser("~"), "Library", "Fonts", "NotoSansCJKjp-DemiLight.otf"]))


def plot_word_frequency(freq: List[Tuple[str, int]], num: int) -> None:
    labels = [s[0] for s in freq[:num]]
    counts = [s[1] for s in freq[:num]]
    axis = [i for i in range(num)]

    plt.bar(axis, counts, tick_label=labels, align="center")
    plt.xticks(range(num), labels, fontproperties=fp)
    plt.title(f"頻度上位 {num} 語", fontproperties=fp)
    plt.xlabel(f"出現頻度が高い {num} 語", fontproperties=fp)
    plt.ylabel("出現頻度", fontproperties=fp)
    plt.show()


if __name__ == "__main__":
    result = get_word_frequency(load_morpheme_list())
    plot_word_frequency(result, 10)
Пример #6
0
# 32. 動詞の原形
# 動詞の原形をすべて抽出せよ.

from knock30 import load_morpheme_list
from typing import List, Dict

M = List[Dict[str, str]]


def get_base_form_verbs(morphemes: M) -> List[str]:
    verbs = []  # type: List[str]
    for morpheme in morphemes:
        if morpheme["pos"] == "動詞":
            verbs.append(morpheme["base"])
    return verbs


if __name__ == "__main__":
    for verb in get_base_form_verbs(load_morpheme_list()):
        print(verb)