from knock30 import load_morpheme_list from knock36 import get_word_frequency from typing import List, Dict, Tuple, Iterable from matplotlib import pyplot as plt from matplotlib.font_manager import FontProperties from os.path import expanduser # 日本語フォントの読み込み fp = FontProperties( fname="/".join([expanduser("~"), "Library", "Fonts", "NotoSansCJKjp-DemiLight.otf"]) ) def zipf(freq: Iterable[Tuple[str, int]]) -> None: value = [v for _, v in freq] # type: List[int] rank = range(1, len(value) + 1) plt.scatter(rank, value, s=1) plt.title("Zipfの法則", fontproperties=fp) plt.xlabel("単語の出現頻度順位", fontproperties=fp) plt.ylabel("単語の出現頻度", fontproperties=fp) plt.xscale("log") plt.yscale("log") plt.show() if __name__ == "__main__": zipf(get_word_frequency(load_morpheme_list()))
# 31. 動詞 # 動詞の表層形をすべて抽出せよ. from knock30 import load_morpheme_list from typing import List, Dict M = List[Dict[str, str]] def get_verbs(morphemes: M) -> List[str]: verbs = [] # type: List[str] for morpheme in morphemes: if morpheme["pos"] == "動詞": verbs.append(morpheme["surface"]) return verbs if __name__ == "__main__": for verb in get_verbs(load_morpheme_list()): print(verb)
# 33. サ変名詞 # サ変接続の名詞をすべて抽出せよ. from knock30 import load_morpheme_list from typing import List, Dict M = List[Dict[str, str]] def get_sahen_noun(morphemes: M) -> List[str]: nouns = [] # type: List[str] for morpheme in morphemes: if morpheme["pos1"] == "サ変接続": nouns.append(morpheme["surface"]) return nouns if __name__ == "__main__": for noun in get_sahen_noun(load_morpheme_list()): print(noun)
# 34. 「AのB」 # 2つの名詞が「の」で連結されている名詞句を抽出せよ from knock30 import load_morpheme_list from typing import List, Dict M = List[Dict[str, str]] def get_noun_phrases(m: M) -> List[str]: noun_phrases = [] for i in range(len(m) - 2): if m[i]["pos"] == m[i + 2]["pos"] == "名詞" and m[i + 1]["surface"] == "の": noun_phrases.append(m[i]["surface"] + m[i + 1]["surface"] + m[i + 2]["surface"]) return noun_phrases if __name__ == "__main__": for noun_phrase in get_noun_phrases(load_morpheme_list()): print(noun_phrase)
from knock30 import load_morpheme_list from knock36 import get_word_frequency from typing import Dict, List, Tuple, Iterable from matplotlib import pyplot as plt from matplotlib.font_manager import FontProperties from os.path import expanduser # 日本語フォントの読み込み fp = FontProperties(fname="/".join( [expanduser("~"), "Library", "Fonts", "NotoSansCJKjp-DemiLight.otf"])) def plot_word_frequency(freq: List[Tuple[str, int]], num: int) -> None: labels = [s[0] for s in freq[:num]] counts = [s[1] for s in freq[:num]] axis = [i for i in range(num)] plt.bar(axis, counts, tick_label=labels, align="center") plt.xticks(range(num), labels, fontproperties=fp) plt.title(f"頻度上位 {num} 語", fontproperties=fp) plt.xlabel(f"出現頻度が高い {num} 語", fontproperties=fp) plt.ylabel("出現頻度", fontproperties=fp) plt.show() if __name__ == "__main__": result = get_word_frequency(load_morpheme_list()) plot_word_frequency(result, 10)
# 32. 動詞の原形 # 動詞の原形をすべて抽出せよ. from knock30 import load_morpheme_list from typing import List, Dict M = List[Dict[str, str]] def get_base_form_verbs(morphemes: M) -> List[str]: verbs = [] # type: List[str] for morpheme in morphemes: if morpheme["pos"] == "動詞": verbs.append(morpheme["base"]) return verbs if __name__ == "__main__": for verb in get_base_form_verbs(load_morpheme_list()): print(verb)