Exemplo n.º 1
0
def write_file(filename, rng):
    category_cnter = defaultdict(int)
    with open(filename + ".txt", "w") as f:
        for i in rng:
            print(data[i]["CATEGORY"], data[i]["TITLE"], sep="\t", file=f)
            category_cnter[data[i]["CATEGORY"]] += 1
    with Renderer(filename) as out:
        for tag, name in categories.items():
            out.result(name, category_cnter[tag])
Exemplo n.º 2
0
def build_cnter(query: dict, *, verbose=False) -> Counter[str]:
    [(tgt_key, tgt_val)] = query.items()

    cnter = Counter()
    for sentence in tqdm(mecab_into_sentences()):
        cnter += Counter(d[tgt_key] for d in sentence)

    if verbose:
        with Renderer(f"「{tgt_val}」の出現頻度") as out:
            out.header("上位 10 個")
            pprint.pprint(cnter.most_common(10), stream=sys.stderr)
            out.result("種類", len(cnter))

    return cnter
Exemplo n.º 3
0
def test_extract(query: dict, *, verbose=False) -> list:
    [(src_key, src_val)] = query["src"].items()
    [(dst_key, dst_val)] = query["dst"].items()

    res = []
    for sentence in mecab_into_sentences():
        res.extend([d[dst_key] for d in sentence if d[src_key] == src_val])

    if verbose:
        with Renderer(f"「{src_val}」の「{dst_val}」") as out:
            out.result("数", len(res))
            out.result("種類", len(set(res)))
            out.result("上から 10 個", res[:10])

    return res
Exemplo n.º 4
0
[Usage]
python knock03.py
"""
import doctest
import os
import re
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip

if __name__ == "__main__":
    doctest.testmod(verbose=True)

    s = ("Now I need a drink, alcoholic of course, "
         "after the heavy lectures involving quantum mechanics.")

    with Renderer("knock03") as out:
        out.result(
            "replace + list comprehension",
            [len(w) for w in s.replace(",", "").replace(".", "").split()],
        )
        out.result(
            "rstrip + list comprehension",
            [len(w.rstrip(",.")) for w in s.split()],
        )
        out.result("re.findall + map", list(map(len, re.findall(r"\w+", s))))
        out.result("re.sub + map", [*map(len, re.sub(r"[,.]", "", s).split())])
        out.result("re.split + map", [*map(len, re.split(r"\W+", s)[:-1])])
Exemplo n.º 5
0
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip

if __name__ == "__main__":

    w1 = "paraparaparadise"
    w2 = "paragraph"
    n = 2

    X = set(n_gram(w1, n))
    Y = set(n_gram(w2, n))
    tgt = n_gram("se", n).pop()

    print("X =", X)
    print("Y =", Y)

    print("X ∪ Y = {}".format(X | Y))  # X.union(Y)
    print("X ∩ Y = {}".format(X & Y))  # X.intersection(Y)
    print("X \\ Y = {}".format(X - Y))  # X.difference(Y)
    print("Y \\ X = {}".format(Y - X))  # Y.difference(X)

    print(f"X includes 'se': {tgt in X}")
    print(f"Y includes 'se': {tgt in Y}")

    with Renderer("MEMO") as out:
        out.result(r"X ∪ Y", X.union(Y))
        out.result(f"X ∩ Y", X.intersection(Y))
        out.result(rf"X \ Y", X.difference(n_gram(w2, n)))
        out.result(fr"Y \ X", Y.difference(n_gram(w1, n)))
Exemplo n.º 6
0
from typing import Iterator, Match, Tuple

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip
from kiyuna.utils.pickle import load  # noqa: E402 isort:skip


def exec_search(wiki: str, pattern: str) -> Iterator[Tuple[str, Match]]:
    reg = re.compile(pattern)
    for line in wiki.split("\n"):
        match = reg.search(line)
        if match:
            yield line, match


if __name__ == "__main__":
    wiki = load("UK")

    pat_category_only = r"\[\[Category:(?P<Category_name>.+?)(\||])"
    for _, match in exec_search(wiki, pat_category_only):
        print(match.group("Category_name"))

    pats = (
        pat_category_only,
        r"\[\[Category:(?P<Category_name>[^|]+)\|*(?P<Sortkey>.*)\]\]",
    )
    for pat in pats:
        with Renderer(pat) as out:
            for line, match in exec_search(wiki, pat):
                out.result(line, match.groups())
Exemplo n.º 7
0
def exec_findall(wiki: str, pattern: str) -> Iterator[Tuple[str, Group]]:
    reg = re.compile(pattern)
    for line in wiki.split("\n"):
        for match in reg.findall(line):
            yield line, match


if __name__ == "__main__":
    wiki = load("UK")

    pat = (
        r"(?:\s=\s)?"  # 「基礎情報 国]」対策
        r"([^:=]+)"  # '/' を [^] の中に追加すると <ref> 内のファイル名も取得できる
        r"\.(?i)(png|gif|jpg|jpeg|xcf|pdf|mid|ogg|svg|djvu)")
    with Renderer("knock24") as out:
        for line, filename in exec_findall(wiki, pat):
            fname = ".".join(filename)
            if "/" not in fname:  # <ref> 対策
                out.result(trunc(line), green(fname))
    """ NOTE
    - ウィキペディアの画像
        - [[ファイル:Uk topo en.jpg|thumb|200px|イギリスの地形図]]
    - 基礎情報 国
        - |国旗画像 = Flag of the United Kingdom.svg
    - <gallery>
        - Stonehenge2007 07 30.jpg|[[ストーンヘンジ]]
    - <ref>
        - <ref>[http://warp.da.ndl.go.jp/.../country.pdf
    """
Exemplo n.º 8
0
[Ref]
- reversed
    - https://docs.python.org/ja/3/library/functions.html#reversed
- string は変更不能なシーケンス型
    - https://docs.python.org/ja/3/reference/datamodel.html

[Usage]
python knock00.py
"""
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip

if __name__ == "__main__":

    s = "stressed"

    with Renderer("knock00") as out:
        out.result("slice", s[::-1])
        out.result("reversed", "".join(reversed(s)))

    with Renderer("MEMO") as out:
        out.result("reversed の返り値は", reversed(s))
        try:
            s.reverse()
        except AttributeError as e:
            out.result("string は変更不能なシーケンス型(immutable sequence)", e)
Exemplo n.º 9
0
[MEMO]
2015 年版の knock94-95 に対応
"""
import os
import sys
from zipfile import ZipFile

from scipy.stats import spearmanr

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import message, Renderer  # noqa: E402 isort:skip
from kiyuna.utils.pickle import load  # noqa: E402 isort:skip

if __name__ == "__main__":
    wv = load("chap07-embeddings")
    preds, labels = [], []
    with ZipFile("wordsim353.zip") as myzip:
        message(myzip.infolist())
        with myzip.open("combined.csv") as myfile:
            myfile = map(lambda x: x.decode(), myfile)
            message("[header]", next(myfile))  # Word 1,Word 2,Human (mean)
            for line in myfile:
                word1, word2, human = line.split(",")
                preds.append(wv.similarity(word1, word2))
                labels.append(human)
    with Renderer("knock66") as out:
        out.result("Spearman corr", spearmanr(preds, labels)[0])
"""result
0.6849564489532376
"""
Exemplo n.º 10
0
    return list(country_names)


def country_embeddings():
    wv = load("chap07-embeddings")
    country_names = np.array(list_country_names(), dtype=object)
    embeddings = [wv[country_name] for country_name in country_names]
    dump([embeddings, country_names], "chap07-embeddings-country")
    return embeddings, country_names


if __name__ == "__main__":
    embeddings, country_names = country_embeddings()
    kmeans = KMeans(n_clusters=5).fit(embeddings)
    dump(kmeans, "chap07-kmeans")
    with Renderer("knock67", start=0) as out:
        for i in range(5):
            out.result(f"Class {i}", country_names[kmeans.labels_ == i])
"""result
[*]  0. Class 0
['Bhutan' 'Bahrain' 'Japan' 'Morocco' 'Indonesia' 'Pakistan' 'Thailand'
 'Tunisia' 'Oman' 'Egypt' 'Turkey' 'Qatar' 'Iraq' 'Laos' 'Libya' 'Lebanon'
 'Jordan' 'Afghanistan' 'Bangladesh' 'Syria' 'Nepal' 'China' 'Vietnam'
 'Iran']
[*]  1. Class 1
['Samoa' 'Chile' 'Dominica' 'Australia' 'Ecuador' 'Fiji' 'Bahamas'
 'Canada' 'Jamaica' 'Nicaragua' 'Cuba' 'Peru' 'Venezuela' 'Uruguay'
 'Guyana' 'Honduras' 'Belize' 'Greenland' 'Philippines' 'Taiwan' 'Tuvalu'
 'Suriname']
[*]  2. Class 2
['Ghana' 'Malawi' 'Gabon' 'Gambia' 'Namibia' 'Guinea' 'Uganda' 'Somalia'
Exemplo n.º 11
0
def build_word_frequency_cnter(path: str, trans: F = str) -> Counter[T]:
    with open(path) as f:
        return collections.Counter(map(trans, f.read().split()))


def get_vocab(path: str, trans: F = str) -> Set[T]:
    with open(path) as f:
        return {trans(w) for w in f.read().split()}


if __name__ == "__main__":
    path = sys.argv[1]

    if path == "MEMO":
        with Renderer("個人メモ") as out:
            out.result(
                "type hints",
                (
                    typing.get_type_hints(list_word_freq),
                    build_word_frequency_cnter.__annotations__,
                ),
            )
            out.header("with 内で return しても大丈夫なはず")
            dis.dis(build_word_frequency_cnter, file=sys.stderr)
            out.header("doctest")
            doctest.testmod(verbose=True)
            out.header("check serialize")
            cnter = list_word_freq("../../test/00-input.txt")
            dump(cnter, "cnter")
            cnter = load("cnter")
Exemplo n.º 12
0
[Usage]
python knock08.py
"""
import os
import string
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip


def rot13(text: str) -> str:
    return "".join(chr(219 - ord(c)) if c.islower() else c for c in text)


if __name__ == "__main__":

    d = {
        "printable": string.printable.strip(),
        "ascii_lowercase": string.ascii_lowercase,
    }

    for title, s in d.items():
        with Renderer(title) as out:
            out.result("plaintext", s)
            out.header("encode")
            out.result("ciphertext", rot13(s))
            out.header("decode")
            out.result("plaintext", rot13(rot13(s)))
Exemplo n.º 13
0
        right = get_S_expr(best_edge[sym_ij][1], best_edge, words)
        return f"({sym} {left} {right})"
    else:
        return f"({sym} {words[int(i)]})"


if __name__ == "__main__":
    if sys.argv[1] == "test":
        grammar_file = "../../test/08-grammar.txt"
        input_file = "../../test/08-input.txt"
    else:
        grammar_file = "../../data/wiki-en-test.grammar"
        input_file = "../../data/wiki-en-short.tok"

    s, t = 0, 1
    with Renderer(sys.argv[1]) as out:
        for i, s_expr in enumerate(cky(grammar_file, input_file, s=s, t=t)):
            message("=" * 3, "line:", s + i, "=" * 3)
            tree = Tree.fromstring(s_expr)
            out.result("S-expression", s_expr)
            out.result("nltk.tree.Tree", tree)
            out.header("nltk.tree.Tree.pretty_print")
            tree.pretty_print()
            # tree.draw()
"""result
[+] main
=== line: 0 ===
[*]  1. S-expression
(S (PP (IN Among) (NP (DT these) (NP' (, ,) (NP' (JJ supervised) (NP' (NN learning) (NNS approaches)))))) (S' (VP (VBP have) (VP (VBN been) (VP' (NP (DT the) (NP' (ADJP (RBS most) (JJ successful)) (NNS algorithms))) (PP (TO to) (NP_NN date))))) (. .)))
[*]  2. nltk.tree.Tree
(S
Exemplo n.º 14
0
def exec_match(wiki: str, pattern: str) -> Iterator[Tuple[str, Match]]:
    reg = re.compile(pattern)
    for line in wiki.split("\n"):
        match = reg.match(line)
        if match:
            yield line, match


if __name__ == "__main__":
    wiki = load("UK")

    pat = r"(?P<Level>=+)\s*(?P<Heading>.+)\s*(?P=Level)"
    for _, match in exec_match(wiki, pat):
        level, heading = match.group(1, 2)
        print(
            "  " * (len(level) - 2),
            "+",
            f" lv{len(level) - 1} ",
            heading,
            sep="",
        )

    with Renderer("re.match() vs. re.search()") as out:
        pat_hat = r"^" + pat
        it = zip(exec_match(wiki, pat), exec_search(wiki, pat_hat))
        for (line, match1), (_, match2) in it:
            assert match1.groups() == match2.groups(), line
        else:
            message("same")
Exemplo n.º 15
0
"""
55. 混同行列の作成
52で学習したロジスティック回帰モデルの混同行列(confusion matrix)を,
学習データおよび評価データ上で作成せよ.

[MEMO]
2015 年版の knock77 に対応
"""
import os
import sys

from sklearn.metrics import confusion_matrix

from knock53 import load_dataset

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer  # noqa: E402 isort:skip
from kiyuna.utils.pickle import dump, load  # noqa: E402 isort:skip

if __name__ == "__main__":
    classifier = load("chap06-classifier")
    with Renderer("knock55") as out:
        for name in "train", "test":
            features, labels = load_dataset(f"./{name}.feature.txt")
            predicts = classifier.predict(features)
            out.result(name, confusion_matrix(labels, predicts))
Exemplo n.º 16
0
def grid_search(
    model: Model,
    path_test: str,
    *,
    rng: Optional[Tuple[float, float, float]] = None,
    rng1: Optional[Tuple[float, float, float]] = None,
    rng2: Optional[Tuple[float, float, float]] = None,
    save: Optional[str] = None,
) -> Tuple[float, float]:
    def get_param(idx: np.ndarray) -> np.ndarray:
        return (np.array([rng1[0], rng2[0]]) +
                np.array([rng1[2], rng2[2]]) * idx)

    if rng:
        rng1 = rng2 = rng
    assert rng1 is not None
    assert rng2 is not None

    with Renderer("grid search") as out:
        cnt1 = len(np.arange(*rng1))
        cnt2 = len(np.arange(*rng2))
        E = np.zeros((cnt2, cnt1))
        for j, λ_2 in enumerate(np.arange(*rng2)):
            message(f"{j + 1:2d} / {cnt2}", CR=True, type="status")
            for i, λ_1 in enumerate(np.arange(*rng1)):
                E[j, i] = model.test(path_test, λ_1=λ_1, λ_2=λ_2)["entropy_H"]
        message("", CR=True)

        ma_y, ma_x = np.where(E == E.max())
        mi_y, mi_x = np.where(E == E.min())
        out.result("max", (E.max(), get_param(np.hstack([ma_x, ma_y]))))
        out.result("min", (E.min(), get_param(np.hstack([mi_x, mi_y]))))

    if save:
        fig = plt.figure()
        ax = fig.add_subplot(111)

        mappable = ax.pcolor(E, cmap="jet", edgecolors="k", alpha=0.8)
        fig.colorbar(mappable)

        ax.scatter(ma_x + 0.5, ma_y + 0.5, c="r", label="max")
        ax.scatter(mi_x + 0.5, mi_y + 0.5, c="b", label="min")

        ax.set_xticks(np.arange(cnt1) + 0.5, minor=False)
        ax.set_yticks(np.arange(cnt2) + 0.5, minor=False)
        ax.set_xticklabels(
            map(lambda x: f"{x:.2f}"[1:], np.arange(*rng1)),
            minor=False,
            rotation=45,
        )
        ax.set_yticklabels(
            map(lambda x: f"{x:.2f}"[1:], np.arange(*rng2)),
            minor=False,
        )
        ax.set_title(f"エントロピー {get_ext(model.WittenBell)}")
        ax.set_xlabel("$λ_1$")
        ax.set_ylabel("$λ_2$")
        ax.set_aspect("equal")
        ax.legend(loc="lower right")
        plt.savefig(save)

    return get_param(np.hstack([mi_x, mi_y]))
Exemplo n.º 17
0
https://nlp100.github.io/ja/ch04.html#33-aのb

[Usage]
python knock33.py
"""
import os
import sys
from typing import Dict, List

from knock30 import mecab_into_sentences

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer  # noqa: E402 isort:skip

Morpheme = Dict[str, str]
Sentence = List[Morpheme]

if __name__ == "__main__":
    tgt = "AのB"

    res = []
    for sentence in mecab_into_sentences():
        for a, no, b in zip(sentence, sentence[1:], sentence[2:]):
            if (a["pos"], no["surface"], b["pos"]) == ("名詞", "の", "名詞"):
                res.append("".join(map(lambda x: x["surface"], (a, no, b))))

    with Renderer(tgt) as out:
        out.result("数", len(res))
        out.result("種類", len(set(res)))
        out.result("上から 10 個", res[:10])
Exemplo n.º 18
0
[Command]
wc (word count)
    -c バイト数を表示
    -l 改行の数を表示する
    -m 文字数を表示する(マルチバイト文字に対応)
    -w 単語数を表示する

[Usage]
INPUT_PATH=./popular-names.txt
python knock10.py $INPUT_PATH
wc -l $INPUT_PATH
cat $INPUT_PATH | wc -l
diff -sw <(python knock10.py $INPUT_PATH) <(cat $INPUT_PATH | wc -l)
"""
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip

if __name__ == "__main__":
    path = sys.argv[1]

    with Renderer("knock10") as out, open(path) as f:
        out.result("generator", sum(1 for _ in f))
        out.result("readlines", len(open(path).readlines()))
        out.result("read", len(open(path).read().rstrip("\n").split("\n")))

    print(sum(1 for _ in open(path)))