示例#1
0
def train() -> None:
    # 素性とラベルを読み込む
    features = deserialize("features")
    labels = deserialize("labels")

    # ロジスティック回帰モデル重みを学習させる (fit)
    model = LogisticRegression().fit(features, labels)
    serialize("model", model)
示例#2
0
def main():
    # 国名に関するベクトルの読み込み
    features = deserialize("country.matrix")
    t_index = deserialize("country.index")

    # ward 法
    df = pd.DataFrame(features, t_index.keys())
    la = linkage(df, method="ward", metric="euclidean")

    # デンドログラムの表示
    dendrogram(la, labels=list(t_index.keys()), leaf_font_size=8)
    plt.show()
示例#3
0
def main():

    # 国名に関するベクトルの読み込み
    features = deserialize("country.matrix")
    t_index = deserialize("country.index")

    # k=5 でクラスタリング
    kmeans_model = KMeans(n_clusters=5).fit(features)
    labels = kmeans_model.labels_
    t_keys = t_index.keys()

    for label, t in zip(labels, t_keys):
        print(f"{t.ljust(12)} : {label}")
示例#4
0
def main() -> None:
    # モデル, 素性, 正解ラベルの読み込み
    model = deserialize("model")
    features = deserialize("features")
    labels = deserialize("labels")

    # 予測されたラベルと予測確率を取得
    preds = model.predict(features)
    probs = model.predict_proba(features)

    # 正解ラベル  予測ラベル  予測確率 を出力
    for ans, label, prob in zip(labels, preds, probs):
        print(f"{ans}\t{label}\t{max(prob):.6f}")
示例#5
0
def main():
    model = deserialize("model")
    features = deserialize("features")
    labels = deserialize("labels")

    # +1 に分類される確率
    probs = model.predict_proba(features)[:, 1]
    # 正解ラベルと予測したラベルの確率を与える
    pre, rec, th = precision_recall_curve(labels, probs)

    plt.plot(rec, pre)
    plt.xlabel("recall")
    plt.ylabel("precision")
    plt.show()
示例#6
0
def main() -> None:
    # model と vocal の読み込み
    model = deserialize("model")
    vocab = deserialize("vocabs")

    # sample.txt から素性を抽出し、ベクトル化する
    sentences, _ = create_feature("./sample.txt")
    vectorizer = TfidfVectorizer(vocabulary=vocab)
    feature = vectorizer.fit_transform(sentences).toarray()

    # model.predict : データが分類されるクラスを予測
    # model.predict_proba : データが各クラスに分類される確率を求める
    pp = zip(model.predict(feature), model.predict_proba(feature))
    for predict, prob in pp:
        print(f"{int(predict):>3} : {max(prob):.6}")
示例#7
0
def main():
    # ベクトルの読み込み
    t_index = deserialize("country.index")
    matrix = np.array(deserialize("country.matrix"))

    t_sne = TSNE(perplexity=30, learning_rate=500).fit_transform(matrix)
    predicts = KMeans(n_clusters=5).fit_predict(matrix)

    fig, ax = plt.subplots()
    cmap = plt.get_cmap("Set1")
    for index, label in enumerate(t_index.keys()):
        cval = cmap(predicts[index] / 4)
        ax.scatter(t_sne[index, 0], t_sne[index, 1], marker=".", color=cval)
        ax.annotate(label, xy=(t_sne[index, 0], t_sne[index, 1]), color=cval)
    plt.show()
示例#8
0
def main() -> None:
    model = deserialize("model")
    features = deserialize("features")
    labels = deserialize("labels")

    # 正解率,適合率,再現率,F1スコア
    eval_index = ["accuracy", "precision", "recall", "f1"]
    # 5 分割交差検定を行い、各指標の算術平均を取る
    scores = cross_validate(model, features, labels, cv=5, scoring=eval_index)
    scores = {k: mean(v) for k, v in scores.items()}

    print(f"正解率 : {scores['test_accuracy']}")
    print(f"適合率 : {scores['test_precision']}")
    print(f"再現率 : {scores['test_recall']}")
    print(f"F1     : {scores['test_f1']}")
示例#9
0
def weight_rank() -> None:
    model = deserialize("model")
    names = deserialize("names")

    # モデルの重み
    weights = model.coef_[0].tolist()
    # モデルの重みの値に名前を対応させる
    res = list(zip(weights, names))
    res.sort()

    print("\n# rank : worst 10")
    for pair in res[:10]:
        print(f"{pair[1]:<10}{pair[0]:.6f}")

    print("\n# rank : top 10")
    for pair in res[:-11:-1]:
        print(f"{pair[1]:<10} {pair[0]:.6f}")
示例#10
0
def main():
    # word2vec のすべてのベクトルを読み込む
    matrix, t_index = deserialize("matrix"), deserialize("t_index")

    # 国名のリストの読み込み
    with open("../data/country.json", "r", encoding="utf-8") as f:
        data = json.load(f)
    countries = [d["name"] for d in data]

    new_matrix, new_index = [], {}
    for c in countries:
        if not c in t_index:
            continue
        new_matrix.append(matrix[t_index[c]])
        new_index[c] = t_index[c]

    serialize("country.matrix", new_matrix)
    serialize("country.index", new_index)
示例#11
0
def main():
    word2vec_filepath = "../data/w2v.txt"
    load_word2vec(word2vec_filepath)

    matrix, t_index = deserialize("matrix"), deserialize("t_index")

    # knock86
    u_s = matrix[t_index["United_States"]]

    # knock87
    print(cosine_similarity(matrix[t_index["U.S"]], u_s))

    # knock88
    for rank in similar_list(matrix, t_index,
                             matrix[t_index["England"]])[1:11]:
        print(f"{rank[0].ljust(10)} : {rank[1]}")

    # knock89
    for rank in multi_vec(matrix, t_index, "Spain", "Madrid", "Athens")[1:11]:
        print(f"{rank[0].ljust(10)} : {rank[1]}")
示例#12
0
import numpy as np
import sys, pathlib
from knock90 import multi_vec
from scipy.io import loadmat
from tqdm import tqdm
import faiss

chap08 = pathlib.Path().parent / ".." / "chapter08"
chap09 = pathlib.Path().parent / ".." / "chapter09"
sys.path.extend([str(chap08), str(chap09)])

from knock72 import serialize, deserialize
from knock87 import cosine_similarity
from knock80 import file_reader

ppmi, p_index = loadmat("knock85.matrix")["knock85.matrix"], deserialize(
    "p_index")
p_keys = list(p_index.keys())

w2v, t_index = np.array(deserialize("matrix")), deserialize("t_index")
t_keys = list(t_index.keys())

faiss_ppmi = faiss.IndexFlatIP(300)
faiss_ppmi.add(np.ascontiguousarray(ppmi.astype("float32")))

faiss_w2v = faiss.IndexFlatIP(300)
faiss_w2v.add(np.ascontiguousarray(w2v.astype("float32")))

with open("./results/faiss.ppmi.out.txt",
          "w") as ppmi_out, open("./results/faiss.w2v.out.txt",
                                 "w") as w2v_out:
    for line in open("./results/knock91.output.txt", "r", encoding="utf-8"):
示例#13
0
import numpy as np
import sys, pathlib
from knock90 import multi_vec
from scipy.io import loadmat
from tqdm import tqdm

chap08 = pathlib.Path().parent / ".." / "chapter08"
chap09 = pathlib.Path().parent / ".." / "chapter09"
sys.path.extend([str(chap08), str(chap09)])

from knock72 import serialize, deserialize
from knock87 import cosine_similarity
from knock80 import file_reader

w2v, w2v_index = deserialize("matrix"), deserialize("t_index")
ppmi, ppmi_index = loadmat("knock85.matrix")["knock85.matrix"], deserialize(
    "p_index")


def apply_w2v(line: str, f) -> None:
    t1, t2, t3, *_ = line.rstrip().split()
    res = multi_vec(w2v, w2v_index, t1, t2, t3)
    if not res:
        print(line.rstrip(), "-", "-", file=f)
    else:
        word, sim = res[0]
        print(line.rstrip(), word, sim, file=f)


def apply_ppmi(line: str, f) -> None: