def main() -> None: with open('cbow_params.pkl', 'rb') as f: params = pickle.load(f) word_vecs = params['word_vecs'] word_to_id = params['word_to_id'] id_to_word = params['id_to_word'] queries = ['you', 'year', 'car', 'toyota'] for query in queries: most_similar(query, word_to_id, id_to_word, word_vecs, top=5) analogy('king', 'man', 'queen', word_to_id, id_to_word, word_vecs) analogy('take', 'took', 'go', word_to_id, id_to_word, word_vecs) analogy('car', 'cars', 'child', word_to_id, id_to_word, word_vecs) analogy('good', 'better', 'bad', word_to_id, id_to_word, word_vecs)
def main(args): """ Display most similar k words """ print(args) word_vec_svd = np.load(args.model_path) with open(args.id2word_path) as f: pairs = f.readlines() word_to_id = {} id_to_word = {} for p in pairs: p = re.sub(r"\n", "", p) p = p.split("\t") id = int(p[0]) word = p[1] id_to_word[id] = word word_to_id[word] = id for word in word_to_id: most_similar(word, word_to_id, id_to_word, word_vec_svd)
def main(): window_size = 2 wordvec_size = 100 corpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) print('counting co-occurence...') co_matrix = create_co_matrix(corpus, vocab_size, window_size) print('calculating PPMI...') W = ppmi(co_matrix, verbose=True) print('calculating SVD ...') U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5, random_state=42) word_vecs = U[:, :wordvec_size] queries = ['you', 'year', 'car', 'toyota'] for query in queries: most_similar(query, word_to_id, id_to_word, word_vecs, top=5) print('DONE')
# %% import numpy as np from util import create_co_matrix, preprocess, cos_similarity text = "You say goodbye and I say hellow." corpus, word_to_id, id_to_word = preprocess(text) C = create_co_matrix(corpus, len(word_to_id)) c0 = C[word_to_id['you']] c2 = C[word_to_id['i']] print(cos_similarity(c0, c2)) # %% from util import most_similar most_similar('hellow', word_to_id, id_to_word, C) # %% # 共起行列をppmi行列に変換する import numpy as np from util import create_co_matrix, preprocess, cos_similarity, ppmi text = "You say goodbye and I say hellow." corpus, word_to_id, id_to_word = preprocess(text) W = ppmi(C, True) np.set_printoptions(precision=3) print('covariance matrix') print(C) print('-' * 50) print('PPMI') print(W) # %% # SVDによる次元削減p84 import numpy as np
import pickle from util import most_similar, analogy, analogy_text if __name__ == '__main__': #fname = 'cbow_params.pkl' fname = 'skip_gram_params.pkl' with open(fname, 'rb') as f: params = pickle.load(f) word_vecs = params['word_vecs'] word_to_id = params['word_to_id'] id_to_word = params['id_to_word'] # most similar querys = ['you', 'year', 'car', 'toyota', 'cat', 'music'] for query in querys: most_similar(query, word_to_id, id_to_word, word_vecs, top=5) # analogy print('\n-' + ' (distance) ' + '-' * 46) analogy('king', 'man', 'queen', word_to_id, id_to_word, word_vecs, top=30) analogy('take', 'took', 'go', word_to_id, id_to_word, word_vecs) analogy('car', 'cars', 'child', word_to_id, id_to_word, word_vecs, top=10) analogy('good', 'better', 'bad', word_to_id, id_to_word, word_vecs, top=30) print('\n-' + ' (text) ' + '-' * 50) analogy_text('king', 'man', 'queen', word_to_id, id_to_word, word_vecs, top=30)
word1 = "猫" word2 = "ライオン" word3 = "犬" emb1 = embedding[v_to_i[word1]] emb2 = embedding[v_to_i[word2]] emb3 = embedding[v_to_i[word3]] cos_sim_1_2 = U.cos_similarity(emb1, emb2) cos_sim_1_3 = U.cos_similarity(emb1, emb3) cos_sim_2_3 = U.cos_similarity(emb2, emb3) print("===== cosine similarity =====") print("{} : {} = {}".format(word1, word2, cos_sim_1_2)) print("{} : {} = {}".format(word1, word3, cos_sim_1_3)) print("{} : {} = {}".format(word2, word3, cos_sim_2_3)) ### 類似単語トップ5 print("\n===== most similar =====") query = [] query.append('猫') print(U.most_similar(query[0], v_to_i, i_to_v, embedding, top=5)) ### 類推 print("\n===== analogy =====") U.analogy('日本', '東京', 'アメリカ', v_to_i, i_to_v, embedding) U.analogy('王', '男', '女王', v_to_i, i_to_v, embedding)
print('vocab size: {0}'.format(len(word_to_id))) print('corpus size: {0}'.format(len(corpus))) # 共起行列 print('counting co_occurence..') c = create_co_matrix(corpus, vocab_size=len(word_to_id), window_size=window_size) # ppmi print('calculating ppmi (t) ..') m_t = ppmi_text(c, verbose=True) print('calculating ppmi (self) ..') m = ppmi(c) # 次元削減 SVD print('calculating svd..') U, S, V = randomized_svd(m, n_components=vec_size) U_t, S_t, V_t = randomized_svd(m_t, n_components=vec_size) # ひょうか querys = ['you', 'year', 'car', 'toyota'] for q in querys: print('SVD (self ppmi)') most_similar(q, word_to_id, id_to_word, U) print('SVD (t ppmi)') most_similar(q, word_to_id, id_to_word, U_t)