def test_cos_similarity(self):
        text = 'you say goodbye and I say hello.'
        corpus, w2id, id2w = preprocess(text)

        vocab_size = len(w2id)

        C = create_co_matrix(corpus, vocab_size)

        c0 = C[w2id['you']]
        c1 = C[w2id['i']]

        expected_c0 = 0.9999999800000005
        expected_c1 = 0.7071067691154799

        self.assertEqual(cos_similarity(c0, c0), expected_c0)
        self.assertEqual(cos_similarity(c0, c1), expected_c1)
示例#2
0
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    '''類似単語の検索

    :param query: クエリ(テキスト)
    :param word_to_id: 単語から単語IDへのディクショナリ
    :param id_to_word: 単語IDから単語へのディクショナリ
    :param word_matrix: 単語ベクトルをまとめた行列。各行に対応する単語のベクトルが格納されていることを想定する
    :param top: 上位何位まで表示するか
    '''
    if query not in word_to_id:
        print('%s is not found' % query)
        return

    print('\n[query] ' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]

    vocab_size = len(id_to_word)

    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)

    count = 0
    for i in (-1 * similarity
              ).argsort():  # argsortはNumpy配列の要素を小さい順にソートするので、-1をかけて大きい順に取り出す
        if id_to_word[i] == query:
            continue
        print(' %s: %s' % (id_to_word[i], similarity[i]))

        count += 1
        if count >= top:
            return
    def test_cos_similarity(self):
        input_x = [np.array([0, 0]), np.array([3, 3, 3, 3])]
        input_y = [np.array([0, 0]), np.array([4, 4, 4, 4])]
        expected = [0, 1]

        for i in range(len(input_x)):
            actual = cos_similarity(input_x[i], input_y[i])
            self.assertAlmostEqual(actual, expected[i])
示例#4
0
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    if query not in word_to_id:
        print('%s is not found' % query)
        return

    print('\n[query] ' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]

    vocab_size = len(id_to_word)
    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)

    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(' %s: %s' % (id_to_word[i], similarity[i]))

        count += 1
        if count >= top:
            return
示例#5
0
文件: similarity.py 项目: d86518/NLP
# coding: utf-8
import sys
sys.path.append('..')
from common.util import preprocess, create_co_matrix, cos_similarity


text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)

c0 = C[word_to_id['you']]  #「you」的詞向量
c1 = C[word_to_id['i']]  #「i」的詞向量
print(cos_similarity(c0, c1))
示例#6
0
print(C[0])
print(C[word_to_id["goodbye"]])

#%% [markdown]
# ###ベクトル間の類似度
# 共起行列をもちいてベクトル感の類似度を計測する
# ベクトルの内積・ユークリッド距離などがありますが、単語のベクトル表現の類似度は
# コサイン類似度(cosine similarity)がよく用いられる。
# $$
# similarity(x, y) = \frac{xy}{||x|| ||y||} = \frac{x_1y_1 + ... + x_ny_n}{\sqrt{x_1^2+ .. + x_n^2} \sqrt{y_1^2 + ... + y_n^2}}
# $$

#%%
from common import util

text = "You say goodbye and I say hello."
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = util.create_co_matrix(corpus, vocab_size)

c0 = C[word_to_id["you"]]  # youの単語ベクトル
c1 = C[word_to_id["i"]]  # iの単語ベクトル
print(c0, c1)
print(util.cos_similarity(c0, c1))

#%%
import numpy as np

x = np.array([100, -20, 2])
print(x.argsort())
示例#7
0
# coding: utf-8
import sys
sys.path.append(r'C:\Users\pc\Desktop\고영국\개발\AI\DeepLearning\Scratch2')
from common.util import preprocess, create_co_matrix, cos_similarity

text = 'You say goodbey and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)

c0 = C[word_to_id['you']]  # "you"의 단어 벡터
c1 = C[word_to_id['i']]  # "i"의 단어 벡터
print(cos_similarity(c0, c1))  # 0.70... <- 1에 가까울수록 유사도가 높다
示例#8
0
# -*- coding: utf-8 -*-
"""
Created on Sat Jan 23 00:36:44 2021

@author: ghqls
"""

import sys
sys.path.append('..')
from common.util import preprocess, create_co_matrix, cos_similarity

text = "You say goodbye and I say hello."
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)

C0 = C[word_to_id['you']]
C1 = C[word_to_id['i']]

print(cos_similarity(C0, C1))
vocabulary_size = len(word_to_id)
C = create_co_matrix(corpus=corpus, vocabulary_size=vocabulary_size)

print(vocabulary_size)
print(C)

vec_you = C[word_to_id['you']]
vec_i = C[word_to_id['i']]
vec_hello = C[word_to_id['hello']]
vec_say = C[word_to_id['say']]
vec_goodbye = C[word_to_id['goodbye']]
vec_and = C[word_to_id['and']]

print('you, i')
print(cos_similarity(vec_you, vec_i))
print('you, hello')
print(cos_similarity(vec_you, vec_hello))
print('you, say')
print(cos_similarity(vec_you, vec_say))
print('you, goodbye')
print(cos_similarity(vec_you, vec_goodbye))
print('you, and')
print(cos_similarity(vec_you, vec_and))

most_similar('you', word_to_id, id_to_word, C, top=5)

W = ppmi(C, verbose=True)
np.set_printoptions(precision=3)
print('covariance matrix')
print(C)
示例#10
0
        for i in range(1, window_size + 1):
            left_idx = idx - i
            right_idx = idx + i

            if left_idx >= 0:
                left_word_id = corpus[left_idx]
                matrix[word_id, left_word_id] += 1
            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                matrix[word_id, right_word_id] += 1
    return matrix


if __name__ == '__main__':
    text = "You say goodbye and I say hello."
    corpus, word_to_id, id_to_word = preprocess(text)
    print("corpus:", corpus)
    co_matrix = create_co_occurence_matrix(corpus, len(word_to_id))
    print(co_matrix)

    # cosine similarity を計算してみる
    print('cosine similarities...')
    vec_you = co_matrix[word_to_id['you']]
    vec_I = co_matrix[word_to_id['i']]
    print('you and i:', cos_similarity(vec_you, vec_I))

    vec_hello = co_matrix[word_to_id['hello']]
    print('you and hello:', cos_similarity(vec_you, vec_hello)
          )  # this is equal to that of you and i beacaus corpus is too small.
    print('i and hello:', cos_similarity(vec_I, vec_hello))