예제 #1
0
import sys
sys.path.append('..')
import numpy as np
from preprocess import most_similar, create_co_matrix, ppmi
from dataset import ptb

window_size = 2
wordvec_size = 100

corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
print('counting co-occurrence ...')
C = create_co_matrix(corpus, vocab_size, window_size)
W = ppmi(C, verbose=True)

print('calculating SVD ...')
try:
    from sklean.utils.extmath import randomized_svd
    U, S, V = randomized_svd(W,
                             n_components=wordvec_size,
                             n_iter=5,
                             random_state=None)

except ImportError:
    U, S, V = np.linalg.svd(W)

    word_vecs = U[:, :wordvec_size]

    querys = ['you', 'year', 'car', 'toyota']
    for query in querys:
        most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
예제 #2
0
import sys
sys.path.append('..')
import numpy as np
from preprocess import preprocess, create_co_matrix, cos_similarity, ppmi

text = "Tomoki is dog."
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)
W = ppmi(C)

np.set_printoptions(precision=3)
print('covariance matrix')
print(C)
print('-' * 50)
print('PPMI')
print(W)