Пример #1
0
def main():
    text = 'You say goodbye and I say hello.'
    corpus, word_to_id, id_to_word = preprocess(text)
    vocab_size = len(word_to_id)
    C = create_co_matrix(corpus, vocab_size)
    W = ppmi(C)

    np.set_printoptions(precision=3)  # 有効桁3桁で表示
    print('convariance matrix')
    print(C)
    print('-'*50)
    print('PPMI')
    print(W)
    def test_ppmi(self):
        text = 'you say goodbye and I say hello.'
        corpus, w2id, id2w = preprocess(text)
        vocab_size = len(w2id)
        C = create_co_matrix(corpus, vocab_size)
        W = ppmi(C)

        W = np.around(W, 3)

        expected = np.array([[0., 1.807, 0., 0., 0., 0., 0.],
                             [1.807, 0., 0.807, 0., 0.807, 0.807, 0.],
                             [0., 0.807, 0., 1.807, 0., 0., 0.],
                             [0., 0., 1.807, 0., 1.807, 0., 0.],
                             [0., 0.807, 0., 1.807, 0., 0., 0.],
                             [0., 0.807, 0., 0., 0., 0., 2.807],
                             [0., 0., 0., 0., 0., 2.807, 0.]])

        np.testing.assert_array_almost_equal(W, expected)
Пример #3
0
def main():
    text = 'You say goodbye and I say hello.'
    corpus, word_to_id, id_to_word = preprocess(text)
    vocab_size = len(word_to_id)
    C = create_co_matrix(corpus, vocab_size, window_size=1)
    W = ppmi(C)

    # SVD
    U, S, V = np.linalg.svd(W)

    print(C[0])  # 共起行列
    print(W[0])  # PPMI行列
    print(U[0])  # SVD

    print(U[0, :2])

    for word, word_id in word_to_id.items():
        plt.annotate(word, (U[word_id, 0], U[word_id, 1]))

    plt.scatter(U[:, 0], U[:, 1], alpha=0.5)
    plt.show()
Пример #4
0
import sys

sys.path.append('..')
import numpy as np
from common.util import preprocess, create_co_matrix, cos_similarity, ppmi

text = "You say goodbye and I say hello."
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)
W = ppmi(C)

np.set_printoptions(precision=3)  # 유효 자릿수를 세 자리로 표시
print('동시발생 행렬')
print(C)
print('-' * 50)
print('PPMI')
print(W)
# coding: utf-8
import sys
sys.path.append('/home/hiromasa/deep-learning-from-scratch-2')
import numpy as np
from common.util import most_similar, create_co_matrix, ppmi
from dataset import ptb

window_size = 2
wordvec_size = 100

corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
print('counting  co-occurrence /home/hiromasa/deep-learning-from-scratch-2.')
C = create_co_matrix(corpus, vocab_size, window_size)
print('calculating PPMI /home/hiromasa/deep-learning-from-scratch-2.')
W = ppmi(C, verbose=True)

print('calculating SVD /home/hiromasa/deep-learning-from-scratch-2.')
try:
    # truncated SVD (fast!)
    from sklearn.utils.extmath import randomized_svd
    U, S, V = randomized_svd(W,
                             n_components=wordvec_size,
                             n_iter=5,
                             random_state=None)
except ImportError:
    # SVD (slow)
    U, S, V = np.linalg.svd(W)

word_vecs = U[:, :wordvec_size]
Пример #6
0
        for j in range(C.shape[1]):
            pmi = np.log2(C[i, j] * N / (S[j] * S[i]) + eps)
            M[i, j] = max(0, pmi)

            if verbose:
                cnt += 1
                if cnt % (total // 100) == 0:
                    print('%.1f%% done' % (100 * cnt / total))
    return M


text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)
W = ppmi(C)

np.set_printoptions(precision=3)  # 有効桁3桁で表示
print('covariance matrix')
print(C)
# [[0 1 0 0 0 0 0]
#  [1 0 1 0 1 1 0]
#  [0 1 0 1 0 0 0]
#  [0 0 1 0 1 0 0]
#  [0 1 0 1 0 0 0]
#  [0 1 0 0 0 0 1]
#  [0 0 0 0 0 1 0]]
print('-' * 50)
print('PPMI')
print(W)
# [[0.    1.807 0.    0.    0.    0.    0.   ]