예제 #1
0
def test_word2vec_n_closest():
    embedded = malaya.malaya_word2vec(256)
    word_vector = malaya.Word2Vec(embedded['nce_weights'],
                                  embedded['dictionary'])
    word = 'anwar'
    assert len(word_vector.n_closest(word=word, num_closest=8,
                                     metric='cosine')) > 0
예제 #2
0
def test_word2vec_n_closest_without_similarity():
    embedded = malaya.malaya_word2vec(256)
    word_vector = malaya.Word2Vec(embedded['nce_weights'],
                                  embedded['dictionary'])
    word = 'anwar'
    assert len(
        word_vector.n_closest(
            word=word, num_closest=8, metric='cosine',
            return_similarity=False)) > 0
예제 #3
0
def test_word2vec_tsne():
    embedded = malaya.malaya_word2vec(32)
    word_vector = malaya.Word2Vec(embedded['nce_weights'],
                                  embedded['dictionary'])
    embed_2d, word_list = word_vector.project_2d(0, 100)
    assert embed_2d.shape[1] == 2
예제 #4
0
def test_word2vec_analogy():
    embedded = malaya.malaya_word2vec(256)
    word_vector = malaya.Word2Vec(embedded['nce_weights'],
                                  embedded['dictionary'])
    assert len(word_vector.analogy('anwar', 'penjara', 'kerajaan', 5)) == 5
예제 #5
0
def word_count(str):
    counts = dict()
    words = str.split()

    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts


''' TEST DIFFERENT EMBEDDING PERFORMANCE ON DIFFERENT DATASET '''

embedded = malaya.malaya_word2vec(256)

print(len(embedded['dictionary']), embedded['nce_weights'].shape)
word_vector = malaya.Word2Vec(embedded['nce_weights'], embedded['dictionary'])
''' FEATURE SELECTION '''

tvec = TfidfVectorizer(max_features=100000, ngram_range=(1, 3))
x_train_tfidf = tvec.fit_transform(train_x)
chi2score = chi2(x_train_tfidf, train_y)[0]

plt.figure(figsize=(15, 10))
wscores = zip(tvec.get_feature_names(), chi2score)
wchi2 = sorted(wscores, key=lambda x: x[1])
topchi2 = list(zip(*wchi2[-20:]))
x = range(len(topchi2[1]))
labels = topchi2[0]
예제 #6
0
def test_word2vec_calculator_n_closest_without_similarity():
    embedded = malaya.malaya_word2vec(256)
    word_vector = malaya.Word2Vec(embedded['nce_weights'], embedded['dictionary'])
    assert len(word_vector.calculator('anwar + mahathir', num_closest=8, metric='cosine', return_similarity=False))
예제 #7
0
def test_word2vec_calculator_bracket():
    embedded = malaya.malaya_word2vec(256)
    word_vector = malaya.Word2Vec(embedded['nce_weights'], embedded['dictionary'])
    assert len(word_vector.calculator('(anwar+hadi) * mahathir', num_closest=8, metric='cosine'))