Пример #1
0
    def __init__(self, filename='vectors.bin'):
        font = matplotlib.font_manager.FontProperties(fname='./ipag.ttc')
        FONT_SIZE = 20
        self.TEXT_KW = dict(fontsize=FONT_SIZE,
                            fontweight='bold',
                            fontproperties=font)

        print 'loading'
        self.data = w2v.load(filename)
        print 'loaded'
Пример #2
0
"""
given a word and visualize near words
"""
import word2vec_boostpython as w2v
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.font_manager

font = matplotlib.font_manager.FontProperties(fname='./ipag.ttc')
FONT_SIZE = 10
TEXT_KW = dict(fontsize=FONT_SIZE, fontweight='bold', fontproperties=font)

filename = 'word2vec/jawiki.bin'
#filename = 'word2vec/orj.bin'
print 'loading'
data = w2v.load(filename)
print 'loaded'
nbest = 15

while True:
    query = raw_input('query: ')
    if query.startswith('nbest='):
        nbest = int(query[6:])
        continue
    if ', ' not in query:
        words = [query] + w2v.search(data, query)[:nbest]
    else:
        words = query.split(', ')
    print ', '.join(words)
    mat = w2v.get_vectors(data)
    word_indexes = [w2v.get_word_index(data, w) for w in words]
Пример #3
0
"""
given a word and visualize near words
"""
import word2vec_boostpython as w2v
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.font_manager

font = matplotlib.font_manager.FontProperties(fname="./ipag.ttc")
FONT_SIZE = 10
TEXT_KW = dict(fontsize=FONT_SIZE, fontweight="bold", fontproperties=font)

filename = "word2vec/jawiki.bin"
# filename = 'word2vec/orj.bin'
print "loading"
data = w2v.load(filename)
print "loaded"
nbest = 15

while True:
    query = raw_input("query: ")
    if query.startswith("nbest="):
        nbest = int(query[6:])
        continue
    if ", " not in query:
        words = [query] + w2v.search(data, query)[:nbest]
    else:
        words = query.split(", ")
    print ", ".join(words)
    mat = w2v.get_vectors(data)
    word_indexes = [w2v.get_word_index(data, w) for w in words]
Пример #4
0
     [0.1442144811153412, -0.274962455034256, -1.260462999343872,
      -0.42235079407691956, 0.15925364196300507, -0.6748839616775513,
      -1.4316737651824951, 0.473715215921402, 0.5771697759628296,
      0.05916936323046684])

invec = w2v.get_invector(1)
test(invec,
     [-0.21694795787334442, 0.38501447439193726, 1.080917239189148,
       0.25622865557670593, -0.22400374710559845, 0.7944273948669434,
       0.8532216548919678, -0.3066456913948059, -0.6160392761230469,
       -0.19778962433338165])

num_words = w2v.get_num_words_from_model()


data = w2v.load("tiny_corpus.bin");
test(
    w2v.get_num_words(data),
    num_words)

test(
    w2v.search(data, "1")[:5],
    ['a', 'A', 'g', 'G', '6'])

data2 = w2v.load_without_normalize("tiny_corpus.bin");

words = [w2v.get_word(data2, i) for i in range(w2v.get_num_words(data2))]

test(words[1], 'd')
test(w2v.get_vector(data2, 'd'), invec)
test(list(w2v.get_vectors(data2)[1]), invec)