def __init__(self, filename='vectors.bin'): font = matplotlib.font_manager.FontProperties(fname='./ipag.ttc') FONT_SIZE = 20 self.TEXT_KW = dict(fontsize=FONT_SIZE, fontweight='bold', fontproperties=font) print 'loading' self.data = w2v.load(filename) print 'loaded'
""" given a word and visualize near words """ import word2vec_boostpython as w2v from sklearn.decomposition import PCA import matplotlib.pyplot as plt import matplotlib.font_manager font = matplotlib.font_manager.FontProperties(fname='./ipag.ttc') FONT_SIZE = 10 TEXT_KW = dict(fontsize=FONT_SIZE, fontweight='bold', fontproperties=font) filename = 'word2vec/jawiki.bin' #filename = 'word2vec/orj.bin' print 'loading' data = w2v.load(filename) print 'loaded' nbest = 15 while True: query = raw_input('query: ') if query.startswith('nbest='): nbest = int(query[6:]) continue if ', ' not in query: words = [query] + w2v.search(data, query)[:nbest] else: words = query.split(', ') print ', '.join(words) mat = w2v.get_vectors(data) word_indexes = [w2v.get_word_index(data, w) for w in words]
""" given a word and visualize near words """ import word2vec_boostpython as w2v from sklearn.decomposition import PCA import matplotlib.pyplot as plt import matplotlib.font_manager font = matplotlib.font_manager.FontProperties(fname="./ipag.ttc") FONT_SIZE = 10 TEXT_KW = dict(fontsize=FONT_SIZE, fontweight="bold", fontproperties=font) filename = "word2vec/jawiki.bin" # filename = 'word2vec/orj.bin' print "loading" data = w2v.load(filename) print "loaded" nbest = 15 while True: query = raw_input("query: ") if query.startswith("nbest="): nbest = int(query[6:]) continue if ", " not in query: words = [query] + w2v.search(data, query)[:nbest] else: words = query.split(", ") print ", ".join(words) mat = w2v.get_vectors(data) word_indexes = [w2v.get_word_index(data, w) for w in words]
[0.1442144811153412, -0.274962455034256, -1.260462999343872, -0.42235079407691956, 0.15925364196300507, -0.6748839616775513, -1.4316737651824951, 0.473715215921402, 0.5771697759628296, 0.05916936323046684]) invec = w2v.get_invector(1) test(invec, [-0.21694795787334442, 0.38501447439193726, 1.080917239189148, 0.25622865557670593, -0.22400374710559845, 0.7944273948669434, 0.8532216548919678, -0.3066456913948059, -0.6160392761230469, -0.19778962433338165]) num_words = w2v.get_num_words_from_model() data = w2v.load("tiny_corpus.bin"); test( w2v.get_num_words(data), num_words) test( w2v.search(data, "1")[:5], ['a', 'A', 'g', 'G', '6']) data2 = w2v.load_without_normalize("tiny_corpus.bin"); words = [w2v.get_word(data2, i) for i in range(w2v.get_num_words(data2))] test(words[1], 'd') test(w2v.get_vector(data2, 'd'), invec) test(list(w2v.get_vectors(data2)[1]), invec)