def plot(self, query, nbest=15): if ', ' not in query: words = [query] + w2v.search(self.data, query)[:nbest] else: words = query.split(', ') print ', '.join(words) mat = w2v.get_vectors(self.data) word_indexes = [w2v.get_word_index(self.data, w) for w in words] if word_indexes == [-1]: print 'not in vocabulary' return # do PCA X = mat[word_indexes] pca = PCA(n_components=2) pca.fit(X) print pca.explained_variance_ratio_ X = pca.transform(X) xs = X[:, 0] ys = X[:, 1] # draw plt.figure(figsize=(12, 8)) plt.scatter(xs, ys, marker='o') for i, w in enumerate(words): plt.annotate(w.decode('utf-8', 'ignore'), xy=(xs[i], ys[i]), xytext=(3, 3), textcoords='offset points', ha='left', va='top', **self.TEXT_KW) plt.show()
TEXT_KW = dict(fontsize=FONT_SIZE, fontweight='bold', fontproperties=font) filename = 'word2vec/jawiki.bin' #filename = 'word2vec/orj.bin' print 'loading' data = w2v.load(filename) print 'loaded' nbest = 15 while True: query = raw_input('query: ') if query.startswith('nbest='): nbest = int(query[6:]) continue if ', ' not in query: words = [query] + w2v.search(data, query)[:nbest] else: words = query.split(', ') print ', '.join(words) mat = w2v.get_vectors(data) word_indexes = [w2v.get_word_index(data, w) for w in words] if word_indexes == [-1]: print 'not in vocabulary' continue # do PCA X = mat[word_indexes] pca = PCA(n_components=2) pca.fit(X) print pca.explained_variance_ratio_ X = pca.transform(X)
TEXT_KW = dict(fontsize=FONT_SIZE, fontweight="bold", fontproperties=font) filename = "word2vec/jawiki.bin" # filename = 'word2vec/orj.bin' print "loading" data = w2v.load(filename) print "loaded" nbest = 15 while True: query = raw_input("query: ") if query.startswith("nbest="): nbest = int(query[6:]) continue if ", " not in query: words = [query] + w2v.search(data, query)[:nbest] else: words = query.split(", ") print ", ".join(words) mat = w2v.get_vectors(data) word_indexes = [w2v.get_word_index(data, w) for w in words] if word_indexes == [-1]: print "not in vocabulary" continue # do PCA X = mat[word_indexes] pca = PCA(n_components=2) pca.fit(X) print pca.explained_variance_ratio_ X = pca.transform(X)
test(invec, [-0.21694795787334442, 0.38501447439193726, 1.080917239189148, 0.25622865557670593, -0.22400374710559845, 0.7944273948669434, 0.8532216548919678, -0.3066456913948059, -0.6160392761230469, -0.19778962433338165]) num_words = w2v.get_num_words_from_model() data = w2v.load("tiny_corpus.bin"); test( w2v.get_num_words(data), num_words) test( w2v.search(data, "1")[:5], ['a', 'A', 'g', 'G', '6']) data2 = w2v.load_without_normalize("tiny_corpus.bin"); words = [w2v.get_word(data2, i) for i in range(w2v.get_num_words(data2))] test(words[1], 'd') test(w2v.get_vector(data2, 'd'), invec) test(list(w2v.get_vectors(data2)[1]), invec) test(w2v.find_sub(data, ['A', 'b'], ['a'])[:5], ['B', '2', 'H', 'h', '8']) # 'A' + 'b' - 'a'