示例#1
0
    def plot(self, query, nbest=15):
        if ', ' not in query:
            words = [query] + w2v.search(self.data, query)[:nbest]
        else:
            words = query.split(', ')
            print ', '.join(words)
        mat = w2v.get_vectors(self.data)
        word_indexes = [w2v.get_word_index(self.data, w) for w in words]
        if word_indexes == [-1]:
            print 'not in vocabulary'
            return

        # do PCA
        X = mat[word_indexes]
        pca = PCA(n_components=2)
        pca.fit(X)
        print pca.explained_variance_ratio_
        X = pca.transform(X)
        xs = X[:, 0]
        ys = X[:, 1]

        # draw
        plt.figure(figsize=(12, 8))
        plt.scatter(xs, ys, marker='o')
        for i, w in enumerate(words):
            plt.annotate(w.decode('utf-8', 'ignore'),
                         xy=(xs[i], ys[i]),
                         xytext=(3, 3),
                         textcoords='offset points',
                         ha='left',
                         va='top',
                         **self.TEXT_KW)

        plt.show()
示例#2
0
TEXT_KW = dict(fontsize=FONT_SIZE, fontweight='bold', fontproperties=font)

filename = 'word2vec/jawiki.bin'
#filename = 'word2vec/orj.bin'
print 'loading'
data = w2v.load(filename)
print 'loaded'
nbest = 15

while True:
    query = raw_input('query: ')
    if query.startswith('nbest='):
        nbest = int(query[6:])
        continue
    if ', ' not in query:
        words = [query] + w2v.search(data, query)[:nbest]
    else:
        words = query.split(', ')
    print ', '.join(words)
    mat = w2v.get_vectors(data)
    word_indexes = [w2v.get_word_index(data, w) for w in words]
    if word_indexes == [-1]:
        print 'not in vocabulary'
        continue

    # do PCA
    X = mat[word_indexes]
    pca = PCA(n_components=2)
    pca.fit(X)
    print pca.explained_variance_ratio_
    X = pca.transform(X)
示例#3
0
文件: vis.py 项目: nishio/mycorpus
TEXT_KW = dict(fontsize=FONT_SIZE, fontweight="bold", fontproperties=font)

filename = "word2vec/jawiki.bin"
# filename = 'word2vec/orj.bin'
print "loading"
data = w2v.load(filename)
print "loaded"
nbest = 15

while True:
    query = raw_input("query: ")
    if query.startswith("nbest="):
        nbest = int(query[6:])
        continue
    if ", " not in query:
        words = [query] + w2v.search(data, query)[:nbest]
    else:
        words = query.split(", ")
    print ", ".join(words)
    mat = w2v.get_vectors(data)
    word_indexes = [w2v.get_word_index(data, w) for w in words]
    if word_indexes == [-1]:
        print "not in vocabulary"
        continue

    # do PCA
    X = mat[word_indexes]
    pca = PCA(n_components=2)
    pca.fit(X)
    print pca.explained_variance_ratio_
    X = pca.transform(X)
test(invec,
     [-0.21694795787334442, 0.38501447439193726, 1.080917239189148,
       0.25622865557670593, -0.22400374710559845, 0.7944273948669434,
       0.8532216548919678, -0.3066456913948059, -0.6160392761230469,
       -0.19778962433338165])

num_words = w2v.get_num_words_from_model()


data = w2v.load("tiny_corpus.bin");
test(
    w2v.get_num_words(data),
    num_words)

test(
    w2v.search(data, "1")[:5],
    ['a', 'A', 'g', 'G', '6'])

data2 = w2v.load_without_normalize("tiny_corpus.bin");

words = [w2v.get_word(data2, i) for i in range(w2v.get_num_words(data2))]

test(words[1], 'd')
test(w2v.get_vector(data2, 'd'), invec)
test(list(w2v.get_vectors(data2)[1]), invec)


test(w2v.find_sub(data, ['A', 'b'], ['a'])[:5],
     ['B', '2', 'H', 'h', '8'])  # 'A' + 'b' - 'a'