示例#1
0
def main():
    sentences, word2idx = get_wikipedia_data(n_files=10,
                                             n_vocab=1500,
                                             by_paragraph=True)
    with open('w2v_word2idx.json', 'w') as f:
        json.dump(word2idx, f)

    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i, j] += 1
        j += 1
    print "finished getting raw counts"

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A)
    # print "type(A):", type(A)
    # exit()
    A = A.toarray()

    idx2word = {v: k for k, v in word2idx.iteritems()}

    # plot the data in 2-D
    tsne = TSNE()
    Z = tsne.fit_transform(A)
    plt.scatter(Z[:, 0], Z[:, 1])
    for i in xrange(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8"), xy=(Z[i, 0], Z[i, 1]))
        except:
            print "bad string:", idx2word[i]
    plt.show()

    # create a higher-D word embedding, try word analogies
    # tsne = TSNE(n_components=3)
    # We = tsne.fit_transform(A)
    We = Z
    find_analogies('king', 'man', 'woman', We, word2idx)
    find_analogies('france', 'paris', 'london', We, word2idx)
    find_analogies('france', 'paris', 'rome', We, word2idx)
    find_analogies('paris', 'france', 'italy', We, word2idx)
示例#2
0
def main(): 
	# sentences, word2idx = get_wikepedia_data(n_files=10, n_vocab=1500, by_paragraph=True)
	sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=2000)
	with open('w2v_word2idx.json', 'w') as f: 
		json.dump(word2idx, f)

	# build term document matrix	
	V = len(word2idx)
	N = len(sentences)

	#create raw counts first 
	A = np.zeros((V,N))
	j =0 
	for sentence in sentences: 
		for i in sentence:
			A[i, j] +=1 

		j+=1
	print 'finished getting raw counts'	

	transformer = TfidfTransformer()
	A = transformer.fit_transform(A) # sparse matrix

	A = A.toarray() # converts to array 

	idx2word = {v:k for k,v in word2idx.iteritems()}
	tsne = TSNE()	
	Z = tsne.fit_transform(A)
	plt.scatter(Z[:,0],Z[:,1])
	for i in xrange(V):
		try: 
			plt.annotate(s=idx2word[i].encode('utf8'), xy=(Z[i,0],Z[i,1]))
		except: 
			print 'bad string:', idx2word[i]

	

	#optionally get a higher dimensionality word embedding
	# tsne = TSNE(n_components=3)
	# We = tsne.fit_transform(A)
	# or 	
	We = Z
	find_analogies('king', 'man', 'woman', We, word2idx)
	find_analogies('france', 'paris', 'london', We, word2idx)
	find_analogies('france', 'paris', 'rome', We, word2idx)
	find_analogies('paris', 'france', 'italy', We, word2idx)
	plt.show()
def main():
    sentences, word2idx = get_wikipedia_data(n_files=10, n_vocab=1500, by_paragraph=True)
    with open('w2v_word2idx.json', 'w') as f:
        json.dump(word2idx, f)

    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i,j] += 1
        j += 1
    print "finished getting raw counts"

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A)
    # print "type(A):", type(A)
    # exit()
    A = A.toarray()

    idx2word = {v:k for k, v in word2idx.iteritems()}

    # plot the data in 2-D
    tsne = TSNE()
    Z = tsne.fit_transform(A)
    plt.scatter(Z[:,0], Z[:,1])
    for i in xrange(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8"), xy=(Z[i,0], Z[i,1]))
        except:
            print "bad string:", idx2word[i]
    plt.show()

    # create a higher-D word embedding, try word analogies
    # tsne = TSNE(n_components=3)
    # We = tsne.fit_transform(A)
    We = Z
    find_analogies('king', 'man', 'woman', We, word2idx)
    find_analogies('france', 'paris', 'london', We, word2idx)
    find_analogies('france', 'paris', 'rome', We, word2idx)
    find_analogies('paris', 'france', 'italy', We, word2idx)
def main():
	sentences, word2idx=get_wikipedia_data(n_files=10, n_vocab=3000, by_paragraph=True)
	with open('w2v_word2idx.json', 'w') as f:
		json.dump(word2idx, f)

	#build term document matrix
	V=len(word2idx)
	N=len(sentences)

	#create raw counts first
	A= np.zeros((V,N))
	j=0
	for sentence in sentences:
		for i in sentence:
			A[i,j] +=1
		j +=1
	print('finishing getting raw counts')

	transformer = TfidfTransformer() 
	A= transformer.fit_transform(A)
	A=A.toarray()
	idx2word= {v:k for k,v in word2idx.items()}

	tsne=TSNE() 
	Z= tsne.fit_transform(A) 
	plt.scatter(Z[:,0],Z[:,1])

	for i in range(V):
		try:
			plt.annotate(s=idx2word[i].encode('utf8'), xy=(Z[i,0], Z[i,1]))
		except:
			print('bad string:', idx2word[i])
	plt.show()

	#optional;y get a highter-dimentionaly word embedding 
	#tsne= TSNE(n_componentes=3)
	#We= tsne.fit_transform(A)
	We=Z
	find_analogies('king', 'man', 'woman', We, word2idx)
	find_analogies('france', 'paris', 'london', We, word2idx)
	find_analogies('france', 'paris', 'rome', We, word2idx)
示例#5
0
    w2i = 'glove_word2idx_50.json'
    main(we, w2i, use_brown=False)

    # load back embeddings
    npz = np.load(we)
    W1 = npz['arr_0']
    W2 = npz['arr_1']

    with open(w2i) as f:
        word2idx = json.load(f)
        idx2word = {i: w for w, i in word2idx.items()}

    for concat in (True, False):
        print("** concat:", concat)

        if concat:
            We = np.hstack([W1, W2.T])
        else:
            We = (W1 + W2.T) / 2

        find_analogies('king', 'man', 'woman', We, word2idx, idx2word)
        find_analogies('france', 'paris', 'london', We, word2idx, idx2word)
        find_analogies('france', 'paris', 'rome', We, word2idx, idx2word)
        find_analogies('paris', 'france', 'italy', We, word2idx, idx2word)
        find_analogies('france', 'french', 'english', We, word2idx, idx2word)
        find_analogies('japan', 'japanese', 'chinese', We, word2idx, idx2word)
        find_analogies('japan', 'japanese', 'italian', We, word2idx, idx2word)
        find_analogies('japan', 'japanese', 'australian', We, word2idx,
                       idx2word)
        find_analogies('december', 'november', 'june', We, word2idx, idx2word)
    w2i = 'glove_word2idx_50.json'
    main(we, w2i, use_brown=False)

    # load back embeddings
    npz = np.load(we)
    W1 = npz['arr_0']
    W2 = npz['arr_1']

    with open(w2i) as f:
        word2idx = json.load(f)
        idx2word = {i:w for w,i in word2idx.items()}

    for concat in (True, False):
        print("** concat:", concat)

        if concat:
            We = np.hstack([W1, W2.T])
        else:
            We = (W1 + W2.T) / 2


        find_analogies('king', 'man', 'woman', We, word2idx, idx2word)
        find_analogies('france', 'paris', 'london', We, word2idx, idx2word)
        find_analogies('france', 'paris', 'rome', We, word2idx, idx2word)
        find_analogies('paris', 'france', 'italy', We, word2idx, idx2word)
        find_analogies('france', 'french', 'english', We, word2idx, idx2word)
        find_analogies('japan', 'japanese', 'chinese', We, word2idx, idx2word)
        find_analogies('japan', 'japanese', 'italian', We, word2idx, idx2word)
        find_analogies('japan', 'japanese', 'australian', We, word2idx, idx2word)
        find_analogies('december', 'november', 'june', We, word2idx, idx2word)
示例#7
0
def main():
    import ipdb;ipdb.set_trace()
    analogies_to_try = (
        ('king', 'man', 'woman'),
        ('france', 'paris', 'london'),
        ('france', 'paris', 'rome'),
        ('paris', 'france', 'italy'),
    )

    ### choose a data source ###
    # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500)
    sentences, word2idx = get_wikipedia_data(n_files=3, n_vocab=2000, by_paragraph=True)
    # with open('tfidf_word2idx.json', 'w') as f:
    #     json.dump(word2idx, f)

    notfound = False
    for word_list in analogies_to_try:
        for w in word_list:
            if w not in word2idx:
                print("%s not found in vocab, remove it from \
                    analogies to try or increase vocab size" % w)
                notfound = True
    if notfound:
        exit()


    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    print("V:", V, "N:", N)
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i,j] += 1
        j += 1
    print("finished getting raw counts")

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A.T).T

    # tsne requires a dense array
    A = A.toarray()

    # map back to word in plot
    idx2word = {v:k for k, v in iteritems(word2idx)}

    # plot the data in 2-D
    tsne = TSNE()
    Z = tsne.fit_transform(A)
    plt.scatter(Z[:,0], Z[:,1])
    for i in range(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1]))
        except:
            print("bad string:", idx2word[i])
    plt.draw()

    ### multiple ways to create vectors for each word ###
    # 1) simply set it to the TF-IDF matrix
    # We = A

    # 2) create a higher-D word embedding
    tsne = TSNE(n_components=3)
    We = tsne.fit_transform(A)

    # 3) use a classic dimensionality reduction technique
    # svd = KernelPCA(n_components=20, kernel='rbf')
    # We = svd.fit_transform(A)

    for word_list in analogies_to_try:
        w1, w2, w3 = word_list
        find_analogies(w1, w2, w3, We, word2idx, idx2word)

    plt.show() # pause script until plot is closed
    plt.show()
=======
    for i in range(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1]))
        except:
            print("bad string:", idx2word[i])
    plt.draw()
>>>>>>> upstream/master

    # create a higher-D word embedding, try word analogies
    # tsne = TSNE(n_components=3)
    # We = tsne.fit_transform(A)
    We = Z
<<<<<<< HEAD
    find_analogies('king', 'man', 'woman', We, word2idx)
    find_analogies('france', 'paris', 'london', We, word2idx)
    find_analogies('france', 'paris', 'rome', We, word2idx)
    find_analogies('paris', 'france', 'italy', We, word2idx)
=======

    for word_list in analogies_to_try:
        w1, w2, w3 = word_list
        find_analogies(w1, w2, w3, We, word2idx)

    plt.show() # pause script until plot is closed
>>>>>>> upstream/master


if __name__ == '__main__':
    main()
    model.save(we_file)


if __name__ == '__main__':
    print("Start")
    we = 'glove_svd_50.npz'
    w2i = 'glove_word2idx_50.json'
    # we = 'glove_svd_brown.npz'
    # w2i = 'glove_word2idx_brown.json'
    main(we, w2i, use_brown=True)

    # load back embeddings
    npz = np.load(we)
    W1 = npz['arr_0']
    W2 = npz['arr_1']

    with open(w2i) as f:
        word2idx = json.load(f)
        idx2word = {i: w for w, i in word2idx.items()}

    for concat in (True, False):
        print("** concat:", concat)

        if concat:
            We = np.hstack([W1, W2.T])
        else:
            We = (W1 + W2.T) / 2

        find_analogies('japan', 'japanese', 'chinese', We, word2idx, idx2word)
        find_analogies('japan', 'japanese', 'italian', We, word2idx, idx2word)
示例#10
0
文件: glove.py 项目: noceanfish/nlp
        with open(w2i_file, 'w') as f:
            json.dump(word2idx, f)

    V = len(word2idx)
    model = Glove(80, V, 10)
    model.fit(sentences=sentences,
              cc_matrix=cc_matrix,
              learning_rate=3 * 10e-5,
              reg=0.01,
              epoches=2000,
              gd=True,
              use_theano=False)
    model.save(we_file)


if __name__ == '__main__':
    we = 'glove_model_50.npz'
    w2i = 'glove_word2idx_50.json'
    main(we, w2i)
    for concat in (True, False):
        print("** concat:", concat)

        find_analogies('king', 'man', 'woman', concat, we, w2i)
        find_analogies('france', 'paris', 'london', concat, we, w2i)
        find_analogies('france', 'paris', 'rome', concat, we, w2i)
        find_analogies('paris', 'france', 'italy', concat, we, w2i)
        find_analogies('france', 'french', 'english', concat, we, w2i)
        find_analogies('japan', 'japanese', 'chinese', concat, we, w2i)
        find_analogies('japan', 'japanese', 'italian', concat, we, w2i)
        find_analogies('japan', 'japanese', 'australian', concat, we, w2i)
        find_analogies('december', 'november', 'june', concat, we, w2i)
def main():
    analogies_to_try = (
        ('king', 'man', 'woman'),
        ('france', 'paris', 'london'),
        ('france', 'paris', 'rome'),
        ('paris', 'france', 'italy'),
    )

    # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500)
    sentences, word2idx = get_wikipedia_data(n_files=20, n_vocab=2000, by_paragraph=True)
    # with open('tfidf_word2idx.json', 'w') as f:
    #     json.dump(word2idx, f)

    notfound = False
    for word_list in analogies_to_try:
        for w in word_list:
            if w not in word2idx:
                print("%s not found in vocab, remove it from \
                    analogies to try or increase vocab size")
                notfound = True
    if notfound:
        exit()


    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i,j] += 1
        j += 1
    print("finished getting raw counts")

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A)
    # print("type(A):", type(A))
    # exit()
    A = A.toarray()

    idx2word = {v:k for k, v in iteritems(word2idx)}

    # plot the data in 2-D
    tsne = TSNE()
    Z = tsne.fit_transform(A)
    plt.scatter(Z[:,0], Z[:,1])
    for i in range(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1]))
        except:
            print("bad string:", idx2word[i])
    plt.draw()

    # create a higher-D word embedding, try word analogies
    # tsne = TSNE(n_components=3)
    # We = tsne.fit_transform(A)
    We = Z

    for word_list in analogies_to_try:
        w1, w2, w3 = word_list
        find_analogies(w1, w2, w3, We, word2idx)

    plt.show() # pause script until plot is closed
示例#12
0
    model = Glove(100, V, 10)
    model.fit(sentences, cc_matrix=cc_matrix, epochs=200)
    model.save(we_file)


if __name__ == '__main__':
    we = 'glove_model_50.npz'
    w2i = 'glove_word2idx_50.json'
    main(we, w2i, use_brown=False)

    # load back embeddings
    npz = np.load(we)
    W1 = npz['arr_0']
    W2 = npz['arr_1']

    with open(w2i) as f:
        word2idx = json.load(f)
        idx2word = {i: w for w, i in word2idx.items()}

    for concat in (True, False):
        print("** concat:", concat)

        if concat:
            We = np.hstack([W1, W2.T])
        else:
            We = (W1 + W2.T) / 2

        find_analogies('france', 'french', 'english', We, word2idx, idx2word)
        find_analogies('japan', 'japanese', 'chinese', We, word2idx, idx2word)
        find_analogies('japan', 'japanese', 'italian', We, word2idx, idx2word)
示例#13
0
def main():
    analogies_to_try = (
        ('king', 'man', 'woman'),
        ('france', 'paris', 'london'),
        ('france', 'paris', 'rome'),
        ('paris', 'france', 'italy'),
    )

    # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500)
    sentences, word2idx = get_wikipedia_data(n_files=20,
                                             n_vocab=2000,
                                             by_paragraph=True)
    # with open('tfidf_word2idx.json', 'w') as f:
    #     json.dump(word2idx, f)

    notfound = False
    for word_list in analogies_to_try:
        for w in word_list:
            if w not in word2idx:
                print("%s not found in vocab, remove it from \
                    analogies to try or increase vocab size")
                notfound = True
    if notfound:
        exit()

    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i, j] += 1
        j += 1
    print("finished getting raw counts")

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A)
    # print("type(A):", type(A))
    # exit()
    A = A.toarray()

    idx2word = {v: k for k, v in iteritems(word2idx)}

    # plot the data in 2-D
    tsne = TSNE()
    Z = tsne.fit_transform(A)
    plt.scatter(Z[:, 0], Z[:, 1])
    for i in range(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"),
                         xy=(Z[i, 0], Z[i, 1]))
        except:
            print("bad string:", idx2word[i])
    plt.draw()

    # create a higher-D word embedding, try word analogies
    # tsne = TSNE(n_components=3)
    # We = tsne.fit_transform(A)
    We = Z

    for word_list in analogies_to_try:
        w1, w2, w3 = word_list
        find_analogies(w1, w2, w3, We, word2idx)

    plt.show()  # pause script until plot is closed
def main():
    analogies_to_try = (
        ('king', 'man', 'woman'),
        ('france', 'paris', 'london'),
        ('france', 'paris', 'rome'),
        ('paris', 'france', 'italy'),
    )

    ### choose a data source ###
    # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500)
    sentences, word2idx = get_wikipedia_data(n_files=3, n_vocab=2000, by_paragraph=True)
    # with open('tfidf_word2idx.json', 'w') as f:
    #     json.dump(word2idx, f)

    notfound = False
    for word_list in analogies_to_try:
        for w in word_list:
            if w not in word2idx:
                print("%s not found in vocab, remove it from \
                    analogies to try or increase vocab size")
                notfound = True
    if notfound:
        exit()


    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i,j] += 1
        j += 1
    print("finished getting raw counts")

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A.T).T

    # tsne requires a dense array
    A = A.toarray()

    # map back to word in plot
    idx2word = {v:k for k, v in iteritems(word2idx)}

    # plot the data in 2-D
    tsne = TSNE()
    Z = tsne.fit_transform(A)
    plt.scatter(Z[:,0], Z[:,1])
    for i in range(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1]))
        except:
            print("bad string:", idx2word[i])
    plt.draw()

    ### multiple ways to create vectors for each word ###
    # 1) simply set it to the TF-IDF matrix
    # We = A

    # 2) create a higher-D word embedding
    tsne = TSNE(n_components=3)
    We = tsne.fit_transform(A)

    # 3) use a classic dimensionality reduction technique
    # svd = KernelPCA(n_components=20, kernel='rbf')
    # We = svd.fit_transform(A)

    for word_list in analogies_to_try:
        w1, w2, w3 = word_list
        find_analogies(w1, w2, w3, We, word2idx, idx2word)

    plt.show() # pause script until plot is closed