def main(): sentences, word2idx = get_wikipedia_data(n_files=10, n_vocab=1500, by_paragraph=True) with open('w2v_word2idx.json', 'w') as f: json.dump(word2idx, f) # build term document matrix V = len(word2idx) N = len(sentences) # create raw counts first A = np.zeros((V, N)) j = 0 for sentence in sentences: for i in sentence: A[i, j] += 1 j += 1 print "finished getting raw counts" transformer = TfidfTransformer() A = transformer.fit_transform(A) # print "type(A):", type(A) # exit() A = A.toarray() idx2word = {v: k for k, v in word2idx.iteritems()} # plot the data in 2-D tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:, 0], Z[:, 1]) for i in xrange(V): try: plt.annotate(s=idx2word[i].encode("utf8"), xy=(Z[i, 0], Z[i, 1])) except: print "bad string:", idx2word[i] plt.show() # create a higher-D word embedding, try word analogies # tsne = TSNE(n_components=3) # We = tsne.fit_transform(A) We = Z find_analogies('king', 'man', 'woman', We, word2idx) find_analogies('france', 'paris', 'london', We, word2idx) find_analogies('france', 'paris', 'rome', We, word2idx) find_analogies('paris', 'france', 'italy', We, word2idx)
def main(): # sentences, word2idx = get_wikepedia_data(n_files=10, n_vocab=1500, by_paragraph=True) sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=2000) with open('w2v_word2idx.json', 'w') as f: json.dump(word2idx, f) # build term document matrix V = len(word2idx) N = len(sentences) #create raw counts first A = np.zeros((V,N)) j =0 for sentence in sentences: for i in sentence: A[i, j] +=1 j+=1 print 'finished getting raw counts' transformer = TfidfTransformer() A = transformer.fit_transform(A) # sparse matrix A = A.toarray() # converts to array idx2word = {v:k for k,v in word2idx.iteritems()} tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:,0],Z[:,1]) for i in xrange(V): try: plt.annotate(s=idx2word[i].encode('utf8'), xy=(Z[i,0],Z[i,1])) except: print 'bad string:', idx2word[i] #optionally get a higher dimensionality word embedding # tsne = TSNE(n_components=3) # We = tsne.fit_transform(A) # or We = Z find_analogies('king', 'man', 'woman', We, word2idx) find_analogies('france', 'paris', 'london', We, word2idx) find_analogies('france', 'paris', 'rome', We, word2idx) find_analogies('paris', 'france', 'italy', We, word2idx) plt.show()
def main(): sentences, word2idx = get_wikipedia_data(n_files=10, n_vocab=1500, by_paragraph=True) with open('w2v_word2idx.json', 'w') as f: json.dump(word2idx, f) # build term document matrix V = len(word2idx) N = len(sentences) # create raw counts first A = np.zeros((V, N)) j = 0 for sentence in sentences: for i in sentence: A[i,j] += 1 j += 1 print "finished getting raw counts" transformer = TfidfTransformer() A = transformer.fit_transform(A) # print "type(A):", type(A) # exit() A = A.toarray() idx2word = {v:k for k, v in word2idx.iteritems()} # plot the data in 2-D tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:,0], Z[:,1]) for i in xrange(V): try: plt.annotate(s=idx2word[i].encode("utf8"), xy=(Z[i,0], Z[i,1])) except: print "bad string:", idx2word[i] plt.show() # create a higher-D word embedding, try word analogies # tsne = TSNE(n_components=3) # We = tsne.fit_transform(A) We = Z find_analogies('king', 'man', 'woman', We, word2idx) find_analogies('france', 'paris', 'london', We, word2idx) find_analogies('france', 'paris', 'rome', We, word2idx) find_analogies('paris', 'france', 'italy', We, word2idx)
def main(): sentences, word2idx=get_wikipedia_data(n_files=10, n_vocab=3000, by_paragraph=True) with open('w2v_word2idx.json', 'w') as f: json.dump(word2idx, f) #build term document matrix V=len(word2idx) N=len(sentences) #create raw counts first A= np.zeros((V,N)) j=0 for sentence in sentences: for i in sentence: A[i,j] +=1 j +=1 print('finishing getting raw counts') transformer = TfidfTransformer() A= transformer.fit_transform(A) A=A.toarray() idx2word= {v:k for k,v in word2idx.items()} tsne=TSNE() Z= tsne.fit_transform(A) plt.scatter(Z[:,0],Z[:,1]) for i in range(V): try: plt.annotate(s=idx2word[i].encode('utf8'), xy=(Z[i,0], Z[i,1])) except: print('bad string:', idx2word[i]) plt.show() #optional;y get a highter-dimentionaly word embedding #tsne= TSNE(n_componentes=3) #We= tsne.fit_transform(A) We=Z find_analogies('king', 'man', 'woman', We, word2idx) find_analogies('france', 'paris', 'london', We, word2idx) find_analogies('france', 'paris', 'rome', We, word2idx)
w2i = 'glove_word2idx_50.json' main(we, w2i, use_brown=False) # load back embeddings npz = np.load(we) W1 = npz['arr_0'] W2 = npz['arr_1'] with open(w2i) as f: word2idx = json.load(f) idx2word = {i: w for w, i in word2idx.items()} for concat in (True, False): print("** concat:", concat) if concat: We = np.hstack([W1, W2.T]) else: We = (W1 + W2.T) / 2 find_analogies('king', 'man', 'woman', We, word2idx, idx2word) find_analogies('france', 'paris', 'london', We, word2idx, idx2word) find_analogies('france', 'paris', 'rome', We, word2idx, idx2word) find_analogies('paris', 'france', 'italy', We, word2idx, idx2word) find_analogies('france', 'french', 'english', We, word2idx, idx2word) find_analogies('japan', 'japanese', 'chinese', We, word2idx, idx2word) find_analogies('japan', 'japanese', 'italian', We, word2idx, idx2word) find_analogies('japan', 'japanese', 'australian', We, word2idx, idx2word) find_analogies('december', 'november', 'june', We, word2idx, idx2word)
w2i = 'glove_word2idx_50.json' main(we, w2i, use_brown=False) # load back embeddings npz = np.load(we) W1 = npz['arr_0'] W2 = npz['arr_1'] with open(w2i) as f: word2idx = json.load(f) idx2word = {i:w for w,i in word2idx.items()} for concat in (True, False): print("** concat:", concat) if concat: We = np.hstack([W1, W2.T]) else: We = (W1 + W2.T) / 2 find_analogies('king', 'man', 'woman', We, word2idx, idx2word) find_analogies('france', 'paris', 'london', We, word2idx, idx2word) find_analogies('france', 'paris', 'rome', We, word2idx, idx2word) find_analogies('paris', 'france', 'italy', We, word2idx, idx2word) find_analogies('france', 'french', 'english', We, word2idx, idx2word) find_analogies('japan', 'japanese', 'chinese', We, word2idx, idx2word) find_analogies('japan', 'japanese', 'italian', We, word2idx, idx2word) find_analogies('japan', 'japanese', 'australian', We, word2idx, idx2word) find_analogies('december', 'november', 'june', We, word2idx, idx2word)
def main(): import ipdb;ipdb.set_trace() analogies_to_try = ( ('king', 'man', 'woman'), ('france', 'paris', 'london'), ('france', 'paris', 'rome'), ('paris', 'france', 'italy'), ) ### choose a data source ### # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500) sentences, word2idx = get_wikipedia_data(n_files=3, n_vocab=2000, by_paragraph=True) # with open('tfidf_word2idx.json', 'w') as f: # json.dump(word2idx, f) notfound = False for word_list in analogies_to_try: for w in word_list: if w not in word2idx: print("%s not found in vocab, remove it from \ analogies to try or increase vocab size" % w) notfound = True if notfound: exit() # build term document matrix V = len(word2idx) N = len(sentences) # create raw counts first A = np.zeros((V, N)) print("V:", V, "N:", N) j = 0 for sentence in sentences: for i in sentence: A[i,j] += 1 j += 1 print("finished getting raw counts") transformer = TfidfTransformer() A = transformer.fit_transform(A.T).T # tsne requires a dense array A = A.toarray() # map back to word in plot idx2word = {v:k for k, v in iteritems(word2idx)} # plot the data in 2-D tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:,0], Z[:,1]) for i in range(V): try: plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1])) except: print("bad string:", idx2word[i]) plt.draw() ### multiple ways to create vectors for each word ### # 1) simply set it to the TF-IDF matrix # We = A # 2) create a higher-D word embedding tsne = TSNE(n_components=3) We = tsne.fit_transform(A) # 3) use a classic dimensionality reduction technique # svd = KernelPCA(n_components=20, kernel='rbf') # We = svd.fit_transform(A) for word_list in analogies_to_try: w1, w2, w3 = word_list find_analogies(w1, w2, w3, We, word2idx, idx2word) plt.show() # pause script until plot is closed
plt.show() ======= for i in range(V): try: plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1])) except: print("bad string:", idx2word[i]) plt.draw() >>>>>>> upstream/master # create a higher-D word embedding, try word analogies # tsne = TSNE(n_components=3) # We = tsne.fit_transform(A) We = Z <<<<<<< HEAD find_analogies('king', 'man', 'woman', We, word2idx) find_analogies('france', 'paris', 'london', We, word2idx) find_analogies('france', 'paris', 'rome', We, word2idx) find_analogies('paris', 'france', 'italy', We, word2idx) ======= for word_list in analogies_to_try: w1, w2, w3 = word_list find_analogies(w1, w2, w3, We, word2idx) plt.show() # pause script until plot is closed >>>>>>> upstream/master if __name__ == '__main__': main()
model.save(we_file) if __name__ == '__main__': print("Start") we = 'glove_svd_50.npz' w2i = 'glove_word2idx_50.json' # we = 'glove_svd_brown.npz' # w2i = 'glove_word2idx_brown.json' main(we, w2i, use_brown=True) # load back embeddings npz = np.load(we) W1 = npz['arr_0'] W2 = npz['arr_1'] with open(w2i) as f: word2idx = json.load(f) idx2word = {i: w for w, i in word2idx.items()} for concat in (True, False): print("** concat:", concat) if concat: We = np.hstack([W1, W2.T]) else: We = (W1 + W2.T) / 2 find_analogies('japan', 'japanese', 'chinese', We, word2idx, idx2word) find_analogies('japan', 'japanese', 'italian', We, word2idx, idx2word)
with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(80, V, 10) model.fit(sentences=sentences, cc_matrix=cc_matrix, learning_rate=3 * 10e-5, reg=0.01, epoches=2000, gd=True, use_theano=False) model.save(we_file) if __name__ == '__main__': we = 'glove_model_50.npz' w2i = 'glove_word2idx_50.json' main(we, w2i) for concat in (True, False): print("** concat:", concat) find_analogies('king', 'man', 'woman', concat, we, w2i) find_analogies('france', 'paris', 'london', concat, we, w2i) find_analogies('france', 'paris', 'rome', concat, we, w2i) find_analogies('paris', 'france', 'italy', concat, we, w2i) find_analogies('france', 'french', 'english', concat, we, w2i) find_analogies('japan', 'japanese', 'chinese', concat, we, w2i) find_analogies('japan', 'japanese', 'italian', concat, we, w2i) find_analogies('japan', 'japanese', 'australian', concat, we, w2i) find_analogies('december', 'november', 'june', concat, we, w2i)
def main(): analogies_to_try = ( ('king', 'man', 'woman'), ('france', 'paris', 'london'), ('france', 'paris', 'rome'), ('paris', 'france', 'italy'), ) # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500) sentences, word2idx = get_wikipedia_data(n_files=20, n_vocab=2000, by_paragraph=True) # with open('tfidf_word2idx.json', 'w') as f: # json.dump(word2idx, f) notfound = False for word_list in analogies_to_try: for w in word_list: if w not in word2idx: print("%s not found in vocab, remove it from \ analogies to try or increase vocab size") notfound = True if notfound: exit() # build term document matrix V = len(word2idx) N = len(sentences) # create raw counts first A = np.zeros((V, N)) j = 0 for sentence in sentences: for i in sentence: A[i,j] += 1 j += 1 print("finished getting raw counts") transformer = TfidfTransformer() A = transformer.fit_transform(A) # print("type(A):", type(A)) # exit() A = A.toarray() idx2word = {v:k for k, v in iteritems(word2idx)} # plot the data in 2-D tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:,0], Z[:,1]) for i in range(V): try: plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1])) except: print("bad string:", idx2word[i]) plt.draw() # create a higher-D word embedding, try word analogies # tsne = TSNE(n_components=3) # We = tsne.fit_transform(A) We = Z for word_list in analogies_to_try: w1, w2, w3 = word_list find_analogies(w1, w2, w3, We, word2idx) plt.show() # pause script until plot is closed
model = Glove(100, V, 10) model.fit(sentences, cc_matrix=cc_matrix, epochs=200) model.save(we_file) if __name__ == '__main__': we = 'glove_model_50.npz' w2i = 'glove_word2idx_50.json' main(we, w2i, use_brown=False) # load back embeddings npz = np.load(we) W1 = npz['arr_0'] W2 = npz['arr_1'] with open(w2i) as f: word2idx = json.load(f) idx2word = {i: w for w, i in word2idx.items()} for concat in (True, False): print("** concat:", concat) if concat: We = np.hstack([W1, W2.T]) else: We = (W1 + W2.T) / 2 find_analogies('france', 'french', 'english', We, word2idx, idx2word) find_analogies('japan', 'japanese', 'chinese', We, word2idx, idx2word) find_analogies('japan', 'japanese', 'italian', We, word2idx, idx2word)
def main(): analogies_to_try = ( ('king', 'man', 'woman'), ('france', 'paris', 'london'), ('france', 'paris', 'rome'), ('paris', 'france', 'italy'), ) # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500) sentences, word2idx = get_wikipedia_data(n_files=20, n_vocab=2000, by_paragraph=True) # with open('tfidf_word2idx.json', 'w') as f: # json.dump(word2idx, f) notfound = False for word_list in analogies_to_try: for w in word_list: if w not in word2idx: print("%s not found in vocab, remove it from \ analogies to try or increase vocab size") notfound = True if notfound: exit() # build term document matrix V = len(word2idx) N = len(sentences) # create raw counts first A = np.zeros((V, N)) j = 0 for sentence in sentences: for i in sentence: A[i, j] += 1 j += 1 print("finished getting raw counts") transformer = TfidfTransformer() A = transformer.fit_transform(A) # print("type(A):", type(A)) # exit() A = A.toarray() idx2word = {v: k for k, v in iteritems(word2idx)} # plot the data in 2-D tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:, 0], Z[:, 1]) for i in range(V): try: plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i, 0], Z[i, 1])) except: print("bad string:", idx2word[i]) plt.draw() # create a higher-D word embedding, try word analogies # tsne = TSNE(n_components=3) # We = tsne.fit_transform(A) We = Z for word_list in analogies_to_try: w1, w2, w3 = word_list find_analogies(w1, w2, w3, We, word2idx) plt.show() # pause script until plot is closed
def main(): analogies_to_try = ( ('king', 'man', 'woman'), ('france', 'paris', 'london'), ('france', 'paris', 'rome'), ('paris', 'france', 'italy'), ) ### choose a data source ### # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500) sentences, word2idx = get_wikipedia_data(n_files=3, n_vocab=2000, by_paragraph=True) # with open('tfidf_word2idx.json', 'w') as f: # json.dump(word2idx, f) notfound = False for word_list in analogies_to_try: for w in word_list: if w not in word2idx: print("%s not found in vocab, remove it from \ analogies to try or increase vocab size") notfound = True if notfound: exit() # build term document matrix V = len(word2idx) N = len(sentences) # create raw counts first A = np.zeros((V, N)) j = 0 for sentence in sentences: for i in sentence: A[i,j] += 1 j += 1 print("finished getting raw counts") transformer = TfidfTransformer() A = transformer.fit_transform(A.T).T # tsne requires a dense array A = A.toarray() # map back to word in plot idx2word = {v:k for k, v in iteritems(word2idx)} # plot the data in 2-D tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:,0], Z[:,1]) for i in range(V): try: plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1])) except: print("bad string:", idx2word[i]) plt.draw() ### multiple ways to create vectors for each word ### # 1) simply set it to the TF-IDF matrix # We = A # 2) create a higher-D word embedding tsne = TSNE(n_components=3) We = tsne.fit_transform(A) # 3) use a classic dimensionality reduction technique # svd = KernelPCA(n_components=20, kernel='rbf') # We = svd.fit_transform(A) for word_list in analogies_to_try: w1, w2, w3 = word_list find_analogies(w1, w2, w3, We, word2idx, idx2word) plt.show() # pause script until plot is closed