def main(we_file, w2i_file, use_brown=True, n_files=100): if use_brown: cc_matrix = "cc_matrix_brown.npy" else: cc_matrix = "cc_matrix_%s.npy" % n_files # hacky way of checking if we need to re-load the raw data or not # remember, only the co-occurrence matrix is needed for training if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) sentences = [] # dummy - we won't actually use it else: if use_brown: keep_words = set([ 'king', 'man', 'woman', 'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england', 'french', 'english', 'japan', 'japanese', 'chinese', 'italian', 'australia', 'australian', 'december', 'november', 'june', 'january', 'february', 'march', 'april', 'may', 'july', 'august', 'september', 'october', ]) sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=5000, keep_words=keep_words) else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(100, V, 10) # alternating least squares method model.fit(sentences, cc_matrix=cc_matrix) model.save(we_file)
def main(we_file, w2i_file, n_files=50): cc_matrix = "cc_matrix_%s.npy" % n_files if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) sentences = [] else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(80, V, 10) model.fit(sentences, cc_matrix=cc_matrix, epochs=20) # model.fit( # sentences=sentences, # cc_matrix=cc_matrix, # learning_rate=3*10e-5, # reg=0.01, # epochs=2000, # gd=True, # use_theano=False, # ) model.save(we_file)
def main(we_file, w2i_file, use_brown=True, n_files=50): if use_brown: cc_matrix = "cc_matrix_brown.npy" else: cc_matrix = "cc_matrix_%s.npy" % n_files # hacky way of checking if we need to re-load the raw data or not # remember, only the co-occurrence matrix is needed for training if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) sentences = [] # dummy - we won't actually use it else: if use_brown: keep_words = set([ 'king', 'man', 'woman', 'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england', 'french', 'english', 'japan', 'japanese', 'chinese', 'italian', 'australia', 'australian', 'december', 'november', 'june', 'january', 'february', 'march', 'april', 'may', 'july', 'august', 'september', 'october', ]) sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=5000, keep_words=keep_words) else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(100, V, 10) model.fit(sentences, cc_matrix=cc_matrix, epochs=200) model.save(we_file)
def main(): sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000) V = len(word2idx) model = Model(10, V, 2) # fp = open('/Users/macuser/Code/word2vec-proto/wiki.en.text') # model.fit(fp) model.fit(sentences) model.save('w2v_model.npz')
def main(): sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000) with open('w2v_word2idx.json', 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Model(80, V, 10) model.fitt(sentences, learning_rate=10e-4, mu=0, epochs=5) model.save('w2v_model.npz')
def main(use_brown=True): if use_brown: # sentences, word2idx = get_sentences_with_word2idx_limit_vocab() sentences, word2idx = get_sentences_with_word2idx() else: sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000) with open('w2v_word2idx.json', 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Model(50, V, 5) model.fit(sentences, learning_rate=1e-3, mu=0, epochs=3, num_neg_samples=5) model.save('w2v_model.npz')
def main(): sentences, word2idx = get_wikipedia_data(n_files=10, n_vocab=1500, by_paragraph=True) with open('w2v_word2idx.json', 'w') as f: json.dump(word2idx, f) V = len(word2idx) N = len(sentences) A = np.zeros(V, N) j = 0 for sentence in sentences: for i in sentence: A[i, j] += 1 j += 1 print("finished getting raw counts")
def main(): sentences, word2idx = get_wikipedia_data(n_files=10, n_vocab=1500, by_paragraph=True) with open('w2v_word2idx.json', 'w') as f: json.dump(word2idx, f) # build term document matrix V = len(word2idx) N = len(sentences) # create raw counts first A = np.zeros((V, N)) j = 0 for sentence in sentences: for i in sentence: A[i, j] += 1 j += 1 print "finished getting raw counts" transformer = TfidfTransformer() A = transformer.fit_transform(A) # print "type(A):", type(A) # exit() A = A.toarray() idx2word = {v: k for k, v in word2idx.iteritems()} # plot the data in 2-D tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:, 0], Z[:, 1]) for i in xrange(V): try: plt.annotate(s=idx2word[i].encode("utf8"), xy=(Z[i, 0], Z[i, 1])) except: print "bad string:", idx2word[i] plt.show() # create a higher-D word embedding, try word analogies # tsne = TSNE(n_components=3) # We = tsne.fit_transform(A) We = Z find_analogies('king', 'man', 'woman', We, word2idx) find_analogies('france', 'paris', 'london', We, word2idx) find_analogies('france', 'paris', 'rome', We, word2idx) find_analogies('paris', 'france', 'italy', We, word2idx)
def main(): sentences, word2idx = get_wikipedia_data(n_files=10, n_vocab=1500, by_paragraph=True) with open('w2v_word2idx.json', 'w') as f: json.dump(word2idx, f) # build term document matrix V = len(word2idx) N = len(sentences) # create raw counts first A = np.zeros((V, N)) j = 0 for sentence in sentences: for i in sentence: A[i,j] += 1 j += 1 print "finished getting raw counts" transformer = TfidfTransformer() A = transformer.fit_transform(A) # print "type(A):", type(A) # exit() A = A.toarray() idx2word = {v:k for k, v in word2idx.iteritems()} # plot the data in 2-D tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:,0], Z[:,1]) for i in xrange(V): try: plt.annotate(s=idx2word[i].encode("utf8"), xy=(Z[i,0], Z[i,1])) except: print "bad string:", idx2word[i] plt.show() # create a higher-D word embedding, try word analogies # tsne = TSNE(n_components=3) # We = tsne.fit_transform(A) We = Z find_analogies('king', 'man', 'woman', We, word2idx) find_analogies('france', 'paris', 'london', We, word2idx) find_analogies('france', 'paris', 'rome', We, word2idx) find_analogies('paris', 'france', 'italy', We, word2idx)
def main(we_file, w2i_file, n_files=50): # co-occurrence matrix cc_matrix = 'cc_matrix_%s.npy' % n_files if os.path.exists(cc_matrix): with open(w2i_file, 'r') as f: word2idx = json.load(f) sentences = [] else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(80, V, 10) model.fit(sentences=sentences, cc_matrix=cc_matrix, learning_rate=3 * 10e-5, reg=0.01, epochs=20, gd=False, use_theano=False) model.save(we_file)
def main(we_file, w2i_file, n_files=75, use_brown=False): if use_brown: cc_matrix = "cc_matrix_brown.npy" else: cc_matrix = "cc_matrix_%s.npy" % n_files if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) sentences = [] else: if use_brown: keep_words = set([ 'king', 'man', 'woman', 'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england', 'french', 'english', 'japan', 'japanese', 'chinese', 'italian', 'australia', 'australian', 'december', 'november', 'june', 'january', 'february', 'march', 'april', 'may', 'july', 'august', 'september', 'october', ]) sentences, word2idx = get_sentences_with_word2idx_limit_vocab( n_vocab=10000, keep_words=keep_words) else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=10000) with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(137, V, 10) model.fit(sentences, cc_matrix=cc_matrix, epochs=100) # model.fit( # sentences=sentences, # cc_matrix=cc_matrix, # learning_rate=3*10e-5, # reg=0.01, # epochs=2000, # gd=True, # use_theano=False, # ) model.save(we_file)
def main(): analogies_to_try = ( ('king', 'man', 'woman'), ('france', 'paris', 'london'), ('france', 'paris', 'rome'), ('paris', 'france', 'italy'), ) ### choose a data source ### # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500) sentences, word2idx = get_wikipedia_data(n_files=3, n_vocab=2000, by_paragraph=True) # with open('tfidf_word2idx.json', 'w') as f: # json.dump(word2idx, f) notfound = False for word_list in analogies_to_try: for w in word_list: if w not in word2idx: print("%s not found in vocab, remove it from \ analogies to try or increase vocab size") notfound = True if notfound: exit() # build term document matrix V = len(word2idx) N = len(sentences) # create raw counts first A = np.zeros((V, N)) j = 0 for sentence in sentences: for i in sentence: A[i,j] += 1 j += 1 print("finished getting raw counts") transformer = TfidfTransformer() A = transformer.fit_transform(A.T).T # tsne requires a dense array A = A.toarray() # map back to word in plot idx2word = {v:k for k, v in iteritems(word2idx)} # plot the data in 2-D tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:,0], Z[:,1]) for i in range(V): try: plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1])) except: print("bad string:", idx2word[i]) plt.draw() ### multiple ways to create vectors for each word ### # 1) simply set it to the TF-IDF matrix # We = A # 2) create a higher-D word embedding tsne = TSNE(n_components=3) We = tsne.fit_transform(A) # 3) use a classic dimensionality reduction technique # svd = KernelPCA(n_components=20, kernel='rbf') # We = svd.fit_transform(A) for word_list in analogies_to_try: w1, w2, w3 = word_list find_analogies(w1, w2, w3, We, word2idx, idx2word) plt.show() # pause script until plot is closed
def main(): import ipdb;ipdb.set_trace() analogies_to_try = ( ('king', 'man', 'woman'), ('france', 'paris', 'london'), ('france', 'paris', 'rome'), ('paris', 'france', 'italy'), ) ### choose a data source ### # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500) sentences, word2idx = get_wikipedia_data(n_files=3, n_vocab=2000, by_paragraph=True) # with open('tfidf_word2idx.json', 'w') as f: # json.dump(word2idx, f) notfound = False for word_list in analogies_to_try: for w in word_list: if w not in word2idx: print("%s not found in vocab, remove it from \ analogies to try or increase vocab size" % w) notfound = True if notfound: exit() # build term document matrix V = len(word2idx) N = len(sentences) # create raw counts first A = np.zeros((V, N)) print("V:", V, "N:", N) j = 0 for sentence in sentences: for i in sentence: A[i,j] += 1 j += 1 print("finished getting raw counts") transformer = TfidfTransformer() A = transformer.fit_transform(A.T).T # tsne requires a dense array A = A.toarray() # map back to word in plot idx2word = {v:k for k, v in iteritems(word2idx)} # plot the data in 2-D tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:,0], Z[:,1]) for i in range(V): try: plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1])) except: print("bad string:", idx2word[i]) plt.draw() ### multiple ways to create vectors for each word ### # 1) simply set it to the TF-IDF matrix # We = A # 2) create a higher-D word embedding tsne = TSNE(n_components=3) We = tsne.fit_transform(A) # 3) use a classic dimensionality reduction technique # svd = KernelPCA(n_components=20, kernel='rbf') # We = svd.fit_transform(A) for word_list in analogies_to_try: w1, w2, w3 = word_list find_analogies(w1, w2, w3, We, word2idx, idx2word) plt.show() # pause script until plot is closed
def main(): <<<<<<< HEAD sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500) # sentences, word2idx = get_wikipedia_data(n_files=10, n_vocab=1500, by_paragraph=True) with open('w2v_word2idx.json', 'w') as f: json.dump(word2idx, f) ======= analogies_to_try = ( ('king', 'man', 'woman'), ('france', 'paris', 'london'), ('france', 'paris', 'rome'), ('paris', 'france', 'italy'), ) # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500) sentences, word2idx = get_wikipedia_data(n_files=20, n_vocab=2000, by_paragraph=True) # with open('tfidf_word2idx.json', 'w') as f: # json.dump(word2idx, f) notfound = False for word_list in analogies_to_try: for w in word_list: if w not in word2idx: print("%s not found in vocab, remove it from \ analogies to try or increase vocab size") notfound = True if notfound: exit() >>>>>>> upstream/master
def main(): analogies_to_try = ( ('king', 'man', 'woman'), ('france', 'paris', 'london'), ('france', 'paris', 'rome'), ('paris', 'france', 'italy'), ) # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500) sentences, word2idx = get_wikipedia_data(n_files=20, n_vocab=2000, by_paragraph=True) # with open('tfidf_word2idx.json', 'w') as f: # json.dump(word2idx, f) notfound = False for word_list in analogies_to_try: for w in word_list: if w not in word2idx: print("%s not found in vocab, remove it from \ analogies to try or increase vocab size") notfound = True if notfound: exit() # build term document matrix V = len(word2idx) N = len(sentences) # create raw counts first A = np.zeros((V, N)) j = 0 for sentence in sentences: for i in sentence: A[i,j] += 1 j += 1 print("finished getting raw counts") transformer = TfidfTransformer() A = transformer.fit_transform(A) # print("type(A):", type(A)) # exit() A = A.toarray() idx2word = {v:k for k, v in iteritems(word2idx)} # plot the data in 2-D tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:,0], Z[:,1]) for i in range(V): try: plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1])) except: print("bad string:", idx2word[i]) plt.draw() # create a higher-D word embedding, try word analogies # tsne = TSNE(n_components=3) # We = tsne.fit_transform(A) We = Z for word_list in analogies_to_try: w1, w2, w3 = word_list find_analogies(w1, w2, w3, We, word2idx) plt.show() # pause script until plot is closed
def main(): analogies_to_try = ( ('king', 'man', 'woman'), ('france', 'paris', 'london'), ('france', 'paris', 'rome'), ('paris', 'france', 'italy'), ) # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500) sentences, word2idx = get_wikipedia_data(n_files=20, n_vocab=2000, by_paragraph=True) # with open('tfidf_word2idx.json', 'w') as f: # json.dump(word2idx, f) notfound = False for word_list in analogies_to_try: for w in word_list: if w not in word2idx: print("%s not found in vocab, remove it from \ analogies to try or increase vocab size") notfound = True if notfound: exit() # build term document matrix V = len(word2idx) N = len(sentences) # create raw counts first A = np.zeros((V, N)) j = 0 for sentence in sentences: for i in sentence: A[i, j] += 1 j += 1 print("finished getting raw counts") transformer = TfidfTransformer() A = transformer.fit_transform(A) # print("type(A):", type(A)) # exit() A = A.toarray() idx2word = {v: k for k, v in iteritems(word2idx)} # plot the data in 2-D tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:, 0], Z[:, 1]) for i in range(V): try: plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i, 0], Z[i, 1])) except: print("bad string:", idx2word[i]) plt.draw() # create a higher-D word embedding, try word analogies # tsne = TSNE(n_components=3) # We = tsne.fit_transform(A) We = Z for word_list in analogies_to_try: w1, w2, w3 = word_list find_analogies(w1, w2, w3, We, word2idx) plt.show() # pause script until plot is closed
arrays = [self.W1, self.W2] np.savez(fn, *arrays) def main(use_brown=True): if use_brown: <<<<<<< HEAD # sentences, word2idx = get_sentences_with_word2idx_limit_vocab() sentences, word2idx = get_sentences_with_word2idx() ======= sentences, word2idx = get_sentences_with_word2idx_limit_vocab() # sentences, word2idx = get_sentences_with_word2idx() # sentences, word2idx = get_text8() >>>>>>> upstream/master else: sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000) with open('w2v_word2idx.json', 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Model(50, V, 5) <<<<<<< HEAD model.fit(sentences, learning_rate=1e-3, mu=0, epochs=3, num_neg_samples=5) ======= # use numpy # model.fit(sentences, learning_rate=1e-3, mu=0, epochs=5, num_neg_samples=5) # use theano model.fitt(sentences, learning_rate=1e-3, mu=0, epochs=5, num_neg_samples=5)