def train_wikipedia(we_file='./rnn_class/word_embeddings.npy', w2i_file='./rnn_class/wikipedia_word2idx.json', RecurrentUnit=LSTM): # there are 32 files ### note: you can pick between Wikipedia data and Brown corpus ### just comment one out, and uncomment the other! # Wikipedia data # sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000) # use brown from NLTK sentences, word2idx = get_sentences_with_word2idx_limit_vocab() print('finished retrieving data') print('vocab size:', len(word2idx), 'number of sentences:', len(sentences)) rnn = RNN(30, [30], len(word2idx)) rnn.fit(sentences, learning_rate=1e-5, epochs=20, activation=T.nnet.relu, show_fig=True, RecurrentUnit=RecurrentUnit) np.save(we_file, rnn.We.get_value()) with open(w2i_file, 'w') as f: json.dump(word2idx, f)
def train_corpus(we_file="word_embeddings.npy", w2i_file='corpus_word2idx.json', RecurrentUnit=GRU): sentences, word2idx = get_sentences_with_word2idx_limit_vocab() print("finished retrieving data") print("vocab size: ", len(word2idx), " number of sentences:", len(sentences)) rnn = RNN(50, [50], len(word2idx)) rnn.fit(sentences, learning_rate=10e-6, epochs=10, show_fig=True, activation=T.nnet.relu) np.save(we_file, rnn.We.get_value()) with open(w2i_file, 'w') as f_out: json.dump(word2idx, f_out)
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU): #WIKIPEDIA # sentences, word2idx= get_wikipedia_data(n_files=100, n_vocab=2000) # vs BROWN sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=2000) print "finished retrieving data" print "vocab size:", len(word2idx), "number of sentences:", len(sentences) rnn = RNN(50,[50], len(word2idx)) rnn.fit(sentences, learning_rate=10e-6, epochs=10, show_fig=True, activation=T.nnet.relu ) np.save(we_file, rnn.We.get_value()) with open(w2i_file, 'w') as f: json.dump(word2idx, f)
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU): # there are 32 files ### note: you can pick between Wikipedia data and Brown corpus ### just comment one out, and uncomment the other! # sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000) sentences, word2idx = get_sentences_with_word2idx_limit_vocab() print "finished retrieving data" print "vocab size:", len(word2idx), "number of sentences:", len(sentences) rnn = RNN(30, [30], len(word2idx)) rnn.fit(sentences, learning_rate=2*1e-4, epochs=10, show_fig=True, activation=T.nnet.relu) np.save(we_file, rnn.We.get_value()) with open(w2i_file, 'w') as f: json.dump(word2idx, f)
def main(): sentences, word2idx = brown.get_sentences_with_word2idx_limit_vocab( 2000, 20000) #sentences, word2idx = brown.get_sentences_with_word2idx_limit_vocab(10,10) V = len(word2idx) print(f"word total: {V}") start_idx = word2idx['START'] end_idx = word2idx['END'] print(f'Start index={start_idx} and End index = {end_idx}') #train a logistic model W = np.random.randn(V, V) / np.sqrt( V) #initial random values to W of shape V x V #print(f'W\n{W}') losses = [] epochs = 1 lr = 1e-2 t0 = datetime.now() for epoch in range(epochs): print(f"In iteration NO.{epoch}") #suffle sentences each epoch random.shuffle(sentences) j = 0 #sentence counter for sentence in sentences: #convert sentence into one-hot coded inputs and targets sentence = [start_idx] + sentence + [end_idx] #print(sentence) n = len(sentence) print(f"Length of sentence {n}") inputs = np.zeros((n - 1, V)) targets = np.zeros((n - 1, V)) inputs[ np.arange(n - 1), sentence[:n - 1]] = 1 #the sentence itself, ignoring the end index, shape n-1 x V targets[ np.arange(n - 1), sentence[1:]] = 1 #the next word of the target, shape n-1 x V # one-hot encoding of word vectors #print (f'inputs:\n{inputs.shape}') #print(f'targets:\n{targets.shape}') #get output prediction predictions = softmax(inputs.dot(W)) #shape n-1 x V #print(f"Shape of predictions after softmax {predictions.shape}")#one for each word in the sentence #print(f"predictions:\n{predictions}") #do a gradient descent step W = W - lr * inputs.T.dot(predictions - targets) #keep track of the loss - cross entropy cost function loss = -np.sum(targets * np.log(predictions)) / ( n - 1) #array multiplication losses.append(loss) ''' #keep track of the bigram loss #only do it for the first epoch to avoid redundancy if epoch ==0: bigram_predictions = softmax(inputs.dot(W_bigram)) bigram_loss = -np.sum(targets*np.log(bigram_predictions))/(n-1) bigram_losses.append(bigram_loss) ''' if j % 1000 == 0: print( f"epoch: {epoch}, sentence: {j}/{len(sentences)}, loss: {loss}" ) j += 1 if j == 2: break print(f"Elapsed time training: {datetime.now()-t0}") plt.plot(losses) ''' #plot a horizontal line for the bigram loss avg_bigram_loss=np.mean(bigram_losses) print('avg_bigram_loss', avg_bigram_loss) plt.axhline(y=avg_bigram_loss, color='r', linestyle='-') ''' #plot a smoothed losses line to reduce variability def smoothed_loss(x, decay=0.99): y = np.zeros(len(x)) last = 0 for t in range(len(x)): z = decay * last + (1 - decay) * x[t] y[t] = z / (1 - decay**(t + 1)) last = z return y plt.plot(smoothed_loss(losses)) plt.show()
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU): # there are 32 files ### note: you can pick between Wikipedia data and Brown corpus ### just comment one out, and uncomment the other! # sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000) sentences, word2idx = get_sentences_with_word2idx_limit_vocab()
from brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx from markov import get_bigram_probs def softmax(a): a = a - a.max() # avoid numeric overflow exp_a = np.exp(a) return exp_a / exp_a.sum(axis=1, keepdims=True) if __name__ == '__main__': # load in the data # note: sentences are already converted to sequences of word indexes # note: you can limit the vocab size if you run out of memory sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000) # sentences, word2idx = get_sentences_with_word2idx() # vocab size V = len(word2idx) print("Vocab size:", V) # we will also treat beginning of sentence and end of sentence as bigrams # START -> first word # last word -> END start_idx = word2idx['START'] end_idx = word2idx['END'] # a matrix where: # row = last word # col = current word
from __future__ import print_function, division from future.utils import iteritems from builtins import range, input import numpy as np import matplotlib.pyplot as plt import random from datetime import datetime import os import sys sys.path.append(os.path.abspath('..')) from rnn_class.util import get_wikipedia_data from brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx from markov import get_bigram_probs if __name__ == '__main__': sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000) # returns indexed sentence and word to index conversion # vocab size V = len(word2idx) print("Vocab length:", V) # special flags for beginning and end of sentence. start_idx = word2idx['START'] end_idx = word2idx['END'] bigram_probs = get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=0.1) D = 137 # shape of the 1st hidden layer H W1 = np.random.randn(V, D)/np.sqrt(V) W2 = np.random.randn(D, V)/np.sqrt(D)
def main(we_file, w2i_file, use_brown=True, n_files=100): if use_brown: cc_matrix = "cc_matrix_brown.npy" else: cc_matrix = "cc_matrix_%s.npy" % n_files # hacky way of checking if we need to re-load the raw data or not # remember, only the co-occurrence matrix is needed for training if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) sentences = [] # dummy - we won't actually use it else: if use_brown: keep_words = set([ 'king', 'man', 'woman', 'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england', 'french', 'english', 'japan', 'japanese', 'chinese', 'italian', 'australia', 'australian', 'december', 'november', 'june', 'january', 'february', 'march', 'april', 'may', 'july', 'august', 'september', 'october', ]) sentences, word2idx = get_sentences_with_word2idx_limit_vocab( n_vocab=5000, keep_words=keep_words) else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(100, V, 10) # alternating least squares method model.fit(sentences, cc_matrix=cc_matrix) model.save(we_file)