def __init__(self, texts): # Create the training data xy = self.preprocess_text(texts); self.X_train = xy['x']; self.y_train = xy['y']; self.model = RNNTheano(_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM) self.train_with_sgd()
def train(X_train,y_train,vocabulary_size,hiddenDim,modelFiles): model = RNNTheano(vocabulary_size, hidden_dim=hiddenDim) t1 = time.time() model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) if modelFiles != None: load_model_parameters_theano(modelFiles, model) train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE)
def train_model(x_train, y_train, training_data_name="training_data_name", load_model_file="", num_epochs=50, learning_rate=0.010, hidden_dim=100, vocab_size=8000): model = RNNTheano(vocab_size, hidden_dim=hidden_dim) t1 = time.time() model.sgd_step(x_train[10], y_train[10], learning_rate) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) if load_model_file != "": print "loading model: %s" % load_model_file load_model_parameters_theano(load_model_file, model) train_with_sgd(model, x_train, y_train, nepoch=num_epochs, learning_rate=learning_rate, training_data_name=training_data_name)
def import_model(model_location, csv_file=_CSV_FILE): m = re.findall('\d+', model_location) vocab_size = int(m[1]) hidden_d = int(m[0]) print "Vocab size= %d, hidden dimensions= %d" % (vocab_size, hidden_d) model2 = RNNTheano(vocab_size, hidden_dim=hidden_d) load_model_parameters_theano(model_location, model2) _, idx_to_word, word_to_idx = load_data_set(csv_file, vocab_size) return model2, idx_to_word, word_to_idx
def generate_examples(model_name, index_to_word, word_to_index, vocab_size=8000, hidden_dim=100, num_sentences=10, sentences_min_length=4): model = RNNTheano(vocab_size, hidden_dim) load_model_parameters_theano(model_name, model) sentences = [] for i in range(num_sentences): sent = [] while len(sent) < sentences_min_length: sent = generate_sentence(model, index_to_word, word_to_index) print " ".join(sent) while len(sentences) < num_sentences: sent = generate_sentence(model, index_to_word, word_to_index) if len(sent) >= sentences_min_length: sentences.append(sent)
def train_theano(): model = RNNTheano(Config._VOCABULARY_SIZE, hidden_dim=Config._HIDDEN_DIM) t1 = time.time() model.sgd_step(X_train[10], y_train[10], Config._LEARNING_RATE) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) model.train_with_sgd(X_train, y_train, nepoch=Config._NEPOCH, learning_rate=Config._LEARNING_RATE) if Config._MODEL_FILE != None: print "start saving model..." save_model_parameters_theano(Config._MODEL_FILE, model) print "model saved!"
# creating vocabulary if not os.path.isfile(vocab_file): vocab = construct_vocabulary(train_file) write_vocabulary(vocab,vocab_file) # read the vocab index_to_word, word_to_index = read_vocabulary(vocab_file, 8000) # adding special symbols index_to_word.append(sentence_end_token) index_to_word.append(sentence_start_token) word_to_index[sentence_start_token] = VOCAB_SIZE+1 word_to_index[sentence_end_token] = VOCAB_SIZE+2 if THEANO: rnn = RNNTheano(VOCAB_SIZE+SPEC_SYMBOLS_COUNT, hidden_dim = 50) else: rnn = RNN(VOCAB_SIZE+SPEC_SYMBOLS_COUNT, VOCAB_SIZE+SPEC_SYMBOLS_COUNT,hidden_dim = 100) # generate sentences print("training the model") loss = [rnn.total_loss(itertools.islice(tokenize_file(word_to_index, train_file), MAX_L_SENTENCES))] for e in range(EPOCHS): i = 0 print("--- Epoch "+str(e+1)+" ---") loss.append(rnn.total_loss(itertools.islice(tokenize_file(word_to_index, train_file), MAX_L_SENTENCES))) sentences = tokenize_file(word_to_index, train_file) for sentence in itertools.islice(sentences, MAX_SENTENCES): i+=1 sentence.insert(0,word_to_index[sentence_start_token]) y = copy.copy(sentence) y.pop(0)
def train_numpy(self, x_train, y_train, iterations): self.model = RNNNumpy(word_dim = self.vocabulary_size, hidden_dim = 100, bptt_truncate = 4) self.model.sgd(x_train, y_train, 0.01, iterations)
np.save('data/ixtoword.npy', index_to_word) # Create the training data X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokens]) y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokens]) feat_path = './data/feats.npy' feats = np.load(feat_path) dim_embed = 256 dim_hidden = 256 dim_image = 4096 feats = np.load(feat_path) feats = feats[:158900] encode_img_W = np.random.uniform(-0.1, 0.1, (dim_image, dim_hidden)) encode_img_b = np.zeros((dim_hidden)) bv = feats * encoding_img_W + encoding_img_b model = RNNTheano(len(index_to_word), hidden_dim=dim_hidden) t1 = time.time() model.sgd_step(X_train[10], bv[10], y_train[10], _LEARNING_RATE) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) if _MODEL_FILE != None: load_model_parameters_theano(_MODEL_FILE, model) train_with_sgd(model, X_train, bv, y_train, nepoch=1000, learning_rate=0.01)
# Replace all words not in our vocabulary with the unknown token # todo needs cleaner text preprocessing for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [ w if w in word_to_index else unknown_token for w in sent ] # Create the training data X_train = numpy.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = numpy.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) ######################################################################## construct RNN print "constructing model..." # todo try a smarter initialization - wrt vanishing gradients model = RNNTheano(vocabulary_size, hidden_dim=HIDDEN_DIM) gradient_check_theano(model, X_train[10], y_train[10], h=0.0000001, error_threshold=0.01) ######################################################################## train if RETRAIN: # run a single step to get a feel for training time print "run a single step..." t1 = time.time() model.sgd_step(X_train[10], y_train[10], LEARNING_RATE) t2 = time.time()
w if w in word_to_index else unknown_token for w in sent ] print("\nExample sentence: '%s'" % sentences[0]) print("\nExample sentence after pre-processing: '%s'" % tokenized_sentences[0]) # Create the training data # Note that the length of each sentence is different # X_train - every words of each sentence except for the last one X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) # y_train - every words except for the first one y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) model = RNNTheano(vocabulary_size, hidden_dim=50) load_model_parameters_theano('./trained-model-theano.npz', model) def generate_sentence(model): # We start the sentence with the start token new_sentence = [word_to_index[sentence_start_token]] # Repeat until we get an end token while not new_sentence[-1] == word_to_index[sentence_end_token]: next_word_probs = model.forward_propagation(new_sentence) sampled_word = word_to_index[unknown_token] # We don't want to sample unknown words while sampled_word == word_to_index[unknown_token]: samples = np.random.multinomial(1, next_word_probs[-1]) sampled_word = np.argmax(samples) new_sentence.append(sampled_word)
index_to_word.append(unknown_token) word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)]) print "Using vocabulary size %d." % vocabulary_size print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]) # Replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent] # Create the training data X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM) # t1 = time.time() # model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE) # t2 = time.time() # print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) if _MODEL_FILE != None: load_model_parameters_theano(_MODEL_FILE, model) # train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE) o = model.forward_propagation(X_train[1]) print o print [index_to_word[x] for x in X_train[1]]; p = 1;
time, num_examples_seen, epoch, accuracy) # For each training example (SGD step)... for i in range(len(y_train)): # One SGD step model.sgd_step(X_train[i], y_train[i], learning_rate) num_examples_seen += 1 # Create the training data x, y = cPickle.load(open('OnlyNPs_codeonly.cPickle', 'rb')) X_train = np.array(x, dtype='float32') y_train = np.array(y, dtype='float32') # Specify model and timing one SGD step model = RNNTheano(10, 333, hidden_dim=30) #t1 = time.time() #model.sgd_step(X_train[10], y_train[10], 0.005) #t2 = time.time() #print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) # Train model for n epoches train_with_sgd(model, X_train, y_train, nepoch=150, learning_rate=0.01, evaluate_loss_after=1) # Save model parameters save_model_parameters_theano('model_parameters_OnlyNPs', model)
# creating vocabulary if not os.path.isfile(vocab_file): vocab = construct_vocabulary(train_file) write_vocabulary(vocab, vocab_file) # read the vocab index_to_word, word_to_index = read_vocabulary(vocab_file, 8000) # adding special symbols index_to_word.append(sentence_end_token) index_to_word.append(sentence_start_token) word_to_index[sentence_start_token] = VOCAB_SIZE + 1 word_to_index[sentence_end_token] = VOCAB_SIZE + 2 if THEANO: rnn = RNNTheano(VOCAB_SIZE + SPEC_SYMBOLS_COUNT, hidden_dim=50) else: rnn = RNN(VOCAB_SIZE + SPEC_SYMBOLS_COUNT, VOCAB_SIZE + SPEC_SYMBOLS_COUNT, hidden_dim=100) # generate sentences print("training the model") loss = [ rnn.total_loss( itertools.islice(tokenize_file(word_to_index, train_file), MAX_L_SENTENCES)) ] for e in range(EPOCHS): i = 0 print("--- Epoch " + str(e + 1) + " ---") loss.append(
import preprocess from rnn_numpy import RNNNumpy from rnn_theano import RNNTheano import numpy as np import cProfile X_train, y_train, vocabulary_size = preprocess.create_train_data() np.random.seed(10) model = RNNNumpy(vocabulary_size) np.random.seed(10) model = RNNNumpy(vocabulary_size) #cProfile.run("model.numpy_sdg_step(X_train[10], y_train[10], 0.005)") #print("----------------------------------------------------------------") np.random.seed(10) model_theano = RNNTheano(vocabulary_size) #cProfile.run("model_theano.train_with_sgd(X_train[10], y_train[10], 0.005)") print("----------------------------------------------------------------") losses_numpy = model.train_with_sgd(X_train[:100], y_train[:100], nepoch=5, evaluate_loss_after=1) losses_theano = model_theano.train_with_sgd(X_train[:100], y_train[:100], nepoch=5, evaluate_loss_after=1)
num_sentences = 10 senten_min_length = 7 for i in range(num_sentences): sent = [] # We want long sentences, not sentences with one or two words while len(sent) < senten_min_length: sent = generate_sentence(model) print " ".join(sent) ## Evaluting on Theano with CPU import os os.environ["THEANO_FLAGS"] = "mode=FAST_RUN,device=cpu,floatX=float32" import theano import theano.tensor as T from utils import * from rnn_theano import RNNTheano, gradient_check_theano np.random.seed(10) # To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking. grad_check_vocab_size = 5 model = RNNTheano(grad_check_vocab_size, 10) gradient_check_theano(model, [0,1,2,3], [1,2,3,4]) ## Without Theano in fast mode it took 1m 12s. Python alone took 750 ms np.random.seed(20) model = RNNTheano(vocabulary_size) %timeit model.sgd_step(X_train[10], y_train[10], 0.005)
print "Expected Loss for random predictions: %f" % np.log(model.word_dim) print "Actual loss: %f" % model.calculate_loss(X_train[:100], y_train[:100]) def test_performance(model, learning_rate): print "\ntest performance: " + str(type(model)) t1 = time.time() model.sgd_step(X_train[10], y_train[10], learning_rate) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) model_gru = GRUTheano(word_dim=_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM, bptt_truncate=-1) model_theano = RNNTheano(word_dim=_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM, bptt_truncate=-1) model_rnn = RNNNumpy(word_dim=_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM, bptt_truncate=-1) test_performance(model_gru, _LEARNING_RATE) test_performance(model_theano, _LEARNING_RATE) test_performance(model_rnn, _LEARNING_RATE) test_loss(model_gru) test_loss(model_theano) test_loss(model_rnn)
for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent] print "\nExample sentence: '%s'" % sentences[0] print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0] # Create the training data X_train = np.asarray([[np.int32(word_to_index[w]) for w in sent] for sent in tokenized_sentences[:-1]]) Y_train = np.asarray([[np.int32(word_to_index[w]) for w in sent] for sent in tokenized_sentences[1:]]) print X_train, type(X_train) print Y_train, type(Y_train) np.random.seed(10) model = RNNTheano(vocabulary_size) # model = RNNNumpy(vocabulary_size) # o, s = model.forward_propagation(X_train[1]) # print o.shape # print o l = [8,9,0,1,2,3,4,5,6,7] x = np.asarray([np.int32(a) for a in l]) l2 = [3,4,5,9,0,1] x2 = np.asarray([np.int32(a) for a in l2]) # x = np.asarray([np.int32(a) for a in range(0,10)]) # print x, type(x) print "input", x, x2 # x[0] = 10 # print x, type(x) # o = model.forward_propagation(x) # print "o.shape",(o).shape, o
print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]) # Create the training data X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) # Print an training data example x_example, y_example = X_train[17], y_train[17] print ("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example)) print ("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example)) #Training our Network with Theano and the GPU # To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking. grad_check_vocab_size = 5 model = RNNTheano(grad_check_vocab_size, 10) gradient_check_theano(model, [0,1,2,3], [1,2,3,4]) np.random.seed(10) model = RNNTheano(vocabulary_size) model.sgd_step(X_train[10], y_train[10], 0.005) # Run the model model = RNNTheano(vocabulary_size, hidden_dim=50) load_model_parameters_theano('./data/trained-model-theano.npz', model) def generate_sentence(model): # We start the sentence with the start token new_sentence = [word_to_index[sentence_start_token]] # Repeat until we get an end token
# ### Training our Network with Theano and the GPU # # I have previously written a [tutorial](http://www.wildml.com/2015/09/speeding-up-your-neural-network-with-theano-and-the-gpu/) on Theano, and since all our logic will stay exactly the same I won't go through optimized code here again. I defined a `RNNTheano` class that replaces the numpy calculations with corresponding calculations in Theano. Just like the rest of this post, [the code is also available Github](https://github.com/dennybritz/rnn-tutorial-rnnlm). # In[20]: from rnn_theano import RNNTheano, gradient_check_theano # In[ ]: np.random.seed(10) # To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking. grad_check_vocab_size = 5 model = RNNTheano(grad_check_vocab_size, 10) gradient_check_theano(model, [0, 1, 2, 3], [1, 2, 3, 4]) # In[ ]: np.random.seed(10) model = RNNTheano(vocabulary_size) #ic(u'timeit model.sgd_step(X_train[10], y_train[10], 0.005)') # This time, one SGD step takes 70ms on my Mac (without GPU) and 23ms on a [g2.2xlarge](https://aws.amazon.com/ec2/instance-types/#g2) Amazon EC2 instance with GPU. That's a 15x improvement over our initial implementation and means we can train our model in hours/days instead of weeks. There are still a vast number of optimizations we could make, but we're good enough for now. # # To help you avoid spending days training a model I have pre-trained a Theano model with a hidden layer dimensionality of 50 and a vocabulary size of 8000. I trained it for 50 epochs in about 20 hours. The loss was was still decreasing and training longer would probably have resulted in a better model, but I was running out of time and wanted to publish this post. Feel free to try it out yourself and trian for longer. You can find the model parameters in `data/trained-model-theano.npz` in the Github repository and load them using the `load_model_parameters_theano` method: # In[ ]: from utils import load_model_parameters_theano, save_model_parameters_theano
''' np.random.seed(10) #FLAG # Train on a small subset of the data to see what happens model = RNNumpy(vocabsize) losses = trainwithsgd(model, Xtrain[:100], ytrain[:100], nepoch=10, evaluate_loss_after=1) np.random.seed(10) #FLAG model = RNNTheano(vocabsize) model.sgd_step(Xtrain[10], ytrain[10], 0.005) ''' from utils import load_model_parameters_theano, save_model_parameters_theano model = RNNTheano(vocabsize, hiddendim=50) # losses = train_with_sgd(model, X_train, y_train, nepoch=50) # save_model_parameters_theano('./data/trained-model-theano.npz', model) load_model_parameters_theano('/home/ihasdapie/Documents/AI/Data/trained-model-theano.npz', model) def generate_sentence(model): # We start the sentence with the start token new_sentence = [wordtoindex[starttoken]] # Repeat until we get an end token while not new_sentence[-1] == wordtoindex[endtoken]: next_word_probs = model.forward_propagation(new_sentence) sampled_word = wordtoindex[unknowntoken] # We don't want to sample unknown words while sampled_word == wordtoindex[unknowntoken]: samples = np.random.multinomial(1, next_word_probs[-1]) sampled_word = np.argmax(samples) new_sentence.append(sampled_word)
class RNNLM: def __init__(self): self.unknown_token = "UNKNOWN_TOKEN" self.sentence_start_token = "SENTENCE_START" self.sentence_end_token = "SENTENCE_END" self.index_to_word = None self.word_to_index = None self.model = None def tokenize_data(self, n=-1): # download dependent nltk resources if you havn't. # nltk.download('punkt') # Read the data and append SENTENCE_START and SENTENCE_END tokens print "Reading sentences from gutenberg corpus ..." from nltk.corpus import gutenberg tokenized_sentences = [] for s in gutenberg.sents('austen-emma.txt'): tokenized_sentences.append([self.sentence_start_token] + s[1:-1] + [self.sentence_end_token]) print "Parsed %d sentences." % (len(tokenized_sentences)) if n > 0: tokenized_sentences = tokenized_sentences[:n] # count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) print "Found %d unique words tokens." % len(word_freq.items()) self.vocabulary_size = int(len(word_freq.items()) * 0.95) # get the most common words, treat others words as unknown. vocab = word_freq.most_common(self.vocabulary_size - 1) print "Using vocabulary size %d." % self.vocabulary_size print "The least frequent word is '%s' and appeared %d times." % \ (vocab[-1][0], vocab[-1][1]) self.index_to_word = [x[0] for x in vocab] self.index_to_word.append(self.unknown_token) self.word_to_index = dict([(w, i) for i, w in enumerate(self.index_to_word)]) # replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [ w if w in self.word_to_index else self.unknown_token for w in sent ] # create training data x_train = np.asarray([[self.word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[self.word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) print "" print "Example sentence: '%s'" % tokenized_sentences[0] print "By word indexes: '%s'" % \ [self.word_to_index[w] for w in tokenized_sentences[0]] return (x_train, y_train) def train_numpy(self, x_train, y_train, iterations): self.model = RNNNumpy(word_dim=self.vocabulary_size, hidden_dim=100, bptt_truncate=4) self.model.sgd(x_train, y_train, 0.01, iterations) def train_theano(self, x_train, y_train, iterations): self.model = RNNTheano(word_dim=self.vocabulary_size, hidden_dim=100, bptt_truncate=4) self.model.sgd(x_train, y_train, 0.01, iterations) def train_lstm_theano(self, x_train, y_train, iterations): self.model = RNNTheano(word_dim=self.vocabulary_size, hidden_dim=100, bptt_truncate=4) self.model.sgd(x_train, y_train, 0.01, iterations) def generate_sentence(self): # repeat until we get an end token sentence_start_idx = self.word_to_index[self.sentence_start_token] sentence_end_idx = self.word_to_index[self.sentence_end_token] unknown_word_idx = self.word_to_index[self.unknown_token] # start the sentence with the start token new_sentence = [sentence_start_idx] while new_sentence[-1] != sentence_end_idx: next_word_probs = self.model.forward_propagation(new_sentence) sampled_word = unknown_word_idx # skip unknown words while sampled_word == unknown_word_idx or \ sampled_word == sentence_start_idx: samples = np.random.multinomial(1, next_word_probs[0]) sampled_word = np.argmax(samples) new_sentence.append(sampled_word) return new_sentence def generate_sentences(self, num_sentences, min_length): for i in xrange(num_sentences): sent = [] # We want long sentences, not sentences with one or two words while len(sent) < min_length: sent = self.generate_sentence() sent_str = [self.index_to_word[x] for x in sent[1:-1]] print " ".join(sent_str).encode('utf-8') print ""
def train_lstm_theano(self, x_train, y_train, iterations): self.model = RNNTheano(word_dim = self.vocabulary_size, hidden_dim = 100, bptt_truncate = 4) self.model.sgd(x_train, y_train, 0.01, iterations)
def load_trained_model(): model = RNNTheano(Config._VOCABULARY_SIZE, hidden_dim=Config._HIDDEN_DIM) print 'start loading...' load_model_parameters_theano(Config._MODEL_FILE, model) print 'load over!' return model
def train_numpy(self, x_train, y_train, iterations): self.model = RNNNumpy(word_dim=self.vocabulary_size, hidden_dim=100, bptt_truncate=4) self.model.sgd(x_train, y_train, 0.01, iterations)
# Get the most common words and build index_to_word and word_to_index vectors vocab = word_freq.most_common(vocabulary_size-1) index_to_word = [x[0] for x in vocab] index_to_word.append(unknown_token) word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)]) print "Using vocabulary size %d." % vocabulary_size print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]) # Replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent] ''' # Create the training data #X_train = np.asarray([[ord(char)] for char in chars]) #y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) X_train = np.asarray(sentences_tokens_x) y_train = np.asarray(sentences_tokens_y) model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM) t1 = time.time() model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) if _MODEL_FILE != None: load_model_parameters_theano(_MODEL_FILE, model) train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE)
def train_lstm_theano(self, x_train, y_train, iterations): self.model = RNNTheano(word_dim=self.vocabulary_size, hidden_dim=100, bptt_truncate=4) self.model.sgd(x_train, y_train, 0.01, iterations)
# Count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) #print 'Found %d unique words tokens.' % len(word_freq.items()) # Get the most common words and build index_to_word and word_to_index vectors vocab = word_freq.most_common(_VOCABULARY_SIZE-1) index_to_word = [x[0] for x in vocab] index_to_word.append(unknown_token) word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)]) with open(_PICKLE_IDX_WRD_FILE, 'wb') as pkl_file: pickle.dump(index_to_word, pkl_file, protocol=pickle.HIGHEST_PROTOCOL) with open(_PICKLE_WRD_IDX_FILE, 'wb') as pkl_file: pickle.dump(word_to_index, pkl_file, protocol=pickle.HIGHEST_PROTOCOL) model = RNNTheano(_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM) load_model_parameters_theano(_MODEL_FILE, model) def clean_up_sentence(sent): clean_dict = {' ,': ',', ' !': '!', ' :': ':', ' ?': '?', ' .': '.', ' \'': '\'', ' --':'--'} for bad, good in clean_dict.iteritems(): sent = sent.replace(bad, good) pms = clean_dict.values()
ALPHA = 0.015 EPOCHS = 5 HIDDEN_LAYER_SIZE = 50 PRELOAD_WEIGHTS = False # read the vocab index_to_word, word_to_index = load_vocab(vocab_file) VOCAB_SIZE = len(index_to_word) # adding special symbols index_to_word[VOCAB_SIZE] = sentence_start_token index_to_word[VOCAB_SIZE+1] = sentence_end_token word_to_index[sentence_start_token] = VOCAB_SIZE word_to_index[sentence_end_token] = VOCAB_SIZE+1 rnn = RNNTheano(VOCAB_SIZE+SPEC_SYMBOLS_COUNT, hidden_dim = HIDDEN_LAYER_SIZE) if PRELOAD_WEIGHTS: print "preloading weights" rnn.preload_weights(weights_file) train_loss = [] test_loss = [] else: print "training the model" train_loss = [] test_loss = [] for e in range(EPOCHS): i = 0 print("--- Epoch "+str(e+1)+" ---") train_loss.append(rnn.total_loss(itertools.islice(load_songs(train_file,word_to_index),MAX_L_SENTENCES))) test_loss.append(rnn.total_loss(itertools.islice(load_songs(test_file,word_to_index),MAX_L_SENTENCES))) sentences = load_songs(train_file,word_to_index)
def generate_sentence(model, index_to_word, word_to_index, min_length=5): # We start the sentence with the start token new_sentence = [word_to_index[SENTENCE_START_TOKEN]] # Repeat until we get an end token while not new_sentence[-1] == word_to_index[SENTENCE_END_TOKEN]: next_word_probs = model.predict(new_sentence)[-1] samples = np.random.multinomial(1, [next_word_probs]) sampled_word = np.argmax(samples) new_sentence.append(sampled_word) # Seomtimes we get stuck if the sentence becomes too long, e.g. "........" :( # And: We don't want sentences with UNKNOWN_TOKEN's if len(new_sentence) > 100 or sampled_word == word_to_index[UNKNOWN_TOKEN]: return None if len(new_sentence) < min_length: return None return new_sentence # cProfile.run("model.numpy_sdg_step(X_train[10], y_train[10], 0.005)") # print("----------------------------------------------------------------") np.random.seed(10) model_theano = RNNTheano(vocabulary_size) # cProfile.run("model_theano.train_with_sgd(X_train[10], y_train[10], 0.005)") print ("----------------------------------------------------------------") # losses_numpy = model.train_with_sgd(X_train[:100], y_train[:100], nepoch=5, evaluate_loss_after=1) losses_theano = model_theano.train_with_sgd(X_train[:100], y_train[:100], nepoch=5, evaluate_loss_after=1) generated_sentence = generate_sentence(model, index_to_word, word_to_index) print generated_sentence
class LM_With_RNN: def __init__(self, texts): # Create the training data xy = self.preprocess_text(texts); self.X_train = xy['x']; self.y_train = xy['y']; self.model = RNNTheano(_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM) self.train_with_sgd() def train_with_sgd(self, nepoch=_NEPOCH, evaluate_loss_after=5, learning_rate=_LEARNING_RATE): # We keep track of the losses so we can plot them later losses = [] num_examples_seen = 0 for epoch in range(nepoch): # Optionally evaluate the loss if (epoch % evaluate_loss_after == 0): loss = self.model.calculate_loss(self.X_train, self.y_train) losses.append((num_examples_seen, loss)) # Adjust the learning rate if loss increases if (len(losses) > 1 and losses[-1][1] > losses[-2][1]): learning_rate = learning_rate * 0.5 # For each training example... for i in range(len(self.y_train)): # One SGD step self.model.sgd_step(self.X_train[i], self.y_train[i], learning_rate) num_examples_seen += 1 return self.model; def calculate_score(self, text): texts = [text]; xy = self.preprocess_text(texts); X_train = xy['x']; y_train = xy['y']; o = self.model.forward_propagation(X_train[0]) p = 0; i = -1; for w in X_train[0]: i += 1; p += -1 * np.log10(o[i][w]) return p; def preprocess_text(self, texts, vocabulary_size=_VOCABULARY_SIZE): unknown_token = "UNKNOWN_TOKEN" sentence_start_token = "SENTENCE_START" sentence_end_token = "SENTENCE_END" # Split full comments into sentences # sentences = itertools.chain(*[nltk.sent_tokenize(x.decode('utf-8').lower()) for x in texts]) sentences = itertools.chain(*[nltk.sent_tokenize(x.lower()) for x in texts]) # Append SENTENCE_START and SENTENCE_END sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences] # Tokenize the sentences into words tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences] # Count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) # Get the most common words and build index_to_word and word_to_index vectors if(vocabulary_size==-1): vocab = word_freq.elements(); else: vocab = word_freq.most_common(vocabulary_size - 1) index_to_word = [x[0] for x in vocab] index_to_word.append(unknown_token) word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)]) # Replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent] # Create the training data X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) return { 'x': X_train, 'y': y_train, 'index_to_word': index_to_word, 'word_to_index': word_to_index };
_LEARNING_RATE = float(os.environ.get('LEARNING_RATE', '0.005')) _NEPOCH = int(os.environ.get('NEPOCH', '100')) _MODEL_FILE = os.environ.get('MODEL_FILE') vocabulary_size = _VOCABULARY_SIZE unknown_token = "UNKNOWN_TOKEN" sentence_start_token = "SENTENCE_START" sentence_end_token = "SENTENCE_END" print "Reading the vocabulary..." with open('data/vocab', 'r') as vfile: index_to_word, word_to_index = pickle.load(vfile) print "Using vocabulary size %d." % vocabulary_size model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM) load_model_parameters_theano( './data/rnn-theano-80-8000-2017-07-14-01-15-51.npz', model) def generate_sentence(model): # We start the sentence with the start token new_sentence = [word_to_index[sentence_start_token]] # Repeat until we get an end token while not new_sentence[-1] == word_to_index[sentence_end_token]: next_word_probs = model.forward_propagation(new_sentence) sampled_word = word_to_index[unknown_token] # We don't want to sample unknown words while sampled_word == word_to_index[unknown_token]: samples = np.random.multinomial(1, next_word_probs[-1]) sampled_word = np.argmax(samples)
# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking. grad_check_vocab_size = 100 np.random.seed(10) model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate=1000) model.gradient_check([0, 1, 2, 3], [1, 2, 3, 4]) np.random.seed(10) model = RNNNumpy(vocabulary_size) t0 = time() model.sgd_step(X_train[10], y_train[10], 0.005) print time() - t0 np.random.seed(10) # Train on a small subset of the data to see what happens model = RNNTheano(vocabulary_size) # load_model_parameters_theano("model.data", model) # train_with_sgd(model, X_train, y_train, nepoch=100, evaluate_loss_after=10) # model = RNNNumpy(vocabulary_size) train_with_sgd(model, X_train, y_train, nepoch=10, evaluate_loss_after=1) # train_with_sgd(model, X_train[:100], y_train[:100], nepoch=10, evaluate_loss_after=1) # with open("model.data", "w") as out_file: # save_model_parameters_theano(out_file, model) def generate_sentence(model): # We start the sentence with the start token new_sentence = [char_map[sentence_start]] # Repeat until we get an end token while not new_sentence[-1] == char_map[sentence_end]: next_word_probs = model.forward_propagation(new_sentence)
print "Using vocabulary size %d." % vocabulary_size print "The least frequent word in our vocabulary is '%s' and appeared %d times." % ( vocab[-1][0], vocab[-1][1]) # Replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [ w if w in word_to_index else unknown_token for w in sent ] # Create the training data X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM) t1 = time.time() model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) if _MODEL_FILE != None: load_model_parameters_theano(_MODEL_FILE, model) train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE)
class RNNLM: def __init__(self): self.unknown_token = "UNKNOWN_TOKEN" self.sentence_start_token = "SENTENCE_START" self.sentence_end_token = "SENTENCE_END" self.index_to_word = None self.word_to_index = None self.model = None def tokenize_data(self, n = -1): # download dependent nltk resources if you havn't. # nltk.download('punkt') # Read the data and append SENTENCE_START and SENTENCE_END tokens print "Reading sentences from gutenberg corpus ..." from nltk.corpus import gutenberg tokenized_sentences = [] for s in gutenberg.sents('austen-emma.txt'): tokenized_sentences.append([self.sentence_start_token] + s[1:-1] + [self.sentence_end_token]) print "Parsed %d sentences." % (len(tokenized_sentences)) if n > 0: tokenized_sentences = tokenized_sentences[:n] # count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) print "Found %d unique words tokens." % len(word_freq.items()) self.vocabulary_size = int(len(word_freq.items()) * 0.95) # get the most common words, treat others words as unknown. vocab = word_freq.most_common(self.vocabulary_size - 1) print "Using vocabulary size %d." % self.vocabulary_size print "The least frequent word is '%s' and appeared %d times." % \ (vocab[-1][0], vocab[-1][1]) self.index_to_word = [x[0] for x in vocab] self.index_to_word.append(self.unknown_token) self.word_to_index = dict([(w,i) for i,w in enumerate(self.index_to_word)]) # replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in self.word_to_index else self.unknown_token for w in sent] # create training data x_train = np.asarray([[self.word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[self.word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) print "" print "Example sentence: '%s'" % tokenized_sentences[0] print "By word indexes: '%s'" % \ [self.word_to_index[w] for w in tokenized_sentences[0]] return (x_train, y_train) def train_numpy(self, x_train, y_train, iterations): self.model = RNNNumpy(word_dim = self.vocabulary_size, hidden_dim = 100, bptt_truncate = 4) self.model.sgd(x_train, y_train, 0.01, iterations) def train_theano(self, x_train, y_train, iterations): self.model = RNNTheano(word_dim = self.vocabulary_size, hidden_dim = 100, bptt_truncate = 4) self.model.sgd(x_train, y_train, 0.01, iterations) def train_lstm_theano(self, x_train, y_train, iterations): self.model = RNNTheano(word_dim = self.vocabulary_size, hidden_dim = 100, bptt_truncate = 4) self.model.sgd(x_train, y_train, 0.01, iterations) def generate_sentence(self): # repeat until we get an end token sentence_start_idx = self.word_to_index[self.sentence_start_token] sentence_end_idx = self.word_to_index[self.sentence_end_token] unknown_word_idx = self.word_to_index[self.unknown_token] # start the sentence with the start token new_sentence = [sentence_start_idx] while new_sentence[-1] != sentence_end_idx: next_word_probs = self.model.forward_propagation(new_sentence) sampled_word = unknown_word_idx # skip unknown words while sampled_word == unknown_word_idx or \ sampled_word == sentence_start_idx: samples = np.random.multinomial(1, next_word_probs[0]) sampled_word = np.argmax(samples) new_sentence.append(sampled_word) return new_sentence def generate_sentences(self, num_sentences, min_length): for i in xrange(num_sentences): sent = [] # We want long sentences, not sentences with one or two words while len(sent) < min_length: sent = self.generate_sentence() sent_str = [self.index_to_word[x] for x in sent[1:-1]] print " ".join(sent_str).encode('utf-8') print ""