Пример #1
0
def train_wikipedia(we_file='./rnn_class/word_embeddings.npy',
                    w2i_file='./rnn_class/wikipedia_word2idx.json',
                    RecurrentUnit=LSTM):
    # there are 32 files
    ### note: you can pick between Wikipedia data and Brown corpus
    ###       just comment one out, and uncomment the other!

    # Wikipedia data
    # sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000)
    # use brown from NLTK
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab()

    print('finished retrieving data')
    print('vocab size:', len(word2idx), 'number of sentences:', len(sentences))
    rnn = RNN(30, [30], len(word2idx))
    rnn.fit(sentences,
            learning_rate=1e-5,
            epochs=20,
            activation=T.nnet.relu,
            show_fig=True,
            RecurrentUnit=RecurrentUnit)

    np.save(we_file, rnn.We.get_value())
    with open(w2i_file, 'w') as f:
        json.dump(word2idx, f)
Пример #2
0
def train_corpus(we_file="word_embeddings.npy", w2i_file='corpus_word2idx.json', RecurrentUnit=GRU):
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab()
    print("finished retrieving data")
    print("vocab size: ", len(word2idx), " number of sentences:", len(sentences))

    rnn = RNN(50, [50], len(word2idx))
    rnn.fit(sentences, learning_rate=10e-6, epochs=10, show_fig=True, activation=T.nnet.relu)
   
    np.save(we_file, rnn.We.get_value())
    with open(w2i_file, 'w') as f_out:
        json.dump(word2idx, f_out)
Пример #3
0
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU):
	#WIKIPEDIA
	# sentences, word2idx= get_wikipedia_data(n_files=100, n_vocab=2000)
	# vs BROWN
	sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=2000)
	print "finished retrieving data"
	print "vocab size:", len(word2idx), "number of sentences:", len(sentences)
	rnn = RNN(50,[50], len(word2idx))
	rnn.fit(sentences, learning_rate=10e-6, epochs=10, show_fig=True, activation=T.nnet.relu )

	np.save(we_file, rnn.We.get_value())
	with open(w2i_file, 'w') as f: 
		json.dump(word2idx, f)
Пример #4
0
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU):
    # there are 32 files
    ### note: you can pick between Wikipedia data and Brown corpus
    ###       just comment one out, and uncomment the other!
    # sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000)
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab()
    print "finished retrieving data"
    print "vocab size:", len(word2idx), "number of sentences:", len(sentences)
    rnn = RNN(30, [30], len(word2idx))
    rnn.fit(sentences, learning_rate=2*1e-4, epochs=10, show_fig=True, activation=T.nnet.relu)

    np.save(we_file, rnn.We.get_value())
    with open(w2i_file, 'w') as f:
        json.dump(word2idx, f)
Пример #5
0
def main():

    sentences, word2idx = brown.get_sentences_with_word2idx_limit_vocab(
        2000, 20000)
    #sentences, word2idx = brown.get_sentences_with_word2idx_limit_vocab(10,10)
    V = len(word2idx)
    print(f"word total: {V}")
    start_idx = word2idx['START']
    end_idx = word2idx['END']
    print(f'Start index={start_idx} and End index = {end_idx}')

    #train a logistic model
    W = np.random.randn(V, V) / np.sqrt(
        V)  #initial random values to W of shape V x V
    #print(f'W\n{W}')

    losses = []
    epochs = 1
    lr = 1e-2

    t0 = datetime.now()
    for epoch in range(epochs):
        print(f"In iteration NO.{epoch}")
        #suffle sentences each epoch
        random.shuffle(sentences)

        j = 0  #sentence counter
        for sentence in sentences:
            #convert sentence into one-hot coded inputs and targets
            sentence = [start_idx] + sentence + [end_idx]
            #print(sentence)
            n = len(sentence)
            print(f"Length of sentence {n}")

            inputs = np.zeros((n - 1, V))
            targets = np.zeros((n - 1, V))
            inputs[
                np.arange(n - 1),
                sentence[:n -
                         1]] = 1  #the sentence itself, ignoring the end index, shape n-1 x V
            targets[
                np.arange(n - 1),
                sentence[1:]] = 1  #the next word of the target, shape n-1 x V
            # one-hot encoding of word vectors
            #print (f'inputs:\n{inputs.shape}')
            #print(f'targets:\n{targets.shape}')

            #get output prediction
            predictions = softmax(inputs.dot(W))  #shape n-1 x V
            #print(f"Shape of predictions after softmax {predictions.shape}")#one for each word in the sentence
            #print(f"predictions:\n{predictions}")
            #do a gradient descent step
            W = W - lr * inputs.T.dot(predictions - targets)

            #keep track of the loss - cross entropy cost function
            loss = -np.sum(targets * np.log(predictions)) / (
                n - 1)  #array multiplication
            losses.append(loss)
            '''
           #keep track of the bigram loss
           #only do it for the first epoch to avoid redundancy
           if epoch ==0:
               bigram_predictions = softmax(inputs.dot(W_bigram))
               bigram_loss = -np.sum(targets*np.log(bigram_predictions))/(n-1)
               bigram_losses.append(bigram_loss)
           '''
            if j % 1000 == 0:
                print(
                    f"epoch: {epoch}, sentence: {j}/{len(sentences)}, loss: {loss}"
                )

            j += 1

            if j == 2:
                break
        print(f"Elapsed time training: {datetime.now()-t0}")
        plt.plot(losses)
        '''
       #plot a horizontal line for the bigram loss
       avg_bigram_loss=np.mean(bigram_losses)
       print('avg_bigram_loss', avg_bigram_loss)
       plt.axhline(y=avg_bigram_loss, color='r', linestyle='-')
       '''

        #plot a smoothed losses line to reduce variability
        def smoothed_loss(x, decay=0.99):
            y = np.zeros(len(x))
            last = 0
            for t in range(len(x)):
                z = decay * last + (1 - decay) * x[t]
                y[t] = z / (1 - decay**(t + 1))
                last = z
            return y

        plt.plot(smoothed_loss(losses))
        plt.show()
Пример #6
0
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU):
    # there are 32 files
    ### note: you can pick between Wikipedia data and Brown corpus
    ###       just comment one out, and uncomment the other!
    # sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000)
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab()
from brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx

from markov import get_bigram_probs


def softmax(a):
    a = a - a.max()  # avoid numeric overflow
    exp_a = np.exp(a)
    return exp_a / exp_a.sum(axis=1, keepdims=True)


if __name__ == '__main__':
    # load in the data
    # note: sentences are already converted to sequences of word indexes
    # note: you can limit the vocab size if you run out of memory
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000)
    # sentences, word2idx = get_sentences_with_word2idx()

    # vocab size
    V = len(word2idx)
    print("Vocab size:", V)

    # we will also treat beginning of sentence and end of sentence as bigrams
    # START -> first word
    # last word -> END
    start_idx = word2idx['START']
    end_idx = word2idx['END']

    # a matrix where:
    # row = last word
    # col = current word
Пример #8
0
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range, input
import numpy as np
import matplotlib.pyplot as plt
import random
from datetime import datetime
import os
import sys
sys.path.append(os.path.abspath('..'))
from rnn_class.util import get_wikipedia_data
from brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx
from markov import get_bigram_probs

if __name__ == '__main__':
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000) # returns indexed sentence and word to index conversion

    # vocab size
    V = len(word2idx)
    print("Vocab length:", V)

    # special flags for beginning and end of sentence.
    start_idx = word2idx['START']
    end_idx = word2idx['END']

    bigram_probs = get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=0.1)

    D = 137     # shape of the 1st hidden layer H
    W1 = np.random.randn(V, D)/np.sqrt(V)
    W2 = np.random.randn(D, V)/np.sqrt(D)
Пример #9
0
def main(we_file, w2i_file, use_brown=True, n_files=100):
    if use_brown:
        cc_matrix = "cc_matrix_brown.npy"
    else:
        cc_matrix = "cc_matrix_%s.npy" % n_files

    # hacky way of checking if we need to re-load the raw data or not
    # remember, only the co-occurrence matrix is needed for training
    if os.path.exists(cc_matrix):
        with open(w2i_file) as f:
            word2idx = json.load(f)
        sentences = []  # dummy - we won't actually use it
    else:
        if use_brown:
            keep_words = set([
                'king',
                'man',
                'woman',
                'france',
                'paris',
                'london',
                'rome',
                'italy',
                'britain',
                'england',
                'french',
                'english',
                'japan',
                'japanese',
                'chinese',
                'italian',
                'australia',
                'australian',
                'december',
                'november',
                'june',
                'january',
                'february',
                'march',
                'april',
                'may',
                'july',
                'august',
                'september',
                'october',
            ])
            sentences, word2idx = get_sentences_with_word2idx_limit_vocab(
                n_vocab=5000, keep_words=keep_words)
        else:
            sentences, word2idx = get_wikipedia_data(n_files=n_files,
                                                     n_vocab=2000)

        with open(w2i_file, 'w') as f:
            json.dump(word2idx, f)

    V = len(word2idx)
    model = Glove(100, V, 10)

    # alternating least squares method
    model.fit(sentences, cc_matrix=cc_matrix)
    model.save(we_file)