Пример #1
def train_wikipedia(we_file='./rnn_class/word_embeddings.npy',
    # there are 32 files
    ### note: you can pick between Wikipedia data and Brown corpus
    ###       just comment one out, and uncomment the other!

    # Wikipedia data
    # sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000)
    # use brown from NLTK
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab()

    print('finished retrieving data')
    print('vocab size:', len(word2idx), 'number of sentences:', len(sentences))
    rnn = RNN(30, [30], len(word2idx))

    np.save(we_file, rnn.We.get_value())
    with open(w2i_file, 'w') as f:
        json.dump(word2idx, f)
Пример #2
def train_corpus(we_file="word_embeddings.npy", w2i_file='corpus_word2idx.json', RecurrentUnit=GRU):
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab()
    print("finished retrieving data")
    print("vocab size: ", len(word2idx), " number of sentences:", len(sentences))

    rnn = RNN(50, [50], len(word2idx))
    rnn.fit(sentences, learning_rate=10e-6, epochs=10, show_fig=True, activation=T.nnet.relu)
    np.save(we_file, rnn.We.get_value())
    with open(w2i_file, 'w') as f_out:
        json.dump(word2idx, f_out)
Пример #3
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU):
	# sentences, word2idx= get_wikipedia_data(n_files=100, n_vocab=2000)
	# vs BROWN
	sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=2000)
	print "finished retrieving data"
	print "vocab size:", len(word2idx), "number of sentences:", len(sentences)
	rnn = RNN(50,[50], len(word2idx))
	rnn.fit(sentences, learning_rate=10e-6, epochs=10, show_fig=True, activation=T.nnet.relu )

	np.save(we_file, rnn.We.get_value())
	with open(w2i_file, 'w') as f: 
		json.dump(word2idx, f)
Пример #4
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU):
    # there are 32 files
    ### note: you can pick between Wikipedia data and Brown corpus
    ###       just comment one out, and uncomment the other!
    # sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000)
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab()
    print "finished retrieving data"
    print "vocab size:", len(word2idx), "number of sentences:", len(sentences)
    rnn = RNN(30, [30], len(word2idx))
    rnn.fit(sentences, learning_rate=2*1e-4, epochs=10, show_fig=True, activation=T.nnet.relu)

    np.save(we_file, rnn.We.get_value())
    with open(w2i_file, 'w') as f:
        json.dump(word2idx, f)
Пример #5
def main():

    sentences, word2idx = brown.get_sentences_with_word2idx_limit_vocab(
        2000, 20000)
    #sentences, word2idx = brown.get_sentences_with_word2idx_limit_vocab(10,10)
    V = len(word2idx)
    print(f"word total: {V}")
    start_idx = word2idx['START']
    end_idx = word2idx['END']
    print(f'Start index={start_idx} and End index = {end_idx}')

    #train a logistic model
    W = np.random.randn(V, V) / np.sqrt(
        V)  #initial random values to W of shape V x V

    losses = []
    epochs = 1
    lr = 1e-2

    t0 = datetime.now()
    for epoch in range(epochs):
        print(f"In iteration NO.{epoch}")
        #suffle sentences each epoch

        j = 0  #sentence counter
        for sentence in sentences:
            #convert sentence into one-hot coded inputs and targets
            sentence = [start_idx] + sentence + [end_idx]
            n = len(sentence)
            print(f"Length of sentence {n}")

            inputs = np.zeros((n - 1, V))
            targets = np.zeros((n - 1, V))
                np.arange(n - 1),
                sentence[:n -
                         1]] = 1  #the sentence itself, ignoring the end index, shape n-1 x V
                np.arange(n - 1),
                sentence[1:]] = 1  #the next word of the target, shape n-1 x V
            # one-hot encoding of word vectors
            #print (f'inputs:\n{inputs.shape}')

            #get output prediction
            predictions = softmax(inputs.dot(W))  #shape n-1 x V
            #print(f"Shape of predictions after softmax {predictions.shape}")#one for each word in the sentence
            #do a gradient descent step
            W = W - lr * inputs.T.dot(predictions - targets)

            #keep track of the loss - cross entropy cost function
            loss = -np.sum(targets * np.log(predictions)) / (
                n - 1)  #array multiplication
           #keep track of the bigram loss
           #only do it for the first epoch to avoid redundancy
           if epoch ==0:
               bigram_predictions = softmax(inputs.dot(W_bigram))
               bigram_loss = -np.sum(targets*np.log(bigram_predictions))/(n-1)
            if j % 1000 == 0:
                    f"epoch: {epoch}, sentence: {j}/{len(sentences)}, loss: {loss}"

            j += 1

            if j == 2:
        print(f"Elapsed time training: {datetime.now()-t0}")
       #plot a horizontal line for the bigram loss
       print('avg_bigram_loss', avg_bigram_loss)
       plt.axhline(y=avg_bigram_loss, color='r', linestyle='-')

        #plot a smoothed losses line to reduce variability
        def smoothed_loss(x, decay=0.99):
            y = np.zeros(len(x))
            last = 0
            for t in range(len(x)):
                z = decay * last + (1 - decay) * x[t]
                y[t] = z / (1 - decay**(t + 1))
                last = z
            return y

Пример #6
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU):
    # there are 32 files
    ### note: you can pick between Wikipedia data and Brown corpus
    ###       just comment one out, and uncomment the other!
    # sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000)
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab()
from brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx

from markov import get_bigram_probs

def softmax(a):
    a = a - a.max()  # avoid numeric overflow
    exp_a = np.exp(a)
    return exp_a / exp_a.sum(axis=1, keepdims=True)

if __name__ == '__main__':
    # load in the data
    # note: sentences are already converted to sequences of word indexes
    # note: you can limit the vocab size if you run out of memory
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000)
    # sentences, word2idx = get_sentences_with_word2idx()

    # vocab size
    V = len(word2idx)
    print("Vocab size:", V)

    # we will also treat beginning of sentence and end of sentence as bigrams
    # START -> first word
    # last word -> END
    start_idx = word2idx['START']
    end_idx = word2idx['END']

    # a matrix where:
    # row = last word
    # col = current word
Пример #8
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range, input
import numpy as np
import matplotlib.pyplot as plt
import random
from datetime import datetime
import os
import sys
from rnn_class.util import get_wikipedia_data
from brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx
from markov import get_bigram_probs

if __name__ == '__main__':
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000) # returns indexed sentence and word to index conversion

    # vocab size
    V = len(word2idx)
    print("Vocab length:", V)

    # special flags for beginning and end of sentence.
    start_idx = word2idx['START']
    end_idx = word2idx['END']

    bigram_probs = get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=0.1)

    D = 137     # shape of the 1st hidden layer H
    W1 = np.random.randn(V, D)/np.sqrt(V)
    W2 = np.random.randn(D, V)/np.sqrt(D)
Пример #9
def main(we_file, w2i_file, use_brown=True, n_files=100):
    if use_brown:
        cc_matrix = "cc_matrix_brown.npy"
        cc_matrix = "cc_matrix_%s.npy" % n_files

    # hacky way of checking if we need to re-load the raw data or not
    # remember, only the co-occurrence matrix is needed for training
    if os.path.exists(cc_matrix):
        with open(w2i_file) as f:
            word2idx = json.load(f)
        sentences = []  # dummy - we won't actually use it
        if use_brown:
            keep_words = set([
            sentences, word2idx = get_sentences_with_word2idx_limit_vocab(
                n_vocab=5000, keep_words=keep_words)
            sentences, word2idx = get_wikipedia_data(n_files=n_files,

        with open(w2i_file, 'w') as f:
            json.dump(word2idx, f)

    V = len(word2idx)
    model = Glove(100, V, 10)

    # alternating least squares method
    model.fit(sentences, cc_matrix=cc_matrix)