Exemplo n.º 1
0
def train(X_train,y_train,vocabulary_size,hiddenDim,modelFiles):
	model = RNNTheano(vocabulary_size, hidden_dim=hiddenDim)
	t1 = time.time()
	model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE)
	t2 = time.time()
	print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

	if modelFiles != None:
	    load_model_parameters_theano(modelFiles, model)

	train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE)
def train_model(x_train, y_train, training_data_name="training_data_name", load_model_file="", num_epochs=50, learning_rate=0.010, hidden_dim=100, vocab_size=8000):
    model = RNNTheano(vocab_size, hidden_dim=hidden_dim)
    t1 = time.time()
    model.sgd_step(x_train[10], y_train[10], learning_rate)
    t2 = time.time()
    print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

    if load_model_file != "":
        print "loading model: %s" % load_model_file
        load_model_parameters_theano(load_model_file, model)

    train_with_sgd(model, x_train, y_train, nepoch=num_epochs, learning_rate=learning_rate, training_data_name=training_data_name)
def train_theano():
    model = RNNTheano(Config._VOCABULARY_SIZE, hidden_dim=Config._HIDDEN_DIM)
    t1 = time.time()
    model.sgd_step(X_train[10], y_train[10], Config._LEARNING_RATE)
    t2 = time.time()
    print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

    model.train_with_sgd(X_train, y_train, nepoch=Config._NEPOCH, learning_rate=Config._LEARNING_RATE)

    if Config._MODEL_FILE != None:
        print "start saving model..."
        save_model_parameters_theano(Config._MODEL_FILE, model)
        print "model saved!"
Exemplo n.º 4
0
print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (
    vocab[-1][0], vocab[-1][1])

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [
        w if w in word_to_index else unknown_token for w in sent
    ]

# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]]
                      for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]]
                      for sent in tokenized_sentences])

model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM)
t1 = time.time()
model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE)
t2 = time.time()
print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

if _MODEL_FILE != None:
    load_model_parameters_theano(_MODEL_FILE, model)

train_with_sgd(model,
               X_train,
               y_train,
               nepoch=_NEPOCH,
               learning_rate=_LEARNING_RATE)
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
'''
# Create the training data
#X_train = np.asarray([[ord(char)] for char in chars])
#y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])
X_train = np.asarray(sentences_tokens_x)
y_train = np.asarray(sentences_tokens_y)

model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM)
t1 = time.time()

model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE)
t2 = time.time()
print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

if _MODEL_FILE != None:
    load_model_parameters_theano(_MODEL_FILE, model)

train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE)
Exemplo n.º 6
0
# Print an training data example
x_example, y_example = X_train[17], y_train[17]
print ("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
print ("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

#Training our Network with Theano and the GPU

# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 5
model = RNNTheano(grad_check_vocab_size, 10)
gradient_check_theano(model, [0,1,2,3], [1,2,3,4])

np.random.seed(10)
model = RNNTheano(vocabulary_size)
model.sgd_step(X_train[10], y_train[10], 0.005)


# Run the model
model = RNNTheano(vocabulary_size, hidden_dim=50)
load_model_parameters_theano('./data/trained-model-theano.npz', model)

def generate_sentence(model):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)
        sampled_word = word_to_index[unknown_token]
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
Exemplo n.º 7
0

# ==========================================================================
# ================= USE THE GUYs RNN CLASS FOR THEANO: =====================
# ==========================================================================
from rnn_theano import RNNTheano, gradient_check_theano

np.random.seed(10)
# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 5
model = RNNTheano(grad_check_vocab_size, 10)
gradient_check_theano(model, [0,1,2,3], [1,2,3,4])

np.random.seed(10)
model = RNNTheano(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

# LOAD the model parameters that he trained:
model = RNNTheano(vocabulary_size, hidden_dim=50)
utils.load_model_parameters_theano('./data/trained-model-theano.npz', model)

# TRAIN the model if wanted, but he said he trained his for 20hrs:
# losses = train_with_sgd(model, X_train, y_train, nepoch=50)
# save_model_parameters_theano('./data/trained-model-theano.npz', model)


# ======================================================
# ================= BUILD OWN RNN: =====================
# ======================================================
'''
    The input  x = X-train[i]  will be a sequence of words
Exemplo n.º 8
0
class LM_With_RNN:
    def __init__(self, texts):

        # Create the training data
        xy = self.preprocess_text(texts);
        self.X_train = xy['x'];
        self.y_train = xy['y'];

        self.model = RNNTheano(_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM)

        self.train_with_sgd()

    def train_with_sgd(self, nepoch=_NEPOCH, evaluate_loss_after=5, learning_rate=_LEARNING_RATE):
        # We keep track of the losses so we can plot them later
        losses = []
        num_examples_seen = 0
        for epoch in range(nepoch):
            # Optionally evaluate the loss
            if (epoch % evaluate_loss_after == 0):
                loss = self.model.calculate_loss(self.X_train, self.y_train)
                losses.append((num_examples_seen, loss))
                # Adjust the learning rate if loss increases
                if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                    learning_rate = learning_rate * 0.5
            # For each training example...
            for i in range(len(self.y_train)):
                # One SGD step
                self.model.sgd_step(self.X_train[i], self.y_train[i], learning_rate)
                num_examples_seen += 1
        return self.model;

    def calculate_score(self, text):
        texts = [text];
        xy = self.preprocess_text(texts);
        X_train = xy['x'];
        y_train = xy['y'];
        o = self.model.forward_propagation(X_train[0])
        p = 0;
        i = -1;
        for w in X_train[0]:
            i += 1;
            p += -1 * np.log10(o[i][w])
        return p;

    def preprocess_text(self, texts, vocabulary_size=_VOCABULARY_SIZE):
        unknown_token = "UNKNOWN_TOKEN"
        sentence_start_token = "SENTENCE_START"
        sentence_end_token = "SENTENCE_END"

        # Split full comments into sentences
        # sentences = itertools.chain(*[nltk.sent_tokenize(x.decode('utf-8').lower()) for x in texts])
        sentences = itertools.chain(*[nltk.sent_tokenize(x.lower()) for x in texts])
        # Append SENTENCE_START and SENTENCE_END
        sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]

        # Tokenize the sentences into words
        tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

        # Count the word frequencies
        word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))

        # Get the most common words and build index_to_word and word_to_index vectors
        if(vocabulary_size==-1):
            vocab = word_freq.elements();
        else:
            vocab = word_freq.most_common(vocabulary_size - 1)
        index_to_word = [x[0] for x in vocab]
        index_to_word.append(unknown_token)
        word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])

        # Replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(tokenized_sentences):
            tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

        # Create the training data
        X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
        y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

        return {
            'x': X_train,
            'y': y_train,
            'index_to_word': index_to_word,
            'word_to_index': word_to_index
        };