Exemplo n.º 1
0
class LM_With_RNN:
    def __init__(self, texts):

        # Create the training data
        xy = self.preprocess_text(texts);
        self.X_train = xy['x'];
        self.y_train = xy['y'];

        self.model = RNNTheano(_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM)

        self.train_with_sgd()

    def train_with_sgd(self, nepoch=_NEPOCH, evaluate_loss_after=5, learning_rate=_LEARNING_RATE):
        # We keep track of the losses so we can plot them later
        losses = []
        num_examples_seen = 0
        for epoch in range(nepoch):
            # Optionally evaluate the loss
            if (epoch % evaluate_loss_after == 0):
                loss = self.model.calculate_loss(self.X_train, self.y_train)
                losses.append((num_examples_seen, loss))
                # Adjust the learning rate if loss increases
                if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                    learning_rate = learning_rate * 0.5
            # For each training example...
            for i in range(len(self.y_train)):
                # One SGD step
                self.model.sgd_step(self.X_train[i], self.y_train[i], learning_rate)
                num_examples_seen += 1
        return self.model;

    def calculate_score(self, text):
        texts = [text];
        xy = self.preprocess_text(texts);
        X_train = xy['x'];
        y_train = xy['y'];
        o = self.model.forward_propagation(X_train[0])
        p = 0;
        i = -1;
        for w in X_train[0]:
            i += 1;
            p += -1 * np.log10(o[i][w])
        return p;

    def preprocess_text(self, texts, vocabulary_size=_VOCABULARY_SIZE):
        unknown_token = "UNKNOWN_TOKEN"
        sentence_start_token = "SENTENCE_START"
        sentence_end_token = "SENTENCE_END"

        # Split full comments into sentences
        # sentences = itertools.chain(*[nltk.sent_tokenize(x.decode('utf-8').lower()) for x in texts])
        sentences = itertools.chain(*[nltk.sent_tokenize(x.lower()) for x in texts])
        # Append SENTENCE_START and SENTENCE_END
        sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]

        # Tokenize the sentences into words
        tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

        # Count the word frequencies
        word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))

        # Get the most common words and build index_to_word and word_to_index vectors
        if(vocabulary_size==-1):
            vocab = word_freq.elements();
        else:
            vocab = word_freq.most_common(vocabulary_size - 1)
        index_to_word = [x[0] for x in vocab]
        index_to_word.append(unknown_token)
        word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])

        # Replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(tokenized_sentences):
            tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

        # Create the training data
        X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
        y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

        return {
            'x': X_train,
            'y': y_train,
            'index_to_word': index_to_word,
            'word_to_index': word_to_index
        };
Exemplo n.º 2
0
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])


model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM)
# t1 = time.time()
# model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE)
# t2 = time.time()
# print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

if _MODEL_FILE != None:
    load_model_parameters_theano(_MODEL_FILE, model)

# train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE)


o = model.forward_propagation(X_train[1])
print o
print [index_to_word[x] for x in X_train[1]];
p = 1;
i = -1;
for w in X_train[1]:
    i += 1;
    p *= o[i][w]
print p;

# def generate_sentence(model):
#     # We start the sentence with the start token
#     new_sentence = [word_to_index[sentence_start_token]]
#     # Repeat until we get an end token
#     while not new_sentence[-1] == word_to_index[sentence_end_token]:
#         next_word_probs = model.forward_propagation(new_sentence)
Exemplo n.º 3
0
class RNNLM:
    def __init__(self):
        self.unknown_token = "UNKNOWN_TOKEN"
        self.sentence_start_token = "SENTENCE_START"
        self.sentence_end_token = "SENTENCE_END"
        self.index_to_word = None
        self.word_to_index = None
        self.model = None

    def tokenize_data(self, n=-1):
        # download dependent nltk resources if you havn't.
        # nltk.download('punkt')

        # Read the data and append SENTENCE_START and SENTENCE_END tokens
        print "Reading sentences from gutenberg corpus ..."
        from nltk.corpus import gutenberg
        tokenized_sentences = []
        for s in gutenberg.sents('austen-emma.txt'):
            tokenized_sentences.append([self.sentence_start_token] + s[1:-1] +
                                       [self.sentence_end_token])
        print "Parsed %d sentences." % (len(tokenized_sentences))

        if n > 0:
            tokenized_sentences = tokenized_sentences[:n]

        # count the word frequencies
        word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
        print "Found %d unique words tokens." % len(word_freq.items())

        self.vocabulary_size = int(len(word_freq.items()) * 0.95)

        # get the most common words, treat others words as unknown.
        vocab = word_freq.most_common(self.vocabulary_size - 1)
        print "Using vocabulary size %d." % self.vocabulary_size
        print "The least frequent word is '%s' and appeared %d times." % \
              (vocab[-1][0], vocab[-1][1])
        self.index_to_word = [x[0] for x in vocab]
        self.index_to_word.append(self.unknown_token)
        self.word_to_index = dict([(w, i)
                                   for i, w in enumerate(self.index_to_word)])

        # replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(tokenized_sentences):
            tokenized_sentences[i] = [
                w if w in self.word_to_index else self.unknown_token
                for w in sent
            ]

        # create training data
        x_train = np.asarray([[self.word_to_index[w] for w in sent[:-1]]
                              for sent in tokenized_sentences])
        y_train = np.asarray([[self.word_to_index[w] for w in sent[1:]]
                              for sent in tokenized_sentences])

        print ""
        print "Example sentence: '%s'" % tokenized_sentences[0]
        print "By word indexes: '%s'" % \
              [self.word_to_index[w] for w in tokenized_sentences[0]]

        return (x_train, y_train)

    def train_numpy(self, x_train, y_train, iterations):
        self.model = RNNNumpy(word_dim=self.vocabulary_size,
                              hidden_dim=100,
                              bptt_truncate=4)
        self.model.sgd(x_train, y_train, 0.01, iterations)

    def train_theano(self, x_train, y_train, iterations):
        self.model = RNNTheano(word_dim=self.vocabulary_size,
                               hidden_dim=100,
                               bptt_truncate=4)
        self.model.sgd(x_train, y_train, 0.01, iterations)

    def train_lstm_theano(self, x_train, y_train, iterations):
        self.model = RNNTheano(word_dim=self.vocabulary_size,
                               hidden_dim=100,
                               bptt_truncate=4)
        self.model.sgd(x_train, y_train, 0.01, iterations)

    def generate_sentence(self):
        # repeat until we get an end token
        sentence_start_idx = self.word_to_index[self.sentence_start_token]
        sentence_end_idx = self.word_to_index[self.sentence_end_token]
        unknown_word_idx = self.word_to_index[self.unknown_token]
        # start the sentence with the start token
        new_sentence = [sentence_start_idx]
        while new_sentence[-1] != sentence_end_idx:
            next_word_probs = self.model.forward_propagation(new_sentence)
            sampled_word = unknown_word_idx
            # skip unknown words
            while sampled_word == unknown_word_idx or \
                  sampled_word == sentence_start_idx:
                samples = np.random.multinomial(1, next_word_probs[0])
                sampled_word = np.argmax(samples)
            new_sentence.append(sampled_word)
        return new_sentence

    def generate_sentences(self, num_sentences, min_length):
        for i in xrange(num_sentences):
            sent = []
            # We want long sentences, not sentences with one or two words
            while len(sent) < min_length:
                sent = self.generate_sentence()
                sent_str = [self.index_to_word[x] for x in sent[1:-1]]
            print " ".join(sent_str).encode('utf-8')
            print ""
Exemplo n.º 4
0
Arquivo: rnn_lm.py Projeto: yelu/blog
class RNNLM:
    def __init__(self):
        self.unknown_token = "UNKNOWN_TOKEN"
        self.sentence_start_token = "SENTENCE_START"
        self.sentence_end_token = "SENTENCE_END"
        self.index_to_word = None
        self.word_to_index = None
        self.model = None

    def tokenize_data(self, n = -1):
        # download dependent nltk resources if you havn't.
        # nltk.download('punkt')

        # Read the data and append SENTENCE_START and SENTENCE_END tokens
        print "Reading sentences from gutenberg corpus ..."
        from nltk.corpus import gutenberg
        tokenized_sentences = []
        for s in gutenberg.sents('austen-emma.txt'):
            tokenized_sentences.append([self.sentence_start_token] + s[1:-1] + [self.sentence_end_token])
        print "Parsed %d sentences." % (len(tokenized_sentences))

        if n > 0:
            tokenized_sentences = tokenized_sentences[:n]

        # count the word frequencies
        word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
        print "Found %d unique words tokens." % len(word_freq.items())

        self.vocabulary_size = int(len(word_freq.items()) * 0.95)

        # get the most common words, treat others words as unknown.
        vocab = word_freq.most_common(self.vocabulary_size - 1)
        print "Using vocabulary size %d." % self.vocabulary_size
        print "The least frequent word is '%s' and appeared %d times." % \
              (vocab[-1][0], vocab[-1][1])
        self.index_to_word = [x[0] for x in vocab]
        self.index_to_word.append(self.unknown_token)
        self.word_to_index = dict([(w,i) for i,w in enumerate(self.index_to_word)])

        # replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(tokenized_sentences):
            tokenized_sentences[i] = [w if w in self.word_to_index
                                      else self.unknown_token for w in sent]

        # create training data
        x_train = np.asarray([[self.word_to_index[w] for w in sent[:-1]]
                             for sent in tokenized_sentences])
        y_train = np.asarray([[self.word_to_index[w] for w in sent[1:]]
                             for sent in tokenized_sentences])

        print ""
        print "Example sentence: '%s'" % tokenized_sentences[0]
        print "By word indexes: '%s'" % \
              [self.word_to_index[w] for w in tokenized_sentences[0]]

        return (x_train, y_train)

    def train_numpy(self, x_train, y_train, iterations):
        self.model = RNNNumpy(word_dim = self.vocabulary_size,
                              hidden_dim = 100, bptt_truncate = 4)
        self.model.sgd(x_train, y_train, 0.01, iterations)

    def train_theano(self, x_train, y_train, iterations):
        self.model = RNNTheano(word_dim = self.vocabulary_size,
                               hidden_dim = 100, bptt_truncate = 4)
        self.model.sgd(x_train, y_train, 0.01, iterations)

    def train_lstm_theano(self, x_train, y_train, iterations):
        self.model = RNNTheano(word_dim = self.vocabulary_size,
                               hidden_dim = 100, bptt_truncate = 4)
        self.model.sgd(x_train, y_train, 0.01, iterations)

    def generate_sentence(self):
        # repeat until we get an end token
        sentence_start_idx = self.word_to_index[self.sentence_start_token]
        sentence_end_idx = self.word_to_index[self.sentence_end_token]
        unknown_word_idx = self.word_to_index[self.unknown_token]
        # start the sentence with the start token
        new_sentence = [sentence_start_idx]
        while new_sentence[-1] != sentence_end_idx:
            next_word_probs = self.model.forward_propagation(new_sentence)
            sampled_word = unknown_word_idx
            # skip unknown words
            while sampled_word == unknown_word_idx or \
                  sampled_word == sentence_start_idx:
                samples = np.random.multinomial(1, next_word_probs[0])
                sampled_word = np.argmax(samples)
            new_sentence.append(sampled_word)
        return new_sentence

    def generate_sentences(self, num_sentences, min_length):
        for i in xrange(num_sentences):
            sent = []
            # We want long sentences, not sentences with one or two words
            while len(sent) < min_length:
                sent = self.generate_sentence()
                sent_str = [self.index_to_word[x] for x in sent[1:-1]]
            print " ".join(sent_str).encode('utf-8')
            print ""