Exemplo n.º 1
0
class NGramSentences:
    def __init__(self, n=3, filename='cache/book.txt'):
        with open(filename) as file:
            text = file.read()

        tokens = [
            list(map(str.lower, word_tokenize(sent)))
            for sent in sent_tokenize(text)
        ]
        train, vocab = padded_everygram_pipeline(3, tokens)

        self.model = Laplace(n)
        self.model.fit(train, vocab)

    def generate(self, prev_word='<s>', max_words=25):
        return detokenize(
            list(
                itertools.takewhile(
                    lambda word: word != '</s>',
                    itertools.dropwhile(
                        lambda word: word == '<s>',
                        (word for word in self.model.generate(
                            max_words, text_seed=[prev_word]))))))
Exemplo n.º 2
0
bigramsList = list(map(lambda x: list(trigrams(x)), y))
bigramsList = list(flatten(bigramsList))
#list(everygrams(bigramsList, max_len=2))

vocab = list(flatten(pad_both_ends(sent, n=2) for sent in text))
from nltk.lm import Vocabulary
vocab = list(Vocabulary(vocab, unk_cutoff=1))
'''
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(2, text)
'''

lm = Laplace(3)
lm.fit([bigramsList], vocabulary_text=list(vocab))

lm.generate(4, text_seed=["government", "had"])


def generateSentences(v):
    sent = v
    v = [lm.generate(1, text_seed=v)]
    sent = sent + v
    while v[0] != '</s>':
        l = len(sent)
        v = [lm.generate(1, text_seed=[sent[l - 2], sent[l - 1]])]
        sent = sent + v
    return sent


sen = generateSentences(['<s>', 'india'])
sen = " ".join(sen)