예제 #1
0
def train_models(filename):
    """ Vous ajoutez à partir d'ici tout le code dont vous avez besoin
        pour construire les différents modèles N-grammes.
        Voir les consignes de l'énoncé du travail pratique concernant les modèles à entraîner.

        Vous pouvez ajouter au fichier toutes les fonctions, classes, méthodes et variables que vous jugerez nécessaire.
        Merci de ne pas modifier les signatures (noms de fonctions et arguments) déjà présentes dans le fichier.
    """
    proverbs = load_proverbs(filename)
    print("\nNombre de proverbes : ", len(proverbs))

    tokens = [word_tokenize(sentence) for sentence in proverbs]

    global models
    models = {1: Laplace(1), 2: Laplace(2), 3: Laplace(3), 20: Bigrams()}

    models[1].fit(
        [list(ngrams(pad_both_ends(sentence, 1), 1)) for sentence in tokens],
        set(flatten(pad_both_ends(sentence, 1) for sentence in tokens)))
    models[2].fit(
        [list(ngrams(pad_both_ends(sentence, 2), 2)) for sentence in tokens],
        set(flatten(pad_both_ends(sentence, 2) for sentence in tokens)))
    models[3].fit(
        [list(ngrams(pad_both_ends(sentence, 3), 3)) for sentence in tokens],
        set(flatten(pad_both_ends(sentence, 3) for sentence in tokens)))
    models[20].fit(
        [list(ngrams(pad_both_ends(sentence, 2), 2)) for sentence in tokens],
        set(flatten(pad_both_ends(sentence, 2) for sentence in tokens)))
예제 #2
0
    def fit(self, values: List[Any]):
        patterns = [self.generator(v) for v in values]
        padded_patterns = [pad_both_ends(p, n=self.n) for p in patterns]
        ngrams_ = [ngrams(pp, n=self.n) for pp in padded_patterns]

        self.vocab = list(flatten(
            pad_both_ends(p, n=self.n) for p in patterns))
        self.model = MLE(self.n)
        self.model.fit(ngrams_, self.vocab)
예제 #3
0
def evaluate(model, corpus):
    """
    Renvoie la perplexité du modèle sur une phrase de test.

    :param model: nltk.lm.api.LanguageModel, un modèle de langue
    :param corpus: list(list(str)), une corpus tokenizé
    :return: float
    """
    ngrams = ngram.extract_ngrams(corpus, model.order)
    ngrams = flatten(ngrams)
    return model.perplexity(ngrams)
예제 #4
0
def evaluate(model, corpus):
    """
    Renvoie la perplexité du modèle sur une phrase de test.

    :param model: nltk.lm.api.LanguageModel, un modèle de langue
    :param corpus: list(list(str)), une corpus tokenizé
    :return: float
    """

    ngrams, words = padded_everygram_pipeline(model.order, corpus)
    ngrams = flatten(ngrams) # met les phrases bout à bout pour pouvoir calculer la perplexite
    return model.perplexity(ngrams)
예제 #5
0
def model_iterator(n):
        perp = []
        n = n+1
        for n in range(1,n):
            print(n)
            train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)
            #model = MLE(n)
            #model = Laplace(n) #only add-one smoothing here
            #model = Lidstone(0.1,n) #Lidstones second number is Gamma/Alpha/Delta
            #model = WittenBellInterpolated(n)
            model = KneserNeyInterpolated(n, discount = 0.88) #only order and discount needed, WB only order
            print(n,model)
            model.fit(train_data, padded_sents)
            print(model.vocab)
            vocab_list = []
            for word in model.vocab:
                vocab_list.append(word)
            #print(vocab_list)
            print("value",model. score('<UNK>'))
            #print(generate_sent_text_seed(model, 30, random_seed=['thicc']))

            #print(generate_sent(model, 50, random_seed = 30))
            entropy_fin = 0
            lense = 1000
            i = 0
            for z in range(lense):
                #print(contents[i])
                tokenized_test = [list(map(str.lower, word_tokenize(contents[i])))]
                if len(tokenized_test[0]) > 0:
                    for g in range(len(tokenized_test[0])):
                        if tokenized_test[0][g] not in vocab_list:
                            tokenized_test[0][g] = '<UNK>'
                    test_text_pad = list(flatten(pad_both_ends(sent, n) for sent in tokenized_test))
                    test_text_everygram = list(everygrams(test_text_pad, max_len=n))
                    #print(test_text_everygram)
                    #test_data, padded_sents_test = padded_everygram_pipeline(n, tokenized_test)
                    #print(i)
                    #print(model.entropy(test_text_bigram))
                    #print(model.entropy(test_text_everygram))
                    entropy_fin += model.entropy(test_text_everygram)
                i += 1
            print(entropy_fin)
            avg_entr = entropy_fin/lense
            print("perplexity",2**avg_entr)
            perp.append([n,2**avg_entr])
        import pandas as pd
        DF = pd.DataFrame(perp)
        return DF
예제 #6
0
def KN_best_discount(n):
        perp = []
        for dis in range(80,100,2):
            dis = dis/100
            print(dis)
            train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)
            model = KneserNeyInterpolated(n, discount = dis) #only order and discount needed, WB only order
            print(n,model)
            model.fit(train_data, padded_sents)
            print(model.vocab)
            vocab_list = []
            for word in model.vocab:
                vocab_list.append(word)
            #print(vocab_list)
            print("value",model. score('<UNK>'))
            #print(generate_sent_text_seed(model, 30, random_seed=['thicc']))

            #print(generate_sent(model, 50, random_seed = 30))
            entropy_fin = 0
            lense = 100
            i = 0
            for z in range(lense):
                #print(contents[i])
                tokenized_test = [list(map(str.lower, word_tokenize(contents[i])))]
                if len(tokenized_test[0]) > 0:
                    for g in range(len(tokenized_test[0])):
                        if tokenized_test[0][g] not in vocab_list:
                            tokenized_test[0][g] = '<UNK>'
                    test_text_pad = list(flatten(pad_both_ends(sent, n) for sent in tokenized_test))
                    test_text_everygram = list(everygrams(test_text_pad, max_len=n))
                    #print(test_text_everygram)
                    #test_data, padded_sents_test = padded_everygram_pipeline(n, tokenized_test)
                    #print(i)
                    #print(model.entropy(test_text_bigram))
                    #print(model.entropy(test_text_everygram))
                    entropy_fin += model.entropy(test_text_everygram)
                i += 1
            print(entropy_fin)
            avg_entr = entropy_fin/lense
            print("perplexity",2**avg_entr)
            perp.append([dis,2**avg_entr])
        import pandas as pd
        DF = pd.DataFrame(perp)
        return DF
예제 #7
0
    def __init__(self, n=3):
        tokens = []
        for book in shakespeare.fileids():
            elt = shakespeare.xml(book)
            iterator = elt.getiterator()
            for node in iterator:
                lines = node.findall("LINE")
                for line in lines:
                    line_tokens = list(str(line.text))
                    line_tokens.insert(0, "<L>")
                    line_tokens.append("</L>")
                    tokens.append(line_tokens)
        t = (everygrams(x, max_len=n) for x in tokens)
        v = flatten(tokens)
        lm = Laplace(order=n)  # add-one smoothing
        lm.fit(t, v)

        self._n = n
        self._lm = lm
        self._tokenize_pattern = re.compile(r'(<L>)|(</L>)')
n = 3

tokenized_text = [
    list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)
]
# Preprocess the tokenized text for 3-grams language modelling

train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

##Test data must be padded with this technique, to match PDE-pipline above
test_text = "3rd edition , hypothesis test using ."
tokenized_test = [
    list(map(str.lower, word_tokenize(sent)))
    for sent in sent_tokenize(test_text)
]
test_text_pad = list(flatten(
    pad_both_ends(sent, n) for sent in tokenized_test))
test_text_everygram = list(everygrams(test_text_pad, max_len=n))
test_data, padded_sents_test = padded_everygram_pipeline(n, tokenized_test)
print(test_text_everygram)

##How to install the various language models
from nltk.lm import Lidstone, Laplace, MLE
from nltk.lm.models import InterpolatedLanguageModel, KneserNeyInterpolated
from nltk.lm.api import LanguageModel, Smoothing
from nltk.lm.smoothing import KneserNey, WittenBell
model = MLE(n)
#model = Laplace(n) #only add-one smoothing here
#model = Lidstone(n,0.1) #Lidstones second number is Gamma/Alpha/Delta
#model = InterpolatedLanguageModel(WittenBell, n) #
#model = KneserNeyInterpolated(n, discount = 0.1) #only order and discount needed, WB only order
예제 #9
0
text = [clean(doc).split() for doc in data]
#print(doc_clean)


# Creating bigrams
text = list(bigrams(pad_both_ends(text, n=2)))

# Creating unigrams as well as bigrams
#padded_bigrams = list(pad_both_ends(text, n=2))
#text = list(everygrams(padded_bigrams, max_len=2))
#print(text)


# During training and evaluation our model will rely on a vocabulary that defines which words are “known” to the model. To create this vocabulary we need to pad our sentences (just like for counting ngrams) and then combine the sentences into one flat stream of words.
text = list(flatten(pad_both_ends(sent, n=2) for sent in text))

# # Training a Model
train, vocab = padded_everygram_pipeline(2, text)

# let us train a **Maximum Likelihood Estimator (MLE)**. We only need to specify the highest ngram order to instantiate it.
lm = MLE(10)

lm.fit(train, vocab)

# The vocabulary helps us handle words that have not occurred during training.
# Looking for known words
#print(lm.vocab.lookup(text[11]))

# Looking for unknown (UNK) words
#lm.vocab.lookup(["aliens", "from", "Mars"])
예제 #10
0
def main():
    #Preprocessing Phrase

    #A very very small corpus with 2 sentences
    text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]

    #Getting the first sentence
    first_text = text[0]

    #Perform padding to be able to find the most likely word beginning/ending a sentence
    padded_text = list(pad_both_ends(first_text, n=2))  # 2 for bigrams

    #Everygrams calculates unigram, bigram, trigram, . . ., ngram for us (max_len = 3 means from unigram to trigram)
    print(list(everygrams(padded_text, max_len=3)))

    #Flat the sentence array to array of words to make a vocabulary ()
    flattened = list(flatten(pad_both_ends(sent, n=3) for sent in text))

    print(flattened)
    train, vocab = padded_everygram_pipeline(3, text)
    print(train, vocab)

    #Training Phrase
    #Maximum Likelihood Estimation
    # (Most probably that a particular word will have the probability = x in a given corpus found by using relative frequency)
    # 3 means maximum gram (trigram)
    lm = MLE(3)  #MLE

    #First output = a blank vocabulary (will be filled when the model is trained)
    print("Our first empty vocabulary", len(lm.vocab))

    #Give the training data and our vocab to the library to train for us
    lm.fit(train, vocab)
    print("Our vocabulary after trained", lm.vocab)

    #Lets play with our model!

    #We can find word in the vocabulary (<UNK> means the word does not belong to the vocab!)
    print(lm.vocab.lookup(['a', 'b', 'c', 'x']))

    #Finding count in each gram. We have 3 ngram order and 45 ngrams (what does it mean by 45 ngrams?)
    print(lm.counts)

    #Let's begin with counting a unigram
    print(lm.counts['a'])

    #Then bigram
    print(lm.counts[['a']]['b'])

    #Then trigram. . .
    print(lm.counts[['a', 'b']]['c'])

    #Relative Frequency (Joint probability)
    print(lm.score('a'))

    #Which word has the most score?
    print(lm.score('<s>'))

    total_score_unigram = 0
    print(flattened)

    #unigram score for w = P(w) = Count(w)/N
    print(lm.score('a'))
    print(lm.score('b'))

    #Avoiding underflow we use logprob
    print(lm.logscore('a'))
    print(lm.logscore('b'))

    #The score in the model for each gram must sum up to exactly 1
    bag_of_word = set(flattened)
    for word in bag_of_word:
        total_score_unigram += lm.score(word)

    print(total_score_unigram)
    total_score_bigram = 0

    #score for a word w given x = P(w | x) = C(w,x)/C(x)
    for word in bag_of_word:
        total_score_bigram += lm.score(word, ['a'])

    print(total_score_bigram)

    #Generating a random sequence of words
    print(lm.generate(10))

    print(lm.generate(10, random_seed=3))  #Same random seed, the same sequence

    print(lm.generate(10, text_seed=['a'],
                      random_seed=3))  #given 'a' as the preceding word)

    #Testing Phrase
    #Given a heldout corpus (additional corpus distinct to the traning corpus)
    test_seq = [('a', 'b'), ('c', 'd'), ('e', 'f')]
    test_good_seq = list(bigrams(lm.vocab.lookup(['f', 'b', 'c', 'e'])))
    print(test_good_seq)
    #Evaluate by calculating entropy or perplexity of sequences of words
    print("Entropy: ", lm.entropy(test_seq), "Perplexity: ",
          lm.perplexity(test_seq))
    print("Entropy: ", lm.entropy(test_good_seq), "Perplexity: ",
          lm.perplexity(test_good_seq))

    #Homework
    #Why the value of entropy and perplexity sometimes is inf
    #How to avoid that?
    #Hint: Smoothing, backoff-interpolations

    #Try using different corpus and
    #Generate a random sentence in unigram, bigram and trigram and see

    def list_followers(self, word):
        followers = set()
        for tup in list(bigrams(flattened)):
            if tup[0] == word:
                followers.add(tup[1])
        print(followers)
        return followers
예제 #11
0
from nltk import bigrams, trigrams

text = reuters.sents()

text = [[j.lower() for j in i] for i in text]
text = [[''.join(c for c in s if c not in ["."]) for s in k] for k in text]

from nltk.lm.preprocessing import pad_both_ends

y = (map((lambda x: pad_both_ends(x, n=2)), text))
y = list(map(lambda x: list(x), y))

from nltk.lm.preprocessing import flatten
from nltk.util import everygrams
bigramsList = list(map(lambda x: list(trigrams(x)), y))
bigramsList = list(flatten(bigramsList))
#list(everygrams(bigramsList, max_len=2))

vocab = list(flatten(pad_both_ends(sent, n=2) for sent in text))
from nltk.lm import Vocabulary
vocab = list(Vocabulary(vocab, unk_cutoff=1))
'''
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(2, text)
'''

lm = Laplace(3)
lm.fit([bigramsList], vocabulary_text=list(vocab))

lm.generate(4, text_seed=["government", "had"])
예제 #12
0
def preprocess_text(sequences):
    # Preprocessed : iterator on sequence
    preprocessed = [pad_both_ends(s.split(' '), n=2) for s in sequences]
    return list(flatten(preprocessed))
예제 #13
0
# bigramodel.py
# Author: Sébastien Combéfis
# Version: March 8, 2020

from nltk import bigrams, ConditionalFreqDist, FreqDist
from nltk.lm.preprocessing import flatten, pad_both_ends

text = ['i like chinese food', 'chinese people like food']
preprocessed = [pad_both_ends(s.split(' '), n=2) for s in text]
tokens = list(flatten(preprocessed))

fd = FreqDist(tokens)
print(fd['like'])

model = bigrams(tokens)
cfd = ConditionalFreqDist(model)
print(cfd['like']['food'])