示例#1
0
    def fit(self, values: List[Any]):
        patterns = [self.generator(v) for v in values]
        padded_patterns = [pad_both_ends(p, n=self.n) for p in patterns]
        ngrams_ = [ngrams(pp, n=self.n) for pp in padded_patterns]

        self.vocab = list(flatten(
            pad_both_ends(p, n=self.n) for p in patterns))
        self.model = MLE(self.n)
        self.model.fit(ngrams_, self.vocab)
示例#2
0
def train_models(filename):
    """ Vous ajoutez à partir d'ici tout le code dont vous avez besoin
        pour construire les différents modèles N-grammes.
        Voir les consignes de l'énoncé du travail pratique concernant les modèles à entraîner.

        Vous pouvez ajouter au fichier toutes les fonctions, classes, méthodes et variables que vous jugerez nécessaire.
        Merci de ne pas modifier les signatures (noms de fonctions et arguments) déjà présentes dans le fichier.
    """
    proverbs = load_proverbs(filename)
    print("\nNombre de proverbes : ", len(proverbs))

    tokens = [word_tokenize(sentence) for sentence in proverbs]

    global models
    models = {1: Laplace(1), 2: Laplace(2), 3: Laplace(3), 20: Bigrams()}

    models[1].fit(
        [list(ngrams(pad_both_ends(sentence, 1), 1)) for sentence in tokens],
        set(flatten(pad_both_ends(sentence, 1) for sentence in tokens)))
    models[2].fit(
        [list(ngrams(pad_both_ends(sentence, 2), 2)) for sentence in tokens],
        set(flatten(pad_both_ends(sentence, 2) for sentence in tokens)))
    models[3].fit(
        [list(ngrams(pad_both_ends(sentence, 3), 3)) for sentence in tokens],
        set(flatten(pad_both_ends(sentence, 3) for sentence in tokens)))
    models[20].fit(
        [list(ngrams(pad_both_ends(sentence, 2), 2)) for sentence in tokens],
        set(flatten(pad_both_ends(sentence, 2) for sentence in tokens)))
示例#3
0
def prepare_data_ngram(data_path, test_size=0.3, n=3, random_state=42):
    """
    生成N_gram的训练测试数据 [(x_{i-n},...,x_{i-1}), x_i)]
    """
    data = load_data(data_path)
    data = [
        list(pad_both_ends(list(line.replace('\n', '')), 2))
        for line in data[:5] if len(line) >= 2
    ]

    print(data[:2])
    word_to_ix = generate_vocab(data)

    ngram_list = []
    ngram_list.append([(sen[i:i + n - 1], sen[i + n - 1]) for sen in data
                       for i in range(len(sen) - 2)])

    print(ngram_list[:5])
    ngram_list = reduce(operator.add, ngram_list)
    print(ngram_list[:5])
    print(len(ngram_list))

    x_train, x_test = train_test_split(ngram_list,
                                       test_size=test_size,
                                       random_state=random_state)

    print(x_train[:3])
    print(x_test[:3])
    print('train={}, test={}'.format(len(x_train), len(x_test)))

    # indexing
    x_train, y_train = word2index(x_train, word_to_ix)
    x_test, y_test = word2index(x_test, word_to_ix)

    return x_train, x_test, y_train, y_test, word_to_ix
def sent_tokenize(sentence, n):
    """Return tokens of words from a sentence, n indicates n-gram, for padding purposes."""
    cleaned = clean(sentence)
    sent = cleaned.translate(str.maketrans('', '', string.punctuation))
    sent_tokens = nltk.word_tokenize(sent)
    tokens = []
    if n == 1:
        sent_tokens = pad_both_ends(sent_tokens, 1)
    if n == 2:
        sent_tokens = pad_both_ends(sent_tokens, 2)
    if n == 3:
        sent_tokens = pad_both_ends(sent_tokens, 3)
    if n == 4:
        sent_tokens = pad_both_ends(sent_tokens, 4)
    sent_tokens = [lemmatizer.lemmatize(token) for token in sent_tokens]
    tokens.extend(sent_tokens)
    return tokens
def calcular_perplexidade(modelo,frase):
    palavras_texto = WhitespaceTokenizer().tokenize(frase)
    palavras_com_fake_char = [list(pad_both_ends(palavra,n = 2)) for palavra in palavras_texto]
    palavras_bigramas = [list(bigrams(palavra)) for palavra in palavras_com_fake_char]
    perplexidade = 0
    for palavra in palavras_bigramas:
        perplexidade += modelo.perplexity(palavra)

    return perplexidade
    def score_text(self, t, model):

        n = 3

        tokenized = normalize_text(t).split()

        return sum(
            model.logscore(trigram[-1], trigram[:-1]) for trigram in ngrams(
                pad_both_ends(tokenized, n=n), n=n)) / len(tokenized)
def tokenize(text, n):
    """Return tokens of words, n indicates n-gram, for padding purposes."""
    sentences = nltk.sent_tokenize(text)
    tokens = []
    for sent in sentences:
        sent = sent.translate(str.maketrans('', '', string.punctuation))
        sent_tokens = nltk.word_tokenize(sent)
        if n == 1:
            sent_tokens = pad_both_ends(sent_tokens, 1)
        if n == 2:
            sent_tokens = pad_both_ends(sent_tokens, 2)
        if n == 3:
            sent_tokens = pad_both_ends(sent_tokens, 3)
        if n == 4:
            sent_tokens = pad_both_ends(sent_tokens, 4)
        sent_tokens = [lemmatizer.lemmatize(token) for token in sent_tokens]
        tokens.extend(sent_tokens)
    return tokens
示例#8
0
文件: format.py 项目: minhptx/spade
    def perplexity(self, sentence):
        tokenized_text = word_tokenize(sentence)
        tokenized_text = list(pad_both_ends(tokenized_text, 2))
        prob = 1
        for i in range(len(tokenized_text) - 1):
            prob = prob * self.model.score(tokenized_text[i + 1],
                                           [tokenized_text[i]])

        return (prob)**(1.0 / len(tokenized_text))
示例#9
0
文件: q2.py 项目: nazaninsbr/NLP-UT
def prep_test_data(data):
    preped_test_data = {news_type: {'word': {'unigram': [], 'bigram': []}, 'character': {'unigram': [], 'bigram': []}} for news_type in data.keys()}

    for news_type in data.keys():
        for news in data[news_type]:
            this_news_sentences = []
            this_news_characters = []
            for sent in news:
                this_news_sentences.extend(sent)
                for word in sent:
                    all_chars = [c for c in word]+[' ']
                this_news_characters.extend(all_chars)

            preped_test_data[news_type]['word']['bigram'].append(list(ngrams(pad_both_ends(this_news_sentences, n=2), 2)))
            preped_test_data[news_type]['word']['unigram'].append(list(ngrams(pad_both_ends(this_news_sentences, n=1), 1)))

            preped_test_data[news_type]['character']['bigram'].append(list(ngrams(pad_both_ends(this_news_characters, n=2), 2)))
            preped_test_data[news_type]['character']['unigram'].append(list(ngrams(pad_both_ends(this_news_characters, n=1), 1)))
    
    return preped_test_data
示例#10
0
 def judge(self, o: Any) -> list:
     scores = []
     p = self.generator(o)
     p = list(pad_both_ends(p, n=self.n))
     for i, v in enumerate(p):
         if i < self.n - 1:
             continue
         letters = []
         for j in range(i - (self.n - 1), i):
             letters.append(p[j])
         scores.append(self.model.score(v, letters))
     return heapq.nsmallest(self.dim, scores)
def main():
    for i in range(len(models)):
        order = i + 1
        print('calculating ' + str(order) + '-gram perplexity...')
        p = 0
        model = models[i]
        for line in test_data:
            tokens = line.strip().split(' ')
            test_ngrams = list(ngrams(pad_both_ends(tokens, n=order), order))
            pp = model.perplexity(test_ngrams)
            p += pp
        print('{}-gram perplexity:'.format(order), p)
示例#12
0
	def get_MLELM(self, tokens, n_gram = 2) -> MLE:
		'''
			Trains lm and stores in class upon training to be reused
		'''
		paddedLine = [list(pad_both_ends(tokens, n=n_gram))]
		train, vocab = padded_everygram_pipeline(2, paddedLine)
			
		if (tokens not in self.lms.keys()):
			lm = MLE(n_gram)
			lm.fit(train, vocab)
			self.lms[tokens] = lm
			
		return self.lms[tokens]
示例#13
0
def run_q4_code(best_model, test_data_dict):
    test_data_dict = test_data_dict['UnknownLabel']
    with open('../Result.csv', 'w') as fp:
        fp.write('Filename,Class\n')
        for filename in test_data_dict.keys():
            this_text = test_data_dict[filename]
            this_text_arr = []
            for sent in this_text:
                this_text_arr.extend(sent)
            preped_data = list(ngrams(pad_both_ends(this_text_arr, n=1), 1))
            predicted_class = predict_the_class(best_model, preped_data)
            line_to_write = filename + ',' + predicted_class + '\n'
            fp.write(line_to_write)
示例#14
0
def extract_ngrams_from_sentence(sentence, n):
    """
    Renvoie la liste des n-grammes présents dans la phrase `sentence`.

    >> extract_ngrams_from_sentence(["Alice", "est", "là"], 2)
    [("<s>", "Alice"), ("Alice", "est"), ("est", "là"), ("là", "</s>")]

    Attention à la gestion du padding (début et fin de phrase).

    :param sentence: list(str), une phrase tokenizée
    :param n: int, l'ordre des n-grammes
    :return: list(tuple(str)), la liste des n-grammes présents dans `sentence`
    """
    sentence = list(pad_both_ends(sentence,n))
    return list(nltk.ngrams(sentence,n))
示例#15
0
def model_iterator(n):
        perp = []
        n = n+1
        for n in range(1,n):
            print(n)
            train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)
            #model = MLE(n)
            #model = Laplace(n) #only add-one smoothing here
            #model = Lidstone(0.1,n) #Lidstones second number is Gamma/Alpha/Delta
            #model = WittenBellInterpolated(n)
            model = KneserNeyInterpolated(n, discount = 0.88) #only order and discount needed, WB only order
            print(n,model)
            model.fit(train_data, padded_sents)
            print(model.vocab)
            vocab_list = []
            for word in model.vocab:
                vocab_list.append(word)
            #print(vocab_list)
            print("value",model. score('<UNK>'))
            #print(generate_sent_text_seed(model, 30, random_seed=['thicc']))

            #print(generate_sent(model, 50, random_seed = 30))
            entropy_fin = 0
            lense = 1000
            i = 0
            for z in range(lense):
                #print(contents[i])
                tokenized_test = [list(map(str.lower, word_tokenize(contents[i])))]
                if len(tokenized_test[0]) > 0:
                    for g in range(len(tokenized_test[0])):
                        if tokenized_test[0][g] not in vocab_list:
                            tokenized_test[0][g] = '<UNK>'
                    test_text_pad = list(flatten(pad_both_ends(sent, n) for sent in tokenized_test))
                    test_text_everygram = list(everygrams(test_text_pad, max_len=n))
                    #print(test_text_everygram)
                    #test_data, padded_sents_test = padded_everygram_pipeline(n, tokenized_test)
                    #print(i)
                    #print(model.entropy(test_text_bigram))
                    #print(model.entropy(test_text_everygram))
                    entropy_fin += model.entropy(test_text_everygram)
                i += 1
            print(entropy_fin)
            avg_entr = entropy_fin/lense
            print("perplexity",2**avg_entr)
            perp.append([n,2**avg_entr])
        import pandas as pd
        DF = pd.DataFrame(perp)
        return DF
示例#16
0
def KN_best_discount(n):
        perp = []
        for dis in range(80,100,2):
            dis = dis/100
            print(dis)
            train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)
            model = KneserNeyInterpolated(n, discount = dis) #only order and discount needed, WB only order
            print(n,model)
            model.fit(train_data, padded_sents)
            print(model.vocab)
            vocab_list = []
            for word in model.vocab:
                vocab_list.append(word)
            #print(vocab_list)
            print("value",model. score('<UNK>'))
            #print(generate_sent_text_seed(model, 30, random_seed=['thicc']))

            #print(generate_sent(model, 50, random_seed = 30))
            entropy_fin = 0
            lense = 100
            i = 0
            for z in range(lense):
                #print(contents[i])
                tokenized_test = [list(map(str.lower, word_tokenize(contents[i])))]
                if len(tokenized_test[0]) > 0:
                    for g in range(len(tokenized_test[0])):
                        if tokenized_test[0][g] not in vocab_list:
                            tokenized_test[0][g] = '<UNK>'
                    test_text_pad = list(flatten(pad_both_ends(sent, n) for sent in tokenized_test))
                    test_text_everygram = list(everygrams(test_text_pad, max_len=n))
                    #print(test_text_everygram)
                    #test_data, padded_sents_test = padded_everygram_pipeline(n, tokenized_test)
                    #print(i)
                    #print(model.entropy(test_text_bigram))
                    #print(model.entropy(test_text_everygram))
                    entropy_fin += model.entropy(test_text_everygram)
                i += 1
            print(entropy_fin)
            avg_entr = entropy_fin/lense
            print("perplexity",2**avg_entr)
            perp.append([dis,2**avg_entr])
        import pandas as pd
        DF = pd.DataFrame(perp)
        return DF
示例#17
0
def cloze_test(incomplete_proverb, choices, n=3):
    """ Fonction qui complète un texte à trous en ajoutant le(s) bon(s) mot(s).
        En anglais, on nomme ce type de tâche un cloze test.

        La paramètre n désigne le modèle utilisé.
        1 - unigramme NLTK, 2 - bigramme NLTK, 3 - trigramme NLTK, 20 - votre modèle bigramme
    """
    results = []

    for choice in choices:
        split = incomplete_proverb.split()
        index = [
            character for character, token in enumerate(split)
            if '***' in token
        ][0]
        proverb = incomplete_proverb.replace('***', choice)

        tokens = word_tokenize(proverb)
        order = 2 if n == 20 else n
        multigrams = list(ngrams(pad_both_ends(tokens, order), order))

        if n == 1:
            logscore = models[n].logscore(choice)
            choice_perplexity = models[n].perplexity(multigrams)
        elif n == 2:
            logscore = models[n].logscore(choice, [split[index - 1]])
            choice_perplexity = models[n].perplexity(multigrams)
        elif n == 3:
            logscore = models[n].logscore(choice,
                                          [split[index - 2], split[index - 1]])
            choice_perplexity = models[n].perplexity(multigrams)
        elif n == 20:
            logscore = models[n].logscore(choice, split[index - 1])
            choice_perplexity = models[n].perplexity(multigrams)

        results.append((proverb, multigrams, logscore))  # choice_perplexity

    results.sort(key=lambda x: x[2], reverse=True)  # False
    result = results[0][0]
    perplexity = models[n].perplexity(results[0][1])

    return result, perplexity
示例#18
0
    def output_prob_for_excel(self, excel, num):
        """
        解析Excel,并返回加入了prob的dataFrame
        """
        if num < 0 or num > self.n-1:
            logging.error('size of context is larger than ngram setting!!')
            return None
        else:
            df = pd.read_excel(excel)
            print(df.shape)
            # 清洗,分词,填初始标签
            data = np.array(df)
            data = [list(pad_both_ends(str(line), n=2)) for line in data[:,3]]
            print(len(data), type(data))
            print(data[0], type(data[0]))

            if num == 0:
                prob_list = []
                for line in data:
                    prob_dict = []
                    for word in line:
                        prob_dict.append(math.log(self.calculate_marginal_prob(word)+1e-8))
                    prob_list.append(sum(prob_dict)/len(prob_dict))

            else:
                prob_list = []
                for line in data:
                    prob_dict = []
                    for i in range(len(line)-num):
                        context = line[i:i+num]
                        word = line[i+num]
                        prob = self.calculate_conditional_prob(word, context)
                        # prob_dict.append((word+'|'+','.join(context), prob))
                        prob_dict.append(math.log(prob + 1e-8))
                    prob_list.append(sum(prob_dict)/len(prob_dict))

            # 更新df
            df['prob'] = prob_list
            df.to_excel(excel)

            return df
示例#19
0
def generate_MLE(positive, negative):
    tokenized = []
    text = ""
    n = 2 # n-gram size

    locations = [positive, negative]
    for location in locations:
        for filename in os.listdir(location):
            if filename.endswith(".txt"):
                contents = open(location + "/" + filename, "r", encoding="windows-1252").read()
                review = sent_tokenize(contents)
                for word in review:
                    tokenized.append(word.lower())
                    
                text += contents + " "

    paddedLine = [list(pad_both_ends(word_tokenize(text.lower()), n))]
    train, vocab = padded_everygram_pipeline(n, paddedLine)

    model = MLE(n)
    model.fit(train, vocab)

    return model
示例#20
0
def extract_ngrams_from_sentence(sentence, n):
    """
    Renvoie la liste des n-grammes présents dans la phrase `sentence`.

    >>> extract_ngrams_from_sentence(["Alice", "est", "là"], 2)
    [("<s>", "Alice"), ("Alice", "est"), ("est", "là"), ("là", "</s>")]

    Attention à la gestion du padding (début et fin de phrase).

    :param sentence: list(str), une phrase tokenizée
    :param n: int, l'ordre des n-grammes
    :return: list(tuple(str)), la liste des n-grammes présents dans `sentence`
    """

    sentence = list(pad_both_ends(sentence, n))

    n_gram_list = []
    for i in range(len(sentence) - n + 1):
        n_gram = []
        for j in range(n):
            n_gram.append(sentence[i + j])
        n_gram_list.append(tuple(n_gram))

    return n_gram_list
示例#21
0
def prepare_data_cbow(data_path, test_size=0.3, n=3, random_state=42):
    """
    生成CBOW的训练测试数据, [(x_{i-n},...,x_{i-1}, x_{i+1},...,x_{i+n}), x_i)]
    """
    data = load_data(data_path)
    data = [
        list(pad_both_ends(list(line.replace('\n', '')), 2))
        for line in data[:5] if len(line) >= 2
    ]

    print(data[:2])
    word_to_ix = generate_vocab(data)

    ngram_list = []
    for sen in data:
        for i in range(n, len(sen) - n):
            context = sen[i - n:i] + sen[i + 1:i + n + 1]
            target = sen[i]
            ngram_list.append((context, target))

    print(ngram_list[:5])
    print(len(ngram_list))

    x_train, x_test = train_test_split(ngram_list,
                                       test_size=test_size,
                                       random_state=random_state)

    print(x_train[:3])
    print(x_test[:3])
    print('train={}, test={}'.format(len(x_train), len(x_test)))

    # indexing
    x_train, y_train = word2index(x_train, word_to_ix)
    x_test, y_test = word2index(x_test, word_to_ix)

    return x_train, x_test, y_train, y_test, word_to_ix
示例#22
0
import nltk
from nltk.lm import MLE, Laplace, KneserNeyInterpolated
from nltk.corpus import reuters
nltk.download('reuters')
from collections import Counter
from nltk import bigrams, trigrams

text = reuters.sents()

text = [[j.lower() for j in i] for i in text]
text = [[''.join(c for c in s if c not in ["."]) for s in k] for k in text]

from nltk.lm.preprocessing import pad_both_ends

y = (map((lambda x: pad_both_ends(x, n=2)), text))
y = list(map(lambda x: list(x), y))

from nltk.lm.preprocessing import flatten
from nltk.util import everygrams
bigramsList = list(map(lambda x: list(trigrams(x)), y))
bigramsList = list(flatten(bigramsList))
#list(everygrams(bigramsList, max_len=2))

vocab = list(flatten(pad_both_ends(sent, n=2) for sent in text))
from nltk.lm import Vocabulary
vocab = list(Vocabulary(vocab, unk_cutoff=1))
'''
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(2, text)
'''
示例#23
0
 def encode(self, sentence):
     return tuple(pad_both_ends(tuple(map(str.lower, word_tokenize(sentence))), self.order))
from nltk.util import bigrams
from nltk.lm.preprocessing import pad_both_ends

#Retirando os bigramas do texto teste
textos_teste = "Alura"
bigrama_texto_teste = bigrams(textos_teste)
print("Bigram of {} => {}".format(textos_teste,list(bigrama_texto_teste)))

'''
Adicionando fake chars no primeiro e no ultimo caracter do bigrama, para que assim a quantidade de caracteres fique a mesma
para todos os caracteres da frase e assim posssamos indetificar facilmente os caracteres de inicio e fim da sentença.
'''
padded_bigram = list(bigrams(pad_both_ends(textos_teste,n=2)))
print("Padded bigram => {}".format(padded_bigram))
示例#25
0
 def perplexity(self, sent: str):
     text = pad_both_ends(sent, n=self.order)
     text_ngrams = ngrams(text, n=self.order)
     return self.model.perplexity(text_ngrams)
tokenized_text = [
    list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)
]
# Preprocess the tokenized text for 3-grams language modelling

train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

##Test data must be padded with this technique, to match PDE-pipline above
test_text = "3rd edition , hypothesis test using ."
tokenized_test = [
    list(map(str.lower, word_tokenize(sent)))
    for sent in sent_tokenize(test_text)
]
test_text_pad = list(flatten(
    pad_both_ends(sent, n) for sent in tokenized_test))
test_text_everygram = list(everygrams(test_text_pad, max_len=n))
test_data, padded_sents_test = padded_everygram_pipeline(n, tokenized_test)
print(test_text_everygram)

##How to install the various language models
from nltk.lm import Lidstone, Laplace, MLE
from nltk.lm.models import InterpolatedLanguageModel, KneserNeyInterpolated
from nltk.lm.api import LanguageModel, Smoothing
from nltk.lm.smoothing import KneserNey, WittenBell
model = MLE(n)
#model = Laplace(n) #only add-one smoothing here
#model = Lidstone(n,0.1) #Lidstones second number is Gamma/Alpha/Delta
#model = InterpolatedLanguageModel(WittenBell, n) #
#model = KneserNeyInterpolated(n, discount = 0.1) #only order and discount needed, WB only order
示例#27
0
def main():
    #Preprocessing Phrase

    #A very very small corpus with 2 sentences
    text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]

    #Getting the first sentence
    first_text = text[0]

    #Perform padding to be able to find the most likely word beginning/ending a sentence
    padded_text = list(pad_both_ends(first_text, n=2))  # 2 for bigrams

    #Everygrams calculates unigram, bigram, trigram, . . ., ngram for us (max_len = 3 means from unigram to trigram)
    print(list(everygrams(padded_text, max_len=3)))

    #Flat the sentence array to array of words to make a vocabulary ()
    flattened = list(flatten(pad_both_ends(sent, n=3) for sent in text))

    print(flattened)
    train, vocab = padded_everygram_pipeline(3, text)
    print(train, vocab)

    #Training Phrase
    #Maximum Likelihood Estimation
    # (Most probably that a particular word will have the probability = x in a given corpus found by using relative frequency)
    # 3 means maximum gram (trigram)
    lm = MLE(3)  #MLE

    #First output = a blank vocabulary (will be filled when the model is trained)
    print("Our first empty vocabulary", len(lm.vocab))

    #Give the training data and our vocab to the library to train for us
    lm.fit(train, vocab)
    print("Our vocabulary after trained", lm.vocab)

    #Lets play with our model!

    #We can find word in the vocabulary (<UNK> means the word does not belong to the vocab!)
    print(lm.vocab.lookup(['a', 'b', 'c', 'x']))

    #Finding count in each gram. We have 3 ngram order and 45 ngrams (what does it mean by 45 ngrams?)
    print(lm.counts)

    #Let's begin with counting a unigram
    print(lm.counts['a'])

    #Then bigram
    print(lm.counts[['a']]['b'])

    #Then trigram. . .
    print(lm.counts[['a', 'b']]['c'])

    #Relative Frequency (Joint probability)
    print(lm.score('a'))

    #Which word has the most score?
    print(lm.score('<s>'))

    total_score_unigram = 0
    print(flattened)

    #unigram score for w = P(w) = Count(w)/N
    print(lm.score('a'))
    print(lm.score('b'))

    #Avoiding underflow we use logprob
    print(lm.logscore('a'))
    print(lm.logscore('b'))

    #The score in the model for each gram must sum up to exactly 1
    bag_of_word = set(flattened)
    for word in bag_of_word:
        total_score_unigram += lm.score(word)

    print(total_score_unigram)
    total_score_bigram = 0

    #score for a word w given x = P(w | x) = C(w,x)/C(x)
    for word in bag_of_word:
        total_score_bigram += lm.score(word, ['a'])

    print(total_score_bigram)

    #Generating a random sequence of words
    print(lm.generate(10))

    print(lm.generate(10, random_seed=3))  #Same random seed, the same sequence

    print(lm.generate(10, text_seed=['a'],
                      random_seed=3))  #given 'a' as the preceding word)

    #Testing Phrase
    #Given a heldout corpus (additional corpus distinct to the traning corpus)
    test_seq = [('a', 'b'), ('c', 'd'), ('e', 'f')]
    test_good_seq = list(bigrams(lm.vocab.lookup(['f', 'b', 'c', 'e'])))
    print(test_good_seq)
    #Evaluate by calculating entropy or perplexity of sequences of words
    print("Entropy: ", lm.entropy(test_seq), "Perplexity: ",
          lm.perplexity(test_seq))
    print("Entropy: ", lm.entropy(test_good_seq), "Perplexity: ",
          lm.perplexity(test_good_seq))

    #Homework
    #Why the value of entropy and perplexity sometimes is inf
    #How to avoid that?
    #Hint: Smoothing, backoff-interpolations

    #Try using different corpus and
    #Generate a random sentence in unigram, bigram and trigram and see

    def list_followers(self, word):
        followers = set()
        for tup in list(bigrams(flattened)):
            if tup[0] == word:
                followers.add(tup[1])
        print(followers)
        return followers
示例#28
0
# # Preparing Data
# Before we train our **ngram** models it is necessary to make sure the data we put in them is in the right format.
def clean(doc):
    stop = set(stopwords.words('english'))
    exclude = set(string.punctuation) 
    stop_free = " ".join([i for i in doc.lower().split('[^A-Za-z]+') if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    return punc_free

text = [clean(doc).split() for doc in data]
#print(doc_clean)


# Creating bigrams
text = list(bigrams(pad_both_ends(text, n=2)))

# Creating unigrams as well as bigrams
#padded_bigrams = list(pad_both_ends(text, n=2))
#text = list(everygrams(padded_bigrams, max_len=2))
#print(text)


# During training and evaluation our model will rely on a vocabulary that defines which words are “known” to the model. To create this vocabulary we need to pad our sentences (just like for counting ngrams) and then combine the sentences into one flat stream of words.
text = list(flatten(pad_both_ends(sent, n=2) for sent in text))

# # Training a Model
train, vocab = padded_everygram_pipeline(2, text)

# let us train a **Maximum Likelihood Estimator (MLE)**. We only need to specify the highest ngram order to instantiate it.
lm = MLE(10)
示例#29
0
def my_pad_both_ends(sentence):
    return list(pad_both_ends(sentence, n=2))
示例#30
0
def preprocess_text(sequences):
    # Preprocessed : iterator on sequence
    preprocessed = [pad_both_ends(s.split(' '), n=2) for s in sequences]
    return list(flatten(preprocessed))