Python pad_both_ends示例，nltk.lm.preprocessing.pad_both_ends Python示例

示例#1

0

显示文件

    def fit(self, values: List[Any]):
        patterns = [self.generator(v) for v in values]
        padded_patterns = [pad_both_ends(p, n=self.n) for p in patterns]
        ngrams_ = [ngrams(pp, n=self.n) for pp in padded_patterns]

        self.vocab = list(flatten(
            pad_both_ends(p, n=self.n) for p in patterns))
        self.model = MLE(self.n)
        self.model.fit(ngrams_, self.vocab)

示例#2

0

显示文件

def train_models(filename):
    """ Vous ajoutez à partir d'ici tout le code dont vous avez besoin
        pour construire les différents modèles N-grammes.
        Voir les consignes de l'énoncé du travail pratique concernant les modèles à entraîner.

        Vous pouvez ajouter au fichier toutes les fonctions, classes, méthodes et variables que vous jugerez nécessaire.
        Merci de ne pas modifier les signatures (noms de fonctions et arguments) déjà présentes dans le fichier.
    """
    proverbs = load_proverbs(filename)
    print("\nNombre de proverbes : ", len(proverbs))

    tokens = [word_tokenize(sentence) for sentence in proverbs]

    global models
    models = {1: Laplace(1), 2: Laplace(2), 3: Laplace(3), 20: Bigrams()}

    models[1].fit(
        [list(ngrams(pad_both_ends(sentence, 1), 1)) for sentence in tokens],
        set(flatten(pad_both_ends(sentence, 1) for sentence in tokens)))
    models[2].fit(
        [list(ngrams(pad_both_ends(sentence, 2), 2)) for sentence in tokens],
        set(flatten(pad_both_ends(sentence, 2) for sentence in tokens)))
    models[3].fit(
        [list(ngrams(pad_both_ends(sentence, 3), 3)) for sentence in tokens],
        set(flatten(pad_both_ends(sentence, 3) for sentence in tokens)))
    models[20].fit(
        [list(ngrams(pad_both_ends(sentence, 2), 2)) for sentence in tokens],
        set(flatten(pad_both_ends(sentence, 2) for sentence in tokens)))

示例#3

0

显示文件

def prepare_data_ngram(data_path, test_size=0.3, n=3, random_state=42):
    """
    生成N_gram的训练测试数据 [(x_{i-n},...,x_{i-1}), x_i)]
    """
    data = load_data(data_path)
    data = [
        list(pad_both_ends(list(line.replace('\n', '')), 2))
        for line in data[:5] if len(line) >= 2
    ]

    print(data[:2])
    word_to_ix = generate_vocab(data)

    ngram_list = []
    ngram_list.append([(sen[i:i + n - 1], sen[i + n - 1]) for sen in data
                       for i in range(len(sen) - 2)])

    print(ngram_list[:5])
    ngram_list = reduce(operator.add, ngram_list)
    print(ngram_list[:5])
    print(len(ngram_list))

    x_train, x_test = train_test_split(ngram_list,
                                       test_size=test_size,
                                       random_state=random_state)

    print(x_train[:3])
    print(x_test[:3])
    print('train={}, test={}'.format(len(x_train), len(x_test)))

    # indexing
    x_train, y_train = word2index(x_train, word_to_ix)
    x_test, y_test = word2index(x_test, word_to_ix)

    return x_train, x_test, y_train, y_test, word_to_ix

示例#4

0

显示文件

文件： n-gram-model.py 项目： rholop/NGramLanguageModels

def sent_tokenize(sentence, n):
    """Return tokens of words from a sentence, n indicates n-gram, for padding purposes."""
    cleaned = clean(sentence)
    sent = cleaned.translate(str.maketrans('', '', string.punctuation))
    sent_tokens = nltk.word_tokenize(sent)
    tokens = []
    if n == 1:
        sent_tokens = pad_both_ends(sent_tokens, 1)
    if n == 2:
        sent_tokens = pad_both_ends(sent_tokens, 2)
    if n == 3:
        sent_tokens = pad_both_ends(sent_tokens, 3)
    if n == 4:
        sent_tokens = pad_both_ends(sent_tokens, 4)
    sent_tokens = [lemmatizer.lemmatize(token) for token in sent_tokens]
    tokens.extend(sent_tokens)
    return tokens

示例#5

0

显示文件

文件： class_7_modularizing_model_and_testing.py 项目： matheusnalmeida/Regex-NLP

def calcular_perplexidade(modelo,frase):
    palavras_texto = WhitespaceTokenizer().tokenize(frase)
    palavras_com_fake_char = [list(pad_both_ends(palavra,n = 2)) for palavra in palavras_texto]
    palavras_bigramas = [list(bigrams(palavra)) for palavra in palavras_com_fake_char]
    perplexidade = 0
    for palavra in palavras_bigramas:
        perplexidade += modelo.perplexity(palavra)

    return perplexidade

示例#6

0

显示文件

文件： plain_experimento2.py 项目： abevieiramota/masters

    def score_text(self, t, model):

        n = 3

        tokenized = normalize_text(t).split()

        return sum(
            model.logscore(trigram[-1], trigram[:-1]) for trigram in ngrams(
                pad_both_ends(tokenized, n=n), n=n)) / len(tokenized)

示例#7

0

显示文件

文件： n-gram-model.py 项目： rholop/NGramLanguageModels

def tokenize(text, n):
    """Return tokens of words, n indicates n-gram, for padding purposes."""
    sentences = nltk.sent_tokenize(text)
    tokens = []
    for sent in sentences:
        sent = sent.translate(str.maketrans('', '', string.punctuation))
        sent_tokens = nltk.word_tokenize(sent)
        if n == 1:
            sent_tokens = pad_both_ends(sent_tokens, 1)
        if n == 2:
            sent_tokens = pad_both_ends(sent_tokens, 2)
        if n == 3:
            sent_tokens = pad_both_ends(sent_tokens, 3)
        if n == 4:
            sent_tokens = pad_both_ends(sent_tokens, 4)
        sent_tokens = [lemmatizer.lemmatize(token) for token in sent_tokens]
        tokens.extend(sent_tokens)
    return tokens

示例#8

0

显示文件

文件： format.py 项目： minhptx/spade

    def perplexity(self, sentence):
        tokenized_text = word_tokenize(sentence)
        tokenized_text = list(pad_both_ends(tokenized_text, 2))
        prob = 1
        for i in range(len(tokenized_text) - 1):
            prob = prob * self.model.score(tokenized_text[i + 1],
                                           [tokenized_text[i]])

        return (prob)**(1.0 / len(tokenized_text))

示例#9

0

显示文件

文件： q2.py 项目： nazaninsbr/NLP-UT

def prep_test_data(data):
    preped_test_data = {news_type: {'word': {'unigram': [], 'bigram': []}, 'character': {'unigram': [], 'bigram': []}} for news_type in data.keys()}

    for news_type in data.keys():
        for news in data[news_type]:
            this_news_sentences = []
            this_news_characters = []
            for sent in news:
                this_news_sentences.extend(sent)
                for word in sent:
                    all_chars = [c for c in word]+[' ']
                this_news_characters.extend(all_chars)

            preped_test_data[news_type]['word']['bigram'].append(list(ngrams(pad_both_ends(this_news_sentences, n=2), 2)))
            preped_test_data[news_type]['word']['unigram'].append(list(ngrams(pad_both_ends(this_news_sentences, n=1), 1)))

            preped_test_data[news_type]['character']['bigram'].append(list(ngrams(pad_both_ends(this_news_characters, n=2), 2)))
            preped_test_data[news_type]['character']['unigram'].append(list(ngrams(pad_both_ends(this_news_characters, n=1), 1)))
    
    return preped_test_data

示例#10

0

显示文件

 def judge(self, o: Any) -> list:
     scores = []
     p = self.generator(o)
     p = list(pad_both_ends(p, n=self.n))
     for i, v in enumerate(p):
         if i < self.n - 1:
             continue
         letters = []
         for j in range(i - (self.n - 1), i):
             letters.append(p[j])
         scores.append(self.model.score(v, letters))
     return heapq.nsmallest(self.dim, scores)

示例#11

0

显示文件

文件： ngram_perplexity.py 项目： tyler-bateman/ProgressionGen

def main():
    for i in range(len(models)):
        order = i + 1
        print('calculating ' + str(order) + '-gram perplexity...')
        p = 0
        model = models[i]
        for line in test_data:
            tokens = line.strip().split(' ')
            test_ngrams = list(ngrams(pad_both_ends(tokens, n=order), order))
            pp = model.perplexity(test_ngrams)
            p += pp
        print('{}-gram perplexity:'.format(order), p)

示例#12

0

显示文件

	def get_MLELM(self, tokens, n_gram = 2) -> MLE:
		'''
			Trains lm and stores in class upon training to be reused
		'''
		paddedLine = [list(pad_both_ends(tokens, n=n_gram))]
		train, vocab = padded_everygram_pipeline(2, paddedLine)
			
		if (tokens not in self.lms.keys()):
			lm = MLE(n_gram)
			lm.fit(train, vocab)
			self.lms[tokens] = lm
			
		return self.lms[tokens]

示例#13

0

显示文件

def run_q4_code(best_model, test_data_dict):
    test_data_dict = test_data_dict['UnknownLabel']
    with open('../Result.csv', 'w') as fp:
        fp.write('Filename,Class\n')
        for filename in test_data_dict.keys():
            this_text = test_data_dict[filename]
            this_text_arr = []
            for sent in this_text:
                this_text_arr.extend(sent)
            preped_data = list(ngrams(pad_both_ends(this_text_arr, n=1), 1))
            predicted_class = predict_the_class(best_model, preped_data)
            line_to_write = filename + ',' + predicted_class + '\n'
            fp.write(line_to_write)

示例#14

0

显示文件

def extract_ngrams_from_sentence(sentence, n):
    """
    Renvoie la liste des n-grammes présents dans la phrase `sentence`.

    >> extract_ngrams_from_sentence(["Alice", "est", "là"], 2)
    [("<s>", "Alice"), ("Alice", "est"), ("est", "là"), ("là", "</s>")]

    Attention à la gestion du padding (début et fin de phrase).

    :param sentence: list(str), une phrase tokenizée
    :param n: int, l'ordre des n-grammes
    :return: list(tuple(str)), la liste des n-grammes présents dans `sentence`
    """
    sentence = list(pad_both_ends(sentence,n))
    return list(nltk.ngrams(sentence,n))

示例#15

0

显示文件

def model_iterator(n):
        perp = []
        n = n+1
        for n in range(1,n):
            print(n)
            train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)
            #model = MLE(n)
            #model = Laplace(n) #only add-one smoothing here
            #model = Lidstone(0.1,n) #Lidstones second number is Gamma/Alpha/Delta
            #model = WittenBellInterpolated(n)
            model = KneserNeyInterpolated(n, discount = 0.88) #only order and discount needed, WB only order
            print(n,model)
            model.fit(train_data, padded_sents)
            print(model.vocab)
            vocab_list = []
            for word in model.vocab:
                vocab_list.append(word)
            #print(vocab_list)
            print("value",model. score('<UNK>'))
            #print(generate_sent_text_seed(model, 30, random_seed=['thicc']))

            #print(generate_sent(model, 50, random_seed = 30))
            entropy_fin = 0
            lense = 1000
            i = 0
            for z in range(lense):
                #print(contents[i])
                tokenized_test = [list(map(str.lower, word_tokenize(contents[i])))]
                if len(tokenized_test[0]) > 0:
                    for g in range(len(tokenized_test[0])):
                        if tokenized_test[0][g] not in vocab_list:
                            tokenized_test[0][g] = '<UNK>'
                    test_text_pad = list(flatten(pad_both_ends(sent, n) for sent in tokenized_test))
                    test_text_everygram = list(everygrams(test_text_pad, max_len=n))
                    #print(test_text_everygram)
                    #test_data, padded_sents_test = padded_everygram_pipeline(n, tokenized_test)
                    #print(i)
                    #print(model.entropy(test_text_bigram))
                    #print(model.entropy(test_text_everygram))
                    entropy_fin += model.entropy(test_text_everygram)
                i += 1
            print(entropy_fin)
            avg_entr = entropy_fin/lense
            print("perplexity",2**avg_entr)
            perp.append([n,2**avg_entr])
        import pandas as pd
        DF = pd.DataFrame(perp)
        return DF

示例#16

0

显示文件

def KN_best_discount(n):
        perp = []
        for dis in range(80,100,2):
            dis = dis/100
            print(dis)
            train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)
            model = KneserNeyInterpolated(n, discount = dis) #only order and discount needed, WB only order
            print(n,model)
            model.fit(train_data, padded_sents)
            print(model.vocab)
            vocab_list = []
            for word in model.vocab:
                vocab_list.append(word)
            #print(vocab_list)
            print("value",model. score('<UNK>'))
            #print(generate_sent_text_seed(model, 30, random_seed=['thicc']))

            #print(generate_sent(model, 50, random_seed = 30))
            entropy_fin = 0
            lense = 100
            i = 0
            for z in range(lense):
                #print(contents[i])
                tokenized_test = [list(map(str.lower, word_tokenize(contents[i])))]
                if len(tokenized_test[0]) > 0:
                    for g in range(len(tokenized_test[0])):
                        if tokenized_test[0][g] not in vocab_list:
                            tokenized_test[0][g] = '<UNK>'
                    test_text_pad = list(flatten(pad_both_ends(sent, n) for sent in tokenized_test))
                    test_text_everygram = list(everygrams(test_text_pad, max_len=n))
                    #print(test_text_everygram)
                    #test_data, padded_sents_test = padded_everygram_pipeline(n, tokenized_test)
                    #print(i)
                    #print(model.entropy(test_text_bigram))
                    #print(model.entropy(test_text_everygram))
                    entropy_fin += model.entropy(test_text_everygram)
                i += 1
            print(entropy_fin)
            avg_entr = entropy_fin/lense
            print("perplexity",2**avg_entr)
            perp.append([dis,2**avg_entr])
        import pandas as pd
        DF = pd.DataFrame(perp)
        return DF

示例#17

0

显示文件

def cloze_test(incomplete_proverb, choices, n=3):
    """ Fonction qui complète un texte à trous en ajoutant le(s) bon(s) mot(s).
        En anglais, on nomme ce type de tâche un cloze test.

        La paramètre n désigne le modèle utilisé.
        1 - unigramme NLTK, 2 - bigramme NLTK, 3 - trigramme NLTK, 20 - votre modèle bigramme
    """
    results = []

    for choice in choices:
        split = incomplete_proverb.split()
        index = [
            character for character, token in enumerate(split)
            if '***' in token
        ][0]
        proverb = incomplete_proverb.replace('***', choice)

        tokens = word_tokenize(proverb)
        order = 2 if n == 20 else n
        multigrams = list(ngrams(pad_both_ends(tokens, order), order))

        if n == 1:
            logscore = models[n].logscore(choice)
            choice_perplexity = models[n].perplexity(multigrams)
        elif n == 2:
            logscore = models[n].logscore(choice, [split[index - 1]])
            choice_perplexity = models[n].perplexity(multigrams)
        elif n == 3:
            logscore = models[n].logscore(choice,
                                          [split[index - 2], split[index - 1]])
            choice_perplexity = models[n].perplexity(multigrams)
        elif n == 20:
            logscore = models[n].logscore(choice, split[index - 1])
            choice_perplexity = models[n].perplexity(multigrams)

        results.append((proverb, multigrams, logscore))  # choice_perplexity

    results.sort(key=lambda x: x[2], reverse=True)  # False
    result = results[0][0]
    perplexity = models[n].perplexity(results[0][1])

    return result, perplexity

示例#18

0

显示文件

    def output_prob_for_excel(self, excel, num):
        """
        解析Excel，并返回加入了prob的dataFrame
        """
        if num < 0 or num > self.n-1:
            logging.error('size of context is larger than ngram setting!!')
            return None
        else:
            df = pd.read_excel(excel)
            print(df.shape)
            # 清洗，分词，填初始标签
            data = np.array(df)
            data = [list(pad_both_ends(str(line), n=2)) for line in data[:,3]]
            print(len(data), type(data))
            print(data[0], type(data[0]))

            if num == 0:
                prob_list = []
                for line in data:
                    prob_dict = []
                    for word in line:
                        prob_dict.append(math.log(self.calculate_marginal_prob(word)+1e-8))
                    prob_list.append(sum(prob_dict)/len(prob_dict))

            else:
                prob_list = []
                for line in data:
                    prob_dict = []
                    for i in range(len(line)-num):
                        context = line[i:i+num]
                        word = line[i+num]
                        prob = self.calculate_conditional_prob(word, context)
                        # prob_dict.append((word+'|'+','.join(context), prob))
                        prob_dict.append(math.log(prob + 1e-8))
                    prob_list.append(sum(prob_dict)/len(prob_dict))

            # 更新df
            df['prob'] = prob_list
            df.to_excel(excel)

            return df

示例#19

0

显示文件

def generate_MLE(positive, negative):
    tokenized = []
    text = ""
    n = 2 # n-gram size

    locations = [positive, negative]
    for location in locations:
        for filename in os.listdir(location):
            if filename.endswith(".txt"):
                contents = open(location + "/" + filename, "r", encoding="windows-1252").read()
                review = sent_tokenize(contents)
                for word in review:
                    tokenized.append(word.lower())
                    
                text += contents + " "

    paddedLine = [list(pad_both_ends(word_tokenize(text.lower()), n))]
    train, vocab = padded_everygram_pipeline(n, paddedLine)

    model = MLE(n)
    model.fit(train, vocab)

    return model

示例#20

0

显示文件

文件： mle_ngram_model.py 项目： gregoiredervaux/INF8460

def extract_ngrams_from_sentence(sentence, n):
    """
    Renvoie la liste des n-grammes présents dans la phrase `sentence`.

    >>> extract_ngrams_from_sentence(["Alice", "est", "là"], 2)
    [("<s>", "Alice"), ("Alice", "est"), ("est", "là"), ("là", "</s>")]

    Attention à la gestion du padding (début et fin de phrase).

    :param sentence: list(str), une phrase tokenizée
    :param n: int, l'ordre des n-grammes
    :return: list(tuple(str)), la liste des n-grammes présents dans `sentence`
    """

    sentence = list(pad_both_ends(sentence, n))

    n_gram_list = []
    for i in range(len(sentence) - n + 1):
        n_gram = []
        for j in range(n):
            n_gram.append(sentence[i + j])
        n_gram_list.append(tuple(n_gram))

    return n_gram_list

示例#21

0

显示文件

def prepare_data_cbow(data_path, test_size=0.3, n=3, random_state=42):
    """
    生成CBOW的训练测试数据， [(x_{i-n},...,x_{i-1}, x_{i+1},...,x_{i+n}), x_i)]
    """
    data = load_data(data_path)
    data = [
        list(pad_both_ends(list(line.replace('\n', '')), 2))
        for line in data[:5] if len(line) >= 2
    ]

    print(data[:2])
    word_to_ix = generate_vocab(data)

    ngram_list = []
    for sen in data:
        for i in range(n, len(sen) - n):
            context = sen[i - n:i] + sen[i + 1:i + n + 1]
            target = sen[i]
            ngram_list.append((context, target))

    print(ngram_list[:5])
    print(len(ngram_list))

    x_train, x_test = train_test_split(ngram_list,
                                       test_size=test_size,
                                       random_state=random_state)

    print(x_train[:3])
    print(x_test[:3])
    print('train={}, test={}'.format(len(x_train), len(x_test)))

    # indexing
    x_train, y_train = word2index(x_train, word_to_ix)
    x_test, y_test = word2index(x_test, word_to_ix)

    return x_train, x_test, y_train, y_test, word_to_ix

示例#22

0

显示文件

文件： LM.py 项目： gunjansehgal30/LanguageModellingMLE

import nltk
from nltk.lm import MLE, Laplace, KneserNeyInterpolated
from nltk.corpus import reuters
nltk.download('reuters')
from collections import Counter
from nltk import bigrams, trigrams

text = reuters.sents()

text = [[j.lower() for j in i] for i in text]
text = [[''.join(c for c in s if c not in ["."]) for s in k] for k in text]

from nltk.lm.preprocessing import pad_both_ends

y = (map((lambda x: pad_both_ends(x, n=2)), text))
y = list(map(lambda x: list(x), y))

from nltk.lm.preprocessing import flatten
from nltk.util import everygrams
bigramsList = list(map(lambda x: list(trigrams(x)), y))
bigramsList = list(flatten(bigramsList))
#list(everygrams(bigramsList, max_len=2))

vocab = list(flatten(pad_both_ends(sent, n=2) for sent in text))
from nltk.lm import Vocabulary
vocab = list(Vocabulary(vocab, unk_cutoff=1))
'''
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(2, text)
'''

示例#23

0

显示文件

 def encode(self, sentence):
     return tuple(pad_both_ends(tuple(map(str.lower, word_tokenize(sentence))), self.order))

示例#24

0

显示文件

文件： class_5_workin_with_bigrams.py 项目： matheusnalmeida/Regex-NLP

from nltk.util import bigrams
from nltk.lm.preprocessing import pad_both_ends

#Retirando os bigramas do texto teste
textos_teste = "Alura"
bigrama_texto_teste = bigrams(textos_teste)
print("Bigram of {} => {}".format(textos_teste,list(bigrama_texto_teste)))

'''
Adicionando fake chars no primeiro e no ultimo caracter do bigrama, para que assim a quantidade de caracteres fique a mesma
para todos os caracteres da frase e assim posssamos indetificar facilmente os caracteres de inicio e fim da sentença.
'''
padded_bigram = list(bigrams(pad_both_ends(textos_teste,n=2)))
print("Padded bigram => {}".format(padded_bigram))

示例#25

0

显示文件

 def perplexity(self, sent: str):
     text = pad_both_ends(sent, n=self.order)
     text_ngrams = ngrams(text, n=self.order)
     return self.model.perplexity(text_ngrams)

示例#26

0

显示文件

文件： LM-test.py 项目： 10Exahertz/Facebook_word_prediction

tokenized_text = [
    list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)
]
# Preprocess the tokenized text for 3-grams language modelling

train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

##Test data must be padded with this technique, to match PDE-pipline above
test_text = "3rd edition , hypothesis test using ."
tokenized_test = [
    list(map(str.lower, word_tokenize(sent)))
    for sent in sent_tokenize(test_text)
]
test_text_pad = list(flatten(
    pad_both_ends(sent, n) for sent in tokenized_test))
test_text_everygram = list(everygrams(test_text_pad, max_len=n))
test_data, padded_sents_test = padded_everygram_pipeline(n, tokenized_test)
print(test_text_everygram)

##How to install the various language models
from nltk.lm import Lidstone, Laplace, MLE
from nltk.lm.models import InterpolatedLanguageModel, KneserNeyInterpolated
from nltk.lm.api import LanguageModel, Smoothing
from nltk.lm.smoothing import KneserNey, WittenBell
model = MLE(n)
#model = Laplace(n) #only add-one smoothing here
#model = Lidstone(n,0.1) #Lidstones second number is Gamma/Alpha/Delta
#model = InterpolatedLanguageModel(WittenBell, n) #
#model = KneserNeyInterpolated(n, discount = 0.1) #only order and discount needed, WB only order

示例#27

0

显示文件

文件： ngram.py 项目： GayatriPurandharT/NLUTProject

def main():
    #Preprocessing Phrase

    #A very very small corpus with 2 sentences
    text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]

    #Getting the first sentence
    first_text = text[0]

    #Perform padding to be able to find the most likely word beginning/ending a sentence
    padded_text = list(pad_both_ends(first_text, n=2))  # 2 for bigrams

    #Everygrams calculates unigram, bigram, trigram, . . ., ngram for us (max_len = 3 means from unigram to trigram)
    print(list(everygrams(padded_text, max_len=3)))

    #Flat the sentence array to array of words to make a vocabulary ()
    flattened = list(flatten(pad_both_ends(sent, n=3) for sent in text))

    print(flattened)
    train, vocab = padded_everygram_pipeline(3, text)
    print(train, vocab)

    #Training Phrase
    #Maximum Likelihood Estimation
    # (Most probably that a particular word will have the probability = x in a given corpus found by using relative frequency)
    # 3 means maximum gram (trigram)
    lm = MLE(3)  #MLE

    #First output = a blank vocabulary (will be filled when the model is trained)
    print("Our first empty vocabulary", len(lm.vocab))

    #Give the training data and our vocab to the library to train for us
    lm.fit(train, vocab)
    print("Our vocabulary after trained", lm.vocab)

    #Lets play with our model!

    #We can find word in the vocabulary (<UNK> means the word does not belong to the vocab!)
    print(lm.vocab.lookup(['a', 'b', 'c', 'x']))

    #Finding count in each gram. We have 3 ngram order and 45 ngrams (what does it mean by 45 ngrams?)
    print(lm.counts)

    #Let's begin with counting a unigram
    print(lm.counts['a'])

    #Then bigram
    print(lm.counts[['a']]['b'])

    #Then trigram. . .
    print(lm.counts[['a', 'b']]['c'])

    #Relative Frequency (Joint probability)
    print(lm.score('a'))

    #Which word has the most score?
    print(lm.score('<s>'))

    total_score_unigram = 0
    print(flattened)

    #unigram score for w = P(w) = Count(w)/N
    print(lm.score('a'))
    print(lm.score('b'))

    #Avoiding underflow we use logprob
    print(lm.logscore('a'))
    print(lm.logscore('b'))

    #The score in the model for each gram must sum up to exactly 1
    bag_of_word = set(flattened)
    for word in bag_of_word:
        total_score_unigram += lm.score(word)

    print(total_score_unigram)
    total_score_bigram = 0

    #score for a word w given x = P(w | x) = C(w,x)/C(x)
    for word in bag_of_word:
        total_score_bigram += lm.score(word, ['a'])

    print(total_score_bigram)

    #Generating a random sequence of words
    print(lm.generate(10))

    print(lm.generate(10, random_seed=3))  #Same random seed, the same sequence

    print(lm.generate(10, text_seed=['a'],
                      random_seed=3))  #given 'a' as the preceding word)

    #Testing Phrase
    #Given a heldout corpus (additional corpus distinct to the traning corpus)
    test_seq = [('a', 'b'), ('c', 'd'), ('e', 'f')]
    test_good_seq = list(bigrams(lm.vocab.lookup(['f', 'b', 'c', 'e'])))
    print(test_good_seq)
    #Evaluate by calculating entropy or perplexity of sequences of words
    print("Entropy: ", lm.entropy(test_seq), "Perplexity: ",
          lm.perplexity(test_seq))
    print("Entropy: ", lm.entropy(test_good_seq), "Perplexity: ",
          lm.perplexity(test_good_seq))

    #Homework
    #Why the value of entropy and perplexity sometimes is inf
    #How to avoid that?
    #Hint: Smoothing, backoff-interpolations

    #Try using different corpus and
    #Generate a random sentence in unigram, bigram and trigram and see

    def list_followers(self, word):
        followers = set()
        for tup in list(bigrams(flattened)):
            if tup[0] == word:
                followers.add(tup[1])
        print(followers)
        return followers

示例#28

0

显示文件

文件： mle_model.py 项目： Anticsss/100DaysOfMLCode

# # Preparing Data
# Before we train our **ngram** models it is necessary to make sure the data we put in them is in the right format.
def clean(doc):
    stop = set(stopwords.words('english'))
    exclude = set(string.punctuation) 
    stop_free = " ".join([i for i in doc.lower().split('[^A-Za-z]+') if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    return punc_free

text = [clean(doc).split() for doc in data]
#print(doc_clean)


# Creating bigrams
text = list(bigrams(pad_both_ends(text, n=2)))

# Creating unigrams as well as bigrams
#padded_bigrams = list(pad_both_ends(text, n=2))
#text = list(everygrams(padded_bigrams, max_len=2))
#print(text)


# During training and evaluation our model will rely on a vocabulary that defines which words are “known” to the model. To create this vocabulary we need to pad our sentences (just like for counting ngrams) and then combine the sentences into one flat stream of words.
text = list(flatten(pad_both_ends(sent, n=2) for sent in text))

# # Training a Model
train, vocab = padded_everygram_pipeline(2, text)

# let us train a **Maximum Likelihood Estimator (MLE)**. We only need to specify the highest ngram order to instantiate it.
lm = MLE(10)

示例#29

0

显示文件

def my_pad_both_ends(sentence):
    return list(pad_both_ends(sentence, n=2))

示例#30

0

显示文件

def preprocess_text(sequences):
    # Preprocessed : iterator on sequence
    preprocessed = [pad_both_ends(s.split(' '), n=2) for s in sequences]
    return list(flatten(preprocessed))