예제 #1
0
파일: test_models.py 프로젝트: rmalouf/nltk
class KneserNeyInterpolatedTrigramTests(unittest.TestCase):
    def setUp(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = KneserNeyInterpolated(3, vocabulary=vocab)
        self.model.fit(training_text)

    score_tests = [
        # For unigram scores revert to uniform
        # Vocab size: 8
        # count('c'): 1
        ("c", None, 1.0 / 8),
        # in vocabulary but unseen, still uses uniform
        ("z", None, 1 / 8),
        # out of vocabulary should use "UNK" score, i.e. again uniform
        ("y", None, 1.0 / 8),
        # alpha = count('bc') - discount = 1 - 0.1 = 0.9
        # gamma(['b']) = discount * number of unique words that follow ['b'] = 0.1 * 2
        # normalizer = total number of bigrams with this context = 2
        # the final should be: (alpha + gamma * unigram_score("c"))
        ("c", ["b"], (0.9 + 0.2 * (1 / 8)) / 2),
        # building on that, let's try 'a b c' as the trigram
        # alpha = count('abc') - discount = 1 - 0.1 = 0.9
        # gamma(['a', 'b']) = 0.1 * 1
        # normalizer = total number of trigrams with prefix "ab" = 1 => we can ignore it!
        ("c", ["a", "b"], 0.9 + 0.1 * ((0.9 + 0.2 * (1 / 8)) / 2)),
    ]
예제 #2
0
 def create_model(self, model_nm):
     self.model = {
         "lidstone": Lidstone(0.5, self.ngram_order),
         "kneserney": KneserNeyInterpolated(self.ngram_order),
         "wittenbell": WittenBellInterpolated(self.ngram_order)
     }[model_nm]
     train, vocab = padded_everygram_pipeline(self.ngram_order, self.text)
     vocab = Vocabulary(vocab, unk_cutoff=2, unk_label="<UNK>")
     print("Creating ngram...")
     self.model.fit(train, vocab)
     print("done")
    def __init__(self, model=None, ngram=3):
        if True:
            stanza.download('sv')  # download Swedish model
            self.nlp = stanza.Pipeline(
                'sv')  # initialize Swedish neural pipeline
            self.base_url = 'https://www.synonymer.se/sv-syn/'

        # Build Language Model from corpus
        if model is None:
            with open('kneyser_lm.pkl', 'rb') as f:
                self.model = pickle.load(f)

        else:
            self.model = KneserNeyInterpolated(ngram)
            sentences = np.loadtxt(corpus_file, dtype='U', delimiter='\n')
            text = [
                list(map(str.lower, word_tokenize(sent))) for sent in sentences
            ]
            train_data, padded_sents = padded_everygram_pipeline(ngram, text)
            self.model.fit(train_data, padded_sents)
예제 #4
0
class TestKneserNeyInterpolatedTrigram(metaclass=ParametrizedTests):
    @classmethod
    def setup_method(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = KneserNeyInterpolated(3, discount=0.75, vocabulary=vocab)
        self.model.fit(training_text)

    score_tests = [
        # P(c) = count('*c') / unique('**')
        #      = 1 / 14
        ("c", None, 1.0 / 14),
        # P(z) = count('*z') / unique('**')
        #      = 0 / 14
        # 'z' is in the vocabulary, but it was not seen during training.
        ("z", None, 0.0 / 14),
        # P(y)
        # Out of vocabulary should use "UNK" score.
        # P(y) = P(UNK) = count('*UNK') / unique('**')
        ("y", None, 3 / 14),
        # We start with P(c|b)
        # P(c|b) = alpha('bc') + gamma('b') * P(c)
        # alpha('bc') = max(unique('*bc') - discount, 0) / unique('*b*')
        #             = max(1 - 0.75, 0) / 2
        #             = 0.125
        # gamma('b')  = discount * unique('b*') / unique('*b*')
        #             = (0.75 * 2) / 2
        #             = 0.75
        ("c", ["b"], (0.125 + 0.75 * (1 / 14))),
        # Building on that, let's try P(c|ab).
        # P(c|ab) = alpha('abc') + gamma('ab') * P(c|b)
        # alpha('abc') = max(count('abc') - discount, 0) / count('ab*')
        #              = max(1 - 0.75, 0) / 1
        #              = 0.25
        # gamma('ab')  = (discount * unique('ab*')) / count('ab*')
        #              = 0.75 * 1 / 1
        ("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (1 / 14))),
        # P(c|zb)
        # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332.
        ("c", ["z", "b"], (0.125 + 0.75 * (1 / 14))),
    ]
예제 #5
0
class TestKneserNeyInterpolatedTrigram(metaclass=ParametrizedTests):
    @classmethod
    def setup_method(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = KneserNeyInterpolated(3, vocabulary=vocab)
        self.model.fit(training_text)

    score_tests = [
        # # of bigrams ending with c = 1
        # total # of unique bigrams = 14
        ("c", None, 1.0 / 14),
        # in vocabulary but unseen
        # # of bigrams ending with z = 0
        ("z", None, 0.0 / 14),
        # out of vocabulary should use "UNK" score
        # # of bigrams ending with <UNK> = 3
        ("y", None, 3 / 14),
        # alpha = max(count('bc') - discount,0)/# of bigrams starting 'b'
        # = (1 - 0.75)/2 = 0.125
        # gamma(['b']) = (discount * number of unique continuations after ['b'])/ # of bigrams starting 'b'
        # = (0.75 * 2)/2 = 0.75
        # the final should be: (alpha + gamma * unigram_score("c"))
        ("c", ["b"], (0.125 + 0.75 * (1 / 14))),
        # building on that, let's try 'a b c' as the trigram
        # alpha = max(count('abc') - discount,0)/# of trigrams starting "ab"
        # = max(1 - 0.1, 0)/1 = 0.25
        # gamma(['a', 'b']) = (discount * number of unique continuations after ['ab'])/ # of bigrams starting 'ab'
        # = 0.75 * 1/1
        # final: alpha + gamma*(P(c|b))
        # alpha of P(c|b) = max(# of trigrams ending in "bc" - discount,0)/# unique trigram continuations with 'b' in the middle
        # = (1-0.75)/2 =0.125
        # gamma of P(c|b) = (discount * # of unique continuations after 'b')/ # of unique bigram continuations with 'b' in the middle
        # = 0.75 * 2/2
        ("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (1 / 14))),
        # The ngram 'z b c' was not seen, so we should simply revert to
        # the score of the ngram 'b c'. See issue #2332.
        ("c", ["z", "b"], (0.125 + 0.75 * (1 / 14))),
    ]
def trigram_model(tokenized_text, test_sentences, sentence_count):

    n = 3
    average_perplexity = 0.0
    train_data, padded_vocab = padded_everygram_pipeline(n, tokenized_text)
    model = KneserNeyInterpolated(n)
    model.fit(train_data, padded_vocab)

    tokenized_text = [
        list(map(str.lower, nltk.tokenize.word_tokenize(sent)))
        for sent in test_sentences
    ]

    test_data, _ = padded_everygram_pipeline(n, tokenized_text)

    for test in list(test_data):
        ngrams = list(test)
        if model.perplexity(ngrams) != float('inf'):
            average_perplexity += model.perplexity(ngrams)

    average_perplexity /= sentence_count
    print(
        f"Average Perplexity for Trigram model on Test tweets: {round(average_perplexity, 4)}"
    )
예제 #7
0
        temp = word_tokenize(sent)
        for idx, word in enumerate(temp):
            if word not in vnword:
                temp[idx] = 'unknown'
        result.append(temp)
    print('tokenize done')
    return result


if __name__ == '__main__':
    arg = get_arg()

    # get train data and tokenize
    with open(arg.doc_dir, 'r', encoding='utf-8') as fin:
        doc = fin.readlines()
    corpus = tokenize(doc)
    del doc

    vi_model = KneserNeyInterpolated(arg.ngram)
    train_data, padded_sent = padded_everygram_pipeline(arg.ngram, corpus)
    del corpus
    start_time = time.time()
    vi_model.fit(train_data, padded_sent)
    print('train %s-gram model in %d s' %
          (arg.ngram, time.time() - start_time))
    print('length of vocab = %s' % (len(vi_model.vocab)))

    with open(arg.model_dir, 'wb') as fout:
        pickle.dump(vi_model, fout)
    print('save model successfully!')
예제 #8
0
lm.fit(train, vocab)
print(lm.counts['America'])
print(lm.counts[['bless']]['America'])
print(lm.score('the'))
print(lm.score("America", ["bless"]))

train, vocab = padded_everygram_pipeline(2, state_union.sents())
lm = Laplace(2)
lm.fit(train, vocab)
print(lm.counts['America'])
print(lm.counts[['bless']]['America'])
print(lm.score('the'))
print(lm.score("America", ["bless"]))

train, vocab = padded_everygram_pipeline(2, state_union.sents())
lm = KneserNeyInterpolated(2)
lm.fit(train, vocab)
print(lm.counts['America'])
print(lm.counts[['bless']]['America'])
print(lm.score('the'))
print(lm.score("America", ["bless"]))

#EXERCISE 3

train, vocab = padded_everygram_pipeline(2,
                                         state_union.sents('1945-Truman.txt'))
lm = MLE(2)
lm.fit(train, vocab)
print(lm.generate(100))

# Exercice 4
class SynonymParaphraser:
    def __init__(self, model=None, ngram=3):
        if True:
            stanza.download('sv')  # download Swedish model
            self.nlp = stanza.Pipeline(
                'sv')  # initialize Swedish neural pipeline
            self.base_url = 'https://www.synonymer.se/sv-syn/'

        # Build Language Model from corpus
        if model is None:
            with open('kneyser_lm.pkl', 'rb') as f:
                self.model = pickle.load(f)

        else:
            self.model = KneserNeyInterpolated(ngram)
            sentences = np.loadtxt(corpus_file, dtype='U', delimiter='\n')
            text = [
                list(map(str.lower, word_tokenize(sent))) for sent in sentences
            ]
            train_data, padded_sents = padded_everygram_pipeline(ngram, text)
            self.model.fit(train_data, padded_sents)

    def generate_paraphrases(self, source_file):
        # Read data and make a copy to store edited paraphrases
        source_data = pd.read_csv(source_file)['question1']
        paraphrases = source_data.copy()

        for i in range(1688, len(source_data)):
            # Clean source sentences and generate dependency parse treee
            source_data[i] = clean_str(source_data[i])
            doc = self.nlp(source_data[i])
            print(doc)

            # Iterate all words to find potential words to replace with synonyms
            candidate_words = []
            for word in doc.sentences[0].words:
                if word.upos in ["ADJ", "ADV", "NOUN", "VERB"] and word.feats:
                    candidate_word = {
                        'word': word.text,
                        'index': word.id - 1,
                        'POS': word.upos
                    }
                    valid_candidate = True
                    features = [
                        feature.split('=') for feature in word.feats.split('|')
                    ]
                    for feature in features:
                        if feature[0] == 'VerbForm' and feature[1] == 'Part':
                            valid_candidate = False
                            break
                        candidate_word[feature[0]] = feature[1]
                    if valid_candidate:
                        candidate_words.append(candidate_word)

            replacements = 0
            best_candidate = {'word': '', 'index': 0, 'diff': -np.inf}
            for j, candidate in enumerate(candidate_words):
                candidate_synonyms = self.get_synonyms(candidate['word'])

                if candidate_synonyms == None:
                    continue
                original = (candidate['word'],
                            self.get_score(candidate['word'],
                                           candidate['index'], source_data[i]))
                best_synonym = original
                synonym_count = 0
                for synonym in candidate_synonyms:
                    synonym = self.get_inflection(candidate, synonym)
                    if synonym is None:
                        continue
                    synonym_count += 1
                    # Calculate score for the synonym and compare to the current best
                    score = self.get_score(synonym, candidate['index'],
                                           source_data[i])
                    if score > best_synonym[1]:
                        best_synonym = (synonym, score)

                    diff = score - original[1]

                    if best_candidate['diff'] < diff:
                        best_candidate['word'] = synonym
                        best_candidate['index'] = candidate['index']
                        best_candidate['diff'] = diff
                        print(
                            f'New best candidate: {synonym} with score {diff}')

                # Build paraphrase sentence
                if best_synonym[0] != candidate['word']:
                    new_sentence = ''
                    for (k, w) in enumerate(source_data[i].split()):
                        if k == candidate['index'] and best_synonym[0] != w:
                            new_sentence += best_synonym[0]
                            replacements += 1
                            print(f'Replaced word {w} with {best_synonym[0]}')
                        else:
                            new_sentence += w
                        if k < len(doc.sentences[0].words) - 1:
                            new_sentence += ' '
                    source_data[i] = new_sentence

            # Assure at least one word is replaced with a synonym
            if replacements == 0 and best_candidate['word'] != '':
                print(best_candidate.items())
                new_sentence = ''
                for (k, w) in enumerate(source_data[i].split()):
                    if k == best_candidate['index']:
                        new_sentence += best_candidate['word']
                    else:
                        new_sentence += w
                    if k < len(doc.sentences[0].words) - 1:
                        new_sentence += ' '
                source_data[i] = new_sentence

            print(f'{i} sentences done')
            print(source_data[i])
            print(paraphrases[i])
            print('\n')
            with open('synonym_samples_final.txt', 'a') as f:
                f.write(source_data[i] + '\n')

        return source_data

    def get_inflection(self, word, synonym):
        pos = POS[word['POS']]
        url = f"https://ws.spraakbanken.gu.se/ws/karp/v4/query?q=extended||and|pos|equals|{POS[word['POS']]}||and|wf|equals|{synonym}&resource=saldom"
        response = requests.get(url).json()['hits']

        if response['total'] == 0:
            return None

        msd = self.word_grammar(word)
        for i in range(len(response['hits'])):
            if response['hits'][i]['_source']['FormRepresentations'][0][
                    'baseform'] in synonym:
                word_forms = response['hits'][i]['_source']['WordForms']

                for j in range(len(word_forms)):
                    if word_forms[j]['msd'] == msd:
                        if word['POS'] == 'NOUN' and 'Gender' in word.keys():
                            inherent = 'n' if word['Gender'] == 'Neut' else 'u'
                            if inherent != response['hits'][i]['_source'][
                                    'FormRepresentations'][0]['inherent']:
                                return None
                        return word_forms[j]['writtenForm']

    def get_synonyms(self, word):
        synonyms = set()

        url = self.base_url + word
        html_doc = requests.get(url).text
        soup = BeautifulSoup(html_doc, 'html.parser')
        soup = soup.find("div", {"id": "dict-default"})
        if soup == None:
            return None
        else:
            soup = soup.find("div", {"body"}).ul
        for synset in soup.find_all('li'):
            for syns in synset.find_all('ol', class_=lambda x: not x):
                for synonym in syns.find_all('a'):
                    if len(synonym.text.split()) > 1:
                        continue
                    synonyms.add(synonym.text)
        return synonyms

    def get_score(self, word, j, source_sentence):
        scores = []
        sentence_len = len(source_sentence.split())
        if sentence_len >= 3:
            if j >= 2:
                scores.append(
                    self.model.logscore(
                        word,
                        source_sentence.split()[(j - 2):(j - 1)]))
            if j < sentence_len - 2:
                scores.append(
                    self.model.logscore(
                        source_sentence.split()[j + 2],
                        [word, source_sentence.split()[j + 1]]))
            if j >= 1 and j < sentence_len - 1:
                scores.append(
                    self.model.logscore(
                        source_sentence.split()[j - 1],
                        [source_sentence.split()[j + 1], word]))
        else:
            if j == 0:
                scores.append(
                    self.model.logscore(source_sentence.split()[1], [word]))
            else:
                scores.append(
                    self.model.logscore(word, [source_sentence.split()[0]]))
        score = sum(scores) / len(scores)
        return score

    def word_grammar(self, word):
        grammar = None
        if word['POS'] == 'ADJ':
            if 'Degree' not in word:
                return None
            if word['Degree'] == 'Pos':
                grammar = 'pos'
            elif word['Degree'] == 'Cmp':
                grammar = 'komp'
                if 'Case' in word.keys() and word['Case'] == 'Nom':
                    grammar = grammar + ' nom'
                else:
                    grammar = grammar + ' gen'
                return grammar
            elif word['Degree'] == 'Sup':
                grammar = 'super'
                if 'Case' in word.keys() and word['Case'] == 'Nom':
                    grammar = grammar + ' nom'
                else:
                    grammar = grammar + ' gen'
                return grammar

            if 'Definite' not in word:
                return None
            if word['Definite'] == 'Ind':
                grammar = grammar + ' indef'
            elif word['Definite'] == 'Def':
                grammar = grammar + ' def'

            if 'Number' in word.keys():
                if word['Number'] == 'Sing':
                    grammar = grammar + ' sg'
                elif word['Number'] == 'Plur':
                    grammar = grammar + ' pl'

            if 'Gender' in word.keys() and word['Gender'] == 'Neut':
                grammar = grammar + ' n nom'
            else:
                grammar = grammar + ' u nom'

        elif word['POS'] == 'ADV':
            if 'Degree' not in word:
                return None
            else:
                if word['Degree'] == 'Pos':
                    grammar = 'pos'
                elif word['Degree'] == 'Cmp':
                    grammar = 'komp'
                elif word['Degree'] == 'Sup':
                    grammar = 'super'

        elif word['POS'] == 'VERB':
            if word['VerbForm'] == 'Inf':
                grammar = 'inf'
            elif word['VerbForm'] == 'Sup':
                grammar = 'sup'
            elif 'Tense' in word.keys() and word['Tense'] == 'Past':
                grammar = 'pret ind'
            elif word['Mood'] == 'Ind':
                grammar = 'pres ind'
            elif word['Mood'] == 'Imp':
                grammar = 'imper'
                return grammar

            if 'Voice' in word.keys() and word['Voice'] == 'Act':
                grammar = grammar + ' aktiv'
            else:
                grammar = grammar + ' s-form'

            # if
        elif word['POS'] == 'NOUN':
            if 'Number' not in word.keys():
                return None
            if word['Number'] == 'Sing':
                grammar = 'sg'
            elif word['Number'] == 'Plur':
                grammar = 'pl'

            if 'Definite' not in word.keys():
                return None
            elif word['Definite'] == 'Ind':
                grammar = grammar + ' indef'
            elif word['Definite'] == 'Def':
                grammar = grammar + ' def'

            if word['Case'] == 'Gen':
                grammar = grammar + ' gen'
            else:
                grammar = grammar + ' nom'

        return grammar
예제 #10
0
train_data = [pre.token_to_text(tweet) for tweet in train_data]

## Test preprocessing: list of tweets that contain a list of sentences with eos containing a list of tokens
tweet_list_test = tweet_list[0:1000]
test_data = [pre.tokenize_word(tweet) for tweet in tweet_list_test]
test_data = [pre.clean_tokens(tweet) for tweet in test_data]
test_data = pre.clean_sent(test_data)
test_data = [pre.token_to_text(tweet) for tweet in test_data]

test_data_uni = lm.pad_eos(1, test_data) 
test_data_bi = lm.pad_eos(2, test_data) 
test_data_tri = lm.pad_eos(3, test_data) 

## Train models: unigram, bigram, trigram
## train a trigram KneserNeyInterpolated language model (based on the 9,000 tweets)
model_uni = lm.train_ngram(KneserNeyInterpolated(1), train_data, ngram=1)
model_bi = lm.train_ngram(KneserNeyInterpolated(2), train_data, ngram=2)
model_tri = lm.train_ngram(KneserNeyInterpolated(3), train_data, ngram=3)

## Test models: plot perplexity
## Important to use Kneser model for the 0 probability smoothing (otherwise perplexity goes to inf)
uni_perplexity = 0
bi_perplexity = 0
tri_perplexity = 0
for uni, bi, tri in zip(test_data_uni, test_data_bi, test_data_tri):
    uni_perplexity += model_uni.perplexity(uni)
    bi_perplexity += model_bi.perplexity(bi)
    tri_perplexity += model_tri.perplexity(tri)
uni_perplexity_avg = uni_perplexity/len(test_data)
bi_perplexity_avg = bi_perplexity/len(test_data)
tri_perplexity_avg = tri_perplexity/len(test_data)
예제 #11
0
파일: test_models.py 프로젝트: rmalouf/nltk
 def setUp(self):
     vocab, training_text = _prepare_test_data(3)
     self.model = KneserNeyInterpolated(3, vocabulary=vocab)
     self.model.fit(training_text)
oov_tokens_counter = 0
for token in twitter_test_tokens:
    if token not in twitter_voc:
        oov_tokens_counter += 1
print('-- OOV for twitter test tokens in train twitter vocabulary: ',
      oov_tokens_counter / len(twitter_test_tokens))

## ------------- Q3
## Sentence split, tokenize, and lower case the Wikipedia data you have collected, then get the first 9,000
wiki_tok = [pre.clean_tokens(s) for s in wiki_save]
wiki_tok = pre.clean_sent(wiki_tok)
wiki_tok = [pre.token_to_text(sent) for sent in wiki_tok]
wiki_tok = wiki_tok[0:9000]

## train a trigram KneserNeyInterpolated language model (based on these 9,000 sentences wikipedia)
model_tri = lm.train_ngram(KneserNeyInterpolated(3), wiki_tok, ngram=3)

## Average perplexity of the model on test Twitter sentences (the one that contains 1,000 tweets)
tweet_list_test = tweet_list[0:1000]
test_data = [pre.tokenize_word(tweet) for tweet in tweet_list_test]
test_data = [pre.clean_tokens(tweet) for tweet in test_data]
test_data = pre.clean_sent(test_data)
test_data = [pre.token_to_text(tweet) for tweet in test_data]
test_data = lm.pad_eos(3, test_data)

tri_perplexity = 0
for tweet in test_data:
    tri_perplexity += model_tri.perplexity(tweet)
tri_perplexity_avg = tri_perplexity / len(tweet_list_test)
print("Perplexity on test_data model_tri =", tri_perplexity_avg)
    :param order: Largest ngram length produced by `everygrams`.
    :param text: Text to iterate over. Expected to be an iterable of sentences:
    Iterable[Iterable[str]]
    :return: iterator over text as ngrams, iterator over text as vocabulary data
    """
    padding_fn = partial(pad_both_ends, n=order, pad_left=pad_left, pad_right=pad_right, left_pad_symbol=left_pad_symbol, right_pad_symbol=right_pad_symbol)
    return (
        (everygrams(list(padding_fn(sent)), max_len=order) for sent in text),
        flatten(map(padding_fn, text)),
    )


N = 2

# LM = MLE(N)
LM = KneserNeyInterpolated(N)
corpus = "data/cornell movie-dialogs corpus/formatted_movie_lines.txt"
# corpus = "test_corpus.txt"
with open(corpus) as f:
    raw = f.read()
print("corpus read")
tokens = nltk.word_tokenize(raw)
sents = [nltk.word_tokenize(s) for s in nltk.sent_tokenize(raw)]

voc = Voc(corpus)
print(voc)
for s in sents:
    for w in s:
        voc.addWord(w)
print(voc)
sents = [[str(SOS_token)] + [str(voc.word2index[w]) for w in s] + [str(EOS_token)] for s in sents]
예제 #14
0
for line in input_text_noparens.split('\n'):
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
    sentences_strings_ted.extend(sent for sent in re.split('[。?!]', m.groupdict()['postcolon']) if sent)

del input_text_noparens, input_text

sentences_strings_ted = [re.sub(r'[^\w\s]', '', sent) for sent in sentences_strings_ted]
sentences_strings_ted = [re.sub(r'[a-zA-Z0-9]', '', sent) for sent in sentences_strings_ted]
sentences_strings_ted = filter(None, sentences_strings_ted)
data = ' '.join([re.sub(r'\s', '', sent) for sent in sentences_strings_ted]).split(' ')
datax = [' '.join(sent).split(' ') for sent in data]

del sentences_strings_ted, data

# 训练 5-gram
lm = KneserNeyInterpolated(5)
train, vocab = padded_everygram_pipeline(5, datax)
lm.fit(train, vocab)

del train, vocab, datax
# 困惑度测试
test = '我想带你们体验一下,我们所要实现的“信任”的感觉。'
sent_list = re.sub(r'[^\w\s]', '', test)
sent_list = ','.join(sent_list).split(',')
text = list(ngrams(pad_both_ends(sent_list, 5), 5))

entropy = lm.entropy(text)  # 交叉熵
perplexity = lm.perplexity(text)  # 困惑度
print('交叉熵:%f' % entropy, '困惑度:%f' % perplexity)
# 储存模型  ... 以下内容 内存不足跑不起来 去 Colaboratory 或者 kaggle 跑蹭谷歌服务器
joblib.dump(lm, 'panti_gram.pkl')
예제 #15
0
 def setup_method(self):
     vocab, training_text = _prepare_test_data(3)
     self.model = KneserNeyInterpolated(3, discount=0.75, vocabulary=vocab)
     self.model.fit(training_text)
def kneserney_trigram_model(trigram_training_data, vocabulary):
    model = KneserNeyInterpolated(order=3, discount=0.75, vocabulary=vocabulary)
    model.fit(trigram_training_data)
    return model
def kneserney_bigram_model(bigram_training_data, vocabulary):
    model = KneserNeyInterpolated(order=2, vocabulary=vocabulary)
    model.fit(bigram_training_data)
    return model
예제 #18
0
 def setUp(self):
     vocab, training_text = _prepare_test_data(3)
     self.model = KneserNeyInterpolated(3, vocabulary=vocab)
     self.model.fit(training_text)
예제 #19
0
 def __init__(self, n: int = 3):
     self.n = n
     self.model = KneserNeyInterpolated(n)
     self.model_path = LM_PATH / f"{n}_gram.model"