def trigram_model(tokenized_text, test_sentences, sentence_count): n = 3 average_perplexity = 0.0 train_data, padded_vocab = padded_everygram_pipeline(n, tokenized_text) model = KneserNeyInterpolated(n) model.fit(train_data, padded_vocab) tokenized_text = [ list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in test_sentences ] test_data, _ = padded_everygram_pipeline(n, tokenized_text) for test in list(test_data): ngrams = list(test) if model.perplexity(ngrams) != float('inf'): average_perplexity += model.perplexity(ngrams) average_perplexity /= sentence_count print( f"Average Perplexity for Trigram model on Test tweets: {round(average_perplexity, 4)}" )
sentences_strings_ted = [re.sub(r'[a-zA-Z0-9]', '', sent) for sent in sentences_strings_ted] sentences_strings_ted = filter(None, sentences_strings_ted) data = ' '.join([re.sub(r'\s', '', sent) for sent in sentences_strings_ted]).split(' ') datax = [' '.join(sent).split(' ') for sent in data] del sentences_strings_ted, data # 训练 5-gram lm = KneserNeyInterpolated(5) train, vocab = padded_everygram_pipeline(5, datax) lm.fit(train, vocab) del train, vocab, datax # 困惑度测试 test = '我想带你们体验一下,我们所要实现的“信任”的感觉。' sent_list = re.sub(r'[^\w\s]', '', test) sent_list = ','.join(sent_list).split(',') text = list(ngrams(pad_both_ends(sent_list, 5), 5)) entropy = lm.entropy(text) # 交叉熵 perplexity = lm.perplexity(text) # 困惑度 print('交叉熵:%f' % entropy, '困惑度:%f' % perplexity) # 储存模型 ... 以下内容 内存不足跑不起来 去 Colaboratory 或者 kaggle 跑蹭谷歌服务器 joblib.dump(lm, 'panti_gram.pkl') # In[] # 测试储存的模型 kn = joblib.load('kn_5gram.pkl') kn_entropy = kn.entropy(text) # 交叉熵 kn_perplexity = kn.perplexity(text) # 困惑度 print('KN交叉熵:%f' % kn_entropy, 'KN困惑度:%f' % kn_perplexity)