def add_doc(self, tokens, author): if author not in self.freqdists: self.freqdists[author] = FreqDist() self.prob_dists[author] = KneserNeyProbDist(self.freqdists[author]) self.needs_probs_recounted[author] = True fd = FreqDist(trigrams(tokens)) self.freqdists[author].update(fd)
def model_KN(self, contents): ''' function returns an unsmoothed probability distribution (n-gram model) based on parameter list passed: - contents : list containing repaired contents of file whose n-gram model is to be created Uses the KneserNeyProbDist() function from NLTK to create a Kneser-Ney smoothing based language model ''' ret_dict = {} list_ngrams = list(ngrams(contents, self.N)) fdist = FreqDist(list_ngrams) kn_prob_dist = KneserNeyProbDist(fdist) self.discount_KN = kn_prob_dist.discount() for iter in kn_prob_dist.samples(): ret_dict[iter] = kn_prob_dist.prob(iter) return ret_dict
def predict_author(self, tokens): shape_toks = get_token_shapes(tokens) needs_probs_recounted = self.needs_probs_recounted prob_dists = self.prob_dists freqdists = self.freqdists smoothing = self.smoothing for author in freqdists: #Only recount those that have since been modified (by having a doc added) if needs_probs_recounted[author]: prob_dists[author] = KneserNeyProbDist(freqdists[author]) self.needs_probs_recounted[author] = False best_score = None likely_author = None for author, probdist in prob_dists.iteritems(): probs = array( [probdist.prob(trigram) for trigram in trigrams(shape_toks)], dtype='float') score = log(probs + smoothing).sum() if score > best_score: likely_author = author best_score = score return likely_author, best_score
class KneserNeyModel(BaseNgramModel): """ Implements Kneser-Ney smoothing """ def __init__(self, *args): super(KneserNeyModel, self).__init__(*args) self.model = KneserNeyProbDist(self.ngrams) def score(self, word, context): """ Use KneserNeyProbDist from NLTK to get score """ trigram = tuple((context[0], context[1], word)) return self.model.prob(trigram) def samples(self): return self.model.samples() def prob(self, sample): return self.model.prob(sample)
def complete(self, author, tokens, num_words, iters=100): if self.needs_probs_recounted[author]: self.prob_dists[author] = KneserNeyProbDist(self.freqdists[author]) add_unigrams(self.prob_dists[author]) context_tokens = list(tokens) #Chop off the end of tokens until we see a bigram we know. while context_tokens: if tuple(context_tokens[-2:]) in self.prob_dists[author]._bigrams: break context_tokens.pop(-1) context = tuple(context_tokens[-2:]) if context_tokens else (None, None) probdist = self.prob_dists[author] completion = generate(probdist, context, num_words, iters) return completion
print(out) """最大似然估计的目的就是:利用已知的样本结果,反推最有可能(最大概率)导致这样结果的参数值。""" """最大似然估计wiki https://zh.wikipedia.org/zh-cn/%E6%9C%80%E5%A4%A7%E4%BC%BC%E7%84%B6%E4%BC%B0%E8%AE%A1""" """隐马尔科夫模型估计 HMM""" corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:700] print(len(corpus)) tag_set = unique_list(tag for sent in corpus for (word, tag) in sent) print(len(tag_set)) """平滑""" # gt = lambda fd, bins:SimpleGoodTuringProbDist(fd, bins=1e5) # train_and_test(gt) corpus = [[((x[0], y[0], z[0]), (x[1], y[1], z[1])) for x, y, z in nltk.trigrams(sent)] for sent in corpus[:100]] # 平滑语料库 tag_set = unique_list(tag for sent in corpus for (word, tag) in sent) print(len(tag_set)) symbols = unique_list(word for sent in corpus for (word, tag) in sent) print(len(symbols)) trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) train_corpus = [] test_corpus = [] for i in range(len(corpus)): if i % 10: train_corpus += [corpus[i]] else: test_corpus += [corpus[i]] print(len(train_corpus)) print(len(test_corpus)) kn = lambda fd, bins: KneserNeyProbDist(fd) # train_and_test(kn)
def __init__(self, *args): super(KneserNeyModel, self).__init__(*args) self.model = KneserNeyProbDist(self.ngrams)
if __name__ == '__main__': print("Lab 4 Exercise 2") corpus_reader = PlaintextCorpusReader(root="./twitter-files", fileids=".*\.txt", word_tokenizer=TweetTokenizer()) # Convert tweets to tri-grams tweets = [tweet for tweet in corpus_reader.sents()] tweet_trigrams = [ list( ngrams(sequence=tweet, n=3, pad_left=True, pad_right=True, left_pad_symbol="<START>", right_pad_symbol="<END>")) for tweet in tweets ] all_trigrams = [gram for tweet in tweet_trigrams for gram in tweet] # Initialize the language model freq_dist = FreqDist(all_trigrams) model = KneserNeyProbDist(freq_dist) # Predict sentences inputs = [ "make America", "I am the", "China is", "The President of", "This election", "I love", "Fake News" ] print("Inputs: {}".format(inputs)) complete_sentence(inputs, model)
best_generated = list(generated) best_score = prob return best_generated, best_score**(1. / num) #geometric mean def add_unigrams(kn): unigrams = {} for k, v in kn._bigrams.iteritems(): for w in k: unigrams[w] = unigrams.get(w, 0) + 1. kn._unigrams = unigrams if __name__ == '__main__': from cPickle import load from code import interact animal_farm_toks = load(open('animal_farm_toks')) niniteen_eightyfour_farm_toks = load(open('1984_toks')) fd = FreqDist(trigrams(animal_farm_toks)) # fd.update(FreqDist(trigrams(niniteen_eightyfour_farm_toks))) kn = KneserNeyProbDist(fd) add_unigrams(kn) #print generate(kn, ('the', 'day'), 10, 100) interact(local=locals()) #http://www.gilesthomas.com/2010/05/generating-political-news-using-nltk/ #content_model = NgramModel(3, tokenized_content) #starting_words = content_model.generate(100)[-2:] #content = content_model.generate(words_to_generate, starting_words) #print u' '.join(content)