def conditional_freq(self): result = [] cfd = ConditionalFreqDist(self.bigram_list) for key, values in cfd.items(): for word, freq in values.items(): result.append((key, word, freq)) return result
class BigramWordCandidateProvider(object): """Provides candidate next words given a word using a bigram model.""" def __init__(self, corpus): """Initializer of the BigramWordCandidateProvider. Args: corpus: An iterable of word strings. """ _bigrams = bigrams(corpus) self._cfd = ConditionalFreqDist(_bigrams) def candidates(self, word_sequence): """Returns a list of candidate next words given a word sequence. """ word = word_sequence[-1] candidates = [ candidate for (candidate, _) in self._cfd[word].most_common() ] return candidates def random_word(self): return random.choice(list(self._cfd.items()))[0]
#%% from nltk.corpus import inaugural from nltk import ConditionalFreqDist from nltk.probability import FreqDist fd3 = FreqDist([s for s in inaugural.words()]) print(fd3.freq('freedom')) # count frequency of words length in decending order cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids() for w in inaugural.words(fileid) if fileid > '1980' and fileid < '2010') print(cfd.items()) cfd.plot() # %%
UNK += lpt.prob(r[0]) print('UNK | ', UNK) print('=========== BIGRAMS ===========') file = open('sampledata.txt', 'r') filetext = file.read() filetext = filetext.replace('</s>', '') filetext = filetext.replace('<s>', '') tokens = word_tokenize(filetext) tokens.append('<s>') print(set(tokens)) vocab2 = vocab vocab2.append('</s>') vocab2.append('UTK') big = bigrams(tokens) cfds = ConditionalFreqDist((w0, w1) for w0, w1 in big) print(cfds.items()) for v3 in vocab2: Unk2 = 0 fr2 = cfds.get(v3) if (fr2 != None): for i in fr2.items(): unigramCount = 0 for s in fr.items(): if v3 == s[0]: unigramCount = s[1] print('P(' + v3 + '|' + str(i[0]) + ') = ' + str((i[1] / unigramCount).__round__(2))) else: Unk2 += 1 print('P(' + v3 + '|UNK) = ' + str(Unk2)) print('======= BIGRAMS SMOOTHING =======')