def generate_model(self, train): every_word = set() self.freq = {} condFreq = {} continuation = {} # get ngrams for sent in train: condFreq = AccumCondFreqs(condFreq, CondFreqs(generate_ngrams, [w for w in sent], self.n)) # count the number of times each prefix starts an ngram for p, s in condFreq.iteritems(): self.freq[p] = float(sum(s.values())) # continuation probability counts for w,c in s.iteritems(): if w not in continuation: continuation[w] = set() continuation[w].add(p) # now calculate the model parameters unique_ngram_starts = float(len(condFreq)) self.full_discount = log(self.discount/reduce(add, self.freq.values())) for p, s in condFreq.iteritems(): self.kn_ngram_prob[p] = {} interpolation_weight = self.discount*(float(len(s)))/self.freq[p] for w,c in s.iteritems(): initial_term_count = float(len(continuation[w])) self.kn_ngram_prob[p][w] = log(max(c - self.discount, 0.0)/self.freq[p] + \ (interpolation_weight * initial_term_count)/unique_ngram_starts) SmoothedModel.generate_model(self, train)
def generate_model(self, train): every_word = set() self.freq = {} condFreq = {} continuation = {} # get ngrams for sent in train: condFreq = AccumCondFreqs(condFreq, CondFreqs(generate_ngrams, [w for w in sent], self.n)) # count the number of times each prefix starts an ngram for p, s in condFreq.iteritems(): self.freq[p] = float(sum(s.values())) # continuation probability counts for w, c in s.iteritems(): if w not in continuation: continuation[w] = set() continuation[w].add(p) # now calculate the model parameters unique_ngram_starts = float(len(condFreq)) self.full_discount = log(self.discount / reduce(add, self.freq.values())) for p, s in condFreq.iteritems(): self.kn_ngram_prob[p] = {} interpolation_weight = self.discount * (float(len(s))) / self.freq[p] for w, c in s.iteritems(): initial_term_count = float(len(continuation[w])) self.kn_ngram_prob[p][w] = log( max(c - self.discount, 0.0) / self.freq[p] + (interpolation_weight * initial_term_count) / unique_ngram_starts ) SmoothedModel.generate_model(self, train)
def generate_model(self, train): """ given a list of lists of tokenized sentences, generate and store a model corresponding to this type of smoothing. >>> """ cacc = {} for line in train: cacc = AccumCondFreqs(cacc, CondFreqs(generate_ngrams, [t for t in line], self.n)) fof = [(i, float(len(n))) for (i, n) in freqOfFreq(cacc)] self.model = model_probs(smoothed_counts(cacc, linear_regression(fof, log))) SmoothedModel.generate_model(self, train)
def generate_model(self, train): ''' given a list of lists of tokenized sentences, generate and store a model corresponding to this type of smoothing. >>> ''' cacc = {} for line in train: cacc = AccumCondFreqs( cacc, CondFreqs(generate_ngrams, [t for t in line], self.n)) fof = [(i, float(len(n))) for (i, n) in freqOfFreq(cacc)] self.model = self.__model_probs( smoothed_counts(cacc, linear_regression(fof, log))) SmoothedModel.generate_model(self, train)
def generate_model(self, train): ''' given a list of lists of tokenized sentences, generate and store a model corresponding to this type of smoothing. >>> from nltk.data import load >>> stok = load('tokenizers/punkt/english.pickle') >>> from nltk.corpus import gutenberg as g >>> from ngram_helpers import * >>> train = [tokenize(preprocess(sent)) for sent in stok.tokenize(g.raw('austen-emma.txt'))] >>> from additive_smoothing import AdditiveSmoothing >>> a_s = AdditiveSmoothing() >>> a_s.generate_model(train) >>> a_s.model[0].items()[:2] [('blessed her', {'before': 0.00027991602519244227}), ('long understood', {'me': 0.00027987685418415898, 'you': 0.00027987685418415898})] >>> ''' cacc = {} for line in train: cacc = AccumCondFreqs(cacc, CondFreqs(generate_ngrams, [t for t in line], self.n)) self.model = self.__smoothed_probs(cacc) SmoothedModel.generate_model(self, train)
def generate_model(self, train): ''' given a list of lists of tokenized sentences, generate and store a model corresponding to this type of smoothing. >>> from nltk.data import load >>> stok = load('tokenizers/punkt/english.pickle') >>> from nltk.corpus import gutenberg as g >>> from ngram_helpers import * >>> train = [tokenize(preprocess(sent)) for sent in stok.tokenize(g.raw('austen-emma.txt'))] >>> from additive_smoothing import AdditiveSmoothing >>> a_s = AdditiveSmoothing() >>> a_s.generate_model(train) >>> a_s.model[0].items()[:2] [('blessed her', {'before': 0.00027991602519244227}), ('long understood', {'me': 0.00027987685418415898, 'you': 0.00027987685418415898})] >>> ''' cacc = {} for line in train: cacc = AccumCondFreqs( cacc, CondFreqs(generate_ngrams, [t for t in line], self.n)) self.model = self.__smoothed_probs(cacc) SmoothedModel.generate_model(self, train)
def generate_model(self, train): ''' given a list of lists of tokenized sentences, generate and store a model corresponding to this type of smoothing. ''' # get the backoff frequencies freq = {} ncf = {} smoothedf = {} for i in xrange(1,self.n+1): ncf[i] = {} for line in train: ncf[1] = AccumFreqs(ncf[1], Freqs(generate_ngrams, [t for t in line], 1)) for i in xrange(2,n+1): #get ngram frequencies up to n ncf[i] = AccumCondFreqs(ncf[i], CondFreqs(generate_ngrams, [t for t in line], i)) # get the smoothed counts for each ngram smoothedf[1] = (ncf[1], sum(ncf[1].values())) for i xrange(2,self.n+1): fof = fof = [(i, float(len(n))) for (i, n) in freqOfFreq(ncf[i])] smoothedf[i] = smoothed_counts(smoothedf[i], linear_regression(fof, log)) # now get the model probabilities for each ngram layer modelp = {} for i,m in smoothedf.iteritems(): probs = {} if i is 1: for w,c in m[0].iteritems(): probs[w] = float(c)/m[1] for p, s in m[0].iteritems(): probs[p] = {} n = float(len(s)) for w,c in s: probs[p][w] = c/n modelp[i] = probs self.model = (modelp, smoothedf) return self.model SmoothedModel.generate_model(self, train)