def __init__(self, n, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: L{int} @param train: the training text @type train: L{list} of L{str} (or L{list} of L{str} L{list}s) @param estimator: a function for generating a probability distribution (must take FreqDist as first argument, and n as second) @type estimator: a function that takes a L{ConditionalFreqDist} and returns a L{ConditionalProbDist} @param freqtype: the type to use to store the counts in the underlying frequency distribution @type freqtype: any numeric type @param backoff: whether or not we should use Katz back-off @type backoff: L{bool} @param estimator_args: Extra arguments for L{estimator}. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. @type estimator_args: (any) @param estimator_kw_args: Extra keyword arguments for L{estimator}. @type estimator_kw_args: (any) """ self._n = n cfd = ConditionalFreqDist(counttype=freqtype) self._ngrams = set() self._padding = (padding, ) * (n - 1) self._estimator = estimator self._freqtype = freqtype self._estimator_args = estimator_args self._estimator_kw_args = estimator_kw_args if train: for utterance in train: for ngram in ingrams( chain(self._padding, utterance, self._padding), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) self._model = ConditionalProbDist(cfd, estimator, self._freqtype, n, *estimator_args, **estimator_kw_args) # recursively construct the lower-order models self._backoff = PartialCountNgramModel( n - 1, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args) if (backoff and n > 1) else None
def __init__(self, n, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: L{int} @param train: the training text @type train: L{list} of L{str} (or L{list} of L{str} L{list}s) @param estimator: a function for generating a probability distribution (must take FreqDist as first argument, and n as second) @type estimator: a function that takes a L{ConditionalFreqDist} and returns a L{ConditionalProbDist} @param freqtype: the type to use to store the counts in the underlying frequency distribution @type freqtype: any numeric type @param backoff: whether or not we should use Katz back-off @type backoff: L{bool} @param estimator_args: Extra arguments for L{estimator}. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. @type estimator_args: (any) @param estimator_kw_args: Extra keyword arguments for L{estimator}. @type estimator_kw_args: (any) """ self._n = n cfd = ConditionalFreqDist(counttype=freqtype) self._ngrams = set() self._padding = (padding,) * (n - 1) self._estimator = estimator self._freqtype = freqtype self._estimator_args = estimator_args self._estimator_kw_args = estimator_kw_args if train: for utterance in train: for ngram in ingrams(chain(self._padding, utterance, self._padding), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) self._model = ConditionalProbDist(cfd, estimator, self._freqtype, n, *estimator_args, **estimator_kw_args) # recursively construct the lower-order models self._backoff = PartialCountNgramModel(n - 1, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args) if (backoff and n > 1) else None
class PartialCountNgramModel(NgramModel): ''' NgramModel that supports storing counts as types other than int. ''' def __init__(self, n, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: L{int} @param train: the training text @type train: L{list} of L{str} (or L{list} of L{str} L{list}s) @param estimator: a function for generating a probability distribution (must take FreqDist as first argument, and n as second) @type estimator: a function that takes a L{ConditionalFreqDist} and returns a L{ConditionalProbDist} @param freqtype: the type to use to store the counts in the underlying frequency distribution @type freqtype: any numeric type @param backoff: whether or not we should use Katz back-off @type backoff: L{bool} @param estimator_args: Extra arguments for L{estimator}. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. @type estimator_args: (any) @param estimator_kw_args: Extra keyword arguments for L{estimator}. @type estimator_kw_args: (any) """ self._n = n cfd = ConditionalFreqDist(counttype=freqtype) self._ngrams = set() self._padding = (padding, ) * (n - 1) self._estimator = estimator self._freqtype = freqtype self._estimator_args = estimator_args self._estimator_kw_args = estimator_kw_args if train: for utterance in train: for ngram in ingrams( chain(self._padding, utterance, self._padding), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) self._model = ConditionalProbDist(cfd, estimator, self._freqtype, n, *estimator_args, **estimator_kw_args) # recursively construct the lower-order models self._backoff = PartialCountNgramModel( n - 1, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args) if (backoff and n > 1) else None def update(self, samples, increase_amount=1): ''' Update the underlying frequency distributions given the current list of samples. ''' cond_samples = [] for utterance in samples: for ngram in ingrams( chain(self._padding, utterance, self._padding), self._n): self._ngrams.add(ngram) cond_samples.append((tuple(ngram[:-1]), ngram[-1])) self._model.update(cond_samples, increase_amount) # Recursively update lower-order models if self._backoff: self._backoff.update(samples, increase_amount) def prob(self, word, context): """ Evaluate the probability of this word in this context using Katz Backoff. @param word: the word to get the probability of @type word: C{string} @param context: the context the word is in @type context: C{list} of C{string} """ context = tuple(context) if not self._backoff or (context + (word, ) in self._ngrams): return self[context].prob(word) else: # print "Alpha: {}\tBackoff prob: {}".format(self._alpha(context), self._backoff.prob(word, context[1:])) return self._alpha(context) * self._backoff.prob(word, context[1:])
def setUp(cls): cls.tokens = text cond_text = list(zip(tags, text)) cls.fd = FreqDist(tags) cls.cfd = ConditionalFreqDist(cond_text) cls.cpd = ConditionalProbDist(cls.cfd, MLEProbDist)
class PartialCountNgramModel(NgramModel): ''' NgramModel that supports storing counts as types other than int. ''' def __init__(self, n, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: L{int} @param train: the training text @type train: L{list} of L{str} (or L{list} of L{str} L{list}s) @param estimator: a function for generating a probability distribution (must take FreqDist as first argument, and n as second) @type estimator: a function that takes a L{ConditionalFreqDist} and returns a L{ConditionalProbDist} @param freqtype: the type to use to store the counts in the underlying frequency distribution @type freqtype: any numeric type @param backoff: whether or not we should use Katz back-off @type backoff: L{bool} @param estimator_args: Extra arguments for L{estimator}. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. @type estimator_args: (any) @param estimator_kw_args: Extra keyword arguments for L{estimator}. @type estimator_kw_args: (any) """ self._n = n cfd = ConditionalFreqDist(counttype=freqtype) self._ngrams = set() self._padding = (padding,) * (n - 1) self._estimator = estimator self._freqtype = freqtype self._estimator_args = estimator_args self._estimator_kw_args = estimator_kw_args if train: for utterance in train: for ngram in ingrams(chain(self._padding, utterance, self._padding), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) self._model = ConditionalProbDist(cfd, estimator, self._freqtype, n, *estimator_args, **estimator_kw_args) # recursively construct the lower-order models self._backoff = PartialCountNgramModel(n - 1, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args) if (backoff and n > 1) else None def update(self, samples, increase_amount=1): ''' Update the underlying frequency distributions given the current list of samples. ''' cond_samples = [] for utterance in samples: for ngram in ingrams(chain(self._padding, utterance, self._padding), self._n): self._ngrams.add(ngram) cond_samples.append((tuple(ngram[:-1]), ngram[-1])) self._model.update(cond_samples, increase_amount) # Recursively update lower-order models if self._backoff: self._backoff.update(samples, increase_amount) def prob(self, word, context): """ Evaluate the probability of this word in this context using Katz Backoff. @param word: the word to get the probability of @type word: C{string} @param context: the context the word is in @type context: C{list} of C{string} """ context = tuple(context) if not self._backoff or (context + (word,) in self._ngrams): return self[context].prob(word) else: # print "Alpha: {}\tBackoff prob: {}".format(self._alpha(context), self._backoff.prob(word, context[1:])) return self._alpha(context) * self._backoff.prob(word, context[1:])