def __init__(self, n, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: L{int} @param train: the training text @type train: L{list} of L{str} (or L{list} of L{str} L{list}s) @param estimator: a function for generating a probability distribution (must take FreqDist as first argument, and n as second) @type estimator: a function that takes a L{ConditionalFreqDist} and returns a L{ConditionalProbDist} @param freqtype: the type to use to store the counts in the underlying frequency distribution @type freqtype: any numeric type @param backoff: whether or not we should use Katz back-off @type backoff: L{bool} @param estimator_args: Extra arguments for L{estimator}. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. @type estimator_args: (any) @param estimator_kw_args: Extra keyword arguments for L{estimator}. @type estimator_kw_args: (any) """ self._n = n cfd = ConditionalFreqDist(counttype=freqtype) self._ngrams = set() self._padding = (padding, ) * (n - 1) self._estimator = estimator self._freqtype = freqtype self._estimator_args = estimator_args self._estimator_kw_args = estimator_kw_args if train: for utterance in train: for ngram in ingrams( chain(self._padding, utterance, self._padding), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) self._model = ConditionalProbDist(cfd, estimator, self._freqtype, n, *estimator_args, **estimator_kw_args) # recursively construct the lower-order models self._backoff = PartialCountNgramModel( n - 1, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args) if (backoff and n > 1) else None
def setUp(cls): cls.tokens = text cond_text = list(zip(tags, text)) cls.fd = FreqDist(tags) cls.cfd = ConditionalFreqDist(cond_text) cls.cpd = ConditionalProbDist(cls.cfd, MLEProbDist)