예제 #1
0
    def __init__(self, n, train, estimator, freqtype, padding, backoff,
                 *estimator_args, **estimator_kw_args):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        @param n: the order of the language model (ngram size)
        @type n: L{int}
        @param train: the training text
        @type train: L{list} of L{str} (or L{list} of L{str} L{list}s)
        @param estimator: a function for generating a probability distribution (must take FreqDist as first argument, and n as second)
        @type estimator: a function that takes a L{ConditionalFreqDist} and returns a L{ConditionalProbDist}
        @param freqtype: the type to use to store the counts in the underlying frequency distribution
        @type freqtype: any numeric type
        @param backoff: whether or not we should use Katz back-off
        @type backoff: L{bool}
        @param estimator_args: Extra arguments for L{estimator}.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
        @type estimator_args: (any)
        @param estimator_kw_args: Extra keyword arguments for L{estimator}.
        @type estimator_kw_args: (any)
        """

        self._n = n

        cfd = ConditionalFreqDist(counttype=freqtype)
        self._ngrams = set()
        self._padding = (padding, ) * (n - 1)
        self._estimator = estimator
        self._freqtype = freqtype
        self._estimator_args = estimator_args
        self._estimator_kw_args = estimator_kw_args

        if train:
            for utterance in train:
                for ngram in ingrams(
                        chain(self._padding, utterance, self._padding), n):
                    self._ngrams.add(ngram)
                    context = tuple(ngram[:-1])
                    token = ngram[-1]
                    cfd[context].inc(token)

        self._model = ConditionalProbDist(cfd, estimator, self._freqtype, n,
                                          *estimator_args, **estimator_kw_args)

        # recursively construct the lower-order models
        self._backoff = PartialCountNgramModel(
            n -
            1, train, estimator, freqtype, padding, backoff, *estimator_args,
            **estimator_kw_args) if (backoff and n > 1) else None
예제 #2
0
    def __init__(self, n, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        @param n: the order of the language model (ngram size)
        @type n: L{int}
        @param train: the training text
        @type train: L{list} of L{str} (or L{list} of L{str} L{list}s)
        @param estimator: a function for generating a probability distribution (must take FreqDist as first argument, and n as second)
        @type estimator: a function that takes a L{ConditionalFreqDist} and returns a L{ConditionalProbDist}
        @param freqtype: the type to use to store the counts in the underlying frequency distribution
        @type freqtype: any numeric type
        @param backoff: whether or not we should use Katz back-off
        @type backoff: L{bool}
        @param estimator_args: Extra arguments for L{estimator}.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
        @type estimator_args: (any)
        @param estimator_kw_args: Extra keyword arguments for L{estimator}.
        @type estimator_kw_args: (any)
        """

        self._n = n

        cfd = ConditionalFreqDist(counttype=freqtype)
        self._ngrams = set()
        self._padding = (padding,) * (n - 1)
        self._estimator = estimator
        self._freqtype = freqtype
        self._estimator_args = estimator_args
        self._estimator_kw_args = estimator_kw_args

        if train:
            for utterance in train:
                for ngram in ingrams(chain(self._padding, utterance, self._padding), n):
                    self._ngrams.add(ngram)
                    context = tuple(ngram[:-1])
                    token = ngram[-1]
                    cfd[context].inc(token)

        self._model = ConditionalProbDist(cfd, estimator, self._freqtype, n, *estimator_args, **estimator_kw_args)

        # recursively construct the lower-order models
        self._backoff = PartialCountNgramModel(n - 1, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args) if (backoff and n > 1) else None
예제 #3
0
class PartialCountNgramModel(NgramModel):
    '''
    NgramModel that supports storing counts as types other than int.
    '''
    def __init__(self, n, train, estimator, freqtype, padding, backoff,
                 *estimator_args, **estimator_kw_args):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        @param n: the order of the language model (ngram size)
        @type n: L{int}
        @param train: the training text
        @type train: L{list} of L{str} (or L{list} of L{str} L{list}s)
        @param estimator: a function for generating a probability distribution (must take FreqDist as first argument, and n as second)
        @type estimator: a function that takes a L{ConditionalFreqDist} and returns a L{ConditionalProbDist}
        @param freqtype: the type to use to store the counts in the underlying frequency distribution
        @type freqtype: any numeric type
        @param backoff: whether or not we should use Katz back-off
        @type backoff: L{bool}
        @param estimator_args: Extra arguments for L{estimator}.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
        @type estimator_args: (any)
        @param estimator_kw_args: Extra keyword arguments for L{estimator}.
        @type estimator_kw_args: (any)
        """

        self._n = n

        cfd = ConditionalFreqDist(counttype=freqtype)
        self._ngrams = set()
        self._padding = (padding, ) * (n - 1)
        self._estimator = estimator
        self._freqtype = freqtype
        self._estimator_args = estimator_args
        self._estimator_kw_args = estimator_kw_args

        if train:
            for utterance in train:
                for ngram in ingrams(
                        chain(self._padding, utterance, self._padding), n):
                    self._ngrams.add(ngram)
                    context = tuple(ngram[:-1])
                    token = ngram[-1]
                    cfd[context].inc(token)

        self._model = ConditionalProbDist(cfd, estimator, self._freqtype, n,
                                          *estimator_args, **estimator_kw_args)

        # recursively construct the lower-order models
        self._backoff = PartialCountNgramModel(
            n -
            1, train, estimator, freqtype, padding, backoff, *estimator_args,
            **estimator_kw_args) if (backoff and n > 1) else None

    def update(self, samples, increase_amount=1):
        '''
        Update the underlying frequency distributions given the current list of samples.
        '''
        cond_samples = []
        for utterance in samples:
            for ngram in ingrams(
                    chain(self._padding, utterance, self._padding), self._n):
                self._ngrams.add(ngram)
                cond_samples.append((tuple(ngram[:-1]), ngram[-1]))
        self._model.update(cond_samples, increase_amount)

        # Recursively update lower-order models
        if self._backoff:
            self._backoff.update(samples, increase_amount)

    def prob(self, word, context):
        """
        Evaluate the probability of this word in this context using Katz Backoff.

        @param word: the word to get the probability of
        @type word: C{string}
        @param context: the context the word is in
        @type context: C{list} of C{string}
        """

        context = tuple(context)
        if not self._backoff or (context + (word, ) in self._ngrams):
            return self[context].prob(word)
        else:
            # print "Alpha: {}\tBackoff prob: {}".format(self._alpha(context), self._backoff.prob(word, context[1:]))
            return self._alpha(context) * self._backoff.prob(word, context[1:])
예제 #4
0
 def setUp(cls):
     cls.tokens = text
     cond_text = list(zip(tags, text))
     cls.fd = FreqDist(tags)
     cls.cfd = ConditionalFreqDist(cond_text)
     cls.cpd = ConditionalProbDist(cls.cfd, MLEProbDist)
예제 #5
0
class PartialCountNgramModel(NgramModel):
    '''
    NgramModel that supports storing counts as types other than int.
    '''

    def __init__(self, n, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        @param n: the order of the language model (ngram size)
        @type n: L{int}
        @param train: the training text
        @type train: L{list} of L{str} (or L{list} of L{str} L{list}s)
        @param estimator: a function for generating a probability distribution (must take FreqDist as first argument, and n as second)
        @type estimator: a function that takes a L{ConditionalFreqDist} and returns a L{ConditionalProbDist}
        @param freqtype: the type to use to store the counts in the underlying frequency distribution
        @type freqtype: any numeric type
        @param backoff: whether or not we should use Katz back-off
        @type backoff: L{bool}
        @param estimator_args: Extra arguments for L{estimator}.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
        @type estimator_args: (any)
        @param estimator_kw_args: Extra keyword arguments for L{estimator}.
        @type estimator_kw_args: (any)
        """

        self._n = n

        cfd = ConditionalFreqDist(counttype=freqtype)
        self._ngrams = set()
        self._padding = (padding,) * (n - 1)
        self._estimator = estimator
        self._freqtype = freqtype
        self._estimator_args = estimator_args
        self._estimator_kw_args = estimator_kw_args

        if train:
            for utterance in train:
                for ngram in ingrams(chain(self._padding, utterance, self._padding), n):
                    self._ngrams.add(ngram)
                    context = tuple(ngram[:-1])
                    token = ngram[-1]
                    cfd[context].inc(token)

        self._model = ConditionalProbDist(cfd, estimator, self._freqtype, n, *estimator_args, **estimator_kw_args)

        # recursively construct the lower-order models
        self._backoff = PartialCountNgramModel(n - 1, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args) if (backoff and n > 1) else None

    def update(self, samples, increase_amount=1):
        '''
        Update the underlying frequency distributions given the current list of samples.
        '''
        cond_samples = []
        for utterance in samples:
            for ngram in ingrams(chain(self._padding, utterance, self._padding), self._n):
                self._ngrams.add(ngram)
                cond_samples.append((tuple(ngram[:-1]), ngram[-1]))
        self._model.update(cond_samples, increase_amount)

        # Recursively update lower-order models
        if self._backoff:
            self._backoff.update(samples, increase_amount)

    def prob(self, word, context):
        """
        Evaluate the probability of this word in this context using Katz Backoff.

        @param word: the word to get the probability of
        @type word: C{string}
        @param context: the context the word is in
        @type context: C{list} of C{string}
        """

        context = tuple(context)
        if not self._backoff or (context + (word,) in self._ngrams):
            return self[context].prob(word)
        else:
            # print "Alpha: {}\tBackoff prob: {}".format(self._alpha(context), self._backoff.prob(word, context[1:]))
            return self._alpha(context) * self._backoff.prob(word, context[1:])