예제 #1
0
    def __init__(
        self,
        order,
        vocabulary=None,
        counter=None,
        verbose=True,
    ):
        """Creates new LanguageModel.

        :param vocabulary: If provided, this vocabulary will be used instead
        of creating a new one when training.
        :type vocabulary: `nltk.lm.Vocabulary` or None
        :param counter: If provided, use this object to count ngrams.
        :type vocabulary: `nltk.lm.NgramCounter` or None
        :param ngrams_fn: If given, defines how sentences in training text are turned to ngram
                          sequences.
        :type ngrams_fn: function or None
        :param pad_fn: If given, defines how senteces in training text are padded.
        :type pad_fn: function or None

        """
        self.order = order
        self.vocab = Vocabulary() if vocabulary is None else vocabulary
        self.counts = NgramCounter() if counter is None else counter
        def_dict_callable = partial(defaultdict, float)
        self._cache = defaultdict(def_dict_callable)
        self.verbose = verbose
예제 #2
0
def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=2):
    """
    Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param model: un des éléments de (MLE, Lidstone, Laplace)
    :param n: int, l'ordre du modèle
    :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet
    argument doit être renseigné
    :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK>
    :return: un modèle entraîné
    """
    lm = None
    ngrams, words = padded_everygram_pipeline(n, corpus)
    vocab = Vocabulary(words, unk_cutoff=unk_cutoff)
    if model== MLE:
        lm = model(n, vocabulary=vocab)
        lm.fit(ngrams)
    elif model == Lidstone:
        if gamma == None:
            raise Exception('Please enter a value for gamma')
        else:
            lm = Lidstone(gamma, order = n, vocabulary=vocab)
            lm.fit(ngrams)
    elif model==Laplace:
        lm = Laplace(order = n, vocabulary=vocab)
        lm.fit(ngrams)
    else:
        raise Exception('Wrong model in train_LM_model')
    return lm
예제 #3
0
def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=1):
    """
    Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param model: un des éléments de (MLE, Lidstone, Laplace)
    :param n: int, l'ordre du modèle
    :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet
    argument doit être renseigné
    :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK>
    :return: un modèle entraîné
    """

    train, words = padded_everygram_pipeline(n, corpus.copy())
    vocab = Vocabulary(words, unk_cutoff)

    if (model == Lidstone) and (gamma is not None):
        model = Lidstone(gamma,n,vocab)
        model.fit(train)
    elif model == MLE:
        model = mle.train_MLE_model(corpus, n)
    elif model == Laplace:
        model = Laplace(n,vocab)
        model.fit(train)

    return model
예제 #4
0
def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=2):
    """
    Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param model: un des éléments de (MLE, Lidstone, Laplace)
    :param n: int, l'ordre du modèle
    :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet
    argument doit être renseigné
    :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK>
    :return: un modèle entraîné
    """
    #On veut condenser le corpus en une simple liste, pour pouvoir utiliser facilement Vocabulary
    flat_corpus = []
    for l in corpus:
        for w in l:
            flat_corpus.append(w)
    
    vocab = Vocabulary(flat_corpus, unk_cutoff)
    
    ngram_corpus = mnm.extract_ngrams(corpus,n)
    
    if (model == MLE):
        model_res = MLE(n)
        model_res.fit(ngram_corpus, vocab)
    
    if (model == Lidstone):
        model_res = Lidstone(gamma,n)
        model_res.fit(ngram_corpus, vocab)
    
    if (model == Laplace):
        model_res = Laplace(n)
        model_res.fit(ngram_corpus, vocab)
    
    return model_res
예제 #5
0
def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=2):
    """
    Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param model: un des éléments de (MLE, Lidstone, Laplace)
    :param n: int, l'ordre du modèle
    :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet
    argument doit être renseigné
    :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK>
    :return: un modèle entraîné
    """
    if model not in [MLE, Laplace, Lidstone]:
        raise TypeError("Unkown model type! supported types: (MLE, Lidstone, Laplace)")

    ngrams, words = padded_everygram_pipeline(n, corpus)
    vocab = Vocabulary(words, unk_cutoff=unk_cutoff)

    params = {
        "order":n,
        "vocabulary":vocab,
    }
    if model == Lidstone:
        params["gamma"] = gamma
    ist_model = model(**params)
    ist_model.fit(ngrams)
    
    return ist_model
예제 #6
0
def train_MLE_model(corpus, n):
    """
    Entraîne un modèle de langue n-gramme MLE de NLTK sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param n: l'ordre du modèle
    :return: un modèle entraîné
    """
    
    ngrams, words = padded_everygram_pipeline(n, corpus)
    vocab = Vocabulary(words, unk_cutoff=1)
    lm = MLE(n, vocabulary=vocab)
    lm.fit(ngrams)
    
    return lm
예제 #7
0
파일: api.py 프로젝트: rmalouf/nltk
    def __init__(self, order, vocabulary=None, counter=None):
        """Creates new LanguageModel.

        :param vocabulary: If provided, this vocabulary will be used instead
        of creating a new one when training.
        :type vocabulary: `nltk.lm.Vocabulary` or None
        :param counter: If provided, use this object to count ngrams.
        :type vocabulary: `nltk.lm.NgramCounter` or None
        :param ngrams_fn: If given, defines how sentences in training text are turned to ngram
                          sequences.
        :type ngrams_fn: function or None
        :param pad_fn: If given, defines how senteces in training text are padded.
        :type pad_fn: function or None

        """
        self.order = order
        self.vocab = Vocabulary() if vocabulary is None else vocabulary
        self.counts = NgramCounter() if counter is None else counter
예제 #8
0
def train_MLE_model(corpus, n):
    """
    Entraîne un modèle de langue n-gramme MLE de NLTK sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param n: l'ordre du modèle
    :return: un modèle entraîné
    """
    # Creation of the vocabulary from the given corpus
    flat_corpus = []
    for document in corpus:
        for word in document:
            flat_corpus.append(word)
    vocab = Vocabulary(flat_corpus, unk_cutoff=2)
    # Extraction of the n-grams
    n_grams = mnm.extract_ngrams(corpus, n)
    # Creation and training of the model on the corpus
    model = MLE(n, vocab)
    model.fit(n_grams)
    print("Modèle d'ordre", n, "généré par nltk.lm")
    return model
예제 #9
0
파일: api.py 프로젝트: AnAnteup/icp7
class LanguageModel(object):
    """ABC for Language Models.

    Cannot be directly instantiated itself.

    """
    def __init__(self, order, vocabulary=None, counter=None):
        """Creates new LanguageModel.

        :param vocabulary: If provided, this vocabulary will be used instead
        of creating a new one when training.
        :type vocabulary: `nltk.lm.Vocabulary` or None
        :param counter: If provided, use this object to count ngrams.
        :type vocabulary: `nltk.lm.NgramCounter` or None
        :param ngrams_fn: If given, defines how sentences in training text are turned to ngram
                          sequences.
        :type ngrams_fn: function or None
        :param pad_fn: If given, defines how senteces in training text are padded.
        :type pad_fn: function or None

        """
        self.order = order
        self.vocab = Vocabulary() if vocabulary is None else vocabulary
        self.counts = NgramCounter() if counter is None else counter

    def fit(self, text, vocabulary_text=None):
        """Trains the model on a text.

        :param text: Training text as a sequence of sentences.

        """
        if not self.vocab:
            if vocabulary_text is None:
                raise ValueError("Cannot fit without a vocabulary or text to "
                                 "create it from.")
            self.vocab.update(vocabulary_text)
        self.counts.update(self.vocab.lookup(sent) for sent in text)

    def score(self, word, context=None):
        """Masks out of vocab (OOV) words and computes their model score.

        For model-specific logic of calculating scores, see the `unmasked_score`
        method.
        """
        return self.unmasked_score(
            self.vocab.lookup(word),
            self.vocab.lookup(context) if context else None)

    @abstractmethod
    def unmasked_score(self, word, context=None):
        """Score a word given some optional context.

        Concrete models are expected to provide an implementation.
        Note that this method does not mask its arguments with the OOV label.
        Use the `score` method for that.

        :param str word: Word for which we want the score
        :param tuple(str) context: Context the word is in.
        If `None`, compute unigram score.
        :param context: tuple(str) or None
        :rtype: float

        """
        raise NotImplementedError()

    def logscore(self, word, context=None):
        """Evaluate the log score of this word in this context.

        The arguments are the same as for `score` and `unmasked_score`.

        """
        return log_base2(self.score(word, context))

    def context_counts(self, context):
        """Helper method for retrieving counts for a given context.

        Assumes context has been checked and oov words in it masked.
        :type context: tuple(str) or None

        """
        return (self.counts[len(context) +
                            1][context] if context else self.counts.unigrams)

    def entropy(self, text_ngrams):
        """Calculate cross-entropy of model for given evaluation text.

        :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples.
        :rtype: float

        """
        return -1 * _mean(
            [self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams])

    def perplexity(self, text_ngrams):
        """Calculates the perplexity of the given text.

        This is simply 2 ** cross-entropy for the text, so the arguments are the same.

        """
        return pow(2.0, self.entropy(text_ngrams))

    def generate(self, num_words=1, text_seed=None, random_seed=None):
        """Generate words from the model.

        :param int num_words: How many words to generate. By default 1.
        :param text_seed: Generation can be conditioned on preceding context.
        :param random_seed: If provided, makes the random sampling part of
        generation reproducible.
        :return: One (str) word or a list of words generated from model.

        Examples:

        >>> from nltk.lm import MLE
        >>> lm = MLE(2)
        >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c'])
        >>> lm.fit([[("a",), ("b",), ("c",)]])
        >>> lm.generate(random_seed=3)
        'a'
        >>> lm.generate(text_seed=['a'])
        'b'

        """
        text_seed = [] if text_seed is None else list(text_seed)
        # base recursion case
        if num_words == 1:
            context = (text_seed[-self.order + 1:]
                       if len(text_seed) >= self.order else text_seed)
            samples = self.context_counts(self.vocab.lookup(context))
            while context and not samples:
                context = context[1:] if len(context) > 1 else []
                samples = self.context_counts(self.vocab.lookup(context))
            # sorting achieves two things:
            # - reproducible randomness when sampling
            # - turning Mapping into Sequence which _weighted_choice expects
            samples = sorted(samples)
            return _weighted_choice(
                samples, tuple(self.score(w, context) for w in samples),
                random_seed)
        # build up text one word at a time
        generated = []
        for _ in range(num_words):
            generated.append(
                self.generate(
                    num_words=1,
                    text_seed=text_seed + generated,
                    random_seed=random_seed,
                ))
        return generated
예제 #10
0
파일: api.py 프로젝트: rmalouf/nltk
class LanguageModel(object):
    """ABC for Language Models.

    Cannot be directly instantiated itself.

    """

    def __init__(self, order, vocabulary=None, counter=None):
        """Creates new LanguageModel.

        :param vocabulary: If provided, this vocabulary will be used instead
        of creating a new one when training.
        :type vocabulary: `nltk.lm.Vocabulary` or None
        :param counter: If provided, use this object to count ngrams.
        :type vocabulary: `nltk.lm.NgramCounter` or None
        :param ngrams_fn: If given, defines how sentences in training text are turned to ngram
                          sequences.
        :type ngrams_fn: function or None
        :param pad_fn: If given, defines how senteces in training text are padded.
        :type pad_fn: function or None

        """
        self.order = order
        self.vocab = Vocabulary() if vocabulary is None else vocabulary
        self.counts = NgramCounter() if counter is None else counter

    def fit(self, text, vocabulary_text=None):
        """Trains the model on a text.

        :param text: Training text as a sequence of sentences.

        """
        if not self.vocab:
            if vocabulary_text is None:
                raise ValueError(
                    "Cannot fit without a vocabulary or text to " "create it from."
                )
            self.vocab.update(vocabulary_text)
        self.counts.update(self.vocab.lookup(sent) for sent in text)

    def score(self, word, context=None):
        """Masks out of vocab (OOV) words and computes their model score.

        For model-specific logic of calculating scores, see the `unmasked_score`
        method.
        """
        return self.unmasked_score(
            self.vocab.lookup(word), self.vocab.lookup(context) if context else None
        )

    @abstractmethod
    def unmasked_score(self, word, context=None):
        """Score a word given some optional context.

        Concrete models are expected to provide an implementation.
        Note that this method does not mask its arguments with the OOV label.
        Use the `score` method for that.

        :param str word: Word for which we want the score
        :param tuple(str) context: Context the word is in.
        If `None`, compute unigram score.
        :param context: tuple(str) or None
        :rtype: float

        """
        raise NotImplementedError()

    def logscore(self, word, context=None):
        """Evaluate the log score of this word in this context.

        The arguments are the same as for `score` and `unmasked_score`.

        """
        return log_base2(self.score(word, context))

    def context_counts(self, context):
        """Helper method for retrieving counts for a given context.

        Assumes context has been checked and oov words in it masked.
        :type context: tuple(str) or None

        """
        return (
            self.counts[len(context) + 1][context] if context else self.counts.unigrams
        )

    def entropy(self, text_ngrams):
        """Calculate cross-entropy of model for given evaluation text.

        :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples.
        :rtype: float

        """
        return -1 * _mean(
            [self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams]
        )

    def perplexity(self, text_ngrams):
        """Calculates the perplexity of the given text.

        This is simply 2 ** cross-entropy for the text, so the arguments are the same.

        """
        return pow(2.0, self.entropy(text_ngrams))

    def generate(self, num_words=1, text_seed=None, random_seed=None):
        """Generate words from the model.

        :param int num_words: How many words to generate. By default 1.
        :param text_seed: Generation can be conditioned on preceding context.
        :param random_seed: A random seed or an instance of `random.Random`. If provided,
        makes the random sampling part of generation reproducible.
        :return: One (str) word or a list of words generated from model.

        Examples:

        >>> from nltk.lm import MLE
        >>> lm = MLE(2)
        >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c'])
        >>> lm.fit([[("a",), ("b",), ("c",)]])
        >>> lm.generate(random_seed=3)
        'a'
        >>> lm.generate(text_seed=['a'])
        'b'

        """
        text_seed = [] if text_seed is None else list(text_seed)
        random_generator = _random_generator(random_seed)
        # base recursion case
        if num_words == 1:
            context = (
                text_seed[-self.order + 1 :]
                if len(text_seed) >= self.order
                else text_seed
            )
            samples = self.context_counts(self.vocab.lookup(context))
            while context and not samples:
                context = context[1:] if len(context) > 1 else []
                samples = self.context_counts(self.vocab.lookup(context))
            # sorting achieves two things:
            # - reproducible randomness when sampling
            # - turning Mapping into Sequence which _weighted_choice expects
            samples = sorted(samples)
            return _weighted_choice(
                samples, tuple(self.score(w, context) for w in samples), random_generator
            )
        # build up text one word at a time
        generated = []
        for _ in range(num_words):
            generated.append(
                self.generate(
                    num_words=1,
                    text_seed=text_seed + generated,
                    random_seed=random_generator,
                )
            )
        return generated
예제 #11
0
class LanguageModel(metaclass=ABCMeta):
    """ABC for Language Models.

    Cannot be directly instantiated itself.

    """
    def __init__(
        self,
        order,
        vocabulary=None,
        counter=None,
        verbose=True,
    ):
        """Creates new LanguageModel.

        :param vocabulary: If provided, this vocabulary will be used instead
        of creating a new one when training.
        :type vocabulary: `nltk.lm.Vocabulary` or None
        :param counter: If provided, use this object to count ngrams.
        :type vocabulary: `nltk.lm.NgramCounter` or None
        :param ngrams_fn: If given, defines how sentences in training text are turned to ngram
                          sequences.
        :type ngrams_fn: function or None
        :param pad_fn: If given, defines how senteces in training text are padded.
        :type pad_fn: function or None

        """
        self.order = order
        self.vocab = Vocabulary() if vocabulary is None else vocabulary
        self.counts = NgramCounter() if counter is None else counter
        def_dict_callable = partial(defaultdict, float)
        self._cache = defaultdict(def_dict_callable)
        self.verbose = verbose

    def _update_cache(self, word):
        i, word = word
        ret_list = []
        for order in range(2, self.order + 1):
            for context in self.counts[order].keys():
                if self.counts[order][context].N() > self.cache_limit:
                    ret_list.append((context, word, self.score(word, context)))
        return ret_list

    def _check_cache_size(self):
        return getsizeof(self._cache) / 1e6

    def fit(self, text, vocabulary_text=None, verbose=True):
        """Trains the model on a text.

        :param text: Training text as a sequence of sentences.

        """
        if not self.vocab:
            if vocabulary_text is None:
                raise ValueError(
                    "Cannot fit without a vocabulary or text to create it from."
                )
            self.vocab.update(vocabulary_text)
        _iter = (self.vocab.lookup(sent) for sent in text)
        self.counts.update(
            progress(_iter, desc="Fitting the model") if self.
            verbose else _iter)

    def score(self, word, context=None):
        """Masks out of vocab (OOV) words and computes their model score.

        For model-specific logic of calculating scores, see the `unmasked_score`
        method.
        """
        return self.unmasked_score(
            self.vocab.lookup(word),
            self.vocab.lookup(context) if context else None)

    @abstractmethod
    def unmasked_score(self, word, context=None):
        """Score a word given some optional context.

        Concrete models are expected to provide an implementation.
        Note that this method does not mask its arguments with the OOV label.
        Use the `score` method for that.

        :param str word: Word for which we want the score
        :param tuple(str) context: Context the word is in.
        If `None`, compute unigram score.
        :param context: tuple(str) or None
        :rtype: float

        """
        raise NotImplementedError()

    def logscore(self, word, context=None):
        """Evaluate the log score of this word in this context.

        The arguments are the same as for `score` and `unmasked_score`.

        """
        return log_base2(self.score(word, context))

    def context_counts(self, context):
        """Helper method for retrieving counts for a given context.

        Assumes context has been checked and oov words in it masked.
        :type context: tuple(str) or None

        """
        return (self.counts[len(context) +
                            1][context] if context else self.counts.unigrams)

    def entropy(self, text_ngrams):
        """Calculate cross-entropy of model for given evaluation text.

        :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples.
        :rtype: float

        """
        return -1 * _mean(
            [self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams])

    def perplexity(self, text_ngrams):
        """Calculates the perplexity of the given text.

        This is simply 2 ** cross-entropy for the text, so the arguments are the same.

        """
        return pow(
            2.0,
            self.entropy(
                progress(text_ngrams, desc="Calculating Perplexity") if self.
                verbose else text_ngrams))

    def context_probabilities(self, context):
        """Helper method for retrieving probabilities for a given context,
        including all the words in the vocabulary

        Assumes context has been checked and oov words in it masked.
        :type context: tuple(str) or None

        """
        if context not in self._cache.keys():
            self._cache[context] = {
                word: self.score(word, context)
                for word in self.vocab.counts.keys()
            }
        return self._cache[context]

    def _generate_single_word(self, sampler_func, text_seed, random_generator,
                              sampler_kwargs):
        context = tuple(
            text_seed[-self.order +
                      1:] if len(text_seed) >= self.order else text_seed)
        distribution = self.context_probabilities(context)
        # Sorting distribution achieves two things:
        # - reproducible randomness when sampling
        # - turns Dictionary into Sequence which `sampler` expects
        distribution = sorted(distribution.items(),
                              key=lambda x: x[1],
                              reverse=True)
        return sampler_func(distribution,
                            random_generator=random_generator,
                            **sampler_kwargs)

    def generate(
        self,
        sampler_func=greedy_decoding,
        num_words=1,
        text_seed=None,
        random_seed=None,
        sampler_kwargs={},
        EOS=None,
    ):
        text_seed = (random.sample(self.vocab.counts.keys(), 1)
                     if text_seed is None else list(text_seed))
        random_generator = _random_generator(random_seed)
        if EOS:
            sampler_kwargs["EOS"] = EOS
        # We build up text one word at a time using the preceding context.
        generated = []
        _iter = range(num_words)
        for _ in (progress(_iter, desc="Generating words")
                  if self.verbose else _iter):
            token = self._generate_single_word(
                sampler_func=sampler_func,
                text_seed=text_seed + generated,
                random_generator=random_generator,
                sampler_kwargs=sampler_kwargs,
            )
            generated.append(token)
            if token == EOS:
                break
        return generated