示例#1
0
 def __init__(self, corpus):
     """
     :param corpus: corpus over which to create the language model
     """
     super(BigramLanguageModel, self).__init__(corpus)
     self.ngram_calculator = BasicNgramCalculator(corpus)
     self.bigrams = self.ngram_calculator.calculate_ngrams(2, True, False)
def q2_calculate_unique_unigrams_in_reuters_training():
    corpus = ReutersTrainingCorpus()
    unigram_calculator = UnigramCalculator(corpus)
    ngram_calculator = BasicNgramCalculator(corpus)

    unique_unigrams = unigram_calculator.get_percentage_unique_unigrams()
    unique_unigrams_ngram = ngram_calculator.get_percentage_unique_ngrams(1)

    print("Percentage of unique unigrams (unigram calculator): ", unique_unigrams)
    print("Percentage of unique unigrams (ngram calculator): ", unique_unigrams_ngram)
示例#3
0
def q2_calculate_unique_unigrams_in_reuters_training():
    corpus = ReutersTrainingCorpus()
    unigram_calculator = UnigramCalculator(corpus)
    ngram_calculator = BasicNgramCalculator(corpus)

    unique_unigrams = unigram_calculator.get_percentage_unique_unigrams()
    unique_unigrams_ngram = ngram_calculator.get_percentage_unique_ngrams(1)

    print("Percentage of unique unigrams (unigram calculator): ",
          unique_unigrams)
    print("Percentage of unique unigrams (ngram calculator): ",
          unique_unigrams_ngram)
def q3_calculate_unique_ngrams_in_reuters_training_for_n_2_3_4_5_6():
    corpus = ReutersTrainingCorpus()
    ngram_calculator = BasicNgramCalculator(corpus)

    unique_ngrams_2 = ngram_calculator.get_percentage_unique_ngrams(2)
    unique_ngrams_3 = ngram_calculator.get_percentage_unique_ngrams(3)
    unique_ngrams_4 = ngram_calculator.get_percentage_unique_ngrams(4)
    unique_ngrams_5 = ngram_calculator.get_percentage_unique_ngrams(5)
    unique_ngrams_6 = ngram_calculator.get_percentage_unique_ngrams(6)

    print("Percentage of unique bigrams: ", unique_ngrams_2)
    print("Percentage of unique trigrams: ", unique_ngrams_3)
    print("Percentage of unique 4-grams: ", unique_ngrams_4)
    print("Percentage of unique 5-grams: ", unique_ngrams_5)
    print("Percentage of unique 6-grams: ", unique_ngrams_6)
示例#5
0
def q3_calculate_unique_ngrams_in_reuters_training_for_n_2_3_4_5_6():
    corpus = ReutersTrainingCorpus()
    ngram_calculator = BasicNgramCalculator(corpus)

    unique_ngrams_2 = ngram_calculator.get_percentage_unique_ngrams(2)
    unique_ngrams_3 = ngram_calculator.get_percentage_unique_ngrams(3)
    unique_ngrams_4 = ngram_calculator.get_percentage_unique_ngrams(4)
    unique_ngrams_5 = ngram_calculator.get_percentage_unique_ngrams(5)
    unique_ngrams_6 = ngram_calculator.get_percentage_unique_ngrams(6)

    print("Percentage of unique bigrams: ", unique_ngrams_2)
    print("Percentage of unique trigrams: ", unique_ngrams_3)
    print("Percentage of unique 4-grams: ", unique_ngrams_4)
    print("Percentage of unique 5-grams: ", unique_ngrams_5)
    print("Percentage of unique 6-grams: ", unique_ngrams_6)
示例#6
0
class BigramLanguageModel(UnigramLanguageModel):
    """
    Language model using bigram probabilities
    """

    def __init__(self, corpus):
        """
        :param corpus: corpus over which to create the language model
        """
        super(BigramLanguageModel, self).__init__(corpus)
        self.ngram_calculator = BasicNgramCalculator(corpus)
        self.bigrams = self.ngram_calculator.calculate_ngrams(2, True, False)

    def get_bigram_count(self, bigram):
        """
        Gets the count of the bigram in the corpus
        :param bigram: bigram to find in the corpus
        :return: number of times the bigram is found in the corpus; 1 if it is not found TODO should be zero
        """
        return self.bigrams[bigram] if bigram in self.bigrams else 0

    def get_bigram_probability(self, bigram):
        """
        Gets the probability of a bigram as calculated by count(bigram) / count(first word of bigram)
        :param bigram: bigram to get the probability for
        :return: probability of the bigram for the language model
        """
        x, y = bigram
        x_count = self.get_unigram_count(x)
        return self.get_bigram_count(bigram) / x_count if x_count != 0 else 0

    def get_bigram_log_probability(self, bigram):
        """
        Gets the log of the probability of a bigram; see get_bigram_probability for details
        :param bigram: bigram to get the probability for
        :return: log base 2 of the probability of the bigram for the language model; None if bigram probability is 0
        """
        bigram_probability = self.get_bigram_probability(bigram)
        return log(bigram_probability, self.BASE) if bigram_probability != 0 else None

    def get_sentence_log_probability(self, sentence):
        """
        Calculates the log probability of a sentence
        :param sentence: list of words
        :return: log of the probability of the sentence (to be used to calculate entropy, perplexity)
        """
        probability = 0
        found_words = 0
        bigram_probability = self.get_bigram_log_probability(('<s>', sentence[0]))
        probability += bigram_probability if bigram_probability is not None else 0
        found_words += 1 if bigram_probability is not None else 0
        for i in range(1, len(sentence)):
            bigram = (sentence[i-1], sentence[i])
            bigram_probability = self.get_bigram_log_probability(bigram)
            probability += bigram_probability if bigram_probability is not None else 0
            found_words += 1 if bigram_probability is not None else 0
        return probability, found_words
示例#7
0
class TestNgramCalculator:
    corpus = Corpus()
    basic_ngram_calculator = BasicNgramCalculator(corpus)
    ngram_calculator = NgramCalculator(corpus)

    flattened_sentences = [
        item for sublist in corpus.get_sentences() for item in sublist
    ]
    frequencies = FreqDist(flattened_sentences)

    def test_calculate_unigrams(self):
        assert dict(
            self.frequencies) == self.ngram_calculator.calculate_ngrams(1)

    def test_get_percentage_unique_unigrams(self):
        assert round(27 / len(list(self.frequencies)), 2) == round(
            self.ngram_calculator.get_percentage_unique_ngrams(1), 2)