def __init__(self, corpus): """ :param corpus: corpus over which to create the language model """ super(BigramLanguageModel, self).__init__(corpus) self.ngram_calculator = BasicNgramCalculator(corpus) self.bigrams = self.ngram_calculator.calculate_ngrams(2, True, False)
def q2_calculate_unique_unigrams_in_reuters_training(): corpus = ReutersTrainingCorpus() unigram_calculator = UnigramCalculator(corpus) ngram_calculator = BasicNgramCalculator(corpus) unique_unigrams = unigram_calculator.get_percentage_unique_unigrams() unique_unigrams_ngram = ngram_calculator.get_percentage_unique_ngrams(1) print("Percentage of unique unigrams (unigram calculator): ", unique_unigrams) print("Percentage of unique unigrams (ngram calculator): ", unique_unigrams_ngram)
def q3_calculate_unique_ngrams_in_reuters_training_for_n_2_3_4_5_6(): corpus = ReutersTrainingCorpus() ngram_calculator = BasicNgramCalculator(corpus) unique_ngrams_2 = ngram_calculator.get_percentage_unique_ngrams(2) unique_ngrams_3 = ngram_calculator.get_percentage_unique_ngrams(3) unique_ngrams_4 = ngram_calculator.get_percentage_unique_ngrams(4) unique_ngrams_5 = ngram_calculator.get_percentage_unique_ngrams(5) unique_ngrams_6 = ngram_calculator.get_percentage_unique_ngrams(6) print("Percentage of unique bigrams: ", unique_ngrams_2) print("Percentage of unique trigrams: ", unique_ngrams_3) print("Percentage of unique 4-grams: ", unique_ngrams_4) print("Percentage of unique 5-grams: ", unique_ngrams_5) print("Percentage of unique 6-grams: ", unique_ngrams_6)
class BigramLanguageModel(UnigramLanguageModel): """ Language model using bigram probabilities """ def __init__(self, corpus): """ :param corpus: corpus over which to create the language model """ super(BigramLanguageModel, self).__init__(corpus) self.ngram_calculator = BasicNgramCalculator(corpus) self.bigrams = self.ngram_calculator.calculate_ngrams(2, True, False) def get_bigram_count(self, bigram): """ Gets the count of the bigram in the corpus :param bigram: bigram to find in the corpus :return: number of times the bigram is found in the corpus; 1 if it is not found TODO should be zero """ return self.bigrams[bigram] if bigram in self.bigrams else 0 def get_bigram_probability(self, bigram): """ Gets the probability of a bigram as calculated by count(bigram) / count(first word of bigram) :param bigram: bigram to get the probability for :return: probability of the bigram for the language model """ x, y = bigram x_count = self.get_unigram_count(x) return self.get_bigram_count(bigram) / x_count if x_count != 0 else 0 def get_bigram_log_probability(self, bigram): """ Gets the log of the probability of a bigram; see get_bigram_probability for details :param bigram: bigram to get the probability for :return: log base 2 of the probability of the bigram for the language model; None if bigram probability is 0 """ bigram_probability = self.get_bigram_probability(bigram) return log(bigram_probability, self.BASE) if bigram_probability != 0 else None def get_sentence_log_probability(self, sentence): """ Calculates the log probability of a sentence :param sentence: list of words :return: log of the probability of the sentence (to be used to calculate entropy, perplexity) """ probability = 0 found_words = 0 bigram_probability = self.get_bigram_log_probability(('<s>', sentence[0])) probability += bigram_probability if bigram_probability is not None else 0 found_words += 1 if bigram_probability is not None else 0 for i in range(1, len(sentence)): bigram = (sentence[i-1], sentence[i]) bigram_probability = self.get_bigram_log_probability(bigram) probability += bigram_probability if bigram_probability is not None else 0 found_words += 1 if bigram_probability is not None else 0 return probability, found_words
class TestNgramCalculator: corpus = Corpus() basic_ngram_calculator = BasicNgramCalculator(corpus) ngram_calculator = NgramCalculator(corpus) flattened_sentences = [ item for sublist in corpus.get_sentences() for item in sublist ] frequencies = FreqDist(flattened_sentences) def test_calculate_unigrams(self): assert dict( self.frequencies) == self.ngram_calculator.calculate_ngrams(1) def test_get_percentage_unique_unigrams(self): assert round(27 / len(list(self.frequencies)), 2) == round( self.ngram_calculator.get_percentage_unique_ngrams(1), 2)