def initialize(self): self._vocabulary_sizes_for_ngram_item_types = self._find_vocabulary_sizes(self._ngram_item_types) if logger.isEnabledFor(logging.DEBUG): logger.debug("Found vocabulary sizes for ngram types : " + str(self._vocabulary_sizes_for_ngram_item_types)) for context_type in self._ngram_item_types: for context_is_leading in (True, False): for target_type in self._ngram_item_types: ngram_type, type_key = self._get_ngram_type_and_key(context_is_leading, context_type, target_type) if type_key in self._smoothers_for_ngram_types: # stuff already calculated, smoother already created! continue distinct_ngram_count_for_ngram_type = NgramTypeFrequencyFinder.find_distinct_count( self._collection, ngram_type ) possible_ngram_count_for_ngram_type = reduce( operator.mul, [ self._vocabulary_sizes_for_ngram_item_types[ngram_type_item] for ngram_type_item in ngram_type ], ) frequency_of_frequency_0 = possible_ngram_count_for_ngram_type - distinct_ngram_count_for_ngram_type logger.debug(" Distinct ngram count for ngram type = " + str(distinct_ngram_count_for_ngram_type)) logger.debug(" Possible ngram count for ngram type = " + str(possible_ngram_count_for_ngram_type)) logger.debug(" Frequency of frequency 0 (unseen) = " + str(frequency_of_frequency_0)) frequencies_of_frequencies_for_ngram_type = {} for i in range(1, self._smoothing_threshold + 2): frequencies_of_frequencies_for_ngram_type[i] = self._find_frequency_of_frequency(ngram_type, i) smoother = SimpleGoodTuringSmoother( self._smoothing_threshold, frequencies_of_frequencies_for_ngram_type, frequency_of_frequency_0 ) self._smoothers_for_ngram_types[type_key] = smoother for ngram_type_key, smoother in self._smoothers_for_ngram_types.iteritems(): smoother.initialize(PLOTTING_MODE) if logger.isEnabledFor(logging.DEBUG): for ngram_type_key, smoother in self._smoothers_for_ngram_types.iteritems(): # convert default dict to normal dict and then pprint it logger.debug( "Found frequencies of ngram frequencies for {}: " + pprint.pformat(json.loads(json.dumps(smoother._frequencies_of_frequencies))) ) logger.debug("Found unseen for {}: {}".format(smoother._unseen_count)) if logger.isEnabledFor(logging.DEBUG): for ngram_type_key, smoother in self._smoothers_for_ngram_types.iteritems(): # convert default dict to normal dict and then pprint it logger.debug( "Loglin regression coefficient m for {}: ".format(ngram_type_key, smoother._loglinregression_m) ) logger.debug( "Loglin regression coefficient c for {}: ".format(ngram_type_key, smoother._loglinregression_c) )
def test_with_larger_values_sc_2(self): smoother = SimpleGoodTuringSmoother(K, { 1: 16181, 2: 2213, 3: 870, 4: 431, 5: 304, 6: 202 }, 2111251811) smoother.initialize() for i in range(0, K + 5): logger.info("c_{} : {}, \t c*_{} : {}".format( i, i, i, smoother.smooth(i)))
def test_with_small_values(self): smoother = SimpleGoodTuringSmoother(K, { 1: 10, 2: 5, 3: 3, 4: 2, 5: 1, 6: 0 }, 100) smoother.initialize() for i in range(0, K + 5): logger.info("c_{} : {}, \t c*_{} : {}".format( i, i, i, smoother.smooth(i)))
def test_with_larger_values(self): smoother = SimpleGoodTuringSmoother(K, { 1: 268, 2: 112, 3: 70, 4: 41, 5: 24, 6: 14, 7: 15, 400: 1, 1918: 1 }, 1000) smoother.initialize() for i in range(0, K + 5): logger.info("c_{} : {}, \t c*_{} : {}".format( i, i, i, smoother.smooth(i)))
def test_with_zero_frequencies_in_between(self): smoother = SimpleGoodTuringSmoother(K, { 1: 268, 2: 0, 3: 70, 4: 0, 5: 24, 6: 14, 7: 15, 400: 1, 1918: 1 }, 1000) smoother.initialize() for i in range(0, K + 5): logger.info("c_{} : {}, \t c*_{} : {}".format( i, i, i, smoother.smooth(i)))
def initialize(self): self._vocabulary_sizes_for_ngram_item_types = self._find_vocabulary_sizes( self._ngram_item_types) if logger.isEnabledFor(logging.DEBUG): logger.debug("Found vocabulary sizes for ngram types : " + str(self._vocabulary_sizes_for_ngram_item_types)) for context_type in self._ngram_item_types: for context_is_leading in (True, False): for target_type in self._ngram_item_types: ngram_type, type_key = self._get_ngram_type_and_key( context_is_leading, context_type, target_type) if type_key in self._smoothers_for_ngram_types: # stuff already calculated, smoother already created! continue distinct_ngram_count_for_ngram_type = NgramTypeFrequencyFinder.find_distinct_count( self._collection, ngram_type) possible_ngram_count_for_ngram_type = reduce( operator.mul, [ self._vocabulary_sizes_for_ngram_item_types[ ngram_type_item] for ngram_type_item in ngram_type ]) frequency_of_frequency_0 = possible_ngram_count_for_ngram_type - distinct_ngram_count_for_ngram_type logger.debug(" Distinct ngram count for ngram type = " + str(distinct_ngram_count_for_ngram_type)) logger.debug(" Possible ngram count for ngram type = " + str(possible_ngram_count_for_ngram_type)) logger.debug(" Frequency of frequency 0 (unseen) = " + str(frequency_of_frequency_0)) frequencies_of_frequencies_for_ngram_type = {} for i in range(1, self._smoothing_threshold + 2): frequencies_of_frequencies_for_ngram_type[ i] = self._find_frequency_of_frequency( ngram_type, i) smoother = SimpleGoodTuringSmoother( self._smoothing_threshold, frequencies_of_frequencies_for_ngram_type, frequency_of_frequency_0) self._smoothers_for_ngram_types[type_key] = smoother for ngram_type_key, smoother in self._smoothers_for_ngram_types.iteritems( ): smoother.initialize(PLOTTING_MODE) if logger.isEnabledFor(logging.DEBUG): for ngram_type_key, smoother in self._smoothers_for_ngram_types.iteritems( ): # convert default dict to normal dict and then pprint it logger.debug( "Found frequencies of ngram frequencies for {}: " + pprint.pformat( json.loads( json.dumps(smoother._frequencies_of_frequencies)))) logger.debug("Found unseen for {}: {}".format( smoother._unseen_count)) if logger.isEnabledFor(logging.DEBUG): for ngram_type_key, smoother in self._smoothers_for_ngram_types.iteritems( ): # convert default dict to normal dict and then pprint it logger.debug("Loglin regression coefficient m for {}: ".format( ngram_type_key, smoother._loglinregression_m)) logger.debug("Loglin regression coefficient c for {}: ".format( ngram_type_key, smoother._loglinregression_c))
class SimpleGoodTuringContextlessDistributionSmoother( ContextlessDistributionSmoother): AVG_PARSE_RESULTS_FOR_A_WORD = 6 # avg parse result count for a word AVG_WORDS_FOR_A_LEXEME = 50 # avg word count for a lexeme AVG_WORDS_FOR_A_STEM = 10 # avg word count for a stem def __init__(self, smoothing_threshold, unigram_collection): self._smoothing_threshold = smoothing_threshold self._unigram_collection = unigram_collection assert self._smoothing_threshold and self._smoothing_threshold > 1 def initialize(self): logger.debug( "Initializing SimpleGoodTuringContextlessDistributionSmoother for K:{}, AVG_PARSE_RESULTS_FOR_A_WORD:{}, AVG_WORDS_FOR_A_LEXEME:{}" .format(self._smoothing_threshold, self.AVG_PARSE_RESULTS_FOR_A_WORD, self.AVG_WORDS_FOR_A_LEXEME)) distinct_parse_result_count = NgramTypeFrequencyFinder.find_distinct_parse_result_count( self._unigram_collection) distinct_word_count = NgramTypeFrequencyFinder.find_distinct_word_count( self._unigram_collection) distinct_lexeme_count = NgramTypeFrequencyFinder.find_distinct_count( self._unigram_collection, ['lemma_root']) distinct_stem_count = NgramTypeFrequencyFinder.find_distinct_count( self._unigram_collection, ['stem']) possible_word_count_estimate_from_lexemes = distinct_lexeme_count * self.AVG_WORDS_FOR_A_LEXEME possible_word_count_estimate_from_stems = distinct_stem_count * self.AVG_WORDS_FOR_A_STEM possible_word_count_estimate = possible_word_count_estimate_from_stems + possible_word_count_estimate_from_lexemes unseen_word_count = possible_word_count_estimate - distinct_word_count possible_parse_result_count_estimate = possible_word_count_estimate * self.AVG_PARSE_RESULTS_FOR_A_WORD unseen_parse_result_count = possible_parse_result_count_estimate - distinct_parse_result_count logger.debug("Found {} distinct parse results".format( distinct_parse_result_count)) logger.debug("Found {} distinct words".format(distinct_word_count)) logger.debug("Estimated possible parse result count : {}".format( possible_parse_result_count_estimate)) logger.debug("Estimated unseen parse result count : {}".format( unseen_parse_result_count)) logger.debug("Found {} distinct lexemes".format(distinct_lexeme_count)) logger.debug("Estimated possible word count from lexemes: {}".format( possible_word_count_estimate_from_lexemes)) logger.debug("Estimated possible word count from stems: {}".format( possible_word_count_estimate_from_stems)) logger.debug("Estimated possible word count: {}".format( possible_word_count_estimate)) logger.debug( "Estimated unseen word count : {}".format(unseen_word_count)) frequencies_of_parse_result_frequencies = { 1: distinct_parse_result_count } frequencies_of_word_frequencies = {1: distinct_word_count} for i in range(2, self._smoothing_threshold + 2): frequencies_of_parse_result_frequencies[ i] = NgramTypeFrequencyFinder.find_frequency_of_parse_result_frequency( self._unigram_collection, i) frequencies_of_word_frequencies[ i] = NgramTypeFrequencyFinder.find_frequency_of_word_frequency( self._unigram_collection, i) logger.debug("Frequencies of parse result frequencies") logger.debug(pformat(frequencies_of_parse_result_frequencies)) logger.debug("Frequencies of word frequencies") logger.debug(pformat(frequencies_of_word_frequencies)) self._parse_result_count_smoother = SimpleGoodTuringSmoother( self._smoothing_threshold, frequencies_of_parse_result_frequencies, unseen_parse_result_count) self._word_count_smoother = SimpleGoodTuringSmoother( self._smoothing_threshold, frequencies_of_word_frequencies, unseen_word_count) self._parse_result_count_smoother.initialize() self._word_count_smoother.initialize() def smooth_parse_result_occurrence_count(self, parse_result_occurrence_count): if parse_result_occurrence_count > self._smoothing_threshold: return parse_result_occurrence_count return self._parse_result_count_smoother.smooth( parse_result_occurrence_count) def smooth_word_occurrence_count(self, word_occurrence_count): if word_occurrence_count > self._smoothing_threshold: return word_occurrence_count return self._word_count_smoother.smooth(word_occurrence_count)
def test_with_zero_frequencies_in_between(self): smoother = SimpleGoodTuringSmoother(K, {1: 268, 2: 0, 3: 70, 4: 0, 5: 24, 6: 14, 7: 15, 400: 1, 1918: 1}, 1000) smoother.initialize() for i in range(0, K + 5): logger.info("c_{} : {}, \t c*_{} : {}".format(i, i, i, smoother.smooth(i)))
def test_with_larger_values_sc_2(self): smoother = SimpleGoodTuringSmoother(K, {1: 16181, 2: 2213, 3: 870, 4: 431, 5: 304, 6: 202}, 2111251811) smoother.initialize() for i in range(0, K + 5): logger.info("c_{} : {}, \t c*_{} : {}".format(i, i, i, smoother.smooth(i)))
def test_with_larger_values(self): smoother = SimpleGoodTuringSmoother(K, {1: 268, 2: 112, 3: 70, 4: 41, 5: 24, 6: 14, 7: 15, 400: 1, 1918: 1}, 1000) smoother.initialize() for i in range(0, K + 5): logger.info("c_{} : {}, \t c*_{} : {}".format(i, i, i, smoother.smooth(i)))
def test_with_small_values(self): smoother = SimpleGoodTuringSmoother(K, {1: 10, 2: 5, 3: 3, 4: 2, 5: 1, 6: 0}, 100) smoother.initialize() for i in range(0, K + 5): logger.info("c_{} : {}, \t c*_{} : {}".format(i, i, i, smoother.smooth(i)))
class SimpleGoodTuringContextlessDistributionSmoother(ContextlessDistributionSmoother): AVG_PARSE_RESULTS_FOR_A_WORD = 6 # avg parse result count for a word AVG_WORDS_FOR_A_LEXEME = 50 # avg word count for a lexeme AVG_WORDS_FOR_A_STEM = 10 # avg word count for a stem def __init__(self, smoothing_threshold, unigram_collection): self._smoothing_threshold = smoothing_threshold self._unigram_collection = unigram_collection assert self._smoothing_threshold and self._smoothing_threshold > 1 def initialize(self): logger.debug( "Initializing SimpleGoodTuringContextlessDistributionSmoother for K:{}, AVG_PARSE_RESULTS_FOR_A_WORD:{}, AVG_WORDS_FOR_A_LEXEME:{}".format( self._smoothing_threshold, self.AVG_PARSE_RESULTS_FOR_A_WORD, self.AVG_WORDS_FOR_A_LEXEME)) distinct_parse_result_count = NgramTypeFrequencyFinder.find_distinct_parse_result_count( self._unigram_collection) distinct_word_count = NgramTypeFrequencyFinder.find_distinct_word_count(self._unigram_collection) distinct_lexeme_count = NgramTypeFrequencyFinder.find_distinct_count(self._unigram_collection, ['lemma_root']) distinct_stem_count = NgramTypeFrequencyFinder.find_distinct_count(self._unigram_collection, ['stem']) possible_word_count_estimate_from_lexemes = distinct_lexeme_count * self.AVG_WORDS_FOR_A_LEXEME possible_word_count_estimate_from_stems = distinct_stem_count * self.AVG_WORDS_FOR_A_STEM possible_word_count_estimate = possible_word_count_estimate_from_stems + possible_word_count_estimate_from_lexemes unseen_word_count = possible_word_count_estimate - distinct_word_count possible_parse_result_count_estimate = possible_word_count_estimate * self.AVG_PARSE_RESULTS_FOR_A_WORD unseen_parse_result_count = possible_parse_result_count_estimate - distinct_parse_result_count logger.debug("Found {} distinct parse results".format(distinct_parse_result_count)) logger.debug("Found {} distinct words".format(distinct_word_count)) logger.debug("Estimated possible parse result count : {}".format(possible_parse_result_count_estimate)) logger.debug("Estimated unseen parse result count : {}".format(unseen_parse_result_count)) logger.debug("Found {} distinct lexemes".format(distinct_lexeme_count)) logger.debug("Estimated possible word count from lexemes: {}".format(possible_word_count_estimate_from_lexemes)) logger.debug("Estimated possible word count from stems: {}".format(possible_word_count_estimate_from_stems)) logger.debug("Estimated possible word count: {}".format(possible_word_count_estimate)) logger.debug("Estimated unseen word count : {}".format(unseen_word_count)) frequencies_of_parse_result_frequencies = {1: distinct_parse_result_count} frequencies_of_word_frequencies = {1: distinct_word_count} for i in range(2, self._smoothing_threshold + 2): frequencies_of_parse_result_frequencies[ i] = NgramTypeFrequencyFinder.find_frequency_of_parse_result_frequency(self._unigram_collection, i) frequencies_of_word_frequencies[i] = NgramTypeFrequencyFinder.find_frequency_of_word_frequency( self._unigram_collection, i) logger.debug("Frequencies of parse result frequencies") logger.debug(pformat(frequencies_of_parse_result_frequencies)) logger.debug("Frequencies of word frequencies") logger.debug(pformat(frequencies_of_word_frequencies)) self._parse_result_count_smoother = SimpleGoodTuringSmoother(self._smoothing_threshold, frequencies_of_parse_result_frequencies, unseen_parse_result_count) self._word_count_smoother = SimpleGoodTuringSmoother(self._smoothing_threshold, frequencies_of_word_frequencies, unseen_word_count) self._parse_result_count_smoother.initialize() self._word_count_smoother.initialize() def smooth_parse_result_occurrence_count(self, parse_result_occurrence_count): if parse_result_occurrence_count > self._smoothing_threshold: return parse_result_occurrence_count return self._parse_result_count_smoother.smooth(parse_result_occurrence_count) def smooth_word_occurrence_count(self, word_occurrence_count): if word_occurrence_count > self._smoothing_threshold: return word_occurrence_count return self._word_count_smoother.smooth(word_occurrence_count)