def test_should_format_for_simple_parseset(self): parse_result = self.parser.parse(u'kitaba')[0] assert_that( formatter.format_morpheme_container_for_simple_parseset( parse_result), equal_to(u'(1,"kitap+Noun+A3sg+Pnon+Dat")')) parse_result = self.parser.parse(u'yaptırtmayı')[0] assert_that( formatter.format_morpheme_container_for_simple_parseset( parse_result), equal_to( u'(1,"yap+Verb")(2,"Verb+Caus")(3,"Verb+Caus+Pos")(4,"Noun+Inf+A3sg+Pnon+Acc")' ))
def calculate_likelihood(self, target, leading_context, following_context, calculation_context=None): if logger.isEnabledFor(logging.DEBUG): logger.debug("Calculating likelihood of {1}, {0}, {2}".format( formatter.format_morpheme_container_for_simple_parseset( target), leading_context, following_context)) calculation_context_leading = {} if calculation_context is not None else None calculation_context_following = {} if calculation_context is not None else None likelihood = self.calculate_oneway_likelihood(target, leading_context, True, calculation_context_leading) * self.WEIGHT_LEADING_CONTEXT +\ self.calculate_oneway_likelihood(target, following_context, False, calculation_context_following) * self.WEIGHT_FOLLOWING_CONTEXT if calculation_context is not None: calculation_context['leading'] = calculation_context_leading calculation_context['following'] = calculation_context_following calculation_context[ 'weight_leading_context'] = self.WEIGHT_LEADING_CONTEXT calculation_context[ 'weight_following_context'] = self.WEIGHT_FOLLOWING_CONTEXT logger.debug(" Calculated likelihood is {}".format(likelihood)) return likelihood
def _find_parse_result_matching_simple_parseset(self, word_part, parse_result_part): parse_results = self.parser.parse(word_part) for parse_result in parse_results: parse_result_str = formatter.format_morpheme_container_for_simple_parseset(parse_result) if parse_result_part==parse_result_str: return parse_result elif parse_result_str==modify_treebank_parse_result_strs_to_look_like_trnltk(parse_result_part): return parse_result return None
def _find_parse_result_matching_simple_parseset(self, word_part, parse_result_part): parse_results = self.parser.parse(word_part) for parse_result in parse_results: parse_result_str = formatter.format_morpheme_container_for_simple_parseset( parse_result) if parse_result_part == parse_result_str: return parse_result elif parse_result_str == modify_treebank_parse_result_strs_to_look_like_trnltk( parse_result_part): return parse_result return None
def calculate_oneway_likelihood(self, target, context, target_comes_after, calculation_context=None): if logger.isEnabledFor(logging.DEBUG): if target_comes_after: logger.debug(u" Calculating oneway likelihood of {1}, {0}".format(formatter.format_morpheme_container_for_simple_parseset(target), [t[0].get_surface() if t else "<Unparsable>" for t in context])) else: logger.debug(u" Calculating oneway likelihood of {0}, {1}".format(formatter.format_morpheme_container_for_simple_parseset(target), [t[0].get_surface() if t else "<Unparsable>" for t in context])) context_len = len(context) if calculation_context is not None: calculation_context['interpolation'] = {'context_length': context_len, 'likelihood': {}, 'weight': {}, 'item': {}, 'part_weight': {}} interpolation_weights = self._calculate_interpolation_weights(context_len) total_likelihood = 0 for i in range(0, len(context)): calculation_context_item = {} if calculation_context else None context_part = context[context_len - i - 1:] if target_comes_after else context[0: i + 1] part_likelihood = self._wrapped_calculator.calculate_oneway_likelihood(target, context_part, target_comes_after, calculation_context_item) part_weight = part_likelihood * interpolation_weights[i] total_likelihood += part_weight if calculation_context is not None: calculation_context['interpolation']['item'][i] = calculation_context_item calculation_context['interpolation']['likelihood'][i] = part_likelihood calculation_context['interpolation']['weight'][i] = interpolation_weights[i] calculation_context['interpolation']['part_weight'][i] = part_weight if calculation_context is not None: calculation_context['sum_likelihood'] = total_likelihood return total_likelihood
def calculate_likelihood(self, target, leading_context, following_context, calculation_context=None): if logger.isEnabledFor(logging.DEBUG): logger.debug("Calculating likelihood of {1}, {0}, {2}".format(formatter.format_morpheme_container_for_simple_parseset(target), leading_context, following_context)) calculation_context_leading = {} if calculation_context is not None else None calculation_context_following = {} if calculation_context is not None else None likelihood = self.calculate_oneway_likelihood(target, leading_context, True, calculation_context_leading) * self.WEIGHT_LEADING_CONTEXT +\ self.calculate_oneway_likelihood(target, following_context, False, calculation_context_following) * self.WEIGHT_FOLLOWING_CONTEXT if calculation_context is not None: calculation_context['leading'] = calculation_context_leading calculation_context['following'] = calculation_context_following calculation_context['weight_leading_context'] = self.WEIGHT_LEADING_CONTEXT calculation_context['weight_following_context'] = self.WEIGHT_FOLLOWING_CONTEXT logger.debug(" Calculated likelihood is {}".format(likelihood)) return likelihood
def calculate_likelihood(self, target, leading_context, following_context, calculation_context=None): if logger.isEnabledFor(logging.DEBUG): logger.debug(u" Calculating twoway likelihood of \n\t{0}\n\t{1}\n\t{2}".format( [t[0].get_surface() if t else "<Unparsable>" for t in leading_context], formatter.format_morpheme_container_for_simple_parseset(target), [t[0].get_surface() if t else "<Unparsable>" for t in following_context])) assert leading_context or following_context calculation_context_leading = {} if calculation_context is not None else None calculation_context_following = {} if calculation_context is not None else None if calculation_context is not None: calculation_context['leading_context_length'] = len(leading_context) calculation_context['following_context_length'] = len(following_context) likelihood = None if leading_context and following_context: likelihood = self.calculate_oneway_likelihood(target, leading_context, True, calculation_context_leading) * self.WEIGHT_LEADING_CONTEXT +\ self.calculate_oneway_likelihood(target, following_context, False, calculation_context_following) * self.WEIGHT_FOLLOWING_CONTEXT elif leading_context: likelihood = self.calculate_oneway_likelihood(target, leading_context, True, calculation_context_leading) * self.WEIGHT_LEADING_CONTEXT elif following_context: likelihood = self.calculate_oneway_likelihood(target, following_context, False, calculation_context_following) * self.WEIGHT_FOLLOWING_CONTEXT if calculation_context is not None: calculation_context['leading'] = calculation_context_leading calculation_context['following'] = calculation_context_following calculation_context['weight_leading_context'] = self.WEIGHT_LEADING_CONTEXT calculation_context['weight_following_context'] = self.WEIGHT_FOLLOWING_CONTEXT if calculation_context is not None: calculation_context['sum_likelihood'] = likelihood logger.debug(" Calculated likelihood is {}".format(likelihood)) return likelihood
def parse_result(self, word): return [ formatter.format_morpheme_container_for_simple_parseset(r) for r in (self.parser.parse(word)) ]
def calculate_oneway_likelihood(self, target, context, target_comes_after, calculation_context=None): """ @type target: WordFormContainer @type context: list of WordFormContainer @type target_comes_after: bool @rtype: float """ assert target assert context target_morpheme_container_str = target.format() if logger.isEnabledFor(logging.DEBUG): if target_comes_after: logger.debug(u" Calculating oneway likelihood of {1}, {0}".format(formatter.format_morpheme_container_for_simple_parseset(target), [t[0].get_surface() if t else "<Unparsable>" for t in context])) else: logger.debug(u" Calculating oneway likelihood of {0}, {1}".format(formatter.format_morpheme_container_for_simple_parseset(target), [t[0].get_surface() if t else "<Unparsable>" for t in context])) cartesian_products_of_context_parse_results = self._get_cartesian_products_of_context_parse_results(context) logger.debug(" Going to check the usages with the following cartesian product of parse results: \n{}".format( [[formatter.format_morpheme_container_for_simple_parseset_without_suffixes(mc) for mc in product_item] for product_item in cartesian_products_of_context_parse_results])) if not cartesian_products_of_context_parse_results or not any(cartesian_products_of_context_parse_results): return 0.0 if calculation_context is not None: calculation_context['possibilities'] = {} target_likelihoods_for_context_parse_results = [] context_parse_results_likelihoods = [] for index, context_parse_results in enumerate(cartesian_products_of_context_parse_results): word_calc_context = None if calculation_context is not None: word_calc_context = calculation_context['possibilities'][index] = {} if logger.isEnabledFor(logging.DEBUG): context_parse_result_str_list = [formatter.format_morpheme_container_for_simple_parseset_without_suffixes(t) for t in context_parse_results] if target_comes_after: logger.debug(u" Calculating oneway likelihood of {1}, {0}".format(target_morpheme_container_str, context_parse_result_str_list)) else: logger.debug(u" Calculating oneway likelihood of {0}, {1}".format(target_morpheme_container_str, context_parse_result_str_list)) context_counts = self._get_context_form_count_matrix(context_parse_results) logger.debug(" Context form counts: \n{}".format(context_counts)) smoothed_context_counts = self._smooth_context_cooccurrence_counts(context_counts, context_parse_results) if calculation_context is not None: word_calc_context['smoothed_context_counts'] = smoothed_context_counts logger.debug(" Smoothed context form counts: \n{}".format(smoothed_context_counts)) if calculation_context is not None: word_calc_context['context_words'] = {} for i, context_item in enumerate(context_parse_results): word_calc_context['context_words'][i] = { 'surface': context_item.get_surface_with_syntactic_categories(), 'stem': context_item.get_stem_with_syntactic_categories(), 'lexeme': context_item.get_lemma_root_with_syntactic_categories() } target_form_given_context_counts = numpy.zeros((3, 3), dtype=float) for i, appender_matrix_row in enumerate(self.APPENDER_MATRIX): for j, (target_appender, context_appender) in enumerate(appender_matrix_row): target_form_given_count = self._target_form_given_context_counter._count_target_form_given_context(target, context_parse_results, target_comes_after, target_appender, context_appender) target_form_given_context_counts[i][j] = target_form_given_count logger.debug(" Target form counts given context forms: \n{}".format(target_form_given_context_counts)) if calculation_context is not None: word_calc_context['target_form_counts'] = target_form_given_context_counts logger.debug(" Target form counts: \n{}".format(target_form_given_context_counts)) smoothed_target_form_given_context_counts = self._smooth_target_context_cooccurrence_counts(target_form_given_context_counts, target, context_parse_results, target_comes_after) if calculation_context is not None: word_calc_context['smoothed_target_form_counts'] = smoothed_target_form_given_context_counts logger.debug(" Smoothed target form counts: \n{}".format(smoothed_target_form_given_context_counts)) target_form_probabilities = smoothed_target_form_given_context_counts / smoothed_context_counts target_form_probabilities[numpy.isinf(target_form_probabilities)] = 0.0 target_form_probabilities[numpy.isnan(target_form_probabilities)] = 0.0 if calculation_context is not None: word_calc_context['target_form_probabilities'] = target_form_probabilities logger.debug(" Target form probabilities: \n{}".format(target_form_probabilities)) target_form_probabilities = target_form_probabilities * self.COEFFICIENTS_TARGET_GIVEN_CONTEXT_FORM if calculation_context is not None: word_calc_context['coefficients_target_given_context_form'] = self.COEFFICIENTS_TARGET_GIVEN_CONTEXT_FORM word_calc_context['target_form_probabilities_with_context_form_weights'] = target_form_probabilities logger.debug(" Target form probabilities with context form weights: \n{}".format(target_form_probabilities)) target_form_probabilities = numpy.dot(target_form_probabilities, numpy.ones((3, 1), dtype=float)) if calculation_context is not None: word_calc_context['summed_target_form_probabilities'] = target_form_probabilities logger.debug(" Summed target form probabilities: \n{}".format(target_form_probabilities)) weight_summed_target_probability = numpy.dot(self.COEFFICIENTS_TARGET_FORM_GIVEN_CONTEXT, target_form_probabilities) assert numpy.shape(weight_summed_target_probability) == (1, 1) if calculation_context is not None: word_calc_context['coefficients_target_form_given_context'] = self.COEFFICIENTS_TARGET_FORM_GIVEN_CONTEXT word_calc_context['weight_summed_target_probability'] = weight_summed_target_probability logger.debug(" Weight-summed target probability: \n{}".format(weight_summed_target_probability)) item_likelihood = weight_summed_target_probability[0][0] target_likelihoods_for_context_parse_results.append(item_likelihood) logger.debug(" Calculated oneway likelihood for target given context item is {}".format(item_likelihood)) # say, target_comes_after=True, context={c1,c2} and target=t # until now, we looked at collocation of (c1, c2, t) and (c2,t) # now we look collocation of (c1,c2) # which makes complete sense while calculating the weight for current cartesian product item context_sequence_likelihood_calculation_direction = SequenceLikelihoodCalculator.HIGHEST_WEIGHT_ON_LAST if target_comes_after else SequenceLikelihoodCalculator.HIGHEST_WEIGHT_ON_FIRST sequence_likelihood_context = {} if calculation_context is not None else None context_likelihood = self._sequence_likelihood_calculator.calculate(context_parse_results, context_sequence_likelihood_calculation_direction, sequence_likelihood_context) if calculation_context is not None: word_calc_context['context_sequence_likelihood'] = sequence_likelihood_context context_parse_results_likelihoods.append(context_likelihood) logger.debug(" Context likelihood is {}".format(context_likelihood)) likelihood = 0.0 # normalize but don't smooth. weights are already smoothed total_context_parse_results_weights = sum(context_parse_results_likelihoods) normalized_context_parse_results_weights = [] if total_context_parse_results_weights: normalized_context_parse_results_weights = [context_parse_results_item_weight/total_context_parse_results_weights for context_parse_results_item_weight in context_parse_results_likelihoods] else: normalized_context_parse_results_weights = [0.0 for context_parse_results_item_weight in context_parse_results_likelihoods] logger.debug(" Normalized context parse results weights are {}".format(normalized_context_parse_results_weights)) for index, context_parse_results in enumerate(cartesian_products_of_context_parse_results): target_likelihood_for_context_parse_results_item = target_likelihoods_for_context_parse_results[index] context_parse_results_item_likelihood = normalized_context_parse_results_weights[index] weighted_parse_result_possibility_likelihood = context_parse_results_item_likelihood * target_likelihood_for_context_parse_results_item likelihood += weighted_parse_result_possibility_likelihood if calculation_context is not None: word_calc_context = calculation_context['possibilities'][index] word_calc_context['context_likelihood'] = context_parse_results_item_likelihood word_calc_context['weighted_parse_result_possibility_likelihood'] = weighted_parse_result_possibility_likelihood logger.debug(" Weighted context parse result likelihood is {} for context : {}".format(weighted_parse_result_possibility_likelihood, context_parse_results)) if calculation_context is not None: calculation_context['sum_likelihood'] = likelihood logger.debug(" Calculated oneway likelihood is {}".format(likelihood)) return likelihood
def test_should_format_for_simple_parseset(self): parse_result = self.parser.parse(u'kitaba')[0] assert_that(formatter.format_morpheme_container_for_simple_parseset(parse_result), equal_to(u'(1,"kitap+Noun+A3sg+Pnon+Dat")')) parse_result = self.parser.parse(u'yaptırtmayı')[0] assert_that(formatter.format_morpheme_container_for_simple_parseset(parse_result), equal_to(u'(1,"yap+Verb")(2,"Verb+Caus")(3,"Verb+Caus+Pos")(4,"Noun+Inf+A3sg+Pnon+Acc")'))
def calculate_oneway_likelihood(self, target, context, target_comes_after, calculation_context=None): """ @type target: WordFormContainer @type context: list of WordFormContainer @type target_comes_after: bool @type calculation_context : dict or None @rtype: float """ assert target assert context if logger.isEnabledFor(logging.DEBUG): if target_comes_after: logger.debug(" Calculating oneway likelihood of {1}, {0}".format(formatter.format_morpheme_container_for_simple_parseset(target), context)) else: logger.debug(" Calculating oneway likelihood of {0}, {1}".format(formatter.format_morpheme_container_for_simple_parseset(target), context)) count_given_context = self._count_target_form_given_context(target, context, False, None, _context_word_appender) if not count_given_context: return 0.0 if calculation_context is not None: for i, context_item in enumerate(context): calculation_context[i] = { 'surface': context_item } target_form_given_context_counts = numpy.zeros((3, 1), dtype=float) for i, appender_matrix_row in enumerate(self.APPENDER_MATRIX): target_appender, context_appender = appender_matrix_row target_form_given_count = self._count_target_form_given_context(target, context, target_comes_after, target_appender, context_appender) target_form_given_context_counts[i] = target_form_given_count logger.debug(" Target form counts given context forms: \n{}".format(target_form_given_context_counts)) logger.debug(" Found {} context occurrences".format(count_given_context)) target_form_probabilities = target_form_given_context_counts / count_given_context target_form_probabilities[numpy.isinf(target_form_probabilities)] = 0.0 target_form_probabilities[numpy.isnan(target_form_probabilities)] = 0.0 if calculation_context is not None: calculation_context['target_form_probabilities'] = target_form_probabilities logger.debug(" Target form probabilities: \n{}".format(target_form_probabilities)) target_form_probabilities = target_form_probabilities * self.COEFFICIENTS_TARGET_GIVEN_CONTEXT_FORM if calculation_context is not None: calculation_context['coefficients_target_given_context_form'] = self.COEFFICIENTS_TARGET_GIVEN_CONTEXT_FORM calculation_context['target_form_probabilities_with_context_form_weights'] = target_form_probabilities logger.debug(" Target form probabilities with context form weights: \n{}".format(target_form_probabilities)) target_form_probabilities = numpy.dot(target_form_probabilities, numpy.ones((3, 1), dtype=float)) if calculation_context is not None: calculation_context['summed_target_form_probabilities'] = target_form_probabilities logger.debug(" Summed target form probabilities: \n{}".format(target_form_probabilities)) weight_summed_target_probability = numpy.dot(self.COEFFICIENTS_TARGET_FORM_GIVEN_CONTEXT, target_form_probabilities) assert numpy.shape(weight_summed_target_probability) == (1, 1) if calculation_context is not None: calculation_context['coefficients_target_form_given_context'] = self.COEFFICIENTS_TARGET_FORM_GIVEN_CONTEXT calculation_context['weight_summed_target_probability'] = weight_summed_target_probability logger.debug(" Weight-summed target probability: \n{}".format(weight_summed_target_probability)) likelihood = weight_summed_target_probability[0][0] logger.debug(" Calculated oneway likelihood is {}".format(likelihood)) return likelihood
def parse_result(self, word): return [formatter.format_morpheme_container_for_simple_parseset(r) for r in (self.parser.parse(word))]