示例#1
0
    def test_should_format_for_simple_parseset(self):
        parse_result = self.parser.parse(u'kitaba')[0]
        assert_that(
            formatter.format_morpheme_container_for_simple_parseset(
                parse_result), equal_to(u'(1,"kitap+Noun+A3sg+Pnon+Dat")'))

        parse_result = self.parser.parse(u'yaptırtmayı')[0]
        assert_that(
            formatter.format_morpheme_container_for_simple_parseset(
                parse_result),
            equal_to(
                u'(1,"yap+Verb")(2,"Verb+Caus")(3,"Verb+Caus+Pos")(4,"Noun+Inf+A3sg+Pnon+Acc")'
            ))
示例#2
0
    def calculate_likelihood(self,
                             target,
                             leading_context,
                             following_context,
                             calculation_context=None):
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug("Calculating likelihood of {1}, {0}, {2}".format(
                formatter.format_morpheme_container_for_simple_parseset(
                    target), leading_context, following_context))

        calculation_context_leading = {} if calculation_context is not None else None
        calculation_context_following = {} if calculation_context is not None else None

        likelihood = self.calculate_oneway_likelihood(target, leading_context, True, calculation_context_leading) * self.WEIGHT_LEADING_CONTEXT +\
                     self.calculate_oneway_likelihood(target, following_context, False, calculation_context_following) * self.WEIGHT_FOLLOWING_CONTEXT

        if calculation_context is not None:
            calculation_context['leading'] = calculation_context_leading
            calculation_context['following'] = calculation_context_following
            calculation_context[
                'weight_leading_context'] = self.WEIGHT_LEADING_CONTEXT
            calculation_context[
                'weight_following_context'] = self.WEIGHT_FOLLOWING_CONTEXT

        logger.debug(" Calculated likelihood is {}".format(likelihood))

        return likelihood
    def _find_parse_result_matching_simple_parseset(self, word_part, parse_result_part):
        parse_results = self.parser.parse(word_part)

        for parse_result in parse_results:
            parse_result_str = formatter.format_morpheme_container_for_simple_parseset(parse_result)
            if parse_result_part==parse_result_str:
                return parse_result
            elif parse_result_str==modify_treebank_parse_result_strs_to_look_like_trnltk(parse_result_part):
                return parse_result

        return None
    def _find_parse_result_matching_simple_parseset(self, word_part,
                                                    parse_result_part):
        parse_results = self.parser.parse(word_part)

        for parse_result in parse_results:
            parse_result_str = formatter.format_morpheme_container_for_simple_parseset(
                parse_result)
            if parse_result_part == parse_result_str:
                return parse_result
            elif parse_result_str == modify_treebank_parse_result_strs_to_look_like_trnltk(
                    parse_result_part):
                return parse_result

        return None
示例#5
0
    def calculate_oneway_likelihood(self, target, context, target_comes_after, calculation_context=None):
        if logger.isEnabledFor(logging.DEBUG):
            if target_comes_after:
                logger.debug(u"  Calculating oneway likelihood of {1}, {0}".format(formatter.format_morpheme_container_for_simple_parseset(target),
                    [t[0].get_surface() if t else "<Unparsable>" for t in context]))
            else:
                logger.debug(u"  Calculating oneway likelihood of {0}, {1}".format(formatter.format_morpheme_container_for_simple_parseset(target),
                    [t[0].get_surface() if t else "<Unparsable>" for t in context]))

        context_len = len(context)

        if calculation_context is not None:
            calculation_context['interpolation'] = {'context_length': context_len, 'likelihood': {}, 'weight': {}, 'item': {}, 'part_weight': {}}

        interpolation_weights = self._calculate_interpolation_weights(context_len)

        total_likelihood = 0

        for i in range(0, len(context)):
            calculation_context_item = {} if calculation_context else None

            context_part = context[context_len - i - 1:] if target_comes_after else context[0: i + 1]
            part_likelihood = self._wrapped_calculator.calculate_oneway_likelihood(target, context_part, target_comes_after, calculation_context_item)
            part_weight = part_likelihood * interpolation_weights[i]
            total_likelihood += part_weight

            if calculation_context is not None:
                calculation_context['interpolation']['item'][i] = calculation_context_item
                calculation_context['interpolation']['likelihood'][i] = part_likelihood
                calculation_context['interpolation']['weight'][i] = interpolation_weights[i]
                calculation_context['interpolation']['part_weight'][i] = part_weight

        if calculation_context is not None:
            calculation_context['sum_likelihood'] = total_likelihood

        return total_likelihood
    def calculate_likelihood(self, target, leading_context, following_context, calculation_context=None):
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug("Calculating likelihood of {1}, {0}, {2}".format(formatter.format_morpheme_container_for_simple_parseset(target), leading_context,
                following_context))

        calculation_context_leading = {} if calculation_context is not None else None
        calculation_context_following = {} if calculation_context is not None else None

        likelihood = self.calculate_oneway_likelihood(target, leading_context, True, calculation_context_leading) * self.WEIGHT_LEADING_CONTEXT +\
                     self.calculate_oneway_likelihood(target, following_context, False, calculation_context_following) * self.WEIGHT_FOLLOWING_CONTEXT

        if calculation_context is not None:
            calculation_context['leading'] = calculation_context_leading
            calculation_context['following'] = calculation_context_following
            calculation_context['weight_leading_context'] = self.WEIGHT_LEADING_CONTEXT
            calculation_context['weight_following_context'] = self.WEIGHT_FOLLOWING_CONTEXT

        logger.debug(" Calculated likelihood is {}".format(likelihood))

        return likelihood
    def calculate_likelihood(self, target, leading_context, following_context, calculation_context=None):
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(u"  Calculating twoway likelihood of \n\t{0}\n\t{1}\n\t{2}".format(
                [t[0].get_surface() if t else "<Unparsable>" for t in leading_context],
                formatter.format_morpheme_container_for_simple_parseset(target),
                [t[0].get_surface() if t else "<Unparsable>" for t in following_context]))

        assert leading_context or following_context

        calculation_context_leading = {} if calculation_context is not None else None
        calculation_context_following = {} if calculation_context is not None else None

        if calculation_context is not None:
            calculation_context['leading_context_length'] = len(leading_context)
            calculation_context['following_context_length'] = len(following_context)

        likelihood = None
        if leading_context and following_context:
            likelihood = self.calculate_oneway_likelihood(target, leading_context, True, calculation_context_leading) * self.WEIGHT_LEADING_CONTEXT +\
                         self.calculate_oneway_likelihood(target, following_context, False, calculation_context_following) * self.WEIGHT_FOLLOWING_CONTEXT
        elif leading_context:
            likelihood = self.calculate_oneway_likelihood(target, leading_context, True, calculation_context_leading) * self.WEIGHT_LEADING_CONTEXT
        elif following_context:
            likelihood = self.calculate_oneway_likelihood(target, following_context, False, calculation_context_following) * self.WEIGHT_FOLLOWING_CONTEXT

        if calculation_context is not None:
            calculation_context['leading'] = calculation_context_leading
            calculation_context['following'] = calculation_context_following
            calculation_context['weight_leading_context'] = self.WEIGHT_LEADING_CONTEXT
            calculation_context['weight_following_context'] = self.WEIGHT_FOLLOWING_CONTEXT

        if calculation_context is not None:
            calculation_context['sum_likelihood'] = likelihood

        logger.debug("  Calculated likelihood is {}".format(likelihood))

        return likelihood
 def parse_result(self, word):
     return [
         formatter.format_morpheme_container_for_simple_parseset(r)
         for r in (self.parser.parse(word))
     ]
    def calculate_oneway_likelihood(self, target, context, target_comes_after, calculation_context=None):
        """
        @type target: WordFormContainer
        @type context: list of WordFormContainer
        @type target_comes_after: bool
        @rtype: float
        """
        assert target
        assert context

        target_morpheme_container_str = target.format()

        if logger.isEnabledFor(logging.DEBUG):
            if target_comes_after:
                logger.debug(u"  Calculating oneway likelihood of {1}, {0}".format(formatter.format_morpheme_container_for_simple_parseset(target),
                    [t[0].get_surface() if t else "<Unparsable>" for t in context]))
            else:
                logger.debug(u"  Calculating oneway likelihood of {0}, {1}".format(formatter.format_morpheme_container_for_simple_parseset(target),
                    [t[0].get_surface() if t else "<Unparsable>" for t in context]))

        cartesian_products_of_context_parse_results = self._get_cartesian_products_of_context_parse_results(context)
        logger.debug("  Going to check the usages with the following cartesian product of parse results: \n{}".format(
            [[formatter.format_morpheme_container_for_simple_parseset_without_suffixes(mc) for mc in product_item] for product_item in
                                                                                                                   cartesian_products_of_context_parse_results]))

        if not cartesian_products_of_context_parse_results or not any(cartesian_products_of_context_parse_results):
            return 0.0

        if calculation_context is not None:
            calculation_context['possibilities'] = {}

        target_likelihoods_for_context_parse_results = []
        context_parse_results_likelihoods = []

        for index, context_parse_results in enumerate(cartesian_products_of_context_parse_results):
            word_calc_context = None
            if calculation_context is not None:
                word_calc_context = calculation_context['possibilities'][index] = {}

            if logger.isEnabledFor(logging.DEBUG):
                context_parse_result_str_list = [formatter.format_morpheme_container_for_simple_parseset_without_suffixes(t) for t in context_parse_results]
                if target_comes_after:
                    logger.debug(u"   Calculating oneway likelihood of {1}, {0}".format(target_morpheme_container_str, context_parse_result_str_list))
                else:
                    logger.debug(u"   Calculating oneway likelihood of {0}, {1}".format(target_morpheme_container_str, context_parse_result_str_list))

            context_counts = self._get_context_form_count_matrix(context_parse_results)
            logger.debug("       Context form counts: \n{}".format(context_counts))

            smoothed_context_counts = self._smooth_context_cooccurrence_counts(context_counts, context_parse_results)
            if calculation_context is not None:
                word_calc_context['smoothed_context_counts'] = smoothed_context_counts
            logger.debug("       Smoothed context form counts: \n{}".format(smoothed_context_counts))

            if calculation_context is not None:
                word_calc_context['context_words'] = {}
                for i, context_item in enumerate(context_parse_results):
                    word_calc_context['context_words'][i] = {
                        'surface': context_item.get_surface_with_syntactic_categories(),
                        'stem': context_item.get_stem_with_syntactic_categories(),
                        'lexeme': context_item.get_lemma_root_with_syntactic_categories()
                    }

            target_form_given_context_counts = numpy.zeros((3, 3), dtype=float)

            for i, appender_matrix_row in enumerate(self.APPENDER_MATRIX):
                for j, (target_appender, context_appender) in enumerate(appender_matrix_row):
                    target_form_given_count = self._target_form_given_context_counter._count_target_form_given_context(target, context_parse_results, target_comes_after, target_appender,
                        context_appender)
                    target_form_given_context_counts[i][j] = target_form_given_count

            logger.debug("       Target form counts given context forms: \n{}".format(target_form_given_context_counts))

            if calculation_context is not None:
                word_calc_context['target_form_counts'] = target_form_given_context_counts
            logger.debug("       Target form counts: \n{}".format(target_form_given_context_counts))

            smoothed_target_form_given_context_counts = self._smooth_target_context_cooccurrence_counts(target_form_given_context_counts, target,
                context_parse_results, target_comes_after)

            if calculation_context is not None:
                word_calc_context['smoothed_target_form_counts'] = smoothed_target_form_given_context_counts
            logger.debug("       Smoothed target form counts: \n{}".format(smoothed_target_form_given_context_counts))

            target_form_probabilities = smoothed_target_form_given_context_counts / smoothed_context_counts
            target_form_probabilities[numpy.isinf(target_form_probabilities)] = 0.0
            target_form_probabilities[numpy.isnan(target_form_probabilities)] = 0.0

            if calculation_context is not None:
                word_calc_context['target_form_probabilities'] = target_form_probabilities
            logger.debug("       Target form probabilities: \n{}".format(target_form_probabilities))

            target_form_probabilities = target_form_probabilities * self.COEFFICIENTS_TARGET_GIVEN_CONTEXT_FORM
            if calculation_context is not None:
                word_calc_context['coefficients_target_given_context_form'] = self.COEFFICIENTS_TARGET_GIVEN_CONTEXT_FORM
                word_calc_context['target_form_probabilities_with_context_form_weights'] = target_form_probabilities
            logger.debug("       Target form probabilities with context form weights: \n{}".format(target_form_probabilities))

            target_form_probabilities = numpy.dot(target_form_probabilities, numpy.ones((3, 1), dtype=float))
            if calculation_context is not None:
                word_calc_context['summed_target_form_probabilities'] = target_form_probabilities
            logger.debug("       Summed target form probabilities: \n{}".format(target_form_probabilities))

            weight_summed_target_probability = numpy.dot(self.COEFFICIENTS_TARGET_FORM_GIVEN_CONTEXT, target_form_probabilities)
            assert numpy.shape(weight_summed_target_probability) == (1, 1)
            if calculation_context is not None:
                word_calc_context['coefficients_target_form_given_context'] = self.COEFFICIENTS_TARGET_FORM_GIVEN_CONTEXT
                word_calc_context['weight_summed_target_probability'] = weight_summed_target_probability
            logger.debug("       Weight-summed target probability: \n{}".format(weight_summed_target_probability))

            item_likelihood = weight_summed_target_probability[0][0]

            target_likelihoods_for_context_parse_results.append(item_likelihood)

            logger.debug("      Calculated oneway likelihood for target given context item is {}".format(item_likelihood))

            # say, target_comes_after=True, context={c1,c2} and target=t
            # until now, we looked at collocation of (c1, c2, t) and (c2,t)
            # now we look collocation of (c1,c2)
            # which makes complete sense while calculating the weight for current cartesian product item
            context_sequence_likelihood_calculation_direction = SequenceLikelihoodCalculator.HIGHEST_WEIGHT_ON_LAST if target_comes_after else SequenceLikelihoodCalculator.HIGHEST_WEIGHT_ON_FIRST
            sequence_likelihood_context = {} if calculation_context is not None else None
            context_likelihood = self._sequence_likelihood_calculator.calculate(context_parse_results, context_sequence_likelihood_calculation_direction, sequence_likelihood_context)

            if calculation_context is not None:
                word_calc_context['context_sequence_likelihood'] = sequence_likelihood_context

            context_parse_results_likelihoods.append(context_likelihood)

            logger.debug("      Context likelihood is {}".format(context_likelihood))

        likelihood = 0.0

        # normalize but don't smooth. weights are already smoothed
        total_context_parse_results_weights = sum(context_parse_results_likelihoods)
        normalized_context_parse_results_weights = []
        if total_context_parse_results_weights:
            normalized_context_parse_results_weights = [context_parse_results_item_weight/total_context_parse_results_weights for context_parse_results_item_weight in context_parse_results_likelihoods]
        else:
            normalized_context_parse_results_weights = [0.0 for context_parse_results_item_weight in context_parse_results_likelihoods]
        logger.debug("     Normalized context parse results weights are {}".format(normalized_context_parse_results_weights))

        for index, context_parse_results in enumerate(cartesian_products_of_context_parse_results):
            target_likelihood_for_context_parse_results_item = target_likelihoods_for_context_parse_results[index]
            context_parse_results_item_likelihood = normalized_context_parse_results_weights[index]
            weighted_parse_result_possibility_likelihood = context_parse_results_item_likelihood * target_likelihood_for_context_parse_results_item
            likelihood += weighted_parse_result_possibility_likelihood
            if calculation_context is not None:
                word_calc_context = calculation_context['possibilities'][index]
                word_calc_context['context_likelihood'] = context_parse_results_item_likelihood
                word_calc_context['weighted_parse_result_possibility_likelihood'] = weighted_parse_result_possibility_likelihood

            logger.debug("      Weighted context parse result likelihood is {} for context : {}".format(weighted_parse_result_possibility_likelihood, context_parse_results))

        if calculation_context is not None:
            calculation_context['sum_likelihood'] = likelihood
        logger.debug("  Calculated oneway likelihood is {}".format(likelihood))

        return likelihood
示例#10
0
    def test_should_format_for_simple_parseset(self):
        parse_result = self.parser.parse(u'kitaba')[0]
        assert_that(formatter.format_morpheme_container_for_simple_parseset(parse_result), equal_to(u'(1,"kitap+Noun+A3sg+Pnon+Dat")'))

        parse_result = self.parser.parse(u'yaptırtmayı')[0]
        assert_that(formatter.format_morpheme_container_for_simple_parseset(parse_result), equal_to(u'(1,"yap+Verb")(2,"Verb+Caus")(3,"Verb+Caus+Pos")(4,"Noun+Inf+A3sg+Pnon+Acc")'))
示例#11
0
    def calculate_oneway_likelihood(self, target, context, target_comes_after, calculation_context=None):
        """
        @type target: WordFormContainer
        @type context: list of WordFormContainer
        @type target_comes_after: bool
        @type calculation_context : dict or None
        @rtype: float
        """
        assert target
        assert context

        if logger.isEnabledFor(logging.DEBUG):
            if target_comes_after:
                logger.debug("  Calculating oneway likelihood of {1}, {0}".format(formatter.format_morpheme_container_for_simple_parseset(target), context))
            else:
                logger.debug("  Calculating oneway likelihood of {0}, {1}".format(formatter.format_morpheme_container_for_simple_parseset(target), context))

        count_given_context = self._count_target_form_given_context(target, context, False, None, _context_word_appender)

        if not count_given_context:
            return 0.0

        if calculation_context is not None:
            for i, context_item in enumerate(context):
                calculation_context[i] = {
                    'surface': context_item
                }

        target_form_given_context_counts = numpy.zeros((3, 1), dtype=float)

        for i, appender_matrix_row in enumerate(self.APPENDER_MATRIX):
            target_appender, context_appender = appender_matrix_row
            target_form_given_count = self._count_target_form_given_context(target, context, target_comes_after, target_appender, context_appender)
            target_form_given_context_counts[i] = target_form_given_count

        logger.debug("    Target form counts given context forms: \n{}".format(target_form_given_context_counts))
        logger.debug("    Found {} context occurrences".format(count_given_context))

        target_form_probabilities = target_form_given_context_counts / count_given_context
        target_form_probabilities[numpy.isinf(target_form_probabilities)] = 0.0
        target_form_probabilities[numpy.isnan(target_form_probabilities)] = 0.0

        if calculation_context is not None:
            calculation_context['target_form_probabilities'] = target_form_probabilities
        logger.debug("    Target form probabilities: \n{}".format(target_form_probabilities))

        target_form_probabilities = target_form_probabilities * self.COEFFICIENTS_TARGET_GIVEN_CONTEXT_FORM
        if calculation_context is not None:
            calculation_context['coefficients_target_given_context_form'] = self.COEFFICIENTS_TARGET_GIVEN_CONTEXT_FORM
            calculation_context['target_form_probabilities_with_context_form_weights'] = target_form_probabilities
        logger.debug("    Target form probabilities with context form weights: \n{}".format(target_form_probabilities))

        target_form_probabilities = numpy.dot(target_form_probabilities, numpy.ones((3, 1), dtype=float))
        if calculation_context is not None:
            calculation_context['summed_target_form_probabilities'] = target_form_probabilities
        logger.debug("    Summed target form probabilities: \n{}".format(target_form_probabilities))

        weight_summed_target_probability = numpy.dot(self.COEFFICIENTS_TARGET_FORM_GIVEN_CONTEXT, target_form_probabilities)
        assert numpy.shape(weight_summed_target_probability) == (1, 1)
        if calculation_context is not None:
            calculation_context['coefficients_target_form_given_context'] = self.COEFFICIENTS_TARGET_FORM_GIVEN_CONTEXT
            calculation_context['weight_summed_target_probability'] = weight_summed_target_probability
        logger.debug("    Weight-summed target probability: \n{}".format(weight_summed_target_probability))

        likelihood = weight_summed_target_probability[0][0]

        logger.debug("  Calculated oneway likelihood is {}".format(likelihood))

        return likelihood
 def parse_result(self, word):
     return [formatter.format_morpheme_container_for_simple_parseset(r) for r in (self.parser.parse(word))]