예제 #1
0
    def parse(self, input):
        parse_results = super(
            UpperCaseSupportingContextlessMorphologicalParser,
            self).parse(input)
        if input[0].isupper():
            parse_results += super(
                UpperCaseSupportingContextlessMorphologicalParser,
                self).parse(TurkishAlphabet.lower(input[0]) + input[1:])

        return parse_results
    def save_parse_result_for_word(self, word_id, parse_result_uuid):
        """
        @type word_id: ObjectId
        @type parse_result_uuid: str or unicode
        """
        parse_result = self.sessionmanager.get_parse_result(parse_result_uuid)
        assert parse_result, "No parse result found with id {}".format(parse_result)

        word = self.dbmanager.get_word(word_id)
        if not word:
            raise Exception("Word not found for setting the correct parse result! {}".format(word_id))

        # check if the parse result belongs to the given word
        assert word['surface'] == parse_result.get_surface() or TurkishAlphabet.lower(word['surface']) == parse_result.get_surface()

        self.dbmanager.set_parse_result_for_word(word, formatter.format_morpheme_container_for_parseset(parse_result), parse_result)
예제 #3
0
파일: creator.py 프로젝트: aliok/trnltk
    def create_word_binding_from_morpheme_container(self, word_str, morpheme_container):
        assert (word_str == morpheme_container.get_surface_so_far()) or (TurkishAlphabet.lower(word_str[0])+word_str[1:] == morpheme_container.get_surface_so_far())

        root_str = morpheme_container.get_root().str
        lemma = morpheme_container.get_root().lexeme.lemma
        lemma_root = morpheme_container.get_root().lexeme.root
        root_syntactic_category = morpheme_container.get_root().lexeme.syntactic_category
        root_secondary_syntactic_category = morpheme_container.get_root().lexeme.secondary_syntactic_category
        root = RootBinding(root_str, lemma, lemma_root, root_syntactic_category, root_secondary_syntactic_category)

        word_syntactic_category = morpheme_container.get_surface_syntactic_category()
        word_secondary_syntactic_category = morpheme_container.get_surface_secondary_syntactic_category()

        parse_result = formatter.format_morpheme_container_for_parseset(morpheme_container)
        word = WordBinding(word_str, parse_result, root, word_syntactic_category, word_secondary_syntactic_category)

        if morpheme_container.get_transitions():
            so_far = root_str
            for transition in morpheme_container.get_transitions():
                if isinstance(transition.suffix_form_application.suffix_form.suffix, FreeTransitionSuffix):
                    continue

                suffix_name = transition.suffix_form_application.suffix_form.suffix.name
                suffix_pretty_name = transition.suffix_form_application.suffix_form.suffix.pretty_name
                suffix_form = transition.suffix_form_application.suffix_form.form
                suffix_application = transition.suffix_form_application.fitting_suffix_form
                suffix_actual_application = transition.suffix_form_application.actual_suffix_form
                word_with_suffix_application = None
                if (so_far + suffix_actual_application)==root_str:
                    word_with_suffix_application = morpheme_container.get_root().lexeme.root + suffix_application
                else:
                    word_with_suffix_application = so_far + suffix_application
                so_far += suffix_actual_application
                if transition.is_derivational():
                    suffix = DerivationalSuffixBinding(suffix_name, suffix_pretty_name, suffix_form, suffix_application, suffix_actual_application, word_with_suffix_application, so_far, transition.to_state.syntactic_category)
                    word.suffixes.append(suffix)
                else:
                    suffix = InflectionalSuffixBinding(suffix_name, suffix_pretty_name, suffix_form, suffix_application, suffix_actual_application, word_with_suffix_application, so_far, transition.to_state.syntactic_category)
                    word.suffixes.append(suffix)
        return word
    def save_parse_result_for_word(self, word_id, parse_result_uuid):
        """
        @type word_id: ObjectId
        @type parse_result_uuid: str or unicode
        """
        parse_result = self.sessionmanager.get_parse_result(parse_result_uuid)
        assert parse_result, "No parse result found with id {}".format(
            parse_result)

        word = self.dbmanager.get_word(word_id)
        if not word:
            raise Exception(
                "Word not found for setting the correct parse result! {}".
                format(word_id))

        # check if the parse result belongs to the given word
        assert word['surface'] == parse_result.get_surface(
        ) or TurkishAlphabet.lower(
            word['surface']) == parse_result.get_surface()

        self.dbmanager.set_parse_result_for_word(
            word,
            formatter.format_morpheme_container_for_parseset(parse_result),
            parse_result)
    def _test_should_parse_simple_parse_set(self, set_number, start_index=0):
        path = os.path.join(
            os.path.dirname(__file__),
            '../../../../testresources/simpleparsesets/simpleparseset{}.txt'.
            format(set_number))
        logger.info("Parsing simple parse set {}".format(path))
        skipped = 0
        unparsable = 0
        comment = 0
        with codecs.open(path, 'r', 'utf-8-sig') as parse_set_file:
            index = 0
            for line in parse_set_file:
                if start_index > index:
                    index += 1
                    continue

                if line.startswith('#'):
                    comment += 1
                    index += 1
                    continue

                line = line.strip()
                (word, parse_result) = line.split('=')
                if any([
                        case_to_skip in parse_result
                        for case_to_skip in cases_to_skip
                ]) or word in words_to_skip:
                    if self.LOG_SKIPPED:
                        logger.info(u'Skipped : {} {} {}'.format(
                            index, word, parse_result))
                    skipped += 1
                    index += 1
                    continue

                #TODO
                parse_result = parse_result.replace('Prog1', 'Prog')
                parse_result = parse_result.replace('Prog2', 'Prog')
                parse_result = parse_result.replace('Inf1', 'Inf')
                parse_result = parse_result.replace('Inf2', 'Inf')
                parse_result = parse_result.replace('Inf3', 'Inf')
                parse_result = parse_result.replace('WithoutHavingDoneSo1',
                                                    'WithoutHavingDoneSo')
                parse_result = parse_result.replace('WithoutHavingDoneSo2',
                                                    'WithoutHavingDoneSo')

                #TODO
                parse_result = parse_result.replace('Hastily', 'Hastily+Pos')

                parse_result = parse_result.replace('Postp+PCNom', 'Part')
                parse_result = parse_result.replace('Postp+PCDat', 'Postp')
                parse_result = parse_result.replace('Postp+PCAcc', 'Postp')
                parse_result = parse_result.replace('Postp+PCLoc', 'Postp')
                parse_result = parse_result.replace('Postp+PCAbl', 'Postp')
                parse_result = parse_result.replace('Postp+PCIns', 'Postp')
                parse_result = parse_result.replace('Postp+PCGen', 'Postp')

                if self.STATS_MODE:
                    try:
                        self.assert_parse_correct(word, index, parse_result)
                    except Exception:
                        unparsable += 1
                        logger.info(u'Unparsable : {} {} {}'.format(
                            index, word, parse_result))
                else:
                    self.assert_parse_correct(TurkishAlphabet.lower(word),
                                              index, parse_result)

                index += 1

        if self.STATS_MODE:
            logger.info("Finished simple parse set {}".format(path))
            logger.info("Found {} lines, with {} lines of comments".format(
                index, comment))
            logger.info("Skipped {}, unparsable {}".format(
                skipped, unparsable))
            logger.info("Words that should be parsable : {}".format(index -
                                                                    comment))
            logger.info("Parse success rate : {}".format(
                float(index - comment - skipped - unparsable) /
                float(index - comment)))
예제 #6
0
    def create_word_binding_from_morpheme_container(self, word_str,
                                                    morpheme_container):
        assert (word_str == morpheme_container.get_surface_so_far()) or (
            TurkishAlphabet.lower(word_str[0]) + word_str[1:]
            == morpheme_container.get_surface_so_far())

        root_str = morpheme_container.get_root().str
        lemma = morpheme_container.get_root().lexeme.lemma
        lemma_root = morpheme_container.get_root().lexeme.root
        root_syntactic_category = morpheme_container.get_root(
        ).lexeme.syntactic_category
        root_secondary_syntactic_category = morpheme_container.get_root(
        ).lexeme.secondary_syntactic_category
        root = RootBinding(root_str, lemma, lemma_root,
                           root_syntactic_category,
                           root_secondary_syntactic_category)

        word_syntactic_category = morpheme_container.get_surface_syntactic_category(
        )
        word_secondary_syntactic_category = morpheme_container.get_surface_secondary_syntactic_category(
        )

        parse_result = formatter.format_morpheme_container_for_parseset(
            morpheme_container)
        word = WordBinding(word_str, parse_result, root,
                           word_syntactic_category,
                           word_secondary_syntactic_category)

        if morpheme_container.get_transitions():
            so_far = root_str
            for transition in morpheme_container.get_transitions():
                if isinstance(
                        transition.suffix_form_application.suffix_form.suffix,
                        FreeTransitionSuffix):
                    continue

                suffix_name = transition.suffix_form_application.suffix_form.suffix.name
                suffix_pretty_name = transition.suffix_form_application.suffix_form.suffix.pretty_name
                suffix_form = transition.suffix_form_application.suffix_form.form
                suffix_application = transition.suffix_form_application.fitting_suffix_form
                suffix_actual_application = transition.suffix_form_application.actual_suffix_form
                word_with_suffix_application = None
                if (so_far + suffix_actual_application) == root_str:
                    word_with_suffix_application = morpheme_container.get_root(
                    ).lexeme.root + suffix_application
                else:
                    word_with_suffix_application = so_far + suffix_application
                so_far += suffix_actual_application
                if transition.is_derivational():
                    suffix = DerivationalSuffixBinding(
                        suffix_name, suffix_pretty_name, suffix_form,
                        suffix_application, suffix_actual_application,
                        word_with_suffix_application, so_far,
                        transition.to_state.syntactic_category)
                    word.suffixes.append(suffix)
                else:
                    suffix = InflectionalSuffixBinding(
                        suffix_name, suffix_pretty_name, suffix_form,
                        suffix_application, suffix_actual_application,
                        word_with_suffix_application, so_far,
                        transition.to_state.syntactic_category)
                    word.suffixes.append(suffix)
        return word
예제 #7
0
파일: parser.py 프로젝트: aliok/trnltk
    def parse(self, input):
        parse_results = super(UpperCaseSupportingContextlessMorphologicalParser, self).parse(input)
        if input[0].isupper():
            parse_results += super(UpperCaseSupportingContextlessMorphologicalParser, self).parse(TurkishAlphabet.lower(input[0]) + input[1:])

        return parse_results
    def _test_should_parse_simple_parse_set(self, set_number, start_index=0):
        path = os.path.join(
            os.path.dirname(__file__),
            "../../../../testresources/simpleparsesets/simpleparseset{}.txt".format(set_number),
        )
        logger.info("Parsing simple parse set {}".format(path))
        skipped = 0
        unparsable = 0
        comment = 0
        with codecs.open(path, "r", "utf-8-sig") as parse_set_file:
            index = 0
            for line in parse_set_file:
                if start_index > index:
                    index += 1
                    continue

                if line.startswith("#"):
                    comment += 1
                    index += 1
                    continue

                line = line.strip()
                (word, parse_result) = line.split("=")
                if any([case_to_skip in parse_result for case_to_skip in cases_to_skip]) or word in words_to_skip:
                    if self.LOG_SKIPPED:
                        logger.info("Skipped : {} {} {}".format(index, word, parse_result))
                    skipped += 1
                    index += 1
                    continue

                # TODO
                parse_result = parse_result.replace("Prog1", "Prog")
                parse_result = parse_result.replace("Prog2", "Prog")
                parse_result = parse_result.replace("Inf1", "Inf")
                parse_result = parse_result.replace("Inf2", "Inf")
                parse_result = parse_result.replace("Inf3", "Inf")
                parse_result = parse_result.replace("WithoutHavingDoneSo1", "WithoutHavingDoneSo")
                parse_result = parse_result.replace("WithoutHavingDoneSo2", "WithoutHavingDoneSo")

                # TODO
                parse_result = parse_result.replace("Hastily", "Hastily+Pos")

                parse_result = parse_result.replace("Postp+PCNom", "Part")
                parse_result = parse_result.replace("Postp+PCDat", "Postp")
                parse_result = parse_result.replace("Postp+PCAcc", "Postp")
                parse_result = parse_result.replace("Postp+PCLoc", "Postp")
                parse_result = parse_result.replace("Postp+PCAbl", "Postp")
                parse_result = parse_result.replace("Postp+PCIns", "Postp")
                parse_result = parse_result.replace("Postp+PCGen", "Postp")

                if self.STATS_MODE:
                    try:
                        self.assert_parse_correct(word, index, parse_result)
                    except Exception:
                        unparsable += 1
                        logger.info("Unparsable : {} {} {}".format(index, word, parse_result))
                else:
                    self.assert_parse_correct(TurkishAlphabet.lower(word), index, parse_result)

                index += 1

        if self.STATS_MODE:
            logger.info("Finished simple parse set {}".format(path))
            logger.info("Found {} lines, with {} lines of comments".format(index, comment))
            logger.info("Skipped {}, unparsable {}".format(skipped, unparsable))
            logger.info("Words that should be parsable : {}".format(index - comment))
            logger.info(
                "Parse success rate : {}".format(float(index - comment - skipped - unparsable) / float(index - comment))
            )