Пример #1
0
    def test_should_not_set_category_and_lemma_if_category_is_set_already(self):
        item_org = Lexeme(u'elma', u'elma', SyntacticCategory.NOUN, None, None)
        item_clone = item_org.clone()
        LexiconLoader._set_category_and_lemma(item_clone)
        assert_that(item_org, equal_to(item_clone))

        item_org = Lexeme(u'mavi', u'mavi', SyntacticCategory.ADJECTIVE, None, None)
        item_clone = item_org.clone()
        LexiconLoader._set_category_and_lemma(item_clone)
        assert_that(item_org, equal_to(item_clone))

        item_org = Lexeme(u'aha', u'aha', SyntacticCategory.INTERJECTION, None, None)
        item_clone = item_org.clone()
        LexiconLoader._set_category_and_lemma(item_clone)
        assert_that(item_org, equal_to(item_clone))

        item_org = Lexeme(u'yemek', u'yemek', SyntacticCategory.NOUN, None, None)
        item_clone = item_org.clone()
        LexiconLoader._set_category_and_lemma(item_clone)
        assert_that(item_org, equal_to(item_clone))

        item_org = Lexeme(u'tokmak', u'tokmak', SyntacticCategory.NOUN, None, None)
        item_clone = item_org.clone()
        LexiconLoader._set_category_and_lemma(item_clone)
        assert_that(item_org, equal_to(item_clone))
Пример #2
0
    def test_should_set_category_and_lemma_for_verbs(self):
        item = Lexeme(u'yemek', u'yemek', None, None, None)
        LexiconLoader._set_category_and_lemma(item)
        assert_that(item, equal_to(Lexeme(u'yemek', u'ye', SyntacticCategory.VERB, None, None)))

        item = Lexeme(u'elemek', u'elemek', None, None, None)
        LexiconLoader._set_category_and_lemma(item)
        assert_that(item, equal_to(Lexeme(u'elemek', u'ele', SyntacticCategory.VERB, None, None)))
Пример #3
0
    def setUpClass(cls):
        super(_LikelihoodCalculatorTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        cls.contextless_parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder,
             proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

        cls.mongodb_connection = pymongo.Connection(host='127.0.0.1')
        cls.collection_map = {
            1: cls.mongodb_connection['trnltk']['wordUnigrams999'],
            2: cls.mongodb_connection['trnltk']['wordBigrams999'],
            3: cls.mongodb_connection['trnltk']['wordTrigrams999']
        }

        cls.generator = None
Пример #4
0
    def setUpClass(cls):
        super(MorphemeContainerContextlessProbabilityGeneratorWithContainersTest, cls).setUpClass()
        all_roots = []

        lexicon_lines = u'''
            duvar
            tutku
            saç
            oğul [A:LastVowelDrop]
            demek [A:RootChange, Passive_In, Passive_InIl]
            bu [P:Det]
        '''.strip().splitlines()

        lexemes = LexiconLoader.load_from_lines(lexicon_lines)
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = BasicSuffixGraph()
        suffix_graph.initialize()

        word_root_finder = WordRootFinder(cls.root_map)

        cls.contextless_parser = ContextlessMorphologicalParser(suffix_graph, None,
            [word_root_finder])
Пример #5
0
    def setUp(self):
        self.parseset_creator = ParseSetCreator()

        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map = (RootMapGenerator()).generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        self.parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])
Пример #6
0
    def setUpClass(cls):
        super(
            MorphemeContainerContextlessProbabilityGeneratorWithContainersTest,
            cls).setUpClass()
        all_roots = []

        lexicon_lines = u'''
            duvar
            tutku
            saç
            oğul [A:LastVowelDrop]
            demek [A:RootChange, Passive_In, Passive_InIl]
            bu [P:Det]
        '''.strip().splitlines()

        lexemes = LexiconLoader.load_from_lines(lexicon_lines)
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = BasicSuffixGraph()
        suffix_graph.initialize()

        word_root_finder = WordRootFinder(cls.root_map)

        cls.contextless_parser = ContextlessMorphologicalParser(
            suffix_graph, None, [word_root_finder])
Пример #7
0
    def setUpClass(cls):
        super(TransitionGeneratorTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        cls.parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder,
             proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

        cls.transition_generator = TransitionGenerator(cls.parser)
Пример #8
0
    def setUpClass(cls):
        super(InterpolatingLikelihoodCalculatorCalculationContextTest,
              cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(
            NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        cls.contextless_parser = UpperCaseSupportingContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

        mongodb_connection = pymongo.Connection(host='127.0.0.1')
        cls.collection_map = {
            1: mongodb_connection['trnltk']['wordUnigrams999'],
            2: mongodb_connection['trnltk']['wordBigrams999'],
            3: mongodb_connection['trnltk']['wordTrigrams999']
        }

        database_index_builder = DatabaseIndexBuilder(cls.collection_map)
        target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter(
            cls.collection_map)
        ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother(
        )
        sequence_likelihood_calculator = UniformSequenceLikelihoodCalculator()

        wrapped_generator = ContextParsingLikelihoodCalculator(
            database_index_builder, target_form_given_context_counter,
            ngram_frequency_smoother, sequence_likelihood_calculator)

        cls.generator = InterpolatingLikelihoodCalculator(wrapped_generator)
Пример #9
0
    def setUpClass(cls):
        super(ParserTestWithExtendedGraph, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))


        cls._org_root_map = (RootMapGenerator()).generate(all_roots)
Пример #10
0
    def setUpClass(cls):
        super(FormatterTest, cls).setUpClass()
        all_roots = []

        dictionary_content = ["kitap", "yapmak"]
        lexemes = LexiconLoader.load_from_lines(dictionary_content)
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        cls.root_map = RootMapGenerator().generate(all_roots)
Пример #11
0
    def setUpClass(cls):
        super(FormatterTest, cls).setUpClass()
        all_roots = []

        dictionary_content = ["kitap", "yapmak"]
        lexemes = LexiconLoader.load_from_lines(dictionary_content)
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        cls.root_map = RootMapGenerator().generate(all_roots)
Пример #12
0
    def create(cls, master_dictionary_path, ngram_collection_map):
        """
        @type master_dictionary_path: str or unicode
        @param ngram_collection_map: list<Collection>
        @rtype ContextfulMorphologicalParser
        """
        all_roots = []

        lexemes = LexiconLoader.load_from_file(master_dictionary_path)
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        contextless_parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder,
             proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

        database_index_builder = DatabaseIndexBuilder(ngram_collection_map)
        target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter(ngram_collection_map)
        ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother()
        sequence_likelihood_calculator = SequenceLikelihoodCalculator(None)

        collocation_metric_calculator = ContextParsingLikelihoodCalculator(database_index_builder,
            target_form_given_context_counter, ngram_frequency_smoother,
            sequence_likelihood_calculator)

        interpolating_collocation_metric_calculator = InterpolatingLikelihoodCalculator(collocation_metric_calculator)

        cached_contextless_distribution_smoother = CachedContextlessDistributionSmoother()
        contextless_distribution_metric_calculator = ContextlessDistributionCalculator(database_index_builder,
            target_form_given_context_counter, cached_contextless_distribution_smoother)

        contextful_likelihood_calculator = ContextfulLikelihoodCalculator(interpolating_collocation_metric_calculator,
            contextless_distribution_metric_calculator)

        sequence_likelihood_calculator._contextful_likelihood_calculator = contextful_likelihood_calculator

        contextful_morphological_parser = ContextfulMorphologicalParser(contextless_parser,
            contextful_likelihood_calculator)

        return contextful_morphological_parser
Пример #13
0
    def setUpClass(cls):
        super(StatisticalParserTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(
            BasicSuffixGraph()))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        contextless_parser = ContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

        parseset_index = "001"
        dom = parse(
            os.path.join(
                os.path.dirname(__file__),
                '../../testresources/parsesets/parseset{}.xml'.format(
                    parseset_index)))
        parseset = ParseSetBinding.build(
            dom.getElementsByTagName("parseset")[0])
        parse_set_word_list = []
        for sentence in parseset.sentences:
            parse_set_word_list.extend(sentence.words)

        complete_word_concordance_index = CompleteWordConcordanceIndex(
            parse_set_word_list)

        cls.parser = StatisticalParser(contextless_parser,
                                       complete_word_concordance_index)
    def create_calculator(cls, parseset_index):
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        cls.contextless_parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder,
             proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

        mongodb_connection = pymongo.Connection(host='127.0.0.1')
        collection_map = {
            1: mongodb_connection['trnltk']['wordUnigrams{}'.format(parseset_index)],
            2: mongodb_connection['trnltk']['wordBigrams{}'.format(parseset_index)],
            3: mongodb_connection['trnltk']['wordTrigrams{}'.format(parseset_index)]
        }

        database_index_builder = DatabaseIndexBuilder(collection_map)
        target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter(collection_map)
        ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother()
        sequence_likelihood_calculator = SequenceLikelihoodCalculator(None)

        collocation_metric_calculator = ContextParsingLikelihoodCalculator(database_index_builder, target_form_given_context_counter, ngram_frequency_smoother,
            sequence_likelihood_calculator)

        interpolating_collocation_metric_calculator = InterpolatingLikelihoodCalculator(collocation_metric_calculator)

        contextless_distribution_metric_calculator = ContextlessDistributionCalculator(database_index_builder, target_form_given_context_counter)

        contextful_likelihood_calculator = ContextfulLikelihoodCalculator(interpolating_collocation_metric_calculator, contextless_distribution_metric_calculator)

        sequence_likelihood_calculator._contextful_likelihood_calculator = contextful_likelihood_calculator

        return contextful_likelihood_calculator
Пример #15
0
    def setUpClass(cls):
        super(PredefinedPathsTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        cls.morpheme_container_map = {}

        cls.suffix_graph = BasicSuffixGraph()
        cls.suffix_graph.initialize()
Пример #16
0
    def setUpClass(cls):
        super(PredefinedPathsTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        cls.morpheme_container_map = {}

        cls.suffix_graph = BasicSuffixGraph()
        cls.suffix_graph.initialize()
Пример #17
0
    def test_should_validate_master_dict(self):
        path = os.path.join(os.path.dirname(__file__), '../../../resources/master_dictionary.txt')

        items = LexiconLoader.load_from_file(path)
        assert_that(len(items)>0, equal_to(True))
        for item in items:
            assert_that(item.lemma, not_none(), str(item))
            assert_that(item.root, not_none(), str(item))
            assert_that(item.syntactic_category, not_none(), str(item))
            assert_that(SyntacticCategory.ALL, has_item(item.syntactic_category), str(item))

            if item.secondary_syntactic_category:
                assert_that(SecondarySyntacticCategory.ALL, has_item(item.secondary_syntactic_category), str(item))

            if item.attributes:
                for attr in item.attributes:
                    assert_that(LexemeAttribute.ALL, has_item(attr), str(item))
Пример #18
0
    def setUpClass(cls):
        super(_LikelihoodCalculatorTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(
            NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        cls.contextless_parser = ContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

        cls.mongodb_connection = pymongo.Connection(host='127.0.0.1')
        cls.collection_map = {
            1: cls.mongodb_connection['trnltk']['wordUnigrams999'],
            2: cls.mongodb_connection['trnltk']['wordBigrams999'],
            3: cls.mongodb_connection['trnltk']['wordTrigrams999']
        }

        cls.generator = None
    def setUpClass(cls):
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        cls.contextless_parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder,
             proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

        mongodb_connection = pymongo.Connection(host='127.0.0.1')
        collection_map = {
            1: mongodb_connection['trnltk']['wordUnigrams{}'.format(cls.parseset_index)]
        }

        database_index_builder = DatabaseIndexBuilder(collection_map)
        target_form_given_context_counter = TargetFormGivenContextCounter(collection_map)
        smoother = CachedContextlessDistributionSmoother()
        smoother.initialize()

        cls.calculator = ContextlessDistributionCalculator(database_index_builder, target_form_given_context_counter, smoother)
        cls.calculator.build_indexes()
Пример #20
0
    def setUpClass(cls):
        super(TransitionGeneratorTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(
            NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        cls.parser = ContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

        cls.transition_generator = TransitionGenerator(cls.parser)
Пример #21
0
    def setUpClass(cls):
        super(StatisticalParserTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))


        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(BasicSuffixGraph()))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        contextless_parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

        parseset_index = "001"
        dom = parse(os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(parseset_index)))
        parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0])
        parse_set_word_list = []
        for sentence in parseset.sentences:
            parse_set_word_list.extend(sentence.words)

        complete_word_concordance_index = CompleteWordConcordanceIndex(parse_set_word_list)

        cls.parser = StatisticalParser(contextless_parser, complete_word_concordance_index)
Пример #22
0
    def test_should_load_lexicon_from_str(self):
        dictionary_content = u'''
            a [P:Interj]
            aba [P:Adj]
            abadî
            abat [P:Adj; A:NoVoicing]
            Abdal
            abdest [A:NoVoicing]
            abes [P:Adj]
            abes [P:Adv]
            ablak [P:Adj; A:NoVoicing]
            abuk [P:Adj, Dup;A:NoVoicing, NoSuffix]
            acemborusu [A:CompoundP3sg; R:acemboru]
            acembuselik
            aciz [A:LastVowelDrop]
            âciz [P:Adj]
            açık [P:Adj]
            ad
            ad [P:Noun; A:Doubling, InverseHarmony]
            addetmek [A:Voicing, Aorist_A]
            addolmak [A:Causative_dIr]
            ahlat [A:NoVoicing, Plural]
            akşam [P:Noun, Time]
            atamak [A:Causative_It]
            sürtmek
            yemek [P:Noun]
            yemek [A:Causative_dIr]
            ürkmek [A:Causative_It]
        '''
        dictionary_lines = dictionary_content.split('\n')
        dictionary_lines = [l.strip() for l in dictionary_lines]
        dictionary_lines = filter(lambda line: line, dictionary_lines)

        lexemes = LexiconLoader.load_from_lines(dictionary_lines)

        assert_that(lexemes, has_length(len(dictionary_lines)), str(len(lexemes)-len(dictionary_lines)))

        assert_that(lexemes, has_item(Lexeme(u'a', u'a', SyntacticCategory.INTERJECTION, None, None)))
        assert_that(lexemes, has_item(Lexeme(u'aba', u'aba', SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'abadî', u'abadî', SyntacticCategory.NOUN, None, {LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'abat', u'abat', SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'Abdal', u'Abdal', SyntacticCategory.NOUN, SecondarySyntacticCategory.PROPER_NOUN, {LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'abdest', u'abdest', SyntacticCategory.NOUN, None, {LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'abes', u'abes', SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'abes', u'abes', SyntacticCategory.ADVERB, None, None)))
        assert_that(lexemes, has_item(Lexeme(u'ablak', u'ablak', SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'abuk', u'abuk', SyntacticCategory.ADJECTIVE, SecondarySyntacticCategory.DUPLICATOR, {LexemeAttribute.NoSuffix, LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'acemborusu', u'acemboru', SyntacticCategory.NOUN, None, {LexemeAttribute.CompoundP3sg, LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'acembuselik', u'acembuselik', SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing})))
        assert_that(lexemes, has_item(Lexeme(u'aciz', u'aciz', SyntacticCategory.NOUN, None, {LexemeAttribute.LastVowelDrop, LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'âciz', u'âciz', SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'açık', u'açık', SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.Voicing})))
        assert_that(lexemes, has_item(Lexeme(u'ad', u'ad', SyntacticCategory.NOUN, None, {LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'ad', u'ad', SyntacticCategory.NOUN, None, {LexemeAttribute.Doubling, LexemeAttribute.InverseHarmony, LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'addetmek', u'addet', SyntacticCategory.VERB, None, {LexemeAttribute.Aorist_A, LexemeAttribute.Causative_dIr, LexemeAttribute.Voicing})))
        assert_that(lexemes, has_item(Lexeme(u'addolmak', u'addol', SyntacticCategory.VERB, None, {LexemeAttribute.Aorist_I, LexemeAttribute.Causative_dIr, LexemeAttribute.NoVoicing, LexemeAttribute.Passive_In})))
        assert_that(lexemes, has_item(Lexeme(u'ahlat', u'ahlat', SyntacticCategory.NOUN, None, {LexemeAttribute.NoVoicing, LexemeAttribute.Plural})))
        assert_that(lexemes, has_item(Lexeme(u'akşam', u'akşam', SyntacticCategory.NOUN, SecondarySyntacticCategory.TIME, {LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'atamak', u'ata', SyntacticCategory.VERB, None, {LexemeAttribute.Aorist_I, LexemeAttribute.Causative_It, LexemeAttribute.NoVoicing, LexemeAttribute.Passive_In, LexemeAttribute.ProgressiveVowelDrop})))
        assert_that(lexemes, has_item(Lexeme(u'sürtmek', u'sürt', SyntacticCategory.VERB, None, {LexemeAttribute.Aorist_A, LexemeAttribute.Causative_Ir, LexemeAttribute.NoVoicing})))
        assert_that(lexemes, has_item(Lexeme(u'yemek', u'yemek', SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing})))
        assert_that(lexemes, has_item(Lexeme(u'yemek', u'ye', SyntacticCategory.VERB, None, {LexemeAttribute.Aorist_A, LexemeAttribute.Causative_dIr, LexemeAttribute.NoVoicing, LexemeAttribute.Passive_In, LexemeAttribute.ProgressiveVowelDrop})))
        assert_that(lexemes, has_item(Lexeme(u'ürkmek', u'ürk', SyntacticCategory.VERB, None, {LexemeAttribute.Aorist_A, LexemeAttribute.Causative_It, LexemeAttribute.NoVoicing})))
from trnltk.morphology.lexicon.lexiconloader import LexiconLoader
from trnltk.morphology.lexicon.rootgenerator import CircumflexConvertingRootGenerator, RootMapGenerator
from trnltk.morphology.model import formatter
from trnltk.morphology.morphotactics.basicsuffixgraph import BasicSuffixGraph
from trnltk.morphology.morphotactics.copulasuffixgraph import CopulaSuffixGraph
from trnltk.morphology.contextless.parser.parser import  logger as parser_logger, UpperCaseSupportingContextlessMorphologicalParser
from trnltk.morphology.contextless.parser.rootfinder import WordRootFinder, DigitNumeralRootFinder, TextNumeralRootFinder, ProperNounFromApostropheRootFinder, ProperNounWithoutApostropheRootFinder
from trnltk.morphology.contextless.parser.suffixapplier import logger as suffix_applier_logger
from trnltk.morphology.morphotactics.numeralsuffixgraph import NumeralSuffixGraph
from trnltk.morphology.morphotactics.predefinedpaths import PredefinedPaths

from trnltk.morphology.morphotactics.propernounsuffixgraph import ProperNounSuffixGraph

all_roots = []

lexemes = LexiconLoader.load_from_file('trnltk/trnltk/resources/master_dictionary.txt')
for di in lexemes:
	all_roots.extend(CircumflexConvertingRootGenerator.generate(di))

root_map_generator = RootMapGenerator()
root_map = root_map_generator.generate(all_roots)

suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
suffix_graph.initialize()

predefined_paths = PredefinedPaths(root_map, suffix_graph)
predefined_paths.create_predefined_paths()

word_root_finder = WordRootFinder(root_map)
text_numeral_root_finder = TextNumeralRootFinder(root_map)
digit_numeral_root_finder = DigitNumeralRootFinder()
Пример #24
0
    def test_should_create_lexeme_from_line(self):
        
        item = LexiconLoader._crate_lexeme_from_line(u'a [P:Interj]')
        assert_that(item, equal_to(Lexeme(u"a", u"a", SyntacticCategory.INTERJECTION, None, None)))
        
        item = LexiconLoader._crate_lexeme_from_line(u'aba [P:Adj]')
        assert_that(item, equal_to(Lexeme(u"aba", u"aba", SyntacticCategory.ADJECTIVE, None, None)))

        item = LexiconLoader._crate_lexeme_from_line(u'abadî')
        assert_that(item, equal_to(Lexeme(u"abadî", u"abadî", None, None, None)))

        item = LexiconLoader._crate_lexeme_from_line(u'abat [P:Adj; A:NoVoicing]')
        assert_that(item, equal_to(Lexeme(u"abat", u"abat", SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.NoVoicing})))

        item = LexiconLoader._crate_lexeme_from_line(u'Abdal')
        assert_that(item, equal_to(Lexeme(u"Abdal", u"Abdal", None, None, None)))

        item = LexiconLoader._crate_lexeme_from_line(u'abdest [A:NoVoicing]')
        assert_that(item, equal_to(Lexeme(u"abdest", u"abdest", None, None, {LexemeAttribute.NoVoicing})))

        item = LexiconLoader._crate_lexeme_from_line(u'abes [P:Adv]')
        assert_that(item, equal_to(Lexeme(u"abes", u"abes", SyntacticCategory.ADVERB, None, None)))

        item = LexiconLoader._crate_lexeme_from_line(u'ablak [P:Adj; A:NoVoicing]')
        assert_that(item, equal_to(Lexeme(u"ablak", u"ablak", SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.NoVoicing})))

        item = LexiconLoader._crate_lexeme_from_line(u'abuk [P:Adj, Dup;A:NoVoicing, NoSuffix]')
        assert_that(item, equal_to(Lexeme(u"abuk", u"abuk", SyntacticCategory.ADJECTIVE, SecondarySyntacticCategory.DUPLICATOR, {LexemeAttribute.NoVoicing, LexemeAttribute.NoSuffix})))

        item = LexiconLoader._crate_lexeme_from_line(u'acemborusu [A:CompoundP3sg; R:acemboru]')
        assert_that(item, equal_to(Lexeme(u"acemborusu", u"acemboru", None, None, {LexemeAttribute.CompoundP3sg})))

        item = LexiconLoader._crate_lexeme_from_line(u'acembuselik')
        assert_that(item, equal_to(Lexeme(u"acembuselik", u"acembuselik", None, None, None)))

        item = LexiconLoader._crate_lexeme_from_line(u'aciz [A: LastVowelDrop]')
        assert_that(item, equal_to(Lexeme(u"aciz", u"aciz", None, None, {LexemeAttribute.LastVowelDrop})))

        item = LexiconLoader._crate_lexeme_from_line(u'âciz [P:Adj]')
        assert_that(item, equal_to(Lexeme(u"âciz", u"âciz", SyntacticCategory.ADJECTIVE, None, None)))

        item = LexiconLoader._crate_lexeme_from_line(u'açık [P:Adj]')
        assert_that(item, equal_to(Lexeme(u"açık", u"açık", SyntacticCategory.ADJECTIVE, None, None)))

        item = LexiconLoader._crate_lexeme_from_line(u'ad')
        assert_that(item, equal_to(Lexeme(u"ad", u"ad", None, None, None)))

        item = LexiconLoader._crate_lexeme_from_line(u'ad [P:Noun; A:Doubling, InverseHarmony]')
        assert_that(item, equal_to(Lexeme(u"ad", u"ad", SyntacticCategory.NOUN, None, {LexemeAttribute.Doubling, LexemeAttribute.InverseHarmony})))

        item = LexiconLoader._crate_lexeme_from_line(u'addetmek [A:Voicing, Aorist_A]')
        assert_that(item, equal_to(Lexeme(u"addetmek", u"addetmek", None, None, {LexemeAttribute.Voicing, LexemeAttribute.Aorist_A})))

        item = LexiconLoader._crate_lexeme_from_line(u'addolmak')
        assert_that(item, equal_to(Lexeme(u"addolmak", u"addolmak", None, None, None)))

        item = LexiconLoader._crate_lexeme_from_line(u'ahlat [A:NoVoicing, Plural]')
        assert_that(item, equal_to(Lexeme(u"ahlat", u"ahlat", None, None, {LexemeAttribute.NoVoicing, LexemeAttribute.Plural})))

        item = LexiconLoader._crate_lexeme_from_line(u'akşam [P:Noun, Time]')
        assert_that(item, equal_to(Lexeme(u"akşam", u"akşam", SyntacticCategory.NOUN, SecondarySyntacticCategory.TIME, None)))

        item = LexiconLoader._crate_lexeme_from_line(u'yemek [P:Noun]')
        assert_that(item, equal_to(Lexeme(u"yemek", u"yemek", SyntacticCategory.NOUN, None, None)))

        item = LexiconLoader._crate_lexeme_from_line(u'yemek')
        assert_that(item, equal_to(Lexeme(u"yemek", u"yemek", None, None, None)))

        item = LexiconLoader._crate_lexeme_from_line(u'sürtmek')
        assert_that(item, equal_to(Lexeme(u"sürtmek", u"sürtmek", None, None, None)))

        item = LexiconLoader._crate_lexeme_from_line(u'ürkmek [A:Causative_It]')
        assert_that(item, equal_to(Lexeme(u"ürkmek", u"ürkmek", None, None, {LexemeAttribute.Causative_It})))

        item = LexiconLoader._crate_lexeme_from_line(u'akşamsefası [A:CompoundP3sg; R:akşamsefa]')
        assert_that(item, equal_to(Lexeme(u"akşamsefası", u"akşamsefa", None, None, {LexemeAttribute.CompoundP3sg})))

        item = LexiconLoader._crate_lexeme_from_line(u'akşamüstü [P:Noun, Time; A:CompoundP3sg; R:akşamüst]')
        assert_that(item, equal_to(Lexeme(u"akşamüstü", u"akşamüst", SyntacticCategory.NOUN, SecondarySyntacticCategory.TIME, {LexemeAttribute.CompoundP3sg})))

        item = LexiconLoader._crate_lexeme_from_line(u'mi [P:Ques]')
        assert_that(item, equal_to(Lexeme(u"mi", u"mi", SyntacticCategory.QUESTION, None, None)))
Пример #25
0
 def test_should_set_category_and_lemma_for_nonverbs(self):
     item = Lexeme(u'elma', u'elma', None, None, None)
     LexiconLoader._set_category_and_lemma(item)
     assert_that(item, equal_to(Lexeme(u'elma', u'elma', SyntacticCategory.NOUN, None, None)))
Пример #26
0
    def test_should_infer_morphemic_attrs_for_verbs(self):
        PVD = LexemeAttribute.ProgressiveVowelDrop
        PI = LexemeAttribute.Passive_In
        AA = LexemeAttribute.Aorist_A
        AI = LexemeAttribute.Aorist_I
        VO = LexemeAttribute.Voicing
        NVO = LexemeAttribute.NoVoicing

        C_T = LexemeAttribute.Causative_t
        C_IR = LexemeAttribute.Causative_Ir
        C_IT = LexemeAttribute.Causative_It
        C_AR = LexemeAttribute.Causative_Ar
        C_DIR = LexemeAttribute.Causative_dIr

        item = Lexeme(u'gitmek', u'git', SyntacticCategory.VERB, None, {VO, C_DIR})
        LexiconLoader._infer_morphemic_attributes(item)
        assert_that(item, equal_to(Lexeme(u'gitmek', u'git', SyntacticCategory.VERB, None, {VO, C_DIR, AA})))

        item = Lexeme(u'gelmek', u'gel', SyntacticCategory.VERB, None, {AI, C_DIR})
        LexiconLoader._infer_morphemic_attributes(item)
        assert_that(item, equal_to(Lexeme(u'gelmek', u'gel', SyntacticCategory.VERB, None, {AI, C_DIR, PI, NVO})))

        item = Lexeme(u'atmak', u'at', SyntacticCategory.VERB, None, {NVO, C_DIR})
        LexiconLoader._infer_morphemic_attributes(item)
        assert_that(item, equal_to(Lexeme(u'atmak', u'at', SyntacticCategory.VERB, None, {NVO, C_DIR, AA})))

        item = Lexeme(u'atamak', u'ata', SyntacticCategory.VERB, None, None)
        LexiconLoader._infer_morphemic_attributes(item)
        assert_that(item, equal_to(Lexeme(u'atamak', u'ata', SyntacticCategory.VERB, None, {PVD, PI, AI, C_T, NVO})))

        item = Lexeme(u'dolamak', u'dola', SyntacticCategory.VERB, None, None)
        LexiconLoader._infer_morphemic_attributes(item)
        assert_that(item, equal_to(Lexeme(u'dolamak', u'dola', SyntacticCategory.VERB, None, {PVD, PI, AI, C_T, NVO})))

        item = Lexeme(u'tanımak', u'tanı', SyntacticCategory.VERB, None, {AI})
        LexiconLoader._infer_morphemic_attributes(item)
        assert_that(item, equal_to(Lexeme(u'tanımak', u'tanı', SyntacticCategory.VERB, None, {AI, PVD, PI, AI, C_T, NVO})))

        item = Lexeme(u'getirmek', u'getir', SyntacticCategory.VERB, None, {AI})
        LexiconLoader._infer_morphemic_attributes(item)
        assert_that(item, equal_to(Lexeme(u'getirmek', u'getir', SyntacticCategory.VERB, None, {AI, AI, C_T, NVO})))

        item = Lexeme(u'ürkmek', u'ürk', SyntacticCategory.VERB, None, {C_IT})
        LexiconLoader._infer_morphemic_attributes(item)
        assert_that(item, equal_to(Lexeme(u'ürkmek', u'ürk', SyntacticCategory.VERB, None, {C_IT, AA, NVO})))

        item = Lexeme(u'ağlamak', u'ağla', SyntacticCategory.VERB, None, None)
        LexiconLoader._infer_morphemic_attributes(item)
        assert_that(item, equal_to(Lexeme(u'ağlamak', u'ağla', SyntacticCategory.VERB, None, {PVD, PI, AI, C_T, NVO})))