def test_should_not_set_category_and_lemma_if_category_is_set_already(self): item_org = Lexeme(u'elma', u'elma', SyntacticCategory.NOUN, None, None) item_clone = item_org.clone() LexiconLoader._set_category_and_lemma(item_clone) assert_that(item_org, equal_to(item_clone)) item_org = Lexeme(u'mavi', u'mavi', SyntacticCategory.ADJECTIVE, None, None) item_clone = item_org.clone() LexiconLoader._set_category_and_lemma(item_clone) assert_that(item_org, equal_to(item_clone)) item_org = Lexeme(u'aha', u'aha', SyntacticCategory.INTERJECTION, None, None) item_clone = item_org.clone() LexiconLoader._set_category_and_lemma(item_clone) assert_that(item_org, equal_to(item_clone)) item_org = Lexeme(u'yemek', u'yemek', SyntacticCategory.NOUN, None, None) item_clone = item_org.clone() LexiconLoader._set_category_and_lemma(item_clone) assert_that(item_org, equal_to(item_clone)) item_org = Lexeme(u'tokmak', u'tokmak', SyntacticCategory.NOUN, None, None) item_clone = item_org.clone() LexiconLoader._set_category_and_lemma(item_clone) assert_that(item_org, equal_to(item_clone))
def test_should_set_category_and_lemma_for_verbs(self): item = Lexeme(u'yemek', u'yemek', None, None, None) LexiconLoader._set_category_and_lemma(item) assert_that(item, equal_to(Lexeme(u'yemek', u'ye', SyntacticCategory.VERB, None, None))) item = Lexeme(u'elemek', u'elemek', None, None, None) LexiconLoader._set_category_and_lemma(item) assert_that(item, equal_to(Lexeme(u'elemek', u'ele', SyntacticCategory.VERB, None, None)))
def setUpClass(cls): super(_LikelihoodCalculatorTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() cls.contextless_parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) cls.mongodb_connection = pymongo.Connection(host='127.0.0.1') cls.collection_map = { 1: cls.mongodb_connection['trnltk']['wordUnigrams999'], 2: cls.mongodb_connection['trnltk']['wordBigrams999'], 3: cls.mongodb_connection['trnltk']['wordTrigrams999'] } cls.generator = None
def setUpClass(cls): super(MorphemeContainerContextlessProbabilityGeneratorWithContainersTest, cls).setUpClass() all_roots = [] lexicon_lines = u''' duvar tutku saç oğul [A:LastVowelDrop] demek [A:RootChange, Passive_In, Passive_InIl] bu [P:Det] '''.strip().splitlines() lexemes = LexiconLoader.load_from_lines(lexicon_lines) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = BasicSuffixGraph() suffix_graph.initialize() word_root_finder = WordRootFinder(cls.root_map) cls.contextless_parser = ContextlessMorphologicalParser(suffix_graph, None, [word_root_finder])
def setUp(self): self.parseset_creator = ParseSetCreator() all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map = (RootMapGenerator()).generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() self.parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])
def setUpClass(cls): super( MorphemeContainerContextlessProbabilityGeneratorWithContainersTest, cls).setUpClass() all_roots = [] lexicon_lines = u''' duvar tutku saç oğul [A:LastVowelDrop] demek [A:RootChange, Passive_In, Passive_InIl] bu [P:Det] '''.strip().splitlines() lexemes = LexiconLoader.load_from_lines(lexicon_lines) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = BasicSuffixGraph() suffix_graph.initialize() word_root_finder = WordRootFinder(cls.root_map) cls.contextless_parser = ContextlessMorphologicalParser( suffix_graph, None, [word_root_finder])
def setUpClass(cls): super(TransitionGeneratorTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() cls.parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) cls.transition_generator = TransitionGenerator(cls.parser)
def setUpClass(cls): super(InterpolatingLikelihoodCalculatorCalculationContextTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph( NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) cls.contextless_parser = UpperCaseSupportingContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) mongodb_connection = pymongo.Connection(host='127.0.0.1') cls.collection_map = { 1: mongodb_connection['trnltk']['wordUnigrams999'], 2: mongodb_connection['trnltk']['wordBigrams999'], 3: mongodb_connection['trnltk']['wordTrigrams999'] } database_index_builder = DatabaseIndexBuilder(cls.collection_map) target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter( cls.collection_map) ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother( ) sequence_likelihood_calculator = UniformSequenceLikelihoodCalculator() wrapped_generator = ContextParsingLikelihoodCalculator( database_index_builder, target_form_given_context_counter, ngram_frequency_smoother, sequence_likelihood_calculator) cls.generator = InterpolatingLikelihoodCalculator(wrapped_generator)
def setUpClass(cls): super(ParserTestWithExtendedGraph, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) cls._org_root_map = (RootMapGenerator()).generate(all_roots)
def setUpClass(cls): super(FormatterTest, cls).setUpClass() all_roots = [] dictionary_content = ["kitap", "yapmak"] lexemes = LexiconLoader.load_from_lines(dictionary_content) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) cls.root_map = RootMapGenerator().generate(all_roots)
def create(cls, master_dictionary_path, ngram_collection_map): """ @type master_dictionary_path: str or unicode @param ngram_collection_map: list<Collection> @rtype ContextfulMorphologicalParser """ all_roots = [] lexemes = LexiconLoader.load_from_file(master_dictionary_path) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() contextless_parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) database_index_builder = DatabaseIndexBuilder(ngram_collection_map) target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter(ngram_collection_map) ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother() sequence_likelihood_calculator = SequenceLikelihoodCalculator(None) collocation_metric_calculator = ContextParsingLikelihoodCalculator(database_index_builder, target_form_given_context_counter, ngram_frequency_smoother, sequence_likelihood_calculator) interpolating_collocation_metric_calculator = InterpolatingLikelihoodCalculator(collocation_metric_calculator) cached_contextless_distribution_smoother = CachedContextlessDistributionSmoother() contextless_distribution_metric_calculator = ContextlessDistributionCalculator(database_index_builder, target_form_given_context_counter, cached_contextless_distribution_smoother) contextful_likelihood_calculator = ContextfulLikelihoodCalculator(interpolating_collocation_metric_calculator, contextless_distribution_metric_calculator) sequence_likelihood_calculator._contextful_likelihood_calculator = contextful_likelihood_calculator contextful_morphological_parser = ContextfulMorphologicalParser(contextless_parser, contextful_likelihood_calculator) return contextful_morphological_parser
def setUpClass(cls): super(StatisticalParserTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph( BasicSuffixGraph())) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) contextless_parser = ContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) parseset_index = "001" dom = parse( os.path.join( os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format( parseset_index))) parseset = ParseSetBinding.build( dom.getElementsByTagName("parseset")[0]) parse_set_word_list = [] for sentence in parseset.sentences: parse_set_word_list.extend(sentence.words) complete_word_concordance_index = CompleteWordConcordanceIndex( parse_set_word_list) cls.parser = StatisticalParser(contextless_parser, complete_word_concordance_index)
def create_calculator(cls, parseset_index): all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() cls.contextless_parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) mongodb_connection = pymongo.Connection(host='127.0.0.1') collection_map = { 1: mongodb_connection['trnltk']['wordUnigrams{}'.format(parseset_index)], 2: mongodb_connection['trnltk']['wordBigrams{}'.format(parseset_index)], 3: mongodb_connection['trnltk']['wordTrigrams{}'.format(parseset_index)] } database_index_builder = DatabaseIndexBuilder(collection_map) target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter(collection_map) ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother() sequence_likelihood_calculator = SequenceLikelihoodCalculator(None) collocation_metric_calculator = ContextParsingLikelihoodCalculator(database_index_builder, target_form_given_context_counter, ngram_frequency_smoother, sequence_likelihood_calculator) interpolating_collocation_metric_calculator = InterpolatingLikelihoodCalculator(collocation_metric_calculator) contextless_distribution_metric_calculator = ContextlessDistributionCalculator(database_index_builder, target_form_given_context_counter) contextful_likelihood_calculator = ContextfulLikelihoodCalculator(interpolating_collocation_metric_calculator, contextless_distribution_metric_calculator) sequence_likelihood_calculator._contextful_likelihood_calculator = contextful_likelihood_calculator return contextful_likelihood_calculator
def setUpClass(cls): super(PredefinedPathsTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) cls.morpheme_container_map = {} cls.suffix_graph = BasicSuffixGraph() cls.suffix_graph.initialize()
def setUpClass(cls): super(PredefinedPathsTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) cls.morpheme_container_map = {} cls.suffix_graph = BasicSuffixGraph() cls.suffix_graph.initialize()
def test_should_validate_master_dict(self): path = os.path.join(os.path.dirname(__file__), '../../../resources/master_dictionary.txt') items = LexiconLoader.load_from_file(path) assert_that(len(items)>0, equal_to(True)) for item in items: assert_that(item.lemma, not_none(), str(item)) assert_that(item.root, not_none(), str(item)) assert_that(item.syntactic_category, not_none(), str(item)) assert_that(SyntacticCategory.ALL, has_item(item.syntactic_category), str(item)) if item.secondary_syntactic_category: assert_that(SecondarySyntacticCategory.ALL, has_item(item.secondary_syntactic_category), str(item)) if item.attributes: for attr in item.attributes: assert_that(LexemeAttribute.ALL, has_item(attr), str(item))
def setUpClass(cls): super(_LikelihoodCalculatorTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph( NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) cls.contextless_parser = ContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) cls.mongodb_connection = pymongo.Connection(host='127.0.0.1') cls.collection_map = { 1: cls.mongodb_connection['trnltk']['wordUnigrams999'], 2: cls.mongodb_connection['trnltk']['wordBigrams999'], 3: cls.mongodb_connection['trnltk']['wordTrigrams999'] } cls.generator = None
def setUpClass(cls): all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() cls.contextless_parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) mongodb_connection = pymongo.Connection(host='127.0.0.1') collection_map = { 1: mongodb_connection['trnltk']['wordUnigrams{}'.format(cls.parseset_index)] } database_index_builder = DatabaseIndexBuilder(collection_map) target_form_given_context_counter = TargetFormGivenContextCounter(collection_map) smoother = CachedContextlessDistributionSmoother() smoother.initialize() cls.calculator = ContextlessDistributionCalculator(database_index_builder, target_form_given_context_counter, smoother) cls.calculator.build_indexes()
def setUpClass(cls): super(TransitionGeneratorTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph( NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) cls.parser = ContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) cls.transition_generator = TransitionGenerator(cls.parser)
def setUpClass(cls): super(StatisticalParserTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(BasicSuffixGraph())) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() contextless_parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) parseset_index = "001" dom = parse(os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(parseset_index))) parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0]) parse_set_word_list = [] for sentence in parseset.sentences: parse_set_word_list.extend(sentence.words) complete_word_concordance_index = CompleteWordConcordanceIndex(parse_set_word_list) cls.parser = StatisticalParser(contextless_parser, complete_word_concordance_index)
def test_should_load_lexicon_from_str(self): dictionary_content = u''' a [P:Interj] aba [P:Adj] abadî abat [P:Adj; A:NoVoicing] Abdal abdest [A:NoVoicing] abes [P:Adj] abes [P:Adv] ablak [P:Adj; A:NoVoicing] abuk [P:Adj, Dup;A:NoVoicing, NoSuffix] acemborusu [A:CompoundP3sg; R:acemboru] acembuselik aciz [A:LastVowelDrop] âciz [P:Adj] açık [P:Adj] ad ad [P:Noun; A:Doubling, InverseHarmony] addetmek [A:Voicing, Aorist_A] addolmak [A:Causative_dIr] ahlat [A:NoVoicing, Plural] akşam [P:Noun, Time] atamak [A:Causative_It] sürtmek yemek [P:Noun] yemek [A:Causative_dIr] ürkmek [A:Causative_It] ''' dictionary_lines = dictionary_content.split('\n') dictionary_lines = [l.strip() for l in dictionary_lines] dictionary_lines = filter(lambda line: line, dictionary_lines) lexemes = LexiconLoader.load_from_lines(dictionary_lines) assert_that(lexemes, has_length(len(dictionary_lines)), str(len(lexemes)-len(dictionary_lines))) assert_that(lexemes, has_item(Lexeme(u'a', u'a', SyntacticCategory.INTERJECTION, None, None))) assert_that(lexemes, has_item(Lexeme(u'aba', u'aba', SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'abadî', u'abadî', SyntacticCategory.NOUN, None, {LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'abat', u'abat', SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'Abdal', u'Abdal', SyntacticCategory.NOUN, SecondarySyntacticCategory.PROPER_NOUN, {LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'abdest', u'abdest', SyntacticCategory.NOUN, None, {LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'abes', u'abes', SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'abes', u'abes', SyntacticCategory.ADVERB, None, None))) assert_that(lexemes, has_item(Lexeme(u'ablak', u'ablak', SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'abuk', u'abuk', SyntacticCategory.ADJECTIVE, SecondarySyntacticCategory.DUPLICATOR, {LexemeAttribute.NoSuffix, LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'acemborusu', u'acemboru', SyntacticCategory.NOUN, None, {LexemeAttribute.CompoundP3sg, LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'acembuselik', u'acembuselik', SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing}))) assert_that(lexemes, has_item(Lexeme(u'aciz', u'aciz', SyntacticCategory.NOUN, None, {LexemeAttribute.LastVowelDrop, LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'âciz', u'âciz', SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'açık', u'açık', SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.Voicing}))) assert_that(lexemes, has_item(Lexeme(u'ad', u'ad', SyntacticCategory.NOUN, None, {LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'ad', u'ad', SyntacticCategory.NOUN, None, {LexemeAttribute.Doubling, LexemeAttribute.InverseHarmony, LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'addetmek', u'addet', SyntacticCategory.VERB, None, {LexemeAttribute.Aorist_A, LexemeAttribute.Causative_dIr, LexemeAttribute.Voicing}))) assert_that(lexemes, has_item(Lexeme(u'addolmak', u'addol', SyntacticCategory.VERB, None, {LexemeAttribute.Aorist_I, LexemeAttribute.Causative_dIr, LexemeAttribute.NoVoicing, LexemeAttribute.Passive_In}))) assert_that(lexemes, has_item(Lexeme(u'ahlat', u'ahlat', SyntacticCategory.NOUN, None, {LexemeAttribute.NoVoicing, LexemeAttribute.Plural}))) assert_that(lexemes, has_item(Lexeme(u'akşam', u'akşam', SyntacticCategory.NOUN, SecondarySyntacticCategory.TIME, {LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'atamak', u'ata', SyntacticCategory.VERB, None, {LexemeAttribute.Aorist_I, LexemeAttribute.Causative_It, LexemeAttribute.NoVoicing, LexemeAttribute.Passive_In, LexemeAttribute.ProgressiveVowelDrop}))) assert_that(lexemes, has_item(Lexeme(u'sürtmek', u'sürt', SyntacticCategory.VERB, None, {LexemeAttribute.Aorist_A, LexemeAttribute.Causative_Ir, LexemeAttribute.NoVoicing}))) assert_that(lexemes, has_item(Lexeme(u'yemek', u'yemek', SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing}))) assert_that(lexemes, has_item(Lexeme(u'yemek', u'ye', SyntacticCategory.VERB, None, {LexemeAttribute.Aorist_A, LexemeAttribute.Causative_dIr, LexemeAttribute.NoVoicing, LexemeAttribute.Passive_In, LexemeAttribute.ProgressiveVowelDrop}))) assert_that(lexemes, has_item(Lexeme(u'ürkmek', u'ürk', SyntacticCategory.VERB, None, {LexemeAttribute.Aorist_A, LexemeAttribute.Causative_It, LexemeAttribute.NoVoicing})))
from trnltk.morphology.lexicon.lexiconloader import LexiconLoader from trnltk.morphology.lexicon.rootgenerator import CircumflexConvertingRootGenerator, RootMapGenerator from trnltk.morphology.model import formatter from trnltk.morphology.morphotactics.basicsuffixgraph import BasicSuffixGraph from trnltk.morphology.morphotactics.copulasuffixgraph import CopulaSuffixGraph from trnltk.morphology.contextless.parser.parser import logger as parser_logger, UpperCaseSupportingContextlessMorphologicalParser from trnltk.morphology.contextless.parser.rootfinder import WordRootFinder, DigitNumeralRootFinder, TextNumeralRootFinder, ProperNounFromApostropheRootFinder, ProperNounWithoutApostropheRootFinder from trnltk.morphology.contextless.parser.suffixapplier import logger as suffix_applier_logger from trnltk.morphology.morphotactics.numeralsuffixgraph import NumeralSuffixGraph from trnltk.morphology.morphotactics.predefinedpaths import PredefinedPaths from trnltk.morphology.morphotactics.propernounsuffixgraph import ProperNounSuffixGraph all_roots = [] lexemes = LexiconLoader.load_from_file('trnltk/trnltk/resources/master_dictionary.txt') for di in lexemes: all_roots.extend(CircumflexConvertingRootGenerator.generate(di)) root_map_generator = RootMapGenerator() root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(root_map) text_numeral_root_finder = TextNumeralRootFinder(root_map) digit_numeral_root_finder = DigitNumeralRootFinder()
def test_should_create_lexeme_from_line(self): item = LexiconLoader._crate_lexeme_from_line(u'a [P:Interj]') assert_that(item, equal_to(Lexeme(u"a", u"a", SyntacticCategory.INTERJECTION, None, None))) item = LexiconLoader._crate_lexeme_from_line(u'aba [P:Adj]') assert_that(item, equal_to(Lexeme(u"aba", u"aba", SyntacticCategory.ADJECTIVE, None, None))) item = LexiconLoader._crate_lexeme_from_line(u'abadî') assert_that(item, equal_to(Lexeme(u"abadî", u"abadî", None, None, None))) item = LexiconLoader._crate_lexeme_from_line(u'abat [P:Adj; A:NoVoicing]') assert_that(item, equal_to(Lexeme(u"abat", u"abat", SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.NoVoicing}))) item = LexiconLoader._crate_lexeme_from_line(u'Abdal') assert_that(item, equal_to(Lexeme(u"Abdal", u"Abdal", None, None, None))) item = LexiconLoader._crate_lexeme_from_line(u'abdest [A:NoVoicing]') assert_that(item, equal_to(Lexeme(u"abdest", u"abdest", None, None, {LexemeAttribute.NoVoicing}))) item = LexiconLoader._crate_lexeme_from_line(u'abes [P:Adv]') assert_that(item, equal_to(Lexeme(u"abes", u"abes", SyntacticCategory.ADVERB, None, None))) item = LexiconLoader._crate_lexeme_from_line(u'ablak [P:Adj; A:NoVoicing]') assert_that(item, equal_to(Lexeme(u"ablak", u"ablak", SyntacticCategory.ADJECTIVE, None, {LexemeAttribute.NoVoicing}))) item = LexiconLoader._crate_lexeme_from_line(u'abuk [P:Adj, Dup;A:NoVoicing, NoSuffix]') assert_that(item, equal_to(Lexeme(u"abuk", u"abuk", SyntacticCategory.ADJECTIVE, SecondarySyntacticCategory.DUPLICATOR, {LexemeAttribute.NoVoicing, LexemeAttribute.NoSuffix}))) item = LexiconLoader._crate_lexeme_from_line(u'acemborusu [A:CompoundP3sg; R:acemboru]') assert_that(item, equal_to(Lexeme(u"acemborusu", u"acemboru", None, None, {LexemeAttribute.CompoundP3sg}))) item = LexiconLoader._crate_lexeme_from_line(u'acembuselik') assert_that(item, equal_to(Lexeme(u"acembuselik", u"acembuselik", None, None, None))) item = LexiconLoader._crate_lexeme_from_line(u'aciz [A: LastVowelDrop]') assert_that(item, equal_to(Lexeme(u"aciz", u"aciz", None, None, {LexemeAttribute.LastVowelDrop}))) item = LexiconLoader._crate_lexeme_from_line(u'âciz [P:Adj]') assert_that(item, equal_to(Lexeme(u"âciz", u"âciz", SyntacticCategory.ADJECTIVE, None, None))) item = LexiconLoader._crate_lexeme_from_line(u'açık [P:Adj]') assert_that(item, equal_to(Lexeme(u"açık", u"açık", SyntacticCategory.ADJECTIVE, None, None))) item = LexiconLoader._crate_lexeme_from_line(u'ad') assert_that(item, equal_to(Lexeme(u"ad", u"ad", None, None, None))) item = LexiconLoader._crate_lexeme_from_line(u'ad [P:Noun; A:Doubling, InverseHarmony]') assert_that(item, equal_to(Lexeme(u"ad", u"ad", SyntacticCategory.NOUN, None, {LexemeAttribute.Doubling, LexemeAttribute.InverseHarmony}))) item = LexiconLoader._crate_lexeme_from_line(u'addetmek [A:Voicing, Aorist_A]') assert_that(item, equal_to(Lexeme(u"addetmek", u"addetmek", None, None, {LexemeAttribute.Voicing, LexemeAttribute.Aorist_A}))) item = LexiconLoader._crate_lexeme_from_line(u'addolmak') assert_that(item, equal_to(Lexeme(u"addolmak", u"addolmak", None, None, None))) item = LexiconLoader._crate_lexeme_from_line(u'ahlat [A:NoVoicing, Plural]') assert_that(item, equal_to(Lexeme(u"ahlat", u"ahlat", None, None, {LexemeAttribute.NoVoicing, LexemeAttribute.Plural}))) item = LexiconLoader._crate_lexeme_from_line(u'akşam [P:Noun, Time]') assert_that(item, equal_to(Lexeme(u"akşam", u"akşam", SyntacticCategory.NOUN, SecondarySyntacticCategory.TIME, None))) item = LexiconLoader._crate_lexeme_from_line(u'yemek [P:Noun]') assert_that(item, equal_to(Lexeme(u"yemek", u"yemek", SyntacticCategory.NOUN, None, None))) item = LexiconLoader._crate_lexeme_from_line(u'yemek') assert_that(item, equal_to(Lexeme(u"yemek", u"yemek", None, None, None))) item = LexiconLoader._crate_lexeme_from_line(u'sürtmek') assert_that(item, equal_to(Lexeme(u"sürtmek", u"sürtmek", None, None, None))) item = LexiconLoader._crate_lexeme_from_line(u'ürkmek [A:Causative_It]') assert_that(item, equal_to(Lexeme(u"ürkmek", u"ürkmek", None, None, {LexemeAttribute.Causative_It}))) item = LexiconLoader._crate_lexeme_from_line(u'akşamsefası [A:CompoundP3sg; R:akşamsefa]') assert_that(item, equal_to(Lexeme(u"akşamsefası", u"akşamsefa", None, None, {LexemeAttribute.CompoundP3sg}))) item = LexiconLoader._crate_lexeme_from_line(u'akşamüstü [P:Noun, Time; A:CompoundP3sg; R:akşamüst]') assert_that(item, equal_to(Lexeme(u"akşamüstü", u"akşamüst", SyntacticCategory.NOUN, SecondarySyntacticCategory.TIME, {LexemeAttribute.CompoundP3sg}))) item = LexiconLoader._crate_lexeme_from_line(u'mi [P:Ques]') assert_that(item, equal_to(Lexeme(u"mi", u"mi", SyntacticCategory.QUESTION, None, None)))
def test_should_set_category_and_lemma_for_nonverbs(self): item = Lexeme(u'elma', u'elma', None, None, None) LexiconLoader._set_category_and_lemma(item) assert_that(item, equal_to(Lexeme(u'elma', u'elma', SyntacticCategory.NOUN, None, None)))
def test_should_infer_morphemic_attrs_for_verbs(self): PVD = LexemeAttribute.ProgressiveVowelDrop PI = LexemeAttribute.Passive_In AA = LexemeAttribute.Aorist_A AI = LexemeAttribute.Aorist_I VO = LexemeAttribute.Voicing NVO = LexemeAttribute.NoVoicing C_T = LexemeAttribute.Causative_t C_IR = LexemeAttribute.Causative_Ir C_IT = LexemeAttribute.Causative_It C_AR = LexemeAttribute.Causative_Ar C_DIR = LexemeAttribute.Causative_dIr item = Lexeme(u'gitmek', u'git', SyntacticCategory.VERB, None, {VO, C_DIR}) LexiconLoader._infer_morphemic_attributes(item) assert_that(item, equal_to(Lexeme(u'gitmek', u'git', SyntacticCategory.VERB, None, {VO, C_DIR, AA}))) item = Lexeme(u'gelmek', u'gel', SyntacticCategory.VERB, None, {AI, C_DIR}) LexiconLoader._infer_morphemic_attributes(item) assert_that(item, equal_to(Lexeme(u'gelmek', u'gel', SyntacticCategory.VERB, None, {AI, C_DIR, PI, NVO}))) item = Lexeme(u'atmak', u'at', SyntacticCategory.VERB, None, {NVO, C_DIR}) LexiconLoader._infer_morphemic_attributes(item) assert_that(item, equal_to(Lexeme(u'atmak', u'at', SyntacticCategory.VERB, None, {NVO, C_DIR, AA}))) item = Lexeme(u'atamak', u'ata', SyntacticCategory.VERB, None, None) LexiconLoader._infer_morphemic_attributes(item) assert_that(item, equal_to(Lexeme(u'atamak', u'ata', SyntacticCategory.VERB, None, {PVD, PI, AI, C_T, NVO}))) item = Lexeme(u'dolamak', u'dola', SyntacticCategory.VERB, None, None) LexiconLoader._infer_morphemic_attributes(item) assert_that(item, equal_to(Lexeme(u'dolamak', u'dola', SyntacticCategory.VERB, None, {PVD, PI, AI, C_T, NVO}))) item = Lexeme(u'tanımak', u'tanı', SyntacticCategory.VERB, None, {AI}) LexiconLoader._infer_morphemic_attributes(item) assert_that(item, equal_to(Lexeme(u'tanımak', u'tanı', SyntacticCategory.VERB, None, {AI, PVD, PI, AI, C_T, NVO}))) item = Lexeme(u'getirmek', u'getir', SyntacticCategory.VERB, None, {AI}) LexiconLoader._infer_morphemic_attributes(item) assert_that(item, equal_to(Lexeme(u'getirmek', u'getir', SyntacticCategory.VERB, None, {AI, AI, C_T, NVO}))) item = Lexeme(u'ürkmek', u'ürk', SyntacticCategory.VERB, None, {C_IT}) LexiconLoader._infer_morphemic_attributes(item) assert_that(item, equal_to(Lexeme(u'ürkmek', u'ürk', SyntacticCategory.VERB, None, {C_IT, AA, NVO}))) item = Lexeme(u'ağlamak', u'ağla', SyntacticCategory.VERB, None, None) LexiconLoader._infer_morphemic_attributes(item) assert_that(item, equal_to(Lexeme(u'ağlamak', u'ağla', SyntacticCategory.VERB, None, {PVD, PI, AI, C_T, NVO})))