def setUpClass(cls): super(StatisticalParserTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph( BasicSuffixGraph())) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) contextless_parser = ContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) parseset_index = "001" dom = parse( os.path.join( os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format( parseset_index))) parseset = ParseSetBinding.build( dom.getElementsByTagName("parseset")[0]) parse_set_word_list = [] for sentence in parseset.sentences: parse_set_word_list.extend(sentence.words) complete_word_concordance_index = CompleteWordConcordanceIndex( parse_set_word_list) cls.parser = StatisticalParser(contextless_parser, complete_word_concordance_index)
def _validate_complete_word_concordance_indexes(self, word_list): idx = CompleteWordConcordanceIndex(word_list) for complete_word in idx._offsets._indices.iterkeys(): offsets = idx.offsets(complete_word) words = [word_list[offset] for offset in offsets] assert_that(all([word.str == complete_word for word in words])) for complete_word in idx._offsets._indices.iterkeys(): for syntactic_category in idx._offsets._indices[ complete_word].iterkeys(): offsets = idx.offsets(complete_word, syntactic_category) words = [word_list[offset] for offset in offsets] assert_that( all([ word.str == complete_word and word.syntactic_category == syntactic_category for word in words ])) for complete_word in idx._offsets._indices.iterkeys(): for syntactic_category in idx._offsets._indices[ complete_word].iterkeys(): for secondary_syntactic_category in idx._offsets._indices[ complete_word][syntactic_category].iterkeys(): offsets = idx.offsets(complete_word, syntactic_category, secondary_syntactic_category) words = [word_list[offset] for offset in offsets] assert_that( all([ word.str == complete_word and word.syntactic_category == syntactic_category and word.secondary_syntactic_category == secondary_syntactic_category for word in words ]))
def _validate_complete_word_concordance_indexes(self, word_list): idx = CompleteWordConcordanceIndex(word_list) for complete_word in idx._offsets._indices.iterkeys(): offsets = idx.offsets(complete_word) words = [word_list[offset] for offset in offsets] assert_that(all([word.str==complete_word for word in words])) for complete_word in idx._offsets._indices.iterkeys(): for syntactic_category in idx._offsets._indices[complete_word].iterkeys(): offsets = idx.offsets(complete_word, syntactic_category) words = [word_list[offset] for offset in offsets] assert_that(all([word.str==complete_word and word.syntactic_category==syntactic_category for word in words])) for complete_word in idx._offsets._indices.iterkeys(): for syntactic_category in idx._offsets._indices[complete_word].iterkeys(): for secondary_syntactic_category in idx._offsets._indices[complete_word][syntactic_category].iterkeys(): offsets = idx.offsets(complete_word, syntactic_category, secondary_syntactic_category) words = [word_list[offset] for offset in offsets] assert_that(all([word.str==complete_word and word.syntactic_category==syntactic_category and word.secondary_syntactic_category==secondary_syntactic_category for word in words]))
def test_should_find_complete_word_concordance(self): idx = CompleteWordConcordanceIndex(self.word_list) assert_that(idx.offsets(u'something'), equal_to([])) assert_that(idx.offsets(u"o"), equal_to([0, 1, 2])) assert_that(idx.offsets(u"o", SyntacticCategory.PRONOUN), equal_to([0, 1])) assert_that(idx.offsets(u"o", SyntacticCategory.DETERMINER), equal_to([2])) assert_that( idx.offsets(u"o", SyntacticCategory.PRONOUN, SecondarySyntacticCategory.PERSONAL), equal_to([0])) assert_that( idx.offsets(u"o", SyntacticCategory.PRONOUN, SecondarySyntacticCategory.DEMONSTRATIVE), equal_to([1])) assert_that( idx.offsets(u"onu", SyntacticCategory.PRONOUN, SecondarySyntacticCategory.PERSONAL), equal_to([3])) assert_that( idx.offsets(u"onu", SyntacticCategory.PRONOUN, SecondarySyntacticCategory.DEMONSTRATIVE), equal_to([4])) assert_that(idx.offsets(u"gittim"), equal_to([6])) assert_that(idx.offsets(u"gittim", SyntacticCategory.VERB), equal_to([6])) assert_that(idx.offsets(u"giderim"), equal_to([7])) assert_that(idx.offsets(u"giderim", SyntacticCategory.VERB), equal_to([7])) assert_that(idx.offsets(u"gidecekler"), equal_to([8, 10])) assert_that(idx.offsets(u"gidecekler", SyntacticCategory.VERB), equal_to([8])) assert_that(idx.offsets(u"gidecekler", SyntacticCategory.NOUN), equal_to([10])) assert_that(idx.offsets(u"gideceğim"), equal_to([9, 11])) assert_that(idx.offsets(u"gideceğim", SyntacticCategory.VERB), equal_to([9])) assert_that(idx.offsets(u"gideceğim", SyntacticCategory.NOUN), equal_to([11]))
def test_should_find_complete_word_concordance(self): idx = CompleteWordConcordanceIndex(self.word_list) assert_that(idx.offsets(u'something'), equal_to([])) assert_that(idx.offsets(u"o"), equal_to([0, 1, 2])) assert_that(idx.offsets(u"o", SyntacticCategory.PRONOUN), equal_to([0, 1])) assert_that(idx.offsets(u"o", SyntacticCategory.DETERMINER), equal_to([2])) assert_that(idx.offsets(u"o", SyntacticCategory.PRONOUN, SecondarySyntacticCategory.PERSONAL), equal_to([0])) assert_that(idx.offsets(u"o", SyntacticCategory.PRONOUN, SecondarySyntacticCategory.DEMONSTRATIVE), equal_to([1])) assert_that(idx.offsets(u"onu", SyntacticCategory.PRONOUN, SecondarySyntacticCategory.PERSONAL), equal_to([3])) assert_that(idx.offsets(u"onu", SyntacticCategory.PRONOUN, SecondarySyntacticCategory.DEMONSTRATIVE), equal_to([4])) assert_that(idx.offsets(u"gittim"), equal_to([6])) assert_that(idx.offsets(u"gittim", SyntacticCategory.VERB), equal_to([6])) assert_that(idx.offsets(u"giderim"), equal_to([7])) assert_that(idx.offsets(u"giderim", SyntacticCategory.VERB), equal_to([7])) assert_that(idx.offsets(u"gidecekler"), equal_to([8, 10])) assert_that(idx.offsets(u"gidecekler", SyntacticCategory.VERB), equal_to([8])) assert_that(idx.offsets(u"gidecekler", SyntacticCategory.NOUN), equal_to([10])) assert_that(idx.offsets(u"gideceğim"), equal_to([9, 11])) assert_that(idx.offsets(u"gideceğim", SyntacticCategory.VERB), equal_to([9])) assert_that(idx.offsets(u"gideceğim", SyntacticCategory.NOUN), equal_to([11]))