def create(cls, master_dictionary_path, ngram_collection_map): """ @type master_dictionary_path: str or unicode @param ngram_collection_map: list<Collection> @rtype ContextfulMorphologicalParser """ all_roots = [] lexemes = LexiconLoader.load_from_file(master_dictionary_path) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() contextless_parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) database_index_builder = DatabaseIndexBuilder(ngram_collection_map) target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter(ngram_collection_map) ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother() sequence_likelihood_calculator = SequenceLikelihoodCalculator(None) collocation_metric_calculator = ContextParsingLikelihoodCalculator(database_index_builder, target_form_given_context_counter, ngram_frequency_smoother, sequence_likelihood_calculator) interpolating_collocation_metric_calculator = InterpolatingLikelihoodCalculator(collocation_metric_calculator) cached_contextless_distribution_smoother = CachedContextlessDistributionSmoother() contextless_distribution_metric_calculator = ContextlessDistributionCalculator(database_index_builder, target_form_given_context_counter, cached_contextless_distribution_smoother) contextful_likelihood_calculator = ContextfulLikelihoodCalculator(interpolating_collocation_metric_calculator, contextless_distribution_metric_calculator) sequence_likelihood_calculator._contextful_likelihood_calculator = contextful_likelihood_calculator contextful_morphological_parser = ContextfulMorphologicalParser(contextless_parser, contextful_likelihood_calculator) return contextful_morphological_parser
def create_calculator(cls, parseset_index): all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() cls.contextless_parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) mongodb_connection = pymongo.Connection(host='127.0.0.1') collection_map = { 1: mongodb_connection['trnltk']['wordUnigrams{}'.format(parseset_index)], 2: mongodb_connection['trnltk']['wordBigrams{}'.format(parseset_index)], 3: mongodb_connection['trnltk']['wordTrigrams{}'.format(parseset_index)] } database_index_builder = DatabaseIndexBuilder(collection_map) target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter(collection_map) ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother() sequence_likelihood_calculator = SequenceLikelihoodCalculator(None) collocation_metric_calculator = ContextParsingLikelihoodCalculator(database_index_builder, target_form_given_context_counter, ngram_frequency_smoother, sequence_likelihood_calculator) interpolating_collocation_metric_calculator = InterpolatingLikelihoodCalculator(collocation_metric_calculator) contextless_distribution_metric_calculator = ContextlessDistributionCalculator(database_index_builder, target_form_given_context_counter) contextful_likelihood_calculator = ContextfulLikelihoodCalculator(interpolating_collocation_metric_calculator, contextless_distribution_metric_calculator) sequence_likelihood_calculator._contextful_likelihood_calculator = contextful_likelihood_calculator return contextful_likelihood_calculator
def create_calculator(cls, parseset_index): all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph( NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) cls.contextless_parser = UpperCaseSupportingContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) mongodb_connection = pymongo.Connection(host='127.0.0.1') collection_map = { 1: mongodb_connection['trnltk']['wordUnigrams{}'.format( parseset_index)], 2: mongodb_connection['trnltk']['wordBigrams{}'.format( parseset_index)], 3: mongodb_connection['trnltk']['wordTrigrams{}'.format( parseset_index)] } database_index_builder = DatabaseIndexBuilder(collection_map) target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter( collection_map) ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother( ) sequence_likelihood_calculator = SequenceLikelihoodCalculator(None) collocation_metric_calculator = ContextParsingLikelihoodCalculator( database_index_builder, target_form_given_context_counter, ngram_frequency_smoother, sequence_likelihood_calculator) interpolating_collocation_metric_calculator = InterpolatingLikelihoodCalculator( collocation_metric_calculator) contextless_distribution_metric_calculator = ContextlessDistributionCalculator( database_index_builder, target_form_given_context_counter) contextful_likelihood_calculator = ContextfulLikelihoodCalculator( interpolating_collocation_metric_calculator, contextless_distribution_metric_calculator) sequence_likelihood_calculator._contextful_likelihood_calculator = contextful_likelihood_calculator return contextful_likelihood_calculator