def parse(self, input): parse_results = super( UpperCaseSupportingContextlessMorphologicalParser, self).parse(input) if input[0].isupper(): parse_results += super( UpperCaseSupportingContextlessMorphologicalParser, self).parse(TurkishAlphabet.lower(input[0]) + input[1:]) return parse_results
def save_parse_result_for_word(self, word_id, parse_result_uuid): """ @type word_id: ObjectId @type parse_result_uuid: str or unicode """ parse_result = self.sessionmanager.get_parse_result(parse_result_uuid) assert parse_result, "No parse result found with id {}".format(parse_result) word = self.dbmanager.get_word(word_id) if not word: raise Exception("Word not found for setting the correct parse result! {}".format(word_id)) # check if the parse result belongs to the given word assert word['surface'] == parse_result.get_surface() or TurkishAlphabet.lower(word['surface']) == parse_result.get_surface() self.dbmanager.set_parse_result_for_word(word, formatter.format_morpheme_container_for_parseset(parse_result), parse_result)
def create_word_binding_from_morpheme_container(self, word_str, morpheme_container): assert (word_str == morpheme_container.get_surface_so_far()) or (TurkishAlphabet.lower(word_str[0])+word_str[1:] == morpheme_container.get_surface_so_far()) root_str = morpheme_container.get_root().str lemma = morpheme_container.get_root().lexeme.lemma lemma_root = morpheme_container.get_root().lexeme.root root_syntactic_category = morpheme_container.get_root().lexeme.syntactic_category root_secondary_syntactic_category = morpheme_container.get_root().lexeme.secondary_syntactic_category root = RootBinding(root_str, lemma, lemma_root, root_syntactic_category, root_secondary_syntactic_category) word_syntactic_category = morpheme_container.get_surface_syntactic_category() word_secondary_syntactic_category = morpheme_container.get_surface_secondary_syntactic_category() parse_result = formatter.format_morpheme_container_for_parseset(morpheme_container) word = WordBinding(word_str, parse_result, root, word_syntactic_category, word_secondary_syntactic_category) if morpheme_container.get_transitions(): so_far = root_str for transition in morpheme_container.get_transitions(): if isinstance(transition.suffix_form_application.suffix_form.suffix, FreeTransitionSuffix): continue suffix_name = transition.suffix_form_application.suffix_form.suffix.name suffix_pretty_name = transition.suffix_form_application.suffix_form.suffix.pretty_name suffix_form = transition.suffix_form_application.suffix_form.form suffix_application = transition.suffix_form_application.fitting_suffix_form suffix_actual_application = transition.suffix_form_application.actual_suffix_form word_with_suffix_application = None if (so_far + suffix_actual_application)==root_str: word_with_suffix_application = morpheme_container.get_root().lexeme.root + suffix_application else: word_with_suffix_application = so_far + suffix_application so_far += suffix_actual_application if transition.is_derivational(): suffix = DerivationalSuffixBinding(suffix_name, suffix_pretty_name, suffix_form, suffix_application, suffix_actual_application, word_with_suffix_application, so_far, transition.to_state.syntactic_category) word.suffixes.append(suffix) else: suffix = InflectionalSuffixBinding(suffix_name, suffix_pretty_name, suffix_form, suffix_application, suffix_actual_application, word_with_suffix_application, so_far, transition.to_state.syntactic_category) word.suffixes.append(suffix) return word
def save_parse_result_for_word(self, word_id, parse_result_uuid): """ @type word_id: ObjectId @type parse_result_uuid: str or unicode """ parse_result = self.sessionmanager.get_parse_result(parse_result_uuid) assert parse_result, "No parse result found with id {}".format( parse_result) word = self.dbmanager.get_word(word_id) if not word: raise Exception( "Word not found for setting the correct parse result! {}". format(word_id)) # check if the parse result belongs to the given word assert word['surface'] == parse_result.get_surface( ) or TurkishAlphabet.lower( word['surface']) == parse_result.get_surface() self.dbmanager.set_parse_result_for_word( word, formatter.format_morpheme_container_for_parseset(parse_result), parse_result)
def _test_should_parse_simple_parse_set(self, set_number, start_index=0): path = os.path.join( os.path.dirname(__file__), '../../../../testresources/simpleparsesets/simpleparseset{}.txt'. format(set_number)) logger.info("Parsing simple parse set {}".format(path)) skipped = 0 unparsable = 0 comment = 0 with codecs.open(path, 'r', 'utf-8-sig') as parse_set_file: index = 0 for line in parse_set_file: if start_index > index: index += 1 continue if line.startswith('#'): comment += 1 index += 1 continue line = line.strip() (word, parse_result) = line.split('=') if any([ case_to_skip in parse_result for case_to_skip in cases_to_skip ]) or word in words_to_skip: if self.LOG_SKIPPED: logger.info(u'Skipped : {} {} {}'.format( index, word, parse_result)) skipped += 1 index += 1 continue #TODO parse_result = parse_result.replace('Prog1', 'Prog') parse_result = parse_result.replace('Prog2', 'Prog') parse_result = parse_result.replace('Inf1', 'Inf') parse_result = parse_result.replace('Inf2', 'Inf') parse_result = parse_result.replace('Inf3', 'Inf') parse_result = parse_result.replace('WithoutHavingDoneSo1', 'WithoutHavingDoneSo') parse_result = parse_result.replace('WithoutHavingDoneSo2', 'WithoutHavingDoneSo') #TODO parse_result = parse_result.replace('Hastily', 'Hastily+Pos') parse_result = parse_result.replace('Postp+PCNom', 'Part') parse_result = parse_result.replace('Postp+PCDat', 'Postp') parse_result = parse_result.replace('Postp+PCAcc', 'Postp') parse_result = parse_result.replace('Postp+PCLoc', 'Postp') parse_result = parse_result.replace('Postp+PCAbl', 'Postp') parse_result = parse_result.replace('Postp+PCIns', 'Postp') parse_result = parse_result.replace('Postp+PCGen', 'Postp') if self.STATS_MODE: try: self.assert_parse_correct(word, index, parse_result) except Exception: unparsable += 1 logger.info(u'Unparsable : {} {} {}'.format( index, word, parse_result)) else: self.assert_parse_correct(TurkishAlphabet.lower(word), index, parse_result) index += 1 if self.STATS_MODE: logger.info("Finished simple parse set {}".format(path)) logger.info("Found {} lines, with {} lines of comments".format( index, comment)) logger.info("Skipped {}, unparsable {}".format( skipped, unparsable)) logger.info("Words that should be parsable : {}".format(index - comment)) logger.info("Parse success rate : {}".format( float(index - comment - skipped - unparsable) / float(index - comment)))
def create_word_binding_from_morpheme_container(self, word_str, morpheme_container): assert (word_str == morpheme_container.get_surface_so_far()) or ( TurkishAlphabet.lower(word_str[0]) + word_str[1:] == morpheme_container.get_surface_so_far()) root_str = morpheme_container.get_root().str lemma = morpheme_container.get_root().lexeme.lemma lemma_root = morpheme_container.get_root().lexeme.root root_syntactic_category = morpheme_container.get_root( ).lexeme.syntactic_category root_secondary_syntactic_category = morpheme_container.get_root( ).lexeme.secondary_syntactic_category root = RootBinding(root_str, lemma, lemma_root, root_syntactic_category, root_secondary_syntactic_category) word_syntactic_category = morpheme_container.get_surface_syntactic_category( ) word_secondary_syntactic_category = morpheme_container.get_surface_secondary_syntactic_category( ) parse_result = formatter.format_morpheme_container_for_parseset( morpheme_container) word = WordBinding(word_str, parse_result, root, word_syntactic_category, word_secondary_syntactic_category) if morpheme_container.get_transitions(): so_far = root_str for transition in morpheme_container.get_transitions(): if isinstance( transition.suffix_form_application.suffix_form.suffix, FreeTransitionSuffix): continue suffix_name = transition.suffix_form_application.suffix_form.suffix.name suffix_pretty_name = transition.suffix_form_application.suffix_form.suffix.pretty_name suffix_form = transition.suffix_form_application.suffix_form.form suffix_application = transition.suffix_form_application.fitting_suffix_form suffix_actual_application = transition.suffix_form_application.actual_suffix_form word_with_suffix_application = None if (so_far + suffix_actual_application) == root_str: word_with_suffix_application = morpheme_container.get_root( ).lexeme.root + suffix_application else: word_with_suffix_application = so_far + suffix_application so_far += suffix_actual_application if transition.is_derivational(): suffix = DerivationalSuffixBinding( suffix_name, suffix_pretty_name, suffix_form, suffix_application, suffix_actual_application, word_with_suffix_application, so_far, transition.to_state.syntactic_category) word.suffixes.append(suffix) else: suffix = InflectionalSuffixBinding( suffix_name, suffix_pretty_name, suffix_form, suffix_application, suffix_actual_application, word_with_suffix_application, so_far, transition.to_state.syntactic_category) word.suffixes.append(suffix) return word
def parse(self, input): parse_results = super(UpperCaseSupportingContextlessMorphologicalParser, self).parse(input) if input[0].isupper(): parse_results += super(UpperCaseSupportingContextlessMorphologicalParser, self).parse(TurkishAlphabet.lower(input[0]) + input[1:]) return parse_results
def _test_should_parse_simple_parse_set(self, set_number, start_index=0): path = os.path.join( os.path.dirname(__file__), "../../../../testresources/simpleparsesets/simpleparseset{}.txt".format(set_number), ) logger.info("Parsing simple parse set {}".format(path)) skipped = 0 unparsable = 0 comment = 0 with codecs.open(path, "r", "utf-8-sig") as parse_set_file: index = 0 for line in parse_set_file: if start_index > index: index += 1 continue if line.startswith("#"): comment += 1 index += 1 continue line = line.strip() (word, parse_result) = line.split("=") if any([case_to_skip in parse_result for case_to_skip in cases_to_skip]) or word in words_to_skip: if self.LOG_SKIPPED: logger.info("Skipped : {} {} {}".format(index, word, parse_result)) skipped += 1 index += 1 continue # TODO parse_result = parse_result.replace("Prog1", "Prog") parse_result = parse_result.replace("Prog2", "Prog") parse_result = parse_result.replace("Inf1", "Inf") parse_result = parse_result.replace("Inf2", "Inf") parse_result = parse_result.replace("Inf3", "Inf") parse_result = parse_result.replace("WithoutHavingDoneSo1", "WithoutHavingDoneSo") parse_result = parse_result.replace("WithoutHavingDoneSo2", "WithoutHavingDoneSo") # TODO parse_result = parse_result.replace("Hastily", "Hastily+Pos") parse_result = parse_result.replace("Postp+PCNom", "Part") parse_result = parse_result.replace("Postp+PCDat", "Postp") parse_result = parse_result.replace("Postp+PCAcc", "Postp") parse_result = parse_result.replace("Postp+PCLoc", "Postp") parse_result = parse_result.replace("Postp+PCAbl", "Postp") parse_result = parse_result.replace("Postp+PCIns", "Postp") parse_result = parse_result.replace("Postp+PCGen", "Postp") if self.STATS_MODE: try: self.assert_parse_correct(word, index, parse_result) except Exception: unparsable += 1 logger.info("Unparsable : {} {} {}".format(index, word, parse_result)) else: self.assert_parse_correct(TurkishAlphabet.lower(word), index, parse_result) index += 1 if self.STATS_MODE: logger.info("Finished simple parse set {}".format(path)) logger.info("Found {} lines, with {} lines of comments".format(index, comment)) logger.info("Skipped {}, unparsable {}".format(skipped, unparsable)) logger.info("Words that should be parsable : {}".format(index - comment)) logger.info( "Parse success rate : {}".format(float(index - comment - skipped - unparsable) / float(index - comment)) )