def find_roots_for_partial_input(self, partial_input, whole_surface=None): """ @type partial_input: unicode @type whole_surface: unicode @rtype: list of Root """ assert partial_input and whole_surface assert len(partial_input) <= len(whole_surface) assert whole_surface.startswith(partial_input) if len(whole_surface) == len(partial_input): assert whole_surface == partial_input root = partial_input lemma = root lemma_root = lemma syntactic_category = SyntacticCategory.NOUN secondary_syntactic_category = None lexeme_attributes = set() lexeme = DynamicLexeme(lemma, lemma_root, syntactic_category, secondary_syntactic_category, lexeme_attributes) phonetic_expectations = set() phonetic_attributes = Phonetics.calculate_phonetic_attributes(partial_input, lexeme_attributes) no_orthographics_root = DynamicRoot(root, lexeme, phonetic_expectations, phonetic_attributes) if len(partial_input) < 2 <= len(whole_surface): return [] if whole_surface == partial_input or len(partial_input) < 2: return [no_orthographics_root] last_vowel = Phonetics.get_last_vowel(partial_input) if not last_vowel: return [no_orthographics_root] last_char = partial_input[-1] first_char_after_partial_input = whole_surface[len(partial_input)] if last_char.isupper() or first_char_after_partial_input.isupper(): return [no_orthographics_root] roots = self._get_voicing_and_doubling_roots(partial_input, last_char, first_char_after_partial_input, no_orthographics_root) first_vowel_letter_after_partial_input = self._get_first_vowel(whole_surface[len(partial_input) - 1:]) if first_vowel_letter_after_partial_input: if last_vowel.frontal != first_vowel_letter_after_partial_input.frontal: for r in roots: r.lexeme.attributes = set(r.lexeme.attributes) r.lexeme.attributes.add(LexemeAttribute.InverseHarmony) for r in roots: phonetic_attributes = Phonetics.calculate_phonetic_attributes(r.str, r.lexeme.attributes) r.phonetic_attributes = phonetic_attributes return roots
def __init__(self, abbr): root = abbr lexeme = DynamicLexeme(abbr, abbr, SyntacticCategory.NOUN, SecondarySyntacticCategory.ABBREVIATION, None) phonetic_attributes = None last_letter = TurkishAlphabet.get_letter_for_char(abbr[-1]) if last_letter.vowel: phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(abbr) else: phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(abbr + u"E") phonetic_expectations = None super(AbbreviationRoot, self).__init__(root, lexeme, phonetic_expectations, phonetic_attributes)
def transition_allowed_for_suffix_form(morpheme_container, suffix_form): if suffix_form.precondition and not suffix_form.precondition.is_satisfied_by(morpheme_container): if logger.isEnabledFor(logging.DEBUG): logger.debug(' Precondition "%s" of suffix form "%s" is not satisfied with transitions %s, skipping.', suffix_form.form, suffix_form.precondition, morpheme_container) return False if suffix_form.form and not Phonetics.expectations_satisfied(morpheme_container.get_phonetic_expectations(), suffix_form.form): logger.debug(' Suffix form "%s" does not satisfy phonetic expectations %s, skipping.', suffix_form.form, morpheme_container.get_phonetic_expectations()) return False if not Phonetics.is_suffix_form_applicable(morpheme_container.get_surface_so_far(), suffix_form.form): logger.debug(' Suffix form "%s" is not phonetically applicable to "%s", skipping.', suffix_form.form, morpheme_container.get_surface_so_far()) return False return True
def _get_possible_passive_roots(self, last_letter, partial_input, whole_surface, no_attr_root): might_have_Passive_Il = (not last_letter.vowel and any([whole_surface.startswith(partial_input+s) for s in [u'il', u'ıl', u'ul', u'ül']])) or\ (last_letter.vowel and whole_surface.startswith(partial_input+ u'l')) might_have_Passive_In = (not last_letter.vowel and any([whole_surface.startswith(partial_input+s) for s in [u'in', u'ın', u'un', u'ün']])) or\ (last_letter.vowel and whole_surface.startswith(partial_input+ u'n')) might_have_Passive_InIl = (not last_letter.vowel and any([whole_surface.startswith(partial_input+s) for s in [u'inil', u'ınıl', u'unul', u'ünül']])) or\ (last_letter.vowel and any([whole_surface.startswith(partial_input+s) for s in [u'nil', u'nıl', u'nul', u'nül']])) might_have_passives = {(LexemeAttribute.Passive_Il, might_have_Passive_Il), (LexemeAttribute.Passive_In, might_have_Passive_In), (LexemeAttribute.Passive_InIl, might_have_Passive_InIl)} might_have_passives = filter(lambda t : t[1], might_have_passives) passive_roots = set() for passive_attr, might_have_happened in might_have_passives: # cannot have other passives at the same time # cannot have any other causative at the same time # cannot have progressive vowel drop at the same time # cannot have aorist_A or aorist_I at the same time generated_root = no_attr_root._clone(True) generated_root.lexeme.attributes = {passive_attr} if passive_attr else set() generated_root.lexeme.phonetic_attributes = Phonetics.calculate_phonetic_attributes(partial_input, generated_root.lexeme.attributes) passive_roots.add(generated_root) return passive_roots
def _set_lemma(self, generated_roots): for r in generated_roots: word, applied_suffix_form = Phonetics.apply( r.lexeme.root, r.phonetic_attributes, u'mAk', r.lexeme.attributes) assert word and applied_suffix_form r.lexeme.lemma = word + applied_suffix_form
def __init__(self, numeral): root = numeral lexeme = DynamicLexeme(numeral, numeral, SyntacticCategory.NUMERAL, SecondarySyntacticCategory.DIGITS, None) phonetic_expectations = None phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( DigitsToNumberConverter.convert_digits_to_words(numeral) ) super(NumeralRoot, self).__init__(root, lexeme, phonetic_expectations, phonetic_attributes)
def get_phonetic_attributes(self): if self.has_transitions(): suffix_so_far = self.get_surface_so_far()[len(self._root.str):] if not suffix_so_far or suffix_so_far.isspace() or not suffix_so_far.isalnum(): return self._root.phonetic_attributes else: return Phonetics.calculate_phonetic_attributes(self.get_surface_so_far(), self.get_lexeme_attributes()) else: return self._root.phonetic_attributes
def __init__(self, numeral): root = numeral lexeme = DynamicLexeme(numeral, numeral, SyntacticCategory.NUMERAL, SecondarySyntacticCategory.DIGITS, None) phonetic_expectations = None phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( DigitsToNumberConverter.convert_digits_to_words(numeral)) super(NumeralRoot, self).__init__(root, lexeme, phonetic_expectations, phonetic_attributes)
def __init__(self, abbr): root = abbr lexeme = DynamicLexeme(abbr, abbr, SyntacticCategory.NOUN, SecondarySyntacticCategory.ABBREVIATION, None) phonetic_attributes = None last_letter = TurkishAlphabet.get_letter_for_char(abbr[-1]) if last_letter.vowel: phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( abbr) else: phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( abbr + u'E') phonetic_expectations = None super(AbbreviationRoot, self).__init__(root, lexeme, phonetic_expectations, phonetic_attributes)
def __init__(self, noun): root = noun lexeme = DynamicLexeme(noun, noun, SyntacticCategory.NOUN, SecondarySyntacticCategory.PROPER_NOUN, None) phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( noun) phonetic_expectations = None super(ProperNounRoot, self).__init__(root, lexeme, phonetic_expectations, phonetic_attributes)
def generate(cls, lexeme): if any(x in lexeme.attributes for x in RootGenerator._modifiers): try: return RootGenerator._generate_modified_root_nodes(lexeme) except: print u'Error generating roots for lexeme : {}'.format(lexeme) raise else: phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(lexeme.root) root = Root(lexeme.root, lexeme, None, phonetic_attributes) return [root]
def _set_lexeme_and_phonetic_attributes(self, generated_roots): for r in generated_roots: r.phonetic_attributes = Phonetics.calculate_phonetic_attributes(r.str, r.lexeme.attributes) if r.str.endswith(u'd') and r.lexeme.root.endswith(u't'): if LexemeAttribute.NoVoicing in r.lexeme.attributes: r.lexeme.attributes.remove(LexemeAttribute.NoVoicing) r.lexeme.attributes.add(LexemeAttribute.Voicing) else: if LexemeAttribute.Voicing in r.lexeme.attributes: r.lexeme.attributes.remove(LexemeAttribute.Voicing) r.lexeme.attributes.add(LexemeAttribute.NoVoicing)
def get_phonetic_attributes(self): if self.has_transitions(): suffix_so_far = self.get_surface_so_far()[len(self._root.str):] if not suffix_so_far or suffix_so_far.isspace( ) or not suffix_so_far.isalnum(): return self._root.phonetic_attributes else: return Phonetics.calculate_phonetic_attributes( self.get_surface_so_far(), self.get_lexeme_attributes()) else: return self._root.phonetic_attributes
def generate(cls, lexeme): if any(x in lexeme.attributes for x in RootGenerator._modifiers): try: return RootGenerator._generate_modified_root_nodes(lexeme) except: print u'Error generating roots for lexeme : {}'.format(lexeme) raise else: phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( lexeme.root) root = Root(lexeme.root, lexeme, None, phonetic_attributes) return [root]
def _set_lexeme_and_phonetic_attributes(self, generated_roots): for r in generated_roots: r.phonetic_attributes = Phonetics.calculate_phonetic_attributes( r.str, r.lexeme.attributes) if r.str.endswith(u'd') and r.lexeme.root.endswith(u't'): if LexemeAttribute.NoVoicing in r.lexeme.attributes: r.lexeme.attributes.remove(LexemeAttribute.NoVoicing) r.lexeme.attributes.add(LexemeAttribute.Voicing) else: if LexemeAttribute.Voicing in r.lexeme.attributes: r.lexeme.attributes.remove(LexemeAttribute.Voicing) r.lexeme.attributes.add(LexemeAttribute.NoVoicing)
def _get_possible_causative_roots(self, partial_input, whole_surface, no_attr_root): # no voicing can happen on causative_t might_have_Causative_t = whole_surface.startswith(partial_input + u't') might_have_Causative_Ir = any([ whole_surface.startswith(partial_input + s) for s in [u'ir', u'ır', u'ur', u'ür'] ]) # no voicing can happen on causative_It might_have_Causative_It = any([ whole_surface.startswith(partial_input + s) for s in [u'it', u'ıt', u'ut', u'üt'] ]) might_have_Causative_Ar = any([ whole_surface.startswith(partial_input + s) for s in [u'ar', u'er'] ]) might_have_Causative_dIr = any([whole_surface.startswith(partial_input+s) for s in [u'dir', u'dır', u'dur', u'dür']]) or\ any([whole_surface.startswith(partial_input+s) for s in [u'tir', u'tır', u'tur', u'tür']]) might_have_causatives = { (LexemeAttribute.Causative_t, might_have_Causative_t), (LexemeAttribute.Causative_Ir, might_have_Causative_Ir), (LexemeAttribute.Causative_It, might_have_Causative_It), (LexemeAttribute.Causative_Ar, might_have_Causative_Ar), (LexemeAttribute.Causative_dIr, might_have_Causative_dIr) } might_have_causatives = filter(lambda t: t[1], might_have_causatives) causative_roots = set() for causative_attr, might_have_happened in might_have_causatives: # cannot have other causatives at the same time # cannot have any other passive at the same time # cannot have progressive vowel drop at the same time # cannot have aorist_A or aorist_I at the same time generated_root = no_attr_root._clone(True) generated_root.lexeme.attributes = {causative_attr } if causative_attr else set() generated_root.lexeme.phonetic_attributes = Phonetics.calculate_phonetic_attributes( partial_input, generated_root.lexeme.attributes) causative_roots.add(generated_root) return causative_roots
def try_suffix_form(morpheme_container, suffix_form, to_state, word): state_before_suffix_form_application = morpheme_container.get_last_state() if not transition_allowed_for_suffix_form(morpheme_container, suffix_form): return None so_far = morpheme_container.get_surface_so_far() morpheme_container_lexeme_attributes = morpheme_container.get_lexeme_attributes() morpheme_container_phonetic_attributes = morpheme_container.get_phonetic_attributes() modified_word, fitting_suffix_form = Phonetics.apply(so_far, morpheme_container_phonetic_attributes, suffix_form.form, morpheme_container_lexeme_attributes) applied_str = modified_word + fitting_suffix_form if Phonetics.application_matches(word, applied_str, to_state.name!='VERB_ROOT'): actual_suffix_form_str = word[len(so_far):len(applied_str)] logger.debug(' Word "%s" starts with applied str "%s" (%s), adding to current morpheme container', word, applied_str, actual_suffix_form_str) clone = morpheme_container.clone() clone.add_transition(SuffixFormApplication(suffix_form, actual_suffix_form_str, fitting_suffix_form), to_state) if morpheme_container.has_transitions() and morpheme_container.get_last_transition().suffix_form_application.suffix_form.postcondition and not morpheme_container.get_last_transition().suffix_form_application.suffix_form.postcondition.is_satisfied_by(clone): if logger.isEnabledFor(logging.DEBUG): logger.debug(' Suffix does not satisfy the postcondition "%s" of last transition suffix form "%s", skipping.', morpheme_container.get_last_transition().suffix_form_application.suffix_form.postcondition, formatter.format_transition(clone.get_last_transition())) return None if morpheme_container.has_transitions() and state_before_suffix_form_application.type==State.DERIVATIONAL: logger.debug(' Suffix is derivative, checking the post derivation conditions of suffixes from previous derivation.') for transition in morpheme_container.get_transitions_from_derivation_suffix(): application_suffix_form = transition.suffix_form_application.suffix_form if application_suffix_form.post_derivation_condition: matches = application_suffix_form.post_derivation_condition.is_satisfied_by(clone) if not matches: logger.debug(' Post derivation condition "%s" of suffix "%s" is not satisfied, skipping.', application_suffix_form.post_derivation_condition, application_suffix_form.suffix) return None return clone else: logger.debug(' Word "%s" does not start with applied str "%s" (%s), skipping', word, applied_str, applied_str) return None
def transition_allowed_for_suffix_form(morpheme_container, suffix_form): if suffix_form.precondition and not suffix_form.precondition.is_satisfied_by( morpheme_container): if logger.isEnabledFor(logging.DEBUG): logger.debug( ' Precondition "%s" of suffix form "%s" is not satisfied with transitions %s, skipping.', suffix_form.form, suffix_form.precondition, morpheme_container) return False if suffix_form.form and not Phonetics.expectations_satisfied( morpheme_container.get_phonetic_expectations(), suffix_form.form): logger.debug( ' Suffix form "%s" does not satisfy phonetic expectations %s, skipping.', suffix_form.form, morpheme_container.get_phonetic_expectations()) return False if not Phonetics.is_suffix_form_applicable( morpheme_container.get_surface_so_far(), suffix_form.form): logger.debug( ' Suffix form "%s" is not phonetically applicable to "%s", skipping.', suffix_form.form, morpheme_container.get_surface_so_far()) return False return True
def _get_possible_causative_roots(self, partial_input, whole_surface, no_attr_root): # no voicing can happen on causative_t might_have_Causative_t = whole_surface.startswith(partial_input + u't') might_have_Causative_Ir = any([whole_surface.startswith(partial_input+s) for s in [u'ir', u'ır', u'ur', u'ür']]) # no voicing can happen on causative_It might_have_Causative_It = any([whole_surface.startswith(partial_input+s) for s in [u'it', u'ıt', u'ut', u'üt']]) might_have_Causative_Ar = any([whole_surface.startswith(partial_input+s) for s in [u'ar', u'er']]) might_have_Causative_dIr = any([whole_surface.startswith(partial_input+s) for s in [u'dir', u'dır', u'dur', u'dür']]) or\ any([whole_surface.startswith(partial_input+s) for s in [u'tir', u'tır', u'tur', u'tür']]) might_have_causatives = {(LexemeAttribute.Causative_t, might_have_Causative_t), (LexemeAttribute.Causative_Ir, might_have_Causative_Ir), (LexemeAttribute.Causative_It, might_have_Causative_It), (LexemeAttribute.Causative_Ar, might_have_Causative_Ar), (LexemeAttribute.Causative_dIr, might_have_Causative_dIr)} might_have_causatives = filter(lambda t : t[1], might_have_causatives) causative_roots = set() for causative_attr, might_have_happened in might_have_causatives: # cannot have other causatives at the same time # cannot have any other passive at the same time # cannot have progressive vowel drop at the same time # cannot have aorist_A or aorist_I at the same time generated_root = no_attr_root._clone(True) generated_root.lexeme.attributes = {causative_attr} if causative_attr else set() generated_root.lexeme.phonetic_attributes = Phonetics.calculate_phonetic_attributes(partial_input, generated_root.lexeme.attributes) causative_roots.add(generated_root) return causative_roots
def _get_possible_passive_roots(self, last_letter, partial_input, whole_surface, no_attr_root): might_have_Passive_Il = (not last_letter.vowel and any([whole_surface.startswith(partial_input+s) for s in [u'il', u'ıl', u'ul', u'ül']])) or\ (last_letter.vowel and whole_surface.startswith(partial_input+ u'l')) might_have_Passive_In = (not last_letter.vowel and any([whole_surface.startswith(partial_input+s) for s in [u'in', u'ın', u'un', u'ün']])) or\ (last_letter.vowel and whole_surface.startswith(partial_input+ u'n')) might_have_Passive_InIl = (not last_letter.vowel and any([whole_surface.startswith(partial_input+s) for s in [u'inil', u'ınıl', u'unul', u'ünül']])) or\ (last_letter.vowel and any([whole_surface.startswith(partial_input+s) for s in [u'nil', u'nıl', u'nul', u'nül']])) might_have_passives = { (LexemeAttribute.Passive_Il, might_have_Passive_Il), (LexemeAttribute.Passive_In, might_have_Passive_In), (LexemeAttribute.Passive_InIl, might_have_Passive_InIl) } might_have_passives = filter(lambda t: t[1], might_have_passives) passive_roots = set() for passive_attr, might_have_happened in might_have_passives: # cannot have other passives at the same time # cannot have any other causative at the same time # cannot have progressive vowel drop at the same time # cannot have aorist_A or aorist_I at the same time generated_root = no_attr_root._clone(True) generated_root.lexeme.attributes = {passive_attr } if passive_attr else set() generated_root.lexeme.phonetic_attributes = Phonetics.calculate_phonetic_attributes( partial_input, generated_root.lexeme.attributes) passive_roots.add(generated_root) return passive_roots
def ap(word, form_str, lexeme_attributes=None): phonetic_attributes = Phonetics.calculate_phonetic_attributes( word, lexeme_attributes) word, application = Phonetics.apply(word, phonetic_attributes, form_str, lexeme_attributes) return word + application
def try_suffix_form(morpheme_container, suffix_form, to_state, word): state_before_suffix_form_application = morpheme_container.get_last_state() if not transition_allowed_for_suffix_form(morpheme_container, suffix_form): return None so_far = morpheme_container.get_surface_so_far() morpheme_container_lexeme_attributes = morpheme_container.get_lexeme_attributes( ) morpheme_container_phonetic_attributes = morpheme_container.get_phonetic_attributes( ) modified_word, fitting_suffix_form = Phonetics.apply( so_far, morpheme_container_phonetic_attributes, suffix_form.form, morpheme_container_lexeme_attributes) applied_str = modified_word + fitting_suffix_form if Phonetics.application_matches(word, applied_str, to_state.name != 'VERB_ROOT'): actual_suffix_form_str = word[len(so_far):len(applied_str)] logger.debug( ' Word "%s" starts with applied str "%s" (%s), adding to current morpheme container', word, applied_str, actual_suffix_form_str) clone = morpheme_container.clone() clone.add_transition( SuffixFormApplication(suffix_form, actual_suffix_form_str, fitting_suffix_form), to_state) if morpheme_container.has_transitions( ) and morpheme_container.get_last_transition( ).suffix_form_application.suffix_form.postcondition and not morpheme_container.get_last_transition( ).suffix_form_application.suffix_form.postcondition.is_satisfied_by( clone): if logger.isEnabledFor(logging.DEBUG): logger.debug( ' Suffix does not satisfy the postcondition "%s" of last transition suffix form "%s", skipping.', morpheme_container.get_last_transition(). suffix_form_application.suffix_form.postcondition, formatter.format_transition(clone.get_last_transition())) return None if morpheme_container.has_transitions( ) and state_before_suffix_form_application.type == State.DERIVATIONAL: logger.debug( ' Suffix is derivative, checking the post derivation conditions of suffixes from previous derivation.' ) for transition in morpheme_container.get_transitions_from_derivation_suffix( ): application_suffix_form = transition.suffix_form_application.suffix_form if application_suffix_form.post_derivation_condition: matches = application_suffix_form.post_derivation_condition.is_satisfied_by( clone) if not matches: logger.debug( ' Post derivation condition "%s" of suffix "%s" is not satisfied, skipping.', application_suffix_form.post_derivation_condition, application_suffix_form.suffix) return None return clone else: logger.debug( ' Word "%s" does not start with applied str "%s" (%s), skipping', word, applied_str, applied_str) return None
def find_roots_for_partial_input(self, partial_input, whole_surface=None): """ @type partial_input: unicode @type whole_surface: unicode @rtype: list of Root """ assert partial_input and whole_surface assert len(partial_input) <= len(whole_surface) assert whole_surface.startswith(partial_input) if len(whole_surface) == len(partial_input): assert whole_surface == partial_input if len(partial_input) < 2: # not possible except (d,diyor) and (y,yiyor). but they are already in the dictionary return [] last_vowel = Phonetics.get_last_vowel(partial_input) if not last_vowel: return [] root = partial_input lemma = root lemma_root = lemma syntactic_category = SyntacticCategory.VERB secondary_syntactic_category = None lexeme_attributes = set() lexeme = DynamicLexeme(lemma, lemma_root, syntactic_category, secondary_syntactic_category, lexeme_attributes) phonetic_expectations = set() phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(partial_input) no_attr_root = DynamicRoot(root, lexeme, phonetic_expectations, phonetic_attributes) self._set_lexeme_and_phonetic_attributes([no_attr_root]) self._set_lemma([no_attr_root]) last_char = partial_input[-1] last_letter = TurkishAlphabet.get_letter_for_char(last_char) partial_surface_can_be_root_of_a_verb = self._seems_like_a_valid_verb_root(partial_input) if whole_surface==partial_input: return [no_attr_root] if partial_surface_can_be_root_of_a_verb else [] first_char_after_partial_input = whole_surface[len(partial_input)] if first_char_after_partial_input.isupper(): return [] first_letter_after_partial_input = TurkishAlphabet.get_letter_for_char(first_char_after_partial_input) might_have_ProgressiveVowelDrop = not last_letter.vowel and\ any([whole_surface.startswith(partial_input+s) for s in [u'iyor', u'ıyor', u'uyor', u'üyor']]) might_have_Aorist_A = not last_letter.vowel and \ (whole_surface.startswith(partial_input + u'ar') or whole_surface.startswith(partial_input + u'er')) # no Aorist_I for -ur, -ür might_have_Aorist_I = not last_letter.vowel and\ (whole_surface.startswith(partial_input + u'ır') or whole_surface.startswith(partial_input + u'ir')) # for other letters, no voicing in verbs. {git+er->gider} vs {yapar, açar, diker} voicing_might_have_happened = last_letter==TurkishAlphabet.L_d and first_letter_after_partial_input.vowel possible_progressive_vowel_drop_roots = self._get_progressive_vowel_drop_roots(partial_input, whole_surface, no_attr_root, last_vowel) if might_have_ProgressiveVowelDrop else set() possible_aorist_A_roots = self._get_aorist_A_roots(no_attr_root) if might_have_Aorist_A else set() possible_aorist_I_roots = self._get_aorist_I_roots(no_attr_root) if might_have_Aorist_I else set() possible_causative_roots = self._get_possible_causative_roots(partial_input, whole_surface, no_attr_root) possible_passive_roots = self._get_possible_passive_roots(last_letter, partial_input, whole_surface, no_attr_root) if voicing_might_have_happened: possible_progressive_vowel_drop_roots = possible_progressive_vowel_drop_roots.union(set([self._get_possible_voicing_root(r) for r in possible_progressive_vowel_drop_roots])) possible_aorist_A_roots = possible_aorist_A_roots.union(set([self._get_possible_voicing_root(r) for r in possible_aorist_A_roots])) possible_aorist_I_roots = possible_aorist_I_roots.union(set([self._get_possible_voicing_root(r) for r in possible_aorist_I_roots])) possible_causative_roots = possible_causative_roots.union(set([self._get_possible_voicing_root(r) for r in possible_causative_roots])) possible_passive_roots = possible_passive_roots.union(set([self._get_possible_voicing_root(r) for r in possible_passive_roots])) generated_roots = set() generated_roots.add(no_attr_root) if voicing_might_have_happened: generated_roots.add(self._get_possible_voicing_root(no_attr_root)) generated_roots = generated_roots.union(possible_progressive_vowel_drop_roots) generated_roots = generated_roots.union(possible_aorist_A_roots) generated_roots = generated_roots.union(possible_aorist_I_roots) generated_roots = generated_roots.union(possible_causative_roots) generated_roots = generated_roots.union(possible_passive_roots) self._set_lexeme_and_phonetic_attributes(generated_roots) self._set_lemma(generated_roots) generated_roots = list(generated_roots) generated_roots = filter(lambda r: self._seems_like_a_valid_verb_root(r.lexeme.root), generated_roots) return generated_roots
def _set_lemma(self, generated_roots): for r in generated_roots: word, applied_suffix_form = Phonetics.apply(r.lexeme.root, r.phonetic_attributes, u'mAk', r.lexeme.attributes) assert word and applied_suffix_form r.lexeme.lemma = word + applied_suffix_form
def _handle_special_roots(cls, lexeme): lexeme.attributes.remove(LexemeAttribute.RootChange) if lexeme.lemma==u'ben': root_ben = Root(u'ben', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'ben')) root_ban = Root(u'ban', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'ban')) return [root_ben, root_ban] elif lexeme.lemma==u'sen': root_sen = Root(u'sen', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'ben')) root_san = Root(u'san', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'ban')) return [root_sen, root_san] elif lexeme.lemma==u'demek': root_di = Root(u'di', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'di')) root_de = Root(u'de', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'de')) return [root_di, root_de] elif lexeme.lemma==u'yemek': root_yi = Root(u'yi', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'yi')) root_ye = Root(u'ye', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'ye')) return [root_yi, root_ye] elif lexeme.lemma==u'hepsi': root_hep = Root(u'hep', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'hep')) root_hepsi = Root(u'hepsi', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'hepsi')) return [root_hep, root_hepsi] elif lexeme.lemma==u'ora': root_or = Root(u'or', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'or')) root_ora = Root(u'ora', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'ora')) return [root_or, root_ora] elif lexeme.lemma==u'bura': root_bur = Root(u'bur', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'bur')) root_bura = Root(u'bura', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'bura')) return [root_bur, root_bura] elif lexeme.lemma==u'şura': root_sur = Root(u'şur', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'şur')) root_sura = Root(u'şura', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'şura')) return [root_sur, root_sura] elif lexeme.lemma==u'nere': root_ner = Root(u'ner', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'ner')) root_nere = Root(u'nere', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'nere')) return [root_ner, root_nere] elif lexeme.lemma==u'nere': root_ner = Root(u'ner', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'ner')) root_nere = Root(u'nere', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'nere')) return [root_ner, root_nere] elif lexeme.lemma==u'içeri': root_icer = Root(u'içer', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'içer')) root_iceri = Root(u'içeri', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'içeri')) return [root_icer, root_iceri] elif lexeme.lemma==u'dışarı': root_disar = Root(u'dışar', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'dışar')) root_disari = Root(u'dışarı', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'dışarı')) return [root_disar, root_disari] elif lexeme.lemma==u'birbiri': root_birbir = Root(u'birbir', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'birbir')) root_birbiri = Root(u'birbiri', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence(u'birbiri')) return [root_birbir, root_birbiri] else: raise Exception('Unhandled root change : {} !'.format(lexeme))
def _generate_modified_root_nodes(cls, lexeme): if LexemeAttribute.RootChange in lexeme.attributes: special_roots = cls._handle_special_roots(lexeme) if special_roots: return special_roots modified_seq = lexeme.root original_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(lexeme.root) modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(lexeme.root) original_phonetic_expectations = set() modified_phonetic_expectations = set() if LexemeAttribute.Voicing in lexeme.attributes or LexemeAttribute.VoicingOpt in lexeme.attributes: last_letter = TurkishAlphabet.get_letter_for_char(modified_seq[-1]) modified_letter = TurkishAlphabet.voice(last_letter) assert modified_letter is not None if lexeme.lemma.endswith(u"nk"): modified_letter = TurkishAlphabet.L_g modified_seq = modified_seq[:-1] + modified_letter.char_value if PhoneticAttributes.LastLetterVoicelessStop in modified_attributes: modified_attributes.remove(PhoneticAttributes.LastLetterVoicelessStop) if modified_letter.continuant: if PhoneticAttributes.LastLetterNotContinuant in modified_attributes : modified_attributes.remove(PhoneticAttributes.LastLetterNotContinuant) modified_attributes.add(PhoneticAttributes.LastLetterContinuant) else: if PhoneticAttributes.LastLetterContinuant in modified_attributes: modified_attributes.remove(PhoneticAttributes.LastLetterContinuant) modified_attributes.add(PhoneticAttributes.LastLetterNotContinuant) if LexemeAttribute.VoicingOpt not in lexeme.attributes: original_phonetic_expectations.add(PhoneticExpectation.ConsonantStart) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) if LexemeAttribute.Doubling in lexeme.attributes: modified_seq = modified_seq + modified_seq[-1] original_phonetic_expectations.add(PhoneticExpectation.ConsonantStart) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) if LexemeAttribute.LastVowelDrop in lexeme.attributes: modified_seq = modified_seq[:-2] + modified_seq[-1] if lexeme.syntactic_category!=SyntacticCategory.VERB: original_phonetic_expectations.add(PhoneticExpectation.ConsonantStart) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) if LexemeAttribute.InverseHarmony in lexeme.attributes: original_attributes.add(PhoneticAttributes.LastVowelFrontal) if PhoneticAttributes.LastVowelBack in original_attributes: original_attributes.remove(PhoneticAttributes.LastVowelBack) modified_attributes.add(PhoneticAttributes.LastVowelFrontal) if PhoneticAttributes.LastVowelBack in modified_attributes: modified_attributes.remove(PhoneticAttributes.LastVowelBack) if LexemeAttribute.ProgressiveVowelDrop in lexeme.attributes: modified_seq = modified_seq[:-1] if RootGenerator._has_vowel(modified_seq): modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(modified_seq) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) original_phonetic_expectations = original_phonetic_expectations or None modified_phonetic_expectations = modified_phonetic_expectations or None original = Root(lexeme.root, lexeme, original_phonetic_expectations, original_attributes) modified = Root(modified_seq, lexeme, modified_phonetic_expectations, modified_attributes) if original==modified: return [original] else: return [original, modified]
def ap(word, form_str, lexeme_attributes=None): phonetic_attributes = Phonetics.calculate_phonetic_attributes(word, lexeme_attributes) word, application = Phonetics.apply(word, phonetic_attributes, form_str, lexeme_attributes) return word + application
def _handle_special_roots(cls, lexeme): lexeme.attributes.remove(LexemeAttribute.RootChange) if lexeme.lemma == u'ben': root_ben = Root( u'ben', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'ben')) root_ban = Root( u'ban', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'ban')) return [root_ben, root_ban] elif lexeme.lemma == u'sen': root_sen = Root( u'sen', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'ben')) root_san = Root( u'san', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'ban')) return [root_sen, root_san] elif lexeme.lemma == u'demek': root_di = Root( u'di', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'di')) root_de = Root( u'de', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'de')) return [root_di, root_de] elif lexeme.lemma == u'yemek': root_yi = Root( u'yi', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'yi')) root_ye = Root( u'ye', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'ye')) return [root_yi, root_ye] elif lexeme.lemma == u'hepsi': root_hep = Root( u'hep', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'hep')) root_hepsi = Root( u'hepsi', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'hepsi')) return [root_hep, root_hepsi] elif lexeme.lemma == u'ora': root_or = Root( u'or', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'or')) root_ora = Root( u'ora', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'ora')) return [root_or, root_ora] elif lexeme.lemma == u'bura': root_bur = Root( u'bur', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'bur')) root_bura = Root( u'bura', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'bura')) return [root_bur, root_bura] elif lexeme.lemma == u'şura': root_sur = Root( u'şur', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'şur')) root_sura = Root( u'şura', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'şura')) return [root_sur, root_sura] elif lexeme.lemma == u'nere': root_ner = Root( u'ner', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'ner')) root_nere = Root( u'nere', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'nere')) return [root_ner, root_nere] elif lexeme.lemma == u'nere': root_ner = Root( u'ner', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'ner')) root_nere = Root( u'nere', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'nere')) return [root_ner, root_nere] elif lexeme.lemma == u'içeri': root_icer = Root( u'içer', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'içer')) root_iceri = Root( u'içeri', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'içeri')) return [root_icer, root_iceri] elif lexeme.lemma == u'dışarı': root_disar = Root( u'dışar', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'dışar')) root_disari = Root( u'dışarı', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'dışarı')) return [root_disar, root_disari] elif lexeme.lemma == u'birbiri': root_birbir = Root( u'birbir', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'birbir')) root_birbiri = Root( u'birbiri', lexeme, None, Phonetics.calculate_phonetic_attributes_of_plain_sequence( u'birbiri')) return [root_birbir, root_birbiri] else: raise Exception('Unhandled root change : {} !'.format(lexeme))
def _generate_modified_root_nodes(cls, lexeme): if LexemeAttribute.RootChange in lexeme.attributes: special_roots = cls._handle_special_roots(lexeme) if special_roots: return special_roots modified_seq = lexeme.root original_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( lexeme.root) modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( lexeme.root) original_phonetic_expectations = set() modified_phonetic_expectations = set() if LexemeAttribute.Voicing in lexeme.attributes or LexemeAttribute.VoicingOpt in lexeme.attributes: last_letter = TurkishAlphabet.get_letter_for_char(modified_seq[-1]) modified_letter = TurkishAlphabet.voice(last_letter) assert modified_letter is not None if lexeme.lemma.endswith(u"nk"): modified_letter = TurkishAlphabet.L_g modified_seq = modified_seq[:-1] + modified_letter.char_value if PhoneticAttributes.LastLetterVoicelessStop in modified_attributes: modified_attributes.remove( PhoneticAttributes.LastLetterVoicelessStop) if modified_letter.continuant: if PhoneticAttributes.LastLetterNotContinuant in modified_attributes: modified_attributes.remove( PhoneticAttributes.LastLetterNotContinuant) modified_attributes.add( PhoneticAttributes.LastLetterContinuant) else: if PhoneticAttributes.LastLetterContinuant in modified_attributes: modified_attributes.remove( PhoneticAttributes.LastLetterContinuant) modified_attributes.add( PhoneticAttributes.LastLetterNotContinuant) if LexemeAttribute.VoicingOpt not in lexeme.attributes: original_phonetic_expectations.add( PhoneticExpectation.ConsonantStart) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) if LexemeAttribute.Doubling in lexeme.attributes: modified_seq = modified_seq + modified_seq[-1] original_phonetic_expectations.add( PhoneticExpectation.ConsonantStart) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) if LexemeAttribute.LastVowelDrop in lexeme.attributes: modified_seq = modified_seq[:-2] + modified_seq[-1] if lexeme.syntactic_category != SyntacticCategory.VERB: original_phonetic_expectations.add( PhoneticExpectation.ConsonantStart) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) if LexemeAttribute.InverseHarmony in lexeme.attributes: original_attributes.add(PhoneticAttributes.LastVowelFrontal) if PhoneticAttributes.LastVowelBack in original_attributes: original_attributes.remove(PhoneticAttributes.LastVowelBack) modified_attributes.add(PhoneticAttributes.LastVowelFrontal) if PhoneticAttributes.LastVowelBack in modified_attributes: modified_attributes.remove(PhoneticAttributes.LastVowelBack) if LexemeAttribute.ProgressiveVowelDrop in lexeme.attributes: modified_seq = modified_seq[:-1] if RootGenerator._has_vowel(modified_seq): modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( modified_seq) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) original_phonetic_expectations = original_phonetic_expectations or None modified_phonetic_expectations = modified_phonetic_expectations or None original = Root(lexeme.root, lexeme, original_phonetic_expectations, original_attributes) modified = Root(modified_seq, lexeme, modified_phonetic_expectations, modified_attributes) if original == modified: return [original] else: return [original, modified]
def find_roots_for_partial_input(self, partial_input, whole_surface=None): """ @type partial_input: unicode @type whole_surface: unicode @rtype: list of Root """ assert partial_input and whole_surface assert len(partial_input) <= len(whole_surface) assert whole_surface.startswith(partial_input) if len(whole_surface) == len(partial_input): assert whole_surface == partial_input if len( partial_input ) < 2: # not possible except (d,diyor) and (y,yiyor). but they are already in the dictionary return [] last_vowel = Phonetics.get_last_vowel(partial_input) if not last_vowel: return [] root = partial_input lemma = root lemma_root = lemma syntactic_category = SyntacticCategory.VERB secondary_syntactic_category = None lexeme_attributes = set() lexeme = DynamicLexeme(lemma, lemma_root, syntactic_category, secondary_syntactic_category, lexeme_attributes) phonetic_expectations = set() phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( partial_input) no_attr_root = DynamicRoot(root, lexeme, phonetic_expectations, phonetic_attributes) self._set_lexeme_and_phonetic_attributes([no_attr_root]) self._set_lemma([no_attr_root]) last_char = partial_input[-1] last_letter = TurkishAlphabet.get_letter_for_char(last_char) partial_surface_can_be_root_of_a_verb = self._seems_like_a_valid_verb_root( partial_input) if whole_surface == partial_input: return [no_attr_root ] if partial_surface_can_be_root_of_a_verb else [] first_char_after_partial_input = whole_surface[len(partial_input)] if first_char_after_partial_input.isupper(): return [] first_letter_after_partial_input = TurkishAlphabet.get_letter_for_char( first_char_after_partial_input) might_have_ProgressiveVowelDrop = not last_letter.vowel and\ any([whole_surface.startswith(partial_input+s) for s in [u'iyor', u'ıyor', u'uyor', u'üyor']]) might_have_Aorist_A = not last_letter.vowel and \ (whole_surface.startswith(partial_input + u'ar') or whole_surface.startswith(partial_input + u'er')) # no Aorist_I for -ur, -ür might_have_Aorist_I = not last_letter.vowel and\ (whole_surface.startswith(partial_input + u'ır') or whole_surface.startswith(partial_input + u'ir')) # for other letters, no voicing in verbs. {git+er->gider} vs {yapar, açar, diker} voicing_might_have_happened = last_letter == TurkishAlphabet.L_d and first_letter_after_partial_input.vowel possible_progressive_vowel_drop_roots = self._get_progressive_vowel_drop_roots( partial_input, whole_surface, no_attr_root, last_vowel) if might_have_ProgressiveVowelDrop else set() possible_aorist_A_roots = self._get_aorist_A_roots( no_attr_root) if might_have_Aorist_A else set() possible_aorist_I_roots = self._get_aorist_I_roots( no_attr_root) if might_have_Aorist_I else set() possible_causative_roots = self._get_possible_causative_roots( partial_input, whole_surface, no_attr_root) possible_passive_roots = self._get_possible_passive_roots( last_letter, partial_input, whole_surface, no_attr_root) if voicing_might_have_happened: possible_progressive_vowel_drop_roots = possible_progressive_vowel_drop_roots.union( set([ self._get_possible_voicing_root(r) for r in possible_progressive_vowel_drop_roots ])) possible_aorist_A_roots = possible_aorist_A_roots.union( set([ self._get_possible_voicing_root(r) for r in possible_aorist_A_roots ])) possible_aorist_I_roots = possible_aorist_I_roots.union( set([ self._get_possible_voicing_root(r) for r in possible_aorist_I_roots ])) possible_causative_roots = possible_causative_roots.union( set([ self._get_possible_voicing_root(r) for r in possible_causative_roots ])) possible_passive_roots = possible_passive_roots.union( set([ self._get_possible_voicing_root(r) for r in possible_passive_roots ])) generated_roots = set() generated_roots.add(no_attr_root) if voicing_might_have_happened: generated_roots.add(self._get_possible_voicing_root(no_attr_root)) generated_roots = generated_roots.union( possible_progressive_vowel_drop_roots) generated_roots = generated_roots.union(possible_aorist_A_roots) generated_roots = generated_roots.union(possible_aorist_I_roots) generated_roots = generated_roots.union(possible_causative_roots) generated_roots = generated_roots.union(possible_passive_roots) self._set_lexeme_and_phonetic_attributes(generated_roots) self._set_lemma(generated_roots) generated_roots = list(generated_roots) generated_roots = filter( lambda r: self._seems_like_a_valid_verb_root(r.lexeme.root), generated_roots) return generated_roots
def __init__(self, noun): root = noun lexeme = DynamicLexeme(noun, noun, SyntacticCategory.NOUN, SecondarySyntacticCategory.PROPER_NOUN, None) phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(noun) phonetic_expectations = None super(ProperNounRoot, self).__init__(root, lexeme, phonetic_expectations, phonetic_attributes)