def _expectation_satisfied(cls, phonetic_expectation, form_str): if phonetic_expectation == PhoneticExpectation.VowelStart: first_char = form_str[0] if first_char == '+': return cls._expectation_satisfied( phonetic_expectation, form_str[1:]) or cls._expectation_satisfied( phonetic_expectation, form_str[2:]) else: return TurkishAlphabet.get_letter_for_char(first_char).vowel elif phonetic_expectation == PhoneticExpectation.ConsonantStart: first_char = form_str[0] if first_char == '+': return cls._expectation_satisfied( phonetic_expectation, form_str[1:]) or cls._expectation_satisfied( phonetic_expectation, form_str[2:]) else: return not TurkishAlphabet.get_letter_for_char( first_char).vowel else: raise Exception('Unknown phonetic_expectation', phonetic_expectation)
def _get_voicing_and_doubling_roots(self, partial_input, last_char, first_char_after_partial_input, no_orthographics_root): last_letter = TurkishAlphabet.get_letter_for_char(last_char) first_letter_after_partial_input = TurkishAlphabet.get_letter_for_char(first_char_after_partial_input) no_voicing_rule_applies = last_letter in TurkishAlphabet.Voicing_Map and first_letter_after_partial_input.vowel voicing_might_have_happened = last_letter in TurkishAlphabet.Inverse_Voicing_Map and first_letter_after_partial_input.vowel doubling_might_have_happened = len(partial_input) > 2 and\ not last_letter.vowel and\ partial_input[-1] == partial_input[-2] and\ first_letter_after_partial_input.vowel if doubling_might_have_happened: if no_voicing_rule_applies: doubling_root = self._create_doubling_root(no_orthographics_root, last_char) no_orthographics_root.lexeme.attributes = {LexemeAttribute.NoVoicing} doubling_root.lexeme.attributes.add(LexemeAttribute.NoVoicing) return [no_orthographics_root, doubling_root] elif voicing_might_have_happened: inverse_devoicing_roots = self._inverse_devoice_last_letter(no_orthographics_root, last_letter) devoicing_doubling_roots = [self._create_doubling_root(r, r.lexeme.root[-1]) for r in inverse_devoicing_roots] doubling_root = self._create_doubling_root(no_orthographics_root, last_char) return [no_orthographics_root] + [doubling_root] + devoicing_doubling_roots else: return [no_orthographics_root] + [self._create_doubling_root(no_orthographics_root, last_char)] else: if no_voicing_rule_applies: no_orthographics_root.lexeme.attributes = {LexemeAttribute.NoVoicing} return [no_orthographics_root] elif voicing_might_have_happened: return [no_orthographics_root] + self._inverse_devoice_last_letter(no_orthographics_root, last_letter) else: return [no_orthographics_root]
def application_matches(cls, word, applied_str, voicing_allowed): """ Checks if a suffix applied word is matched by a surface. >>> Phonetics.application_matches(u'armudunu', u'armut', True) True >>> Phonetics.application_matches(u'armudunu', u'armut', False) False >>> Phonetics.application_matches(u'armudunu', u'armudu', True) True >>> Phonetics.application_matches(u'armudunu', u'armudu', False) True @param word: The full word (surface) @param applied_str: Suffix applied part of the word @param voicing_allowed: If voicing should be considered or ignored @type word: unicode @type applied_str: unicode @type voicing_allowed: bool @rtype: L{bool} """ if not applied_str or len(applied_str) > len(word): return False elif word == applied_str or word.startswith(applied_str): return True if voicing_allowed and word.startswith(applied_str[:-1]): last_letter_of_application = TurkishAlphabet.get_letter_for_char(applied_str[-1]) last_letter_of_word_part = TurkishAlphabet.get_letter_for_char(word[len(applied_str) - 1]) return TurkishAlphabet.voice(last_letter_of_application) == last_letter_of_word_part else: return False
def _handle_phonetics(cls, word, phonetic_attributes, form_str, lexeme_attributes=None): lexeme_attributes = lexeme_attributes or [] phonetic_attributes = phonetic_attributes or [] first_letter_of_form = TurkishAlphabet.get_letter_for_char(form_str[0]) # first apply voicing if possible if LexemeAttribute.NoVoicing not in lexeme_attributes and PhoneticAttributes.LastLetterVoicelessStop in phonetic_attributes and first_letter_of_form.vowel: voiced_letter = TurkishAlphabet.voice( TurkishAlphabet.get_letter_for_char(word[-1])) if voiced_letter: word = word[:-1] + voiced_letter.char_value # then try devoicing if PhoneticAttributes.LastLetterVoiceless in phonetic_attributes and TurkishAlphabet.devoice( first_letter_of_form): form_str = TurkishAlphabet.devoice( first_letter_of_form).char_value + form_str[1:] applied = u'' for i in range(len(form_str)): c = form_str[i] next_c = form_str[i + 1] if i + 1 < len(form_str) else None if c == '!': continue letter = TurkishAlphabet.get_letter_for_char(c) if letter.vowel and letter.upper_case_char_value == c: if c == u'A': if PhoneticAttributes.LastVowelBack in phonetic_attributes: applied += u'a' else: applied += u'e' elif c == u'I': if PhoneticAttributes.LastVowelBack in phonetic_attributes: if PhoneticAttributes.LastVowelUnrounded in phonetic_attributes or next_c == '!': applied += u'ı' else: applied += u'u' else: if PhoneticAttributes.LastVowelUnrounded in phonetic_attributes or next_c == '!': applied += u'i' else: applied += u'ü' elif c == u'O': if PhoneticAttributes.LastVowelBack in phonetic_attributes: applied += u'o' else: applied += u'ö' else: applied = applied + c return word, applied
def apply(cls, word, phonetic_attributes, form_str, lexeme_attributes=None): """ Applies a suffix form to a word, considering the phonetics and root attributes given. @param word: Surface @type word: unicode @param phonetic_attributes: Provided phonetics of the surface @type phonetic_attributes: set of unicode @param form_str: Suffix form @type form_str: unicode @param lexeme_attributes: Provided lexeme attributes of the root of surface @type lexeme_attributes: set of unicode @return: Tuple (word, applied suffix form) @rtype: tuple """ if not form_str or not form_str.strip(): return word, u'' if not word or not word.strip(): return None, None # ci, dik, +yacak, +iyor, +ar, +yi, +im, +yla first_form_letter = TurkishAlphabet.get_letter_for_char(form_str[0]) if first_form_letter.char_value == '+': # +yacak, +iyor, +ar, +yi, +im, +yla optional_letter = TurkishAlphabet.get_letter_for_char(form_str[1]) if optional_letter.vowel: #+iyor, +ar, +im if PhoneticAttributes.LastLetterVowel in phonetic_attributes: # ata, dana return cls.apply(word, phonetic_attributes, form_str[2:], lexeme_attributes) else: # yap, kitap return cls._handle_phonetics(word, phonetic_attributes, form_str[1:], lexeme_attributes) else: # +yacak, +yi, +yla if PhoneticAttributes.LastLetterVowel in phonetic_attributes: #ata, dana return cls._handle_phonetics(word, phonetic_attributes, form_str[1:], lexeme_attributes) else: # yap, kitap return cls.apply(word, phonetic_attributes, form_str[2:], lexeme_attributes) else: return cls._handle_phonetics(word, phonetic_attributes, form_str, lexeme_attributes)
def _seems_like_a_valid_verb_root(self, seq): last_char = seq[-1] last_letter = TurkishAlphabet.get_letter_for_char(last_char) previous_char = seq[-2] previous_letter = TurkishAlphabet.get_letter_for_char(previous_char) return last_letter.vowel or previous_letter.vowel or\ (any([previous_letter == l for l in [TurkishAlphabet.L_l, TurkishAlphabet.L_r, TurkishAlphabet.L_n]]) and not last_letter.continuant)
def _handle_phonetics(cls, word, phonetic_attributes, form_str, lexeme_attributes=None): lexeme_attributes = lexeme_attributes or [] phonetic_attributes = phonetic_attributes or [] first_letter_of_form = TurkishAlphabet.get_letter_for_char(form_str[0]) # first apply voicing if possible if LexemeAttribute.NoVoicing not in lexeme_attributes and PhoneticAttributes.LastLetterVoicelessStop in phonetic_attributes and first_letter_of_form.vowel: voiced_letter = TurkishAlphabet.voice(TurkishAlphabet.get_letter_for_char(word[-1])) if voiced_letter: word = word[:-1] + voiced_letter.char_value # then try devoicing if PhoneticAttributes.LastLetterVoiceless in phonetic_attributes and TurkishAlphabet.devoice(first_letter_of_form): form_str = TurkishAlphabet.devoice(first_letter_of_form).char_value + form_str[1:] applied = u'' for i in range(len(form_str)): c = form_str[i] next_c = form_str[i + 1] if i + 1 < len(form_str) else None if c == '!': continue letter = TurkishAlphabet.get_letter_for_char(c) if letter.vowel and letter.upper_case_char_value == c: if c == u'A': if PhoneticAttributes.LastVowelBack in phonetic_attributes: applied += u'a' else: applied += u'e' elif c == u'I': if PhoneticAttributes.LastVowelBack in phonetic_attributes: if PhoneticAttributes.LastVowelUnrounded in phonetic_attributes or next_c == '!': applied += u'ı' else: applied += u'u' else: if PhoneticAttributes.LastVowelUnrounded in phonetic_attributes or next_c == '!': applied += u'i' else: applied += u'ü' elif c == u'O': if PhoneticAttributes.LastVowelBack in phonetic_attributes: applied += u'o' else: applied += u'ö' else: applied = applied + c return word, applied
def is_suffix_form_applicable(cls, word, form_str): """ Calculates the phonetics of the word and a suffix for and determines if the suffix form is applicable. @type word: unicode or None @type form_str: unicode or None @rtype: bool """ if not form_str or not form_str.strip(): return True if not word or not word.strip(): return False word = word.strip() form_str = form_str.strip() phonetic_attributes = cls.calculate_phonetic_attributes_of_plain_sequence( word) # ci, dik, +yacak, +iyor, +ar, +yi, +im, +yla first_form_letter = TurkishAlphabet.get_letter_for_char(form_str[0]) if first_form_letter.char_value == '+': # +yacak, +iyor, +ar, +yi, +im, +yla optional_letter = TurkishAlphabet.get_letter_for_char(form_str[1]) if optional_letter.vowel: #+iyor, +ar, +im if PhoneticAttributes.LastLetterVowel in phonetic_attributes: # ata, dana return cls.is_suffix_form_applicable(word, form_str[2:]) else: # yap, kitap return True else: # +yacak, +yi, +yla if PhoneticAttributes.LastLetterVowel in phonetic_attributes: #ata, dana return True else: # yap, kitap return cls.is_suffix_form_applicable(word, form_str[2:]) else: if first_form_letter.vowel: return PhoneticAttributes.LastLetterVowel not in phonetic_attributes else: return True
def is_suffix_form_applicable(cls, word, form_str): """ Calculates the phonetics of the word and a suffix for and determines if the suffix form is applicable. @type word: unicode or None @type form_str: unicode or None @rtype: bool """ if not form_str or not form_str.strip(): return True if not word or not word.strip(): return False word = word.strip() form_str = form_str.strip() phonetic_attributes = cls.calculate_phonetic_attributes_of_plain_sequence(word) # ci, dik, +yacak, +iyor, +ar, +yi, +im, +yla first_form_letter = TurkishAlphabet.get_letter_for_char(form_str[0]) if first_form_letter.char_value == '+': # +yacak, +iyor, +ar, +yi, +im, +yla optional_letter = TurkishAlphabet.get_letter_for_char(form_str[1]) if optional_letter.vowel: #+iyor, +ar, +im if PhoneticAttributes.LastLetterVowel in phonetic_attributes: # ata, dana return cls.is_suffix_form_applicable(word, form_str[2:]) else: # yap, kitap return True else: # +yacak, +yi, +yla if PhoneticAttributes.LastLetterVowel in phonetic_attributes: #ata, dana return True else: # yap, kitap return cls.is_suffix_form_applicable(word, form_str[2:]) else: if first_form_letter.vowel: return PhoneticAttributes.LastLetterVowel not in phonetic_attributes else: return True
def _vowel_count(cls, seq): vowel_count = 0 for c in seq: if TurkishAlphabet.get_letter_for_char(c).vowel: vowel_count += 1 return vowel_count
def _get_first_vowel(self, seq): for s in seq: letter = TurkishAlphabet.get_letter_for_char(s) if letter and letter.vowel: return letter return None
def _infer_morphemic_attributes(cls, lexeme): """ @type lexeme: Lexeme """ item_root = lexeme.root root_vowel_count = cls._vowel_count(item_root) last_letter = TurkishAlphabet.get_letter_for_char(item_root[-1]) if lexeme.syntactic_category==SyntacticCategory.VERB: if last_letter.vowel: lexeme.attributes.add(LexemeAttribute.ProgressiveVowelDrop) lexeme.attributes.add(LexemeAttribute.Passive_In) if root_vowel_count>1 and LexemeAttribute.Aorist_A not in lexeme.attributes: lexeme.attributes.add(LexemeAttribute.Aorist_I) if root_vowel_count==1 and LexemeAttribute.Aorist_I not in lexeme.attributes: lexeme.attributes.add(LexemeAttribute.Aorist_A) if last_letter==TurkishAlphabet.L_l: lexeme.attributes.add(LexemeAttribute.Passive_In) if all(a not in lexeme.attributes for a in LexemeAttribute.CAUSATIVES): if last_letter.vowel or (last_letter in [TurkishAlphabet.L_l, TurkishAlphabet.L_r]) and root_vowel_count>1: lexeme.attributes.add(LexemeAttribute.Causative_t) elif last_letter==TurkishAlphabet.L_t and root_vowel_count<2: lexeme.attributes.add(LexemeAttribute.Causative_Ir) else: lexeme.attributes.add(LexemeAttribute.Causative_dIr) if LexemeAttribute.ProgressiveVowelDrop in lexeme.attributes: lexeme.attributes.add(LexemeAttribute.NoVoicing) if LexemeAttribute.Voicing not in lexeme.attributes and LexemeAttribute.NoVoicing not in lexeme.attributes: lexeme.attributes.add(LexemeAttribute.NoVoicing) elif lexeme.syntactic_category==SyntacticCategory.NOUN and LexemeAttribute.CompoundP3sg in lexeme.attributes: if LexemeAttribute.VoicingOpt in lexeme.attributes: if LexemeAttribute.Voicing in lexeme.attributes: lexeme.attributes.remove(LexemeAttribute.Voicing) if LexemeAttribute.NoVoicing in lexeme.attributes: lexeme.attributes.remove(LexemeAttribute.NoVoicing) elif LexemeAttribute.Voicing not in lexeme.attributes: lexeme.attributes.add(LexemeAttribute.NoVoicing) elif lexeme.syntactic_category in [SyntacticCategory.NOUN, SyntacticCategory.ADJECTIVE]: if LexemeAttribute.VoicingOpt in lexeme.attributes: if LexemeAttribute.Voicing in lexeme.attributes: lexeme.attributes.remove(LexemeAttribute.Voicing) if LexemeAttribute.NoVoicing in lexeme.attributes: lexeme.attributes.remove(LexemeAttribute.NoVoicing) else: if root_vowel_count>1 and last_letter.voiceless and not last_letter.continuant and LexemeAttribute.NoVoicing not in lexeme.attributes \ and LexemeAttribute.InverseHarmony not in lexeme.attributes: lexeme.attributes.add(LexemeAttribute.Voicing) elif item_root.endswith('nk') or item_root.endswith('og') or item_root.endswith('rt'): lexeme.attributes.add(LexemeAttribute.Voicing) elif LexemeAttribute.Voicing not in lexeme.attributes: lexeme.attributes.add(LexemeAttribute.NoVoicing)
def _expectation_satisfied(cls, phonetic_expectation, form_str): if phonetic_expectation == PhoneticExpectation.VowelStart: first_char = form_str[0] if first_char == '+': return cls._expectation_satisfied(phonetic_expectation, form_str[1:]) or cls._expectation_satisfied( phonetic_expectation, form_str[2:]) else: return TurkishAlphabet.get_letter_for_char(first_char).vowel elif phonetic_expectation == PhoneticExpectation.ConsonantStart: first_char = form_str[0] if first_char == '+': return cls._expectation_satisfied(phonetic_expectation, form_str[1:]) or cls._expectation_satisfied( phonetic_expectation, form_str[2:]) else: return not TurkishAlphabet.get_letter_for_char(first_char).vowel else: raise Exception('Unknown phonetic_expectation', phonetic_expectation)
def print_verbs_with_double_consonant_ending(): dictionary_file_path = os.path.join(os.path.dirname(__file__), '../resources/master_dictionary.txt') with codecs.open(dictionary_file_path, mode='r', encoding='utf-8') as dictionary_file: for line in dictionary_file: line = line.strip() if line.startswith('#'): continue item = line if u'[' in line: item, meta = line.split(u'[') item = item.strip() if item.endswith(u'mak') or item.endswith(u'mek'): verb_root = item[:-3] if not TurkishAlphabet.get_letter_for_char( verb_root[-1] ).vowel and not TurkishAlphabet.get_letter_for_char( verb_root[-2]).vowel: print verb_root
def __init__(self, abbr): root = abbr lexeme = DynamicLexeme(abbr, abbr, SyntacticCategory.NOUN, SecondarySyntacticCategory.ABBREVIATION, None) phonetic_attributes = None last_letter = TurkishAlphabet.get_letter_for_char(abbr[-1]) if last_letter.vowel: phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(abbr) else: phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(abbr + u"E") phonetic_expectations = None super(AbbreviationRoot, self).__init__(root, lexeme, phonetic_expectations, phonetic_attributes)
def application_matches(cls, word, applied_str, voicing_allowed): """ Checks if a suffix applied word is matched by a surface. >>> Phonetics.application_matches(u'armudunu', u'armut', True) True >>> Phonetics.application_matches(u'armudunu', u'armut', False) False >>> Phonetics.application_matches(u'armudunu', u'armudu', True) True >>> Phonetics.application_matches(u'armudunu', u'armudu', False) True @param word: The full word (surface) @param applied_str: Suffix applied part of the word @param voicing_allowed: If voicing should be considered or ignored @type word: unicode @type applied_str: unicode @type voicing_allowed: bool @rtype: L{bool} """ if not applied_str or len(applied_str) > len(word): return False elif word == applied_str or word.startswith(applied_str): return True if voicing_allowed and word.startswith(applied_str[:-1]): last_letter_of_application = TurkishAlphabet.get_letter_for_char( applied_str[-1]) last_letter_of_word_part = TurkishAlphabet.get_letter_for_char( word[len(applied_str) - 1]) return TurkishAlphabet.voice( last_letter_of_application) == last_letter_of_word_part else: return False
def print_verbs_with_double_consonant_ending(): dictionary_file_path = os.path.join(os.path.dirname(__file__), '../resources/master_dictionary.txt') with codecs.open(dictionary_file_path, mode='r', encoding='utf-8') as dictionary_file: for line in dictionary_file: line = line.strip() if line.startswith('#'): continue item = line if u'[' in line: item,meta = line.split(u'[') item = item.strip() if item.endswith(u'mak') or item.endswith(u'mek'): verb_root = item[:-3] if not TurkishAlphabet.get_letter_for_char(verb_root[-1]).vowel and not TurkishAlphabet.get_letter_for_char(verb_root[-2]).vowel: print verb_root
def __init__(self, abbr): root = abbr lexeme = DynamicLexeme(abbr, abbr, SyntacticCategory.NOUN, SecondarySyntacticCategory.ABBREVIATION, None) phonetic_attributes = None last_letter = TurkishAlphabet.get_letter_for_char(abbr[-1]) if last_letter.vowel: phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( abbr) else: phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( abbr + u'E') phonetic_expectations = None super(AbbreviationRoot, self).__init__(root, lexeme, phonetic_expectations, phonetic_attributes)
def calculate_phonetic_attributes_of_plain_sequence(cls, seq): """ Calculates the phonetic attributes of a word, without the root attributes of it. @type seq: unicode @rtype: set """ attrs = [] last_vowel = cls.get_last_vowel(seq) last_letter = TurkishAlphabet.get_letter_for_char(seq[-1]) if last_vowel: if last_vowel.rounded: attrs.append(PhoneticAttributes.LastVowelRounded) else: attrs.append(PhoneticAttributes.LastVowelUnrounded) if last_vowel.frontal: attrs.append(PhoneticAttributes.LastVowelFrontal) else: attrs.append(PhoneticAttributes.LastVowelBack) if last_letter.vowel: attrs.append(PhoneticAttributes.LastLetterVowel) else: attrs.append(PhoneticAttributes.LastLetterConsonant) if last_letter.voiceless: attrs.append(PhoneticAttributes.LastLetterVoiceless) if not last_letter.continuant: attrs.append(PhoneticAttributes.LastLetterVoicelessStop) else: attrs.append(PhoneticAttributes.LastLetterNotVoiceless) if not last_letter.continuant and not last_letter.vowel: attrs.append(PhoneticAttributes.LastLetterVoicedStop) if last_letter.continuant: attrs.append(PhoneticAttributes.LastLetterContinuant) else: attrs.append(PhoneticAttributes.LastLetterNotContinuant) return set(attrs)
def find_roots_for_partial_input(self, partial_input, whole_surface=None): """ @type partial_input: unicode @type whole_surface: unicode @rtype: list of Root """ assert partial_input and whole_surface assert len(partial_input) <= len(whole_surface) assert whole_surface.startswith(partial_input) if len(whole_surface) == len(partial_input): assert whole_surface == partial_input if len(partial_input) < 2: # not possible except (d,diyor) and (y,yiyor). but they are already in the dictionary return [] last_vowel = Phonetics.get_last_vowel(partial_input) if not last_vowel: return [] root = partial_input lemma = root lemma_root = lemma syntactic_category = SyntacticCategory.VERB secondary_syntactic_category = None lexeme_attributes = set() lexeme = DynamicLexeme(lemma, lemma_root, syntactic_category, secondary_syntactic_category, lexeme_attributes) phonetic_expectations = set() phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(partial_input) no_attr_root = DynamicRoot(root, lexeme, phonetic_expectations, phonetic_attributes) self._set_lexeme_and_phonetic_attributes([no_attr_root]) self._set_lemma([no_attr_root]) last_char = partial_input[-1] last_letter = TurkishAlphabet.get_letter_for_char(last_char) partial_surface_can_be_root_of_a_verb = self._seems_like_a_valid_verb_root(partial_input) if whole_surface==partial_input: return [no_attr_root] if partial_surface_can_be_root_of_a_verb else [] first_char_after_partial_input = whole_surface[len(partial_input)] if first_char_after_partial_input.isupper(): return [] first_letter_after_partial_input = TurkishAlphabet.get_letter_for_char(first_char_after_partial_input) might_have_ProgressiveVowelDrop = not last_letter.vowel and\ any([whole_surface.startswith(partial_input+s) for s in [u'iyor', u'ıyor', u'uyor', u'üyor']]) might_have_Aorist_A = not last_letter.vowel and \ (whole_surface.startswith(partial_input + u'ar') or whole_surface.startswith(partial_input + u'er')) # no Aorist_I for -ur, -ür might_have_Aorist_I = not last_letter.vowel and\ (whole_surface.startswith(partial_input + u'ır') or whole_surface.startswith(partial_input + u'ir')) # for other letters, no voicing in verbs. {git+er->gider} vs {yapar, açar, diker} voicing_might_have_happened = last_letter==TurkishAlphabet.L_d and first_letter_after_partial_input.vowel possible_progressive_vowel_drop_roots = self._get_progressive_vowel_drop_roots(partial_input, whole_surface, no_attr_root, last_vowel) if might_have_ProgressiveVowelDrop else set() possible_aorist_A_roots = self._get_aorist_A_roots(no_attr_root) if might_have_Aorist_A else set() possible_aorist_I_roots = self._get_aorist_I_roots(no_attr_root) if might_have_Aorist_I else set() possible_causative_roots = self._get_possible_causative_roots(partial_input, whole_surface, no_attr_root) possible_passive_roots = self._get_possible_passive_roots(last_letter, partial_input, whole_surface, no_attr_root) if voicing_might_have_happened: possible_progressive_vowel_drop_roots = possible_progressive_vowel_drop_roots.union(set([self._get_possible_voicing_root(r) for r in possible_progressive_vowel_drop_roots])) possible_aorist_A_roots = possible_aorist_A_roots.union(set([self._get_possible_voicing_root(r) for r in possible_aorist_A_roots])) possible_aorist_I_roots = possible_aorist_I_roots.union(set([self._get_possible_voicing_root(r) for r in possible_aorist_I_roots])) possible_causative_roots = possible_causative_roots.union(set([self._get_possible_voicing_root(r) for r in possible_causative_roots])) possible_passive_roots = possible_passive_roots.union(set([self._get_possible_voicing_root(r) for r in possible_passive_roots])) generated_roots = set() generated_roots.add(no_attr_root) if voicing_might_have_happened: generated_roots.add(self._get_possible_voicing_root(no_attr_root)) generated_roots = generated_roots.union(possible_progressive_vowel_drop_roots) generated_roots = generated_roots.union(possible_aorist_A_roots) generated_roots = generated_roots.union(possible_aorist_I_roots) generated_roots = generated_roots.union(possible_causative_roots) generated_roots = generated_roots.union(possible_passive_roots) self._set_lexeme_and_phonetic_attributes(generated_roots) self._set_lemma(generated_roots) generated_roots = list(generated_roots) generated_roots = filter(lambda r: self._seems_like_a_valid_verb_root(r.lexeme.root), generated_roots) return generated_roots
def _generate_modified_root_nodes(cls, lexeme): if LexemeAttribute.RootChange in lexeme.attributes: special_roots = cls._handle_special_roots(lexeme) if special_roots: return special_roots modified_seq = lexeme.root original_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( lexeme.root) modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( lexeme.root) original_phonetic_expectations = set() modified_phonetic_expectations = set() if LexemeAttribute.Voicing in lexeme.attributes or LexemeAttribute.VoicingOpt in lexeme.attributes: last_letter = TurkishAlphabet.get_letter_for_char(modified_seq[-1]) modified_letter = TurkishAlphabet.voice(last_letter) assert modified_letter is not None if lexeme.lemma.endswith(u"nk"): modified_letter = TurkishAlphabet.L_g modified_seq = modified_seq[:-1] + modified_letter.char_value if PhoneticAttributes.LastLetterVoicelessStop in modified_attributes: modified_attributes.remove( PhoneticAttributes.LastLetterVoicelessStop) if modified_letter.continuant: if PhoneticAttributes.LastLetterNotContinuant in modified_attributes: modified_attributes.remove( PhoneticAttributes.LastLetterNotContinuant) modified_attributes.add( PhoneticAttributes.LastLetterContinuant) else: if PhoneticAttributes.LastLetterContinuant in modified_attributes: modified_attributes.remove( PhoneticAttributes.LastLetterContinuant) modified_attributes.add( PhoneticAttributes.LastLetterNotContinuant) if LexemeAttribute.VoicingOpt not in lexeme.attributes: original_phonetic_expectations.add( PhoneticExpectation.ConsonantStart) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) if LexemeAttribute.Doubling in lexeme.attributes: modified_seq = modified_seq + modified_seq[-1] original_phonetic_expectations.add( PhoneticExpectation.ConsonantStart) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) if LexemeAttribute.LastVowelDrop in lexeme.attributes: modified_seq = modified_seq[:-2] + modified_seq[-1] if lexeme.syntactic_category != SyntacticCategory.VERB: original_phonetic_expectations.add( PhoneticExpectation.ConsonantStart) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) if LexemeAttribute.InverseHarmony in lexeme.attributes: original_attributes.add(PhoneticAttributes.LastVowelFrontal) if PhoneticAttributes.LastVowelBack in original_attributes: original_attributes.remove(PhoneticAttributes.LastVowelBack) modified_attributes.add(PhoneticAttributes.LastVowelFrontal) if PhoneticAttributes.LastVowelBack in modified_attributes: modified_attributes.remove(PhoneticAttributes.LastVowelBack) if LexemeAttribute.ProgressiveVowelDrop in lexeme.attributes: modified_seq = modified_seq[:-1] if RootGenerator._has_vowel(modified_seq): modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( modified_seq) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) original_phonetic_expectations = original_phonetic_expectations or None modified_phonetic_expectations = modified_phonetic_expectations or None original = Root(lexeme.root, lexeme, original_phonetic_expectations, original_attributes) modified = Root(modified_seq, lexeme, modified_phonetic_expectations, modified_attributes) if original == modified: return [original] else: return [original, modified]
def _has_vowel(cls, seq): for s in seq: if TurkishAlphabet.get_letter_for_char(s).vowel: return True return False
def _generate_modified_root_nodes(cls, lexeme): if LexemeAttribute.RootChange in lexeme.attributes: special_roots = cls._handle_special_roots(lexeme) if special_roots: return special_roots modified_seq = lexeme.root original_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(lexeme.root) modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(lexeme.root) original_phonetic_expectations = set() modified_phonetic_expectations = set() if LexemeAttribute.Voicing in lexeme.attributes or LexemeAttribute.VoicingOpt in lexeme.attributes: last_letter = TurkishAlphabet.get_letter_for_char(modified_seq[-1]) modified_letter = TurkishAlphabet.voice(last_letter) assert modified_letter is not None if lexeme.lemma.endswith(u"nk"): modified_letter = TurkishAlphabet.L_g modified_seq = modified_seq[:-1] + modified_letter.char_value if PhoneticAttributes.LastLetterVoicelessStop in modified_attributes: modified_attributes.remove(PhoneticAttributes.LastLetterVoicelessStop) if modified_letter.continuant: if PhoneticAttributes.LastLetterNotContinuant in modified_attributes : modified_attributes.remove(PhoneticAttributes.LastLetterNotContinuant) modified_attributes.add(PhoneticAttributes.LastLetterContinuant) else: if PhoneticAttributes.LastLetterContinuant in modified_attributes: modified_attributes.remove(PhoneticAttributes.LastLetterContinuant) modified_attributes.add(PhoneticAttributes.LastLetterNotContinuant) if LexemeAttribute.VoicingOpt not in lexeme.attributes: original_phonetic_expectations.add(PhoneticExpectation.ConsonantStart) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) if LexemeAttribute.Doubling in lexeme.attributes: modified_seq = modified_seq + modified_seq[-1] original_phonetic_expectations.add(PhoneticExpectation.ConsonantStart) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) if LexemeAttribute.LastVowelDrop in lexeme.attributes: modified_seq = modified_seq[:-2] + modified_seq[-1] if lexeme.syntactic_category!=SyntacticCategory.VERB: original_phonetic_expectations.add(PhoneticExpectation.ConsonantStart) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) if LexemeAttribute.InverseHarmony in lexeme.attributes: original_attributes.add(PhoneticAttributes.LastVowelFrontal) if PhoneticAttributes.LastVowelBack in original_attributes: original_attributes.remove(PhoneticAttributes.LastVowelBack) modified_attributes.add(PhoneticAttributes.LastVowelFrontal) if PhoneticAttributes.LastVowelBack in modified_attributes: modified_attributes.remove(PhoneticAttributes.LastVowelBack) if LexemeAttribute.ProgressiveVowelDrop in lexeme.attributes: modified_seq = modified_seq[:-1] if RootGenerator._has_vowel(modified_seq): modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(modified_seq) modified_phonetic_expectations.add(PhoneticExpectation.VowelStart) original_phonetic_expectations = original_phonetic_expectations or None modified_phonetic_expectations = modified_phonetic_expectations or None original = Root(lexeme.root, lexeme, original_phonetic_expectations, original_attributes) modified = Root(modified_seq, lexeme, modified_phonetic_expectations, modified_attributes) if original==modified: return [original] else: return [original, modified]
def get_last_vowel(cls, seq): for s in reversed(seq): turkish_letter = TurkishAlphabet.get_letter_for_char(s) if turkish_letter.vowel: return turkish_letter
def find_roots_for_partial_input(self, partial_input, whole_surface=None): """ @type partial_input: unicode @type whole_surface: unicode @rtype: list of Root """ assert partial_input and whole_surface assert len(partial_input) <= len(whole_surface) assert whole_surface.startswith(partial_input) if len(whole_surface) == len(partial_input): assert whole_surface == partial_input # no compound should be found an input shorter than sth like "atsu-yu". even that doesn't make sense if len(partial_input) < 5: return [] if whole_surface == partial_input: return [] last_char = partial_input[-1] previous_char = partial_input[-2] if last_char.isupper() or previous_char.isupper(): return [] last_letter = TurkishAlphabet.get_letter_for_char(last_char) if last_letter!=TurkishAlphabet.L_i and last_letter!=TurkishAlphabet.L_u and\ last_letter!=TurkishAlphabet.L_ii and last_letter!=TurkishAlphabet.L_uu: return [] first_char_after_partial_input = whole_surface[len(partial_input)] if first_char_after_partial_input.isupper(): return [] first_letter_after_partial_input = TurkishAlphabet.get_letter_for_char(first_char_after_partial_input) if first_letter_after_partial_input != TurkishAlphabet.L_n: return [] if len(whole_surface) < len(partial_input) + 2: # need a char after char 'n' return [] compound_results = [] results_with_partial_input_one_char_missing = self.brute_force_noun_root_finder.find_roots_for_partial_input(partial_input[:-1], whole_surface) # illustrate: # partial_input = suborusu, whole_surface = suborusuna # results_with_partial_input_one_char_missing : <'suborus','suborus'> # partial_input = bacakkalemi, whole_surface = bacakkalemini # results_with_partial_input_one_char_missing : <'bacakkalem','bacakkalem'> for normal_noun_result in results_with_partial_input_one_char_missing: clone_result = normal_noun_result._clone(True) clone_result.str = clone_result.lexeme.root clone_result.lexeme.root = partial_input clone_result.lexeme.lemma = partial_input compound_results.append(clone_result) previous_letter = TurkishAlphabet.get_letter_for_char(previous_char) if previous_letter==TurkishAlphabet.L_s: results_with_partial_input_two_chars_missing = self.brute_force_noun_root_finder.find_roots_for_partial_input(partial_input[:-2], whole_surface) # illustrate: # partial_input = suborusu, whole_surface = suborusuna # results_with_partial_input_two_chars_missing : <'suboru','suboru'> for normal_noun_result in results_with_partial_input_two_chars_missing: clone_result = normal_noun_result._clone(True) clone_result.lexeme.root = partial_input clone_result.lexeme.lemma = partial_input compound_results.append(clone_result) for compound_result in compound_results: compound_result.lexeme.attributes.add(LexemeAttribute.CompoundP3sg) return compound_results
def find_roots_for_partial_input(self, partial_input, whole_surface=None): """ @type partial_input: unicode @type whole_surface: unicode @rtype: list of Root """ assert partial_input and whole_surface assert len(partial_input) <= len(whole_surface) assert whole_surface.startswith(partial_input) if len(whole_surface) == len(partial_input): assert whole_surface == partial_input if len( partial_input ) < 2: # not possible except (d,diyor) and (y,yiyor). but they are already in the dictionary return [] last_vowel = Phonetics.get_last_vowel(partial_input) if not last_vowel: return [] root = partial_input lemma = root lemma_root = lemma syntactic_category = SyntacticCategory.VERB secondary_syntactic_category = None lexeme_attributes = set() lexeme = DynamicLexeme(lemma, lemma_root, syntactic_category, secondary_syntactic_category, lexeme_attributes) phonetic_expectations = set() phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence( partial_input) no_attr_root = DynamicRoot(root, lexeme, phonetic_expectations, phonetic_attributes) self._set_lexeme_and_phonetic_attributes([no_attr_root]) self._set_lemma([no_attr_root]) last_char = partial_input[-1] last_letter = TurkishAlphabet.get_letter_for_char(last_char) partial_surface_can_be_root_of_a_verb = self._seems_like_a_valid_verb_root( partial_input) if whole_surface == partial_input: return [no_attr_root ] if partial_surface_can_be_root_of_a_verb else [] first_char_after_partial_input = whole_surface[len(partial_input)] if first_char_after_partial_input.isupper(): return [] first_letter_after_partial_input = TurkishAlphabet.get_letter_for_char( first_char_after_partial_input) might_have_ProgressiveVowelDrop = not last_letter.vowel and\ any([whole_surface.startswith(partial_input+s) for s in [u'iyor', u'ıyor', u'uyor', u'üyor']]) might_have_Aorist_A = not last_letter.vowel and \ (whole_surface.startswith(partial_input + u'ar') or whole_surface.startswith(partial_input + u'er')) # no Aorist_I for -ur, -ür might_have_Aorist_I = not last_letter.vowel and\ (whole_surface.startswith(partial_input + u'ır') or whole_surface.startswith(partial_input + u'ir')) # for other letters, no voicing in verbs. {git+er->gider} vs {yapar, açar, diker} voicing_might_have_happened = last_letter == TurkishAlphabet.L_d and first_letter_after_partial_input.vowel possible_progressive_vowel_drop_roots = self._get_progressive_vowel_drop_roots( partial_input, whole_surface, no_attr_root, last_vowel) if might_have_ProgressiveVowelDrop else set() possible_aorist_A_roots = self._get_aorist_A_roots( no_attr_root) if might_have_Aorist_A else set() possible_aorist_I_roots = self._get_aorist_I_roots( no_attr_root) if might_have_Aorist_I else set() possible_causative_roots = self._get_possible_causative_roots( partial_input, whole_surface, no_attr_root) possible_passive_roots = self._get_possible_passive_roots( last_letter, partial_input, whole_surface, no_attr_root) if voicing_might_have_happened: possible_progressive_vowel_drop_roots = possible_progressive_vowel_drop_roots.union( set([ self._get_possible_voicing_root(r) for r in possible_progressive_vowel_drop_roots ])) possible_aorist_A_roots = possible_aorist_A_roots.union( set([ self._get_possible_voicing_root(r) for r in possible_aorist_A_roots ])) possible_aorist_I_roots = possible_aorist_I_roots.union( set([ self._get_possible_voicing_root(r) for r in possible_aorist_I_roots ])) possible_causative_roots = possible_causative_roots.union( set([ self._get_possible_voicing_root(r) for r in possible_causative_roots ])) possible_passive_roots = possible_passive_roots.union( set([ self._get_possible_voicing_root(r) for r in possible_passive_roots ])) generated_roots = set() generated_roots.add(no_attr_root) if voicing_might_have_happened: generated_roots.add(self._get_possible_voicing_root(no_attr_root)) generated_roots = generated_roots.union( possible_progressive_vowel_drop_roots) generated_roots = generated_roots.union(possible_aorist_A_roots) generated_roots = generated_roots.union(possible_aorist_I_roots) generated_roots = generated_roots.union(possible_causative_roots) generated_roots = generated_roots.union(possible_passive_roots) self._set_lexeme_and_phonetic_attributes(generated_roots) self._set_lemma(generated_roots) generated_roots = list(generated_roots) generated_roots = filter( lambda r: self._seems_like_a_valid_verb_root(r.lexeme.root), generated_roots) return generated_roots
def _infer_morphemic_attributes(cls, lexeme): """ @type lexeme: Lexeme """ item_root = lexeme.root root_vowel_count = cls._vowel_count(item_root) last_letter = TurkishAlphabet.get_letter_for_char(item_root[-1]) if lexeme.syntactic_category == SyntacticCategory.VERB: if last_letter.vowel: lexeme.attributes.add(LexemeAttribute.ProgressiveVowelDrop) lexeme.attributes.add(LexemeAttribute.Passive_In) if root_vowel_count > 1 and LexemeAttribute.Aorist_A not in lexeme.attributes: lexeme.attributes.add(LexemeAttribute.Aorist_I) if root_vowel_count == 1 and LexemeAttribute.Aorist_I not in lexeme.attributes: lexeme.attributes.add(LexemeAttribute.Aorist_A) if last_letter == TurkishAlphabet.L_l: lexeme.attributes.add(LexemeAttribute.Passive_In) if all(a not in lexeme.attributes for a in LexemeAttribute.CAUSATIVES): if last_letter.vowel or (last_letter in [ TurkishAlphabet.L_l, TurkishAlphabet.L_r ]) and root_vowel_count > 1: lexeme.attributes.add(LexemeAttribute.Causative_t) elif last_letter == TurkishAlphabet.L_t and root_vowel_count < 2: lexeme.attributes.add(LexemeAttribute.Causative_Ir) else: lexeme.attributes.add(LexemeAttribute.Causative_dIr) if LexemeAttribute.ProgressiveVowelDrop in lexeme.attributes: lexeme.attributes.add(LexemeAttribute.NoVoicing) if LexemeAttribute.Voicing not in lexeme.attributes and LexemeAttribute.NoVoicing not in lexeme.attributes: lexeme.attributes.add(LexemeAttribute.NoVoicing) elif lexeme.syntactic_category == SyntacticCategory.NOUN and LexemeAttribute.CompoundP3sg in lexeme.attributes: if LexemeAttribute.VoicingOpt in lexeme.attributes: if LexemeAttribute.Voicing in lexeme.attributes: lexeme.attributes.remove(LexemeAttribute.Voicing) if LexemeAttribute.NoVoicing in lexeme.attributes: lexeme.attributes.remove(LexemeAttribute.NoVoicing) elif LexemeAttribute.Voicing not in lexeme.attributes: lexeme.attributes.add(LexemeAttribute.NoVoicing) elif lexeme.syntactic_category in [ SyntacticCategory.NOUN, SyntacticCategory.ADJECTIVE ]: if LexemeAttribute.VoicingOpt in lexeme.attributes: if LexemeAttribute.Voicing in lexeme.attributes: lexeme.attributes.remove(LexemeAttribute.Voicing) if LexemeAttribute.NoVoicing in lexeme.attributes: lexeme.attributes.remove(LexemeAttribute.NoVoicing) else: if root_vowel_count>1 and last_letter.voiceless and not last_letter.continuant and LexemeAttribute.NoVoicing not in lexeme.attributes \ and LexemeAttribute.InverseHarmony not in lexeme.attributes: lexeme.attributes.add(LexemeAttribute.Voicing) elif item_root.endswith('nk') or item_root.endswith( 'og') or item_root.endswith('rt'): lexeme.attributes.add(LexemeAttribute.Voicing) elif LexemeAttribute.Voicing not in lexeme.attributes: lexeme.attributes.add(LexemeAttribute.NoVoicing)