Python TurkishAlphabet.get_letter_for_char 예제들, trnltk.morphology.phonetics.alphabet.TurkishAlphabet.get_letter_for_char Python 예제들

예제 #1

0

파일 보기

파일: phonetics.py 프로젝트: v-mostafapour/trnltk

    def _expectation_satisfied(cls, phonetic_expectation, form_str):
        if phonetic_expectation == PhoneticExpectation.VowelStart:
            first_char = form_str[0]
            if first_char == '+':
                return cls._expectation_satisfied(
                    phonetic_expectation,
                    form_str[1:]) or cls._expectation_satisfied(
                        phonetic_expectation, form_str[2:])
            else:
                return TurkishAlphabet.get_letter_for_char(first_char).vowel

        elif phonetic_expectation == PhoneticExpectation.ConsonantStart:
            first_char = form_str[0]
            if first_char == '+':
                return cls._expectation_satisfied(
                    phonetic_expectation,
                    form_str[1:]) or cls._expectation_satisfied(
                        phonetic_expectation, form_str[2:])
            else:
                return not TurkishAlphabet.get_letter_for_char(
                    first_char).vowel

        else:
            raise Exception('Unknown phonetic_expectation',
                            phonetic_expectation)

예제 #2

0

파일 보기

파일: bruteforcenounrootfinders.py 프로젝트: v-mostafapour/trnltk

    def _get_voicing_and_doubling_roots(self, partial_input, last_char, first_char_after_partial_input,
                                        no_orthographics_root):
        last_letter = TurkishAlphabet.get_letter_for_char(last_char)
        first_letter_after_partial_input = TurkishAlphabet.get_letter_for_char(first_char_after_partial_input)

        no_voicing_rule_applies = last_letter in TurkishAlphabet.Voicing_Map and first_letter_after_partial_input.vowel
        voicing_might_have_happened = last_letter in TurkishAlphabet.Inverse_Voicing_Map and first_letter_after_partial_input.vowel
        doubling_might_have_happened = len(partial_input) > 2 and\
                                       not last_letter.vowel and\
                                       partial_input[-1] == partial_input[-2] and\
                                       first_letter_after_partial_input.vowel

        if doubling_might_have_happened:
            if no_voicing_rule_applies:
                doubling_root = self._create_doubling_root(no_orthographics_root, last_char)
                no_orthographics_root.lexeme.attributes = {LexemeAttribute.NoVoicing}
                doubling_root.lexeme.attributes.add(LexemeAttribute.NoVoicing)
                return [no_orthographics_root, doubling_root]
            elif voicing_might_have_happened:
                inverse_devoicing_roots = self._inverse_devoice_last_letter(no_orthographics_root, last_letter)
                devoicing_doubling_roots = [self._create_doubling_root(r, r.lexeme.root[-1]) for r in
                                            inverse_devoicing_roots]
                doubling_root = self._create_doubling_root(no_orthographics_root, last_char)
                return [no_orthographics_root] + [doubling_root] + devoicing_doubling_roots
            else:
                return [no_orthographics_root] + [self._create_doubling_root(no_orthographics_root, last_char)]
        else:
            if no_voicing_rule_applies:
                no_orthographics_root.lexeme.attributes = {LexemeAttribute.NoVoicing}
                return [no_orthographics_root]
            elif voicing_might_have_happened:
                return [no_orthographics_root] + self._inverse_devoice_last_letter(no_orthographics_root, last_letter)
            else:
                return [no_orthographics_root]

예제 #3

0

파일 보기

파일: phonetics.py 프로젝트: aliok/trnltk

    def application_matches(cls, word, applied_str, voicing_allowed):
        """
        Checks if a suffix applied word is matched by a surface.

            >>> Phonetics.application_matches(u'armudunu', u'armut', True)
            True
            >>> Phonetics.application_matches(u'armudunu', u'armut', False)
            False
            >>> Phonetics.application_matches(u'armudunu', u'armudu', True)
            True
            >>> Phonetics.application_matches(u'armudunu', u'armudu', False)
            True

        @param word: The full word (surface)
        @param applied_str: Suffix applied part of the word
        @param voicing_allowed: If voicing should be considered or ignored
        @type word: unicode
        @type applied_str: unicode
        @type voicing_allowed: bool
        @rtype: L{bool}
        """
        if not applied_str or len(applied_str) > len(word):
            return False

        elif word == applied_str or word.startswith(applied_str):
            return True

        if  voicing_allowed and word.startswith(applied_str[:-1]):
            last_letter_of_application = TurkishAlphabet.get_letter_for_char(applied_str[-1])
            last_letter_of_word_part = TurkishAlphabet.get_letter_for_char(word[len(applied_str) - 1])
            return TurkishAlphabet.voice(last_letter_of_application) == last_letter_of_word_part

        else:
            return False

예제 #4

0

파일 보기

파일: phonetics.py 프로젝트: v-mostafapour/trnltk

    def _handle_phonetics(cls,
                          word,
                          phonetic_attributes,
                          form_str,
                          lexeme_attributes=None):
        lexeme_attributes = lexeme_attributes or []
        phonetic_attributes = phonetic_attributes or []

        first_letter_of_form = TurkishAlphabet.get_letter_for_char(form_str[0])

        # first apply voicing if possible
        if LexemeAttribute.NoVoicing not in lexeme_attributes and PhoneticAttributes.LastLetterVoicelessStop in phonetic_attributes and first_letter_of_form.vowel:
            voiced_letter = TurkishAlphabet.voice(
                TurkishAlphabet.get_letter_for_char(word[-1]))
            if voiced_letter:
                word = word[:-1] + voiced_letter.char_value

        # then try devoicing
        if PhoneticAttributes.LastLetterVoiceless in phonetic_attributes and TurkishAlphabet.devoice(
                first_letter_of_form):
            form_str = TurkishAlphabet.devoice(
                first_letter_of_form).char_value + form_str[1:]

        applied = u''

        for i in range(len(form_str)):
            c = form_str[i]
            next_c = form_str[i + 1] if i + 1 < len(form_str) else None

            if c == '!':
                continue

            letter = TurkishAlphabet.get_letter_for_char(c)
            if letter.vowel and letter.upper_case_char_value == c:
                if c == u'A':
                    if PhoneticAttributes.LastVowelBack in phonetic_attributes:
                        applied += u'a'
                    else:
                        applied += u'e'
                elif c == u'I':
                    if PhoneticAttributes.LastVowelBack in phonetic_attributes:
                        if PhoneticAttributes.LastVowelUnrounded in phonetic_attributes or next_c == '!':
                            applied += u'ı'
                        else:
                            applied += u'u'
                    else:
                        if PhoneticAttributes.LastVowelUnrounded in phonetic_attributes or next_c == '!':
                            applied += u'i'
                        else:
                            applied += u'ü'
                elif c == u'O':
                    if PhoneticAttributes.LastVowelBack in phonetic_attributes:
                        applied += u'o'
                    else:
                        applied += u'ö'

            else:
                applied = applied + c

        return word, applied

예제 #5

0

파일 보기

파일: phonetics.py 프로젝트: v-mostafapour/trnltk

    def apply(cls,
              word,
              phonetic_attributes,
              form_str,
              lexeme_attributes=None):
        """
        Applies a suffix form to a word, considering the phonetics and root attributes given.
        @param word: Surface
        @type word: unicode
        @param phonetic_attributes: Provided phonetics of the surface
        @type phonetic_attributes: set of unicode
        @param form_str: Suffix form
        @type form_str: unicode
        @param lexeme_attributes: Provided lexeme attributes of the root of surface
        @type lexeme_attributes: set of unicode
        @return: Tuple (word, applied suffix form)
        @rtype: tuple
        """
        if not form_str or not form_str.strip():
            return word, u''

        if not word or not word.strip():
            return None, None

        # ci, dik, +yacak, +iyor, +ar, +yi, +im, +yla

        first_form_letter = TurkishAlphabet.get_letter_for_char(form_str[0])
        if first_form_letter.char_value == '+':
            # +yacak, +iyor, +ar, +yi, +im, +yla

            optional_letter = TurkishAlphabet.get_letter_for_char(form_str[1])
            if optional_letter.vowel:
                #+iyor, +ar, +im
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    # ata, dana
                    return cls.apply(word, phonetic_attributes, form_str[2:],
                                     lexeme_attributes)
                else:
                    # yap, kitap
                    return cls._handle_phonetics(word, phonetic_attributes,
                                                 form_str[1:],
                                                 lexeme_attributes)

            else:
                # +yacak, +yi, +yla
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    #ata, dana
                    return cls._handle_phonetics(word, phonetic_attributes,
                                                 form_str[1:],
                                                 lexeme_attributes)
                else:
                    # yap, kitap
                    return cls.apply(word, phonetic_attributes, form_str[2:],
                                     lexeme_attributes)

        else:
            return cls._handle_phonetics(word, phonetic_attributes, form_str,
                                         lexeme_attributes)

예제 #6

0

파일 보기

파일: bruteforceverbrootfinder.py 프로젝트: aliok/trnltk

    def _seems_like_a_valid_verb_root(self, seq):
        last_char = seq[-1]
        last_letter = TurkishAlphabet.get_letter_for_char(last_char)

        previous_char = seq[-2]
        previous_letter = TurkishAlphabet.get_letter_for_char(previous_char)

        return last_letter.vowel or previous_letter.vowel or\
               (any([previous_letter == l for l in [TurkishAlphabet.L_l, TurkishAlphabet.L_r, TurkishAlphabet.L_n]])
                and not last_letter.continuant)

예제 #7

0

파일 보기

    def _seems_like_a_valid_verb_root(self, seq):
        last_char = seq[-1]
        last_letter = TurkishAlphabet.get_letter_for_char(last_char)

        previous_char = seq[-2]
        previous_letter = TurkishAlphabet.get_letter_for_char(previous_char)

        return last_letter.vowel or previous_letter.vowel or\
               (any([previous_letter == l for l in [TurkishAlphabet.L_l, TurkishAlphabet.L_r, TurkishAlphabet.L_n]])
                and not last_letter.continuant)

예제 #8

0

파일 보기

파일: phonetics.py 프로젝트: aliok/trnltk

    def _handle_phonetics(cls, word, phonetic_attributes, form_str, lexeme_attributes=None):
        lexeme_attributes = lexeme_attributes or []
        phonetic_attributes = phonetic_attributes or []

        first_letter_of_form = TurkishAlphabet.get_letter_for_char(form_str[0])

        # first apply voicing if possible
        if LexemeAttribute.NoVoicing not in lexeme_attributes and PhoneticAttributes.LastLetterVoicelessStop in phonetic_attributes and first_letter_of_form.vowel:
            voiced_letter = TurkishAlphabet.voice(TurkishAlphabet.get_letter_for_char(word[-1]))
            if voiced_letter:
                word = word[:-1] + voiced_letter.char_value

        # then try devoicing
        if PhoneticAttributes.LastLetterVoiceless in phonetic_attributes and TurkishAlphabet.devoice(first_letter_of_form):
            form_str = TurkishAlphabet.devoice(first_letter_of_form).char_value + form_str[1:]

        applied = u''

        for i in range(len(form_str)):
            c = form_str[i]
            next_c = form_str[i + 1] if i + 1 < len(form_str) else None

            if c == '!':
                continue

            letter = TurkishAlphabet.get_letter_for_char(c)
            if letter.vowel and letter.upper_case_char_value == c:
                if c == u'A':
                    if PhoneticAttributes.LastVowelBack in phonetic_attributes:
                        applied += u'a'
                    else:
                        applied += u'e'
                elif c == u'I':
                    if PhoneticAttributes.LastVowelBack in phonetic_attributes:
                        if PhoneticAttributes.LastVowelUnrounded in phonetic_attributes or next_c == '!':
                            applied += u'ı'
                        else:
                            applied += u'u'
                    else:
                        if PhoneticAttributes.LastVowelUnrounded in phonetic_attributes or next_c == '!':
                            applied += u'i'
                        else:
                            applied += u'ü'
                elif c == u'O':
                    if PhoneticAttributes.LastVowelBack in phonetic_attributes:
                        applied += u'o'
                    else:
                        applied += u'ö'

            else:
                applied = applied + c

        return word, applied

예제 #9

0

파일 보기

파일: phonetics.py 프로젝트: v-mostafapour/trnltk

    def is_suffix_form_applicable(cls, word, form_str):
        """
        Calculates the phonetics of the word and a suffix for and determines if the suffix form is applicable.
        @type word: unicode or None
        @type form_str: unicode or None
        @rtype: bool
        """
        if not form_str or not form_str.strip():
            return True

        if not word or not word.strip():
            return False

        word = word.strip()
        form_str = form_str.strip()

        phonetic_attributes = cls.calculate_phonetic_attributes_of_plain_sequence(
            word)

        # ci, dik, +yacak, +iyor, +ar, +yi, +im, +yla

        first_form_letter = TurkishAlphabet.get_letter_for_char(form_str[0])
        if first_form_letter.char_value == '+':
            # +yacak, +iyor, +ar, +yi, +im, +yla

            optional_letter = TurkishAlphabet.get_letter_for_char(form_str[1])
            if optional_letter.vowel:
                #+iyor, +ar, +im
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    # ata, dana
                    return cls.is_suffix_form_applicable(word, form_str[2:])
                else:
                    # yap, kitap
                    return True

            else:
                # +yacak, +yi, +yla
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    #ata, dana
                    return True
                else:
                    # yap, kitap
                    return cls.is_suffix_form_applicable(word, form_str[2:])

        else:
            if first_form_letter.vowel:
                return PhoneticAttributes.LastLetterVowel not in phonetic_attributes
            else:
                return True

예제 #10

0

파일 보기

파일: phonetics.py 프로젝트: aliok/trnltk

    def is_suffix_form_applicable(cls, word, form_str):
        """
        Calculates the phonetics of the word and a suffix for and determines if the suffix form is applicable.
        @type word: unicode or None
        @type form_str: unicode or None
        @rtype: bool
        """
        if not form_str or not form_str.strip():
            return True

        if not word or not word.strip():
            return False

        word = word.strip()
        form_str = form_str.strip()

        phonetic_attributes = cls.calculate_phonetic_attributes_of_plain_sequence(word)

        # ci, dik, +yacak, +iyor, +ar, +yi, +im, +yla

        first_form_letter = TurkishAlphabet.get_letter_for_char(form_str[0])
        if first_form_letter.char_value == '+':
            # +yacak, +iyor, +ar, +yi, +im, +yla

            optional_letter = TurkishAlphabet.get_letter_for_char(form_str[1])
            if optional_letter.vowel:
                #+iyor, +ar, +im
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    # ata, dana
                    return cls.is_suffix_form_applicable(word, form_str[2:])
                else:
                    # yap, kitap
                    return True

            else:
                # +yacak, +yi, +yla
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    #ata, dana
                    return True
                else:
                    # yap, kitap
                    return cls.is_suffix_form_applicable(word, form_str[2:])

        else:
            if first_form_letter.vowel:
                return PhoneticAttributes.LastLetterVowel not in phonetic_attributes
            else:
                return True

예제 #11

0

파일 보기

    def _vowel_count(cls, seq):
        vowel_count = 0
        for c in seq:
            if TurkishAlphabet.get_letter_for_char(c).vowel:
                vowel_count += 1

        return vowel_count

예제 #12

0

파일 보기

파일: bruteforceverbrootfinder.py 프로젝트: aliok/trnltk

    def _get_first_vowel(self, seq):
        for s in seq:
            letter = TurkishAlphabet.get_letter_for_char(s)
            if letter and letter.vowel:
                return letter

        return None

예제 #13

0

파일 보기

파일: lexiconloader.py 프로젝트: aliok/trnltk

    def _vowel_count(cls, seq):
        vowel_count = 0
        for c in seq:
            if TurkishAlphabet.get_letter_for_char(c).vowel:
                vowel_count += 1

        return vowel_count

예제 #14

0

파일 보기

    def _get_first_vowel(self, seq):
        for s in seq:
            letter = TurkishAlphabet.get_letter_for_char(s)
            if letter and letter.vowel:
                return letter

        return None

예제 #15

0

파일 보기

파일: phonetics.py 프로젝트: aliok/trnltk

    def apply(cls, word, phonetic_attributes, form_str, lexeme_attributes=None):
        """
        Applies a suffix form to a word, considering the phonetics and root attributes given.
        @param word: Surface
        @type word: unicode
        @param phonetic_attributes: Provided phonetics of the surface
        @type phonetic_attributes: set of unicode
        @param form_str: Suffix form
        @type form_str: unicode
        @param lexeme_attributes: Provided lexeme attributes of the root of surface
        @type lexeme_attributes: set of unicode
        @return: Tuple (word, applied suffix form)
        @rtype: tuple
        """
        if not form_str or not form_str.strip():
            return word, u''

        if not word or not word.strip():
            return None, None

        # ci, dik, +yacak, +iyor, +ar, +yi, +im, +yla

        first_form_letter = TurkishAlphabet.get_letter_for_char(form_str[0])
        if first_form_letter.char_value == '+':
            # +yacak, +iyor, +ar, +yi, +im, +yla

            optional_letter = TurkishAlphabet.get_letter_for_char(form_str[1])
            if optional_letter.vowel:
                #+iyor, +ar, +im
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    # ata, dana
                    return cls.apply(word, phonetic_attributes, form_str[2:], lexeme_attributes)
                else:
                    # yap, kitap
                    return cls._handle_phonetics(word, phonetic_attributes, form_str[1:], lexeme_attributes)

            else:
                # +yacak, +yi, +yla
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    #ata, dana
                    return cls._handle_phonetics(word, phonetic_attributes, form_str[1:], lexeme_attributes)
                else:
                    # yap, kitap
                    return cls.apply(word, phonetic_attributes, form_str[2:], lexeme_attributes)

        else:
            return cls._handle_phonetics(word, phonetic_attributes, form_str, lexeme_attributes)

예제 #16

0

파일 보기

파일: lexiconloader.py 프로젝트: aliok/trnltk

    def _infer_morphemic_attributes(cls, lexeme):
        """
        @type lexeme: Lexeme
        """
        item_root = lexeme.root
        root_vowel_count = cls._vowel_count(item_root)
        last_letter = TurkishAlphabet.get_letter_for_char(item_root[-1])

        if lexeme.syntactic_category==SyntacticCategory.VERB:
            if last_letter.vowel:
                lexeme.attributes.add(LexemeAttribute.ProgressiveVowelDrop)
                lexeme.attributes.add(LexemeAttribute.Passive_In)

            if root_vowel_count>1 and LexemeAttribute.Aorist_A not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.Aorist_I)

            if root_vowel_count==1 and LexemeAttribute.Aorist_I not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.Aorist_A)

            if last_letter==TurkishAlphabet.L_l:
                lexeme.attributes.add(LexemeAttribute.Passive_In)

            if all(a not in lexeme.attributes for a in LexemeAttribute.CAUSATIVES):
                if last_letter.vowel or (last_letter in [TurkishAlphabet.L_l, TurkishAlphabet.L_r]) and root_vowel_count>1:
                    lexeme.attributes.add(LexemeAttribute.Causative_t)
                elif last_letter==TurkishAlphabet.L_t and root_vowel_count<2:
                    lexeme.attributes.add(LexemeAttribute.Causative_Ir)
                else:
                    lexeme.attributes.add(LexemeAttribute.Causative_dIr)

            if LexemeAttribute.ProgressiveVowelDrop in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.NoVoicing)

            if LexemeAttribute.Voicing not in lexeme.attributes and LexemeAttribute.NoVoicing not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.NoVoicing)

        elif lexeme.syntactic_category==SyntacticCategory.NOUN and LexemeAttribute.CompoundP3sg in lexeme.attributes:
            if LexemeAttribute.VoicingOpt in lexeme.attributes:
                if LexemeAttribute.Voicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.Voicing)
                if LexemeAttribute.NoVoicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.NoVoicing)
            elif LexemeAttribute.Voicing not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.NoVoicing)

        elif lexeme.syntactic_category in [SyntacticCategory.NOUN, SyntacticCategory.ADJECTIVE]:
            if LexemeAttribute.VoicingOpt in lexeme.attributes:
                if LexemeAttribute.Voicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.Voicing)
                if LexemeAttribute.NoVoicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.NoVoicing)
            else:
                if root_vowel_count>1 and last_letter.voiceless and not last_letter.continuant and LexemeAttribute.NoVoicing not in lexeme.attributes \
                and LexemeAttribute.InverseHarmony not in lexeme.attributes:
                    lexeme.attributes.add(LexemeAttribute.Voicing)
                elif item_root.endswith('nk') or item_root.endswith('og') or item_root.endswith('rt'):
                    lexeme.attributes.add(LexemeAttribute.Voicing)
                elif LexemeAttribute.Voicing not in lexeme.attributes:
                    lexeme.attributes.add(LexemeAttribute.NoVoicing)

예제 #17

0

파일 보기

파일: phonetics.py 프로젝트: aliok/trnltk

    def _expectation_satisfied(cls, phonetic_expectation, form_str):
        if phonetic_expectation == PhoneticExpectation.VowelStart:
            first_char = form_str[0]
            if first_char == '+':
                return cls._expectation_satisfied(phonetic_expectation, form_str[1:]) or cls._expectation_satisfied(
                    phonetic_expectation, form_str[2:])
            else:
                return TurkishAlphabet.get_letter_for_char(first_char).vowel

        elif phonetic_expectation == PhoneticExpectation.ConsonantStart:
            first_char = form_str[0]
            if first_char == '+':
                return cls._expectation_satisfied(phonetic_expectation, form_str[1:]) or cls._expectation_satisfied(
                    phonetic_expectation, form_str[2:])
            else:
                return not TurkishAlphabet.get_letter_for_char(first_char).vowel

        else:
            raise Exception('Unknown phonetic_expectation', phonetic_expectation)

예제 #18

0

파일 보기

파일: dictionaryexperiments.py 프로젝트: v-mostafapour/trnltk

def print_verbs_with_double_consonant_ending():
    dictionary_file_path = os.path.join(os.path.dirname(__file__),
                                        '../resources/master_dictionary.txt')
    with codecs.open(dictionary_file_path, mode='r',
                     encoding='utf-8') as dictionary_file:
        for line in dictionary_file:
            line = line.strip()
            if line.startswith('#'):
                continue
            item = line
            if u'[' in line:
                item, meta = line.split(u'[')
            item = item.strip()
            if item.endswith(u'mak') or item.endswith(u'mek'):
                verb_root = item[:-3]
                if not TurkishAlphabet.get_letter_for_char(
                        verb_root[-1]
                ).vowel and not TurkishAlphabet.get_letter_for_char(
                        verb_root[-2]).vowel:
                    print verb_root

예제 #19

0

파일 보기

파일: root.py 프로젝트: efebuyuk/trnltk

    def __init__(self, abbr):
        root = abbr
        lexeme = DynamicLexeme(abbr, abbr, SyntacticCategory.NOUN, SecondarySyntacticCategory.ABBREVIATION, None)
        phonetic_attributes = None

        last_letter = TurkishAlphabet.get_letter_for_char(abbr[-1])
        if last_letter.vowel:
            phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(abbr)
        else:
            phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(abbr + u"E")

        phonetic_expectations = None
        super(AbbreviationRoot, self).__init__(root, lexeme, phonetic_expectations, phonetic_attributes)

예제 #20

0

파일 보기

파일: phonetics.py 프로젝트: v-mostafapour/trnltk

    def application_matches(cls, word, applied_str, voicing_allowed):
        """
        Checks if a suffix applied word is matched by a surface.

            >>> Phonetics.application_matches(u'armudunu', u'armut', True)
            True
            >>> Phonetics.application_matches(u'armudunu', u'armut', False)
            False
            >>> Phonetics.application_matches(u'armudunu', u'armudu', True)
            True
            >>> Phonetics.application_matches(u'armudunu', u'armudu', False)
            True

        @param word: The full word (surface)
        @param applied_str: Suffix applied part of the word
        @param voicing_allowed: If voicing should be considered or ignored
        @type word: unicode
        @type applied_str: unicode
        @type voicing_allowed: bool
        @rtype: L{bool}
        """
        if not applied_str or len(applied_str) > len(word):
            return False

        elif word == applied_str or word.startswith(applied_str):
            return True

        if voicing_allowed and word.startswith(applied_str[:-1]):
            last_letter_of_application = TurkishAlphabet.get_letter_for_char(
                applied_str[-1])
            last_letter_of_word_part = TurkishAlphabet.get_letter_for_char(
                word[len(applied_str) - 1])
            return TurkishAlphabet.voice(
                last_letter_of_application) == last_letter_of_word_part

        else:
            return False

예제 #21

0

파일 보기

파일: dictionaryexperiments.py 프로젝트: aliok/trnltk

def print_verbs_with_double_consonant_ending():
    dictionary_file_path = os.path.join(os.path.dirname(__file__), '../resources/master_dictionary.txt')
    with codecs.open(dictionary_file_path, mode='r', encoding='utf-8') as dictionary_file:
        for line in dictionary_file:
            line = line.strip()
            if line.startswith('#'):
                continue
            item = line
            if u'[' in line:
                item,meta = line.split(u'[')
            item = item.strip()
            if item.endswith(u'mak') or item.endswith(u'mek'):
                verb_root = item[:-3]
                if not TurkishAlphabet.get_letter_for_char(verb_root[-1]).vowel and not TurkishAlphabet.get_letter_for_char(verb_root[-2]).vowel:
                    print verb_root

예제 #22

0

파일 보기

파일: root.py 프로젝트: v-mostafapour/trnltk

    def __init__(self, abbr):
        root = abbr
        lexeme = DynamicLexeme(abbr, abbr, SyntacticCategory.NOUN,
                               SecondarySyntacticCategory.ABBREVIATION, None)
        phonetic_attributes = None

        last_letter = TurkishAlphabet.get_letter_for_char(abbr[-1])
        if last_letter.vowel:
            phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(
                abbr)
        else:
            phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(
                abbr + u'E')

        phonetic_expectations = None
        super(AbbreviationRoot,
              self).__init__(root, lexeme, phonetic_expectations,
                             phonetic_attributes)

예제 #23

0

파일 보기

파일: phonetics.py 프로젝트: aliok/trnltk

    def calculate_phonetic_attributes_of_plain_sequence(cls, seq):
        """
        Calculates the phonetic attributes of a word, without the root attributes of it.
        @type seq: unicode
        @rtype: set
        """
        attrs = []

        last_vowel = cls.get_last_vowel(seq)
        last_letter = TurkishAlphabet.get_letter_for_char(seq[-1])
        if last_vowel:
            if last_vowel.rounded:
                attrs.append(PhoneticAttributes.LastVowelRounded)
            else:
                attrs.append(PhoneticAttributes.LastVowelUnrounded)

            if last_vowel.frontal:
                attrs.append(PhoneticAttributes.LastVowelFrontal)
            else:
                attrs.append(PhoneticAttributes.LastVowelBack)

        if last_letter.vowel:
            attrs.append(PhoneticAttributes.LastLetterVowel)
        else:
            attrs.append(PhoneticAttributes.LastLetterConsonant)

        if last_letter.voiceless:
            attrs.append(PhoneticAttributes.LastLetterVoiceless)
            if not last_letter.continuant:
                attrs.append(PhoneticAttributes.LastLetterVoicelessStop)
        else:
            attrs.append(PhoneticAttributes.LastLetterNotVoiceless)
            if not last_letter.continuant and not last_letter.vowel:
                attrs.append(PhoneticAttributes.LastLetterVoicedStop)

        if last_letter.continuant:
            attrs.append(PhoneticAttributes.LastLetterContinuant)
        else:
            attrs.append(PhoneticAttributes.LastLetterNotContinuant)

        return set(attrs)

예제 #24

0

파일 보기

파일: phonetics.py 프로젝트: v-mostafapour/trnltk

    def calculate_phonetic_attributes_of_plain_sequence(cls, seq):
        """
        Calculates the phonetic attributes of a word, without the root attributes of it.
        @type seq: unicode
        @rtype: set
        """
        attrs = []

        last_vowel = cls.get_last_vowel(seq)
        last_letter = TurkishAlphabet.get_letter_for_char(seq[-1])
        if last_vowel:
            if last_vowel.rounded:
                attrs.append(PhoneticAttributes.LastVowelRounded)
            else:
                attrs.append(PhoneticAttributes.LastVowelUnrounded)

            if last_vowel.frontal:
                attrs.append(PhoneticAttributes.LastVowelFrontal)
            else:
                attrs.append(PhoneticAttributes.LastVowelBack)

        if last_letter.vowel:
            attrs.append(PhoneticAttributes.LastLetterVowel)
        else:
            attrs.append(PhoneticAttributes.LastLetterConsonant)

        if last_letter.voiceless:
            attrs.append(PhoneticAttributes.LastLetterVoiceless)
            if not last_letter.continuant:
                attrs.append(PhoneticAttributes.LastLetterVoicelessStop)
        else:
            attrs.append(PhoneticAttributes.LastLetterNotVoiceless)
            if not last_letter.continuant and not last_letter.vowel:
                attrs.append(PhoneticAttributes.LastLetterVoicedStop)

        if last_letter.continuant:
            attrs.append(PhoneticAttributes.LastLetterContinuant)
        else:
            attrs.append(PhoneticAttributes.LastLetterNotContinuant)

        return set(attrs)

예제 #25

0

파일 보기

파일: bruteforceverbrootfinder.py 프로젝트: aliok/trnltk

    def find_roots_for_partial_input(self, partial_input, whole_surface=None):
        """
        @type partial_input: unicode
        @type whole_surface: unicode
        @rtype: list of Root
        """
        assert partial_input and whole_surface
        assert len(partial_input) <= len(whole_surface)
        assert whole_surface.startswith(partial_input)
        if len(whole_surface) == len(partial_input):
            assert whole_surface == partial_input

        if len(partial_input) < 2:      # not possible except (d,diyor) and (y,yiyor). but they are already in the dictionary
            return []

        last_vowel = Phonetics.get_last_vowel(partial_input)

        if not last_vowel:
            return []

        root = partial_input
        lemma = root
        lemma_root = lemma
        syntactic_category = SyntacticCategory.VERB
        secondary_syntactic_category = None
        lexeme_attributes = set()

        lexeme = DynamicLexeme(lemma, lemma_root, syntactic_category, secondary_syntactic_category,
            lexeme_attributes)

        phonetic_expectations = set()
        phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(partial_input)

        no_attr_root = DynamicRoot(root, lexeme, phonetic_expectations, phonetic_attributes)

        self._set_lexeme_and_phonetic_attributes([no_attr_root])
        self._set_lemma([no_attr_root])

        last_char = partial_input[-1]
        last_letter = TurkishAlphabet.get_letter_for_char(last_char)

        partial_surface_can_be_root_of_a_verb = self._seems_like_a_valid_verb_root(partial_input)

        if whole_surface==partial_input:
            return [no_attr_root] if partial_surface_can_be_root_of_a_verb else []


        first_char_after_partial_input = whole_surface[len(partial_input)]

        if first_char_after_partial_input.isupper():
            return []

        first_letter_after_partial_input = TurkishAlphabet.get_letter_for_char(first_char_after_partial_input)


        might_have_ProgressiveVowelDrop = not last_letter.vowel and\
                                          any([whole_surface.startswith(partial_input+s) for s in [u'iyor', u'ıyor', u'uyor', u'üyor']])

        might_have_Aorist_A = not last_letter.vowel and \
                              (whole_surface.startswith(partial_input + u'ar') or whole_surface.startswith(partial_input + u'er'))

        # no Aorist_I for -ur, -ür
        might_have_Aorist_I = not last_letter.vowel and\
                              (whole_surface.startswith(partial_input + u'ır') or whole_surface.startswith(partial_input + u'ir'))

        # for other letters, no voicing in verbs. {git+er->gider} vs {yapar, açar, diker}
        voicing_might_have_happened = last_letter==TurkishAlphabet.L_d and first_letter_after_partial_input.vowel

        possible_progressive_vowel_drop_roots = self._get_progressive_vowel_drop_roots(partial_input, whole_surface, no_attr_root, last_vowel) if might_have_ProgressiveVowelDrop else set()
        possible_aorist_A_roots = self._get_aorist_A_roots(no_attr_root) if might_have_Aorist_A else set()
        possible_aorist_I_roots = self._get_aorist_I_roots(no_attr_root) if might_have_Aorist_I else set()
        possible_causative_roots = self._get_possible_causative_roots(partial_input, whole_surface, no_attr_root)
        possible_passive_roots = self._get_possible_passive_roots(last_letter, partial_input, whole_surface, no_attr_root)


        if voicing_might_have_happened:
            possible_progressive_vowel_drop_roots = possible_progressive_vowel_drop_roots.union(set([self._get_possible_voicing_root(r) for r in possible_progressive_vowel_drop_roots]))
            possible_aorist_A_roots = possible_aorist_A_roots.union(set([self._get_possible_voicing_root(r) for r in possible_aorist_A_roots]))
            possible_aorist_I_roots = possible_aorist_I_roots.union(set([self._get_possible_voicing_root(r) for r in possible_aorist_I_roots]))
            possible_causative_roots = possible_causative_roots.union(set([self._get_possible_voicing_root(r) for r in possible_causative_roots]))
            possible_passive_roots = possible_passive_roots.union(set([self._get_possible_voicing_root(r) for r in possible_passive_roots]))

        generated_roots = set()

        generated_roots.add(no_attr_root)

        if voicing_might_have_happened:
            generated_roots.add(self._get_possible_voicing_root(no_attr_root))

        generated_roots = generated_roots.union(possible_progressive_vowel_drop_roots)
        generated_roots = generated_roots.union(possible_aorist_A_roots)
        generated_roots = generated_roots.union(possible_aorist_I_roots)
        generated_roots = generated_roots.union(possible_causative_roots)
        generated_roots = generated_roots.union(possible_passive_roots)

        self._set_lexeme_and_phonetic_attributes(generated_roots)
        self._set_lemma(generated_roots)

        generated_roots = list(generated_roots)

        generated_roots = filter(lambda r: self._seems_like_a_valid_verb_root(r.lexeme.root), generated_roots)

        return generated_roots

예제 #26

0

파일 보기

    def _generate_modified_root_nodes(cls, lexeme):
        if LexemeAttribute.RootChange in lexeme.attributes:
            special_roots = cls._handle_special_roots(lexeme)
            if special_roots:
                return special_roots

        modified_seq = lexeme.root

        original_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(
            lexeme.root)
        modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(
            lexeme.root)
        original_phonetic_expectations = set()
        modified_phonetic_expectations = set()

        if LexemeAttribute.Voicing in lexeme.attributes or LexemeAttribute.VoicingOpt in lexeme.attributes:
            last_letter = TurkishAlphabet.get_letter_for_char(modified_seq[-1])
            modified_letter = TurkishAlphabet.voice(last_letter)
            assert modified_letter is not None
            if lexeme.lemma.endswith(u"nk"):
                modified_letter = TurkishAlphabet.L_g
            modified_seq = modified_seq[:-1] + modified_letter.char_value
            if PhoneticAttributes.LastLetterVoicelessStop in modified_attributes:
                modified_attributes.remove(
                    PhoneticAttributes.LastLetterVoicelessStop)
            if modified_letter.continuant:
                if PhoneticAttributes.LastLetterNotContinuant in modified_attributes:
                    modified_attributes.remove(
                        PhoneticAttributes.LastLetterNotContinuant)
                modified_attributes.add(
                    PhoneticAttributes.LastLetterContinuant)
            else:
                if PhoneticAttributes.LastLetterContinuant in modified_attributes:
                    modified_attributes.remove(
                        PhoneticAttributes.LastLetterContinuant)
                modified_attributes.add(
                    PhoneticAttributes.LastLetterNotContinuant)
            if LexemeAttribute.VoicingOpt not in lexeme.attributes:
                original_phonetic_expectations.add(
                    PhoneticExpectation.ConsonantStart)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)

        if LexemeAttribute.Doubling in lexeme.attributes:
            modified_seq = modified_seq + modified_seq[-1]
            original_phonetic_expectations.add(
                PhoneticExpectation.ConsonantStart)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)

        if LexemeAttribute.LastVowelDrop in lexeme.attributes:
            modified_seq = modified_seq[:-2] + modified_seq[-1]
            if lexeme.syntactic_category != SyntacticCategory.VERB:
                original_phonetic_expectations.add(
                    PhoneticExpectation.ConsonantStart)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)

        if LexemeAttribute.InverseHarmony in lexeme.attributes:
            original_attributes.add(PhoneticAttributes.LastVowelFrontal)
            if PhoneticAttributes.LastVowelBack in original_attributes:
                original_attributes.remove(PhoneticAttributes.LastVowelBack)
            modified_attributes.add(PhoneticAttributes.LastVowelFrontal)
            if PhoneticAttributes.LastVowelBack in modified_attributes:
                modified_attributes.remove(PhoneticAttributes.LastVowelBack)

        if LexemeAttribute.ProgressiveVowelDrop in lexeme.attributes:
            modified_seq = modified_seq[:-1]
            if RootGenerator._has_vowel(modified_seq):
                modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(
                    modified_seq)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)

        original_phonetic_expectations = original_phonetic_expectations or None
        modified_phonetic_expectations = modified_phonetic_expectations or None

        original = Root(lexeme.root, lexeme, original_phonetic_expectations,
                        original_attributes)
        modified = Root(modified_seq, lexeme, modified_phonetic_expectations,
                        modified_attributes)

        if original == modified:
            return [original]
        else:
            return [original, modified]

예제 #27

0

파일 보기

    def _has_vowel(cls, seq):
        for s in seq:
            if TurkishAlphabet.get_letter_for_char(s).vowel:
                return True

        return False

예제 #28

0

파일 보기

파일: rootgenerator.py 프로젝트: aliok/trnltk

    def _generate_modified_root_nodes(cls, lexeme):
        if LexemeAttribute.RootChange in lexeme.attributes:
            special_roots = cls._handle_special_roots(lexeme)
            if special_roots:
                return special_roots

        modified_seq = lexeme.root

        original_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(lexeme.root)
        modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(lexeme.root)
        original_phonetic_expectations = set()
        modified_phonetic_expectations = set()

        if LexemeAttribute.Voicing in lexeme.attributes or LexemeAttribute.VoicingOpt in lexeme.attributes:
            last_letter = TurkishAlphabet.get_letter_for_char(modified_seq[-1])
            modified_letter = TurkishAlphabet.voice(last_letter)
            assert modified_letter is not None
            if lexeme.lemma.endswith(u"nk"):
                modified_letter = TurkishAlphabet.L_g
            modified_seq = modified_seq[:-1] + modified_letter.char_value
            if PhoneticAttributes.LastLetterVoicelessStop in modified_attributes:
                modified_attributes.remove(PhoneticAttributes.LastLetterVoicelessStop)
            if modified_letter.continuant:
                if PhoneticAttributes.LastLetterNotContinuant in modified_attributes :
                    modified_attributes.remove(PhoneticAttributes.LastLetterNotContinuant)
                modified_attributes.add(PhoneticAttributes.LastLetterContinuant)
            else:
                if PhoneticAttributes.LastLetterContinuant in modified_attributes:
                    modified_attributes.remove(PhoneticAttributes.LastLetterContinuant)
                modified_attributes.add(PhoneticAttributes.LastLetterNotContinuant)
            if LexemeAttribute.VoicingOpt not in lexeme.attributes:
                original_phonetic_expectations.add(PhoneticExpectation.ConsonantStart)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)

        if LexemeAttribute.Doubling in lexeme.attributes:
            modified_seq = modified_seq + modified_seq[-1]
            original_phonetic_expectations.add(PhoneticExpectation.ConsonantStart)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)

        if LexemeAttribute.LastVowelDrop in lexeme.attributes:
            modified_seq = modified_seq[:-2] + modified_seq[-1]
            if lexeme.syntactic_category!=SyntacticCategory.VERB:
                original_phonetic_expectations.add(PhoneticExpectation.ConsonantStart)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)

        if LexemeAttribute.InverseHarmony in lexeme.attributes:
            original_attributes.add(PhoneticAttributes.LastVowelFrontal)
            if PhoneticAttributes.LastVowelBack in original_attributes:
                original_attributes.remove(PhoneticAttributes.LastVowelBack)
            modified_attributes.add(PhoneticAttributes.LastVowelFrontal)
            if PhoneticAttributes.LastVowelBack in modified_attributes:
                modified_attributes.remove(PhoneticAttributes.LastVowelBack)

        if LexemeAttribute.ProgressiveVowelDrop in lexeme.attributes:
            modified_seq = modified_seq[:-1]
            if RootGenerator._has_vowel(modified_seq):
                modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(modified_seq)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)


        original_phonetic_expectations = original_phonetic_expectations or None
        modified_phonetic_expectations = modified_phonetic_expectations or None

        original = Root(lexeme.root, lexeme, original_phonetic_expectations, original_attributes)
        modified = Root(modified_seq, lexeme, modified_phonetic_expectations, modified_attributes)

        if original==modified:
            return [original]
        else:
            return [original, modified]

예제 #29

0

파일 보기

파일: rootgenerator.py 프로젝트: aliok/trnltk

    def _has_vowel(cls, seq):
        for s in seq:
            if TurkishAlphabet.get_letter_for_char(s).vowel:
                return True

        return False

예제 #30

0

파일 보기

파일: phonetics.py 프로젝트: v-mostafapour/trnltk

 def get_last_vowel(cls, seq):
     for s in reversed(seq):
         turkish_letter = TurkishAlphabet.get_letter_for_char(s)
         if turkish_letter.vowel:
             return turkish_letter

예제 #31

0

파일 보기

파일: phonetics.py 프로젝트: aliok/trnltk

 def get_last_vowel(cls, seq):
     for s in reversed(seq):
         turkish_letter = TurkishAlphabet.get_letter_for_char(s)
         if turkish_letter.vowel:
             return turkish_letter

예제 #32

0

파일 보기

파일: bruteforcenounrootfinders.py 프로젝트: v-mostafapour/trnltk

    def find_roots_for_partial_input(self, partial_input, whole_surface=None):
        """
        @type partial_input: unicode
        @type whole_surface: unicode
        @rtype: list of Root
        """
        assert partial_input and whole_surface
        assert len(partial_input) <= len(whole_surface)
        assert whole_surface.startswith(partial_input)
        if len(whole_surface) == len(partial_input):
            assert whole_surface == partial_input

        # no compound should be found an input shorter than sth like "atsu-yu". even that doesn't make sense
        if len(partial_input) < 5:
            return []

        if whole_surface == partial_input:
            return []

        last_char = partial_input[-1]
        previous_char = partial_input[-2]

        if last_char.isupper() or previous_char.isupper():
            return []

        last_letter = TurkishAlphabet.get_letter_for_char(last_char)

        if last_letter!=TurkishAlphabet.L_i and last_letter!=TurkishAlphabet.L_u and\
           last_letter!=TurkishAlphabet.L_ii and last_letter!=TurkishAlphabet.L_uu:
            return []

        first_char_after_partial_input = whole_surface[len(partial_input)]

        if first_char_after_partial_input.isupper():
            return []

        first_letter_after_partial_input = TurkishAlphabet.get_letter_for_char(first_char_after_partial_input)

        if first_letter_after_partial_input != TurkishAlphabet.L_n:
            return []

        if len(whole_surface) < len(partial_input) + 2: # need a char after char 'n'
            return []

        compound_results = []

        results_with_partial_input_one_char_missing = self.brute_force_noun_root_finder.find_roots_for_partial_input(partial_input[:-1], whole_surface)

        # illustrate:
        # partial_input = suborusu, whole_surface = suborusuna
        # results_with_partial_input_one_char_missing : <'suborus','suborus'>
        # partial_input = bacakkalemi, whole_surface = bacakkalemini
        # results_with_partial_input_one_char_missing : <'bacakkalem','bacakkalem'>

        for normal_noun_result in results_with_partial_input_one_char_missing:
            clone_result = normal_noun_result._clone(True)
            clone_result.str = clone_result.lexeme.root
            clone_result.lexeme.root = partial_input
            clone_result.lexeme.lemma = partial_input

            compound_results.append(clone_result)


        previous_letter = TurkishAlphabet.get_letter_for_char(previous_char)

        if previous_letter==TurkishAlphabet.L_s:
            results_with_partial_input_two_chars_missing = self.brute_force_noun_root_finder.find_roots_for_partial_input(partial_input[:-2], whole_surface)

            # illustrate:
            # partial_input = suborusu, whole_surface = suborusuna
            # results_with_partial_input_two_chars_missing : <'suboru','suboru'>

            for normal_noun_result in results_with_partial_input_two_chars_missing:
                clone_result = normal_noun_result._clone(True)
                clone_result.lexeme.root = partial_input
                clone_result.lexeme.lemma = partial_input

                compound_results.append(clone_result)


        for compound_result in compound_results:
            compound_result.lexeme.attributes.add(LexemeAttribute.CompoundP3sg)

        return compound_results

예제 #33

0

파일 보기

    def find_roots_for_partial_input(self, partial_input, whole_surface=None):
        """
        @type partial_input: unicode
        @type whole_surface: unicode
        @rtype: list of Root
        """
        assert partial_input and whole_surface
        assert len(partial_input) <= len(whole_surface)
        assert whole_surface.startswith(partial_input)
        if len(whole_surface) == len(partial_input):
            assert whole_surface == partial_input

        if len(
                partial_input
        ) < 2:  # not possible except (d,diyor) and (y,yiyor). but they are already in the dictionary
            return []

        last_vowel = Phonetics.get_last_vowel(partial_input)

        if not last_vowel:
            return []

        root = partial_input
        lemma = root
        lemma_root = lemma
        syntactic_category = SyntacticCategory.VERB
        secondary_syntactic_category = None
        lexeme_attributes = set()

        lexeme = DynamicLexeme(lemma, lemma_root, syntactic_category,
                               secondary_syntactic_category, lexeme_attributes)

        phonetic_expectations = set()
        phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(
            partial_input)

        no_attr_root = DynamicRoot(root, lexeme, phonetic_expectations,
                                   phonetic_attributes)

        self._set_lexeme_and_phonetic_attributes([no_attr_root])
        self._set_lemma([no_attr_root])

        last_char = partial_input[-1]
        last_letter = TurkishAlphabet.get_letter_for_char(last_char)

        partial_surface_can_be_root_of_a_verb = self._seems_like_a_valid_verb_root(
            partial_input)

        if whole_surface == partial_input:
            return [no_attr_root
                    ] if partial_surface_can_be_root_of_a_verb else []

        first_char_after_partial_input = whole_surface[len(partial_input)]

        if first_char_after_partial_input.isupper():
            return []

        first_letter_after_partial_input = TurkishAlphabet.get_letter_for_char(
            first_char_after_partial_input)


        might_have_ProgressiveVowelDrop = not last_letter.vowel and\
                                          any([whole_surface.startswith(partial_input+s) for s in [u'iyor', u'ıyor', u'uyor', u'üyor']])

        might_have_Aorist_A = not last_letter.vowel and \
                              (whole_surface.startswith(partial_input + u'ar') or whole_surface.startswith(partial_input + u'er'))

        # no Aorist_I for -ur, -ür
        might_have_Aorist_I = not last_letter.vowel and\
                              (whole_surface.startswith(partial_input + u'ır') or whole_surface.startswith(partial_input + u'ir'))

        # for other letters, no voicing in verbs. {git+er->gider} vs {yapar, açar, diker}
        voicing_might_have_happened = last_letter == TurkishAlphabet.L_d and first_letter_after_partial_input.vowel

        possible_progressive_vowel_drop_roots = self._get_progressive_vowel_drop_roots(
            partial_input, whole_surface, no_attr_root,
            last_vowel) if might_have_ProgressiveVowelDrop else set()
        possible_aorist_A_roots = self._get_aorist_A_roots(
            no_attr_root) if might_have_Aorist_A else set()
        possible_aorist_I_roots = self._get_aorist_I_roots(
            no_attr_root) if might_have_Aorist_I else set()
        possible_causative_roots = self._get_possible_causative_roots(
            partial_input, whole_surface, no_attr_root)
        possible_passive_roots = self._get_possible_passive_roots(
            last_letter, partial_input, whole_surface, no_attr_root)

        if voicing_might_have_happened:
            possible_progressive_vowel_drop_roots = possible_progressive_vowel_drop_roots.union(
                set([
                    self._get_possible_voicing_root(r)
                    for r in possible_progressive_vowel_drop_roots
                ]))
            possible_aorist_A_roots = possible_aorist_A_roots.union(
                set([
                    self._get_possible_voicing_root(r)
                    for r in possible_aorist_A_roots
                ]))
            possible_aorist_I_roots = possible_aorist_I_roots.union(
                set([
                    self._get_possible_voicing_root(r)
                    for r in possible_aorist_I_roots
                ]))
            possible_causative_roots = possible_causative_roots.union(
                set([
                    self._get_possible_voicing_root(r)
                    for r in possible_causative_roots
                ]))
            possible_passive_roots = possible_passive_roots.union(
                set([
                    self._get_possible_voicing_root(r)
                    for r in possible_passive_roots
                ]))

        generated_roots = set()

        generated_roots.add(no_attr_root)

        if voicing_might_have_happened:
            generated_roots.add(self._get_possible_voicing_root(no_attr_root))

        generated_roots = generated_roots.union(
            possible_progressive_vowel_drop_roots)
        generated_roots = generated_roots.union(possible_aorist_A_roots)
        generated_roots = generated_roots.union(possible_aorist_I_roots)
        generated_roots = generated_roots.union(possible_causative_roots)
        generated_roots = generated_roots.union(possible_passive_roots)

        self._set_lexeme_and_phonetic_attributes(generated_roots)
        self._set_lemma(generated_roots)

        generated_roots = list(generated_roots)

        generated_roots = filter(
            lambda r: self._seems_like_a_valid_verb_root(r.lexeme.root),
            generated_roots)

        return generated_roots

예제 #34

0

파일 보기

    def _infer_morphemic_attributes(cls, lexeme):
        """
        @type lexeme: Lexeme
        """
        item_root = lexeme.root
        root_vowel_count = cls._vowel_count(item_root)
        last_letter = TurkishAlphabet.get_letter_for_char(item_root[-1])

        if lexeme.syntactic_category == SyntacticCategory.VERB:
            if last_letter.vowel:
                lexeme.attributes.add(LexemeAttribute.ProgressiveVowelDrop)
                lexeme.attributes.add(LexemeAttribute.Passive_In)

            if root_vowel_count > 1 and LexemeAttribute.Aorist_A not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.Aorist_I)

            if root_vowel_count == 1 and LexemeAttribute.Aorist_I not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.Aorist_A)

            if last_letter == TurkishAlphabet.L_l:
                lexeme.attributes.add(LexemeAttribute.Passive_In)

            if all(a not in lexeme.attributes
                   for a in LexemeAttribute.CAUSATIVES):
                if last_letter.vowel or (last_letter in [
                        TurkishAlphabet.L_l, TurkishAlphabet.L_r
                ]) and root_vowel_count > 1:
                    lexeme.attributes.add(LexemeAttribute.Causative_t)
                elif last_letter == TurkishAlphabet.L_t and root_vowel_count < 2:
                    lexeme.attributes.add(LexemeAttribute.Causative_Ir)
                else:
                    lexeme.attributes.add(LexemeAttribute.Causative_dIr)

            if LexemeAttribute.ProgressiveVowelDrop in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.NoVoicing)

            if LexemeAttribute.Voicing not in lexeme.attributes and LexemeAttribute.NoVoicing not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.NoVoicing)

        elif lexeme.syntactic_category == SyntacticCategory.NOUN and LexemeAttribute.CompoundP3sg in lexeme.attributes:
            if LexemeAttribute.VoicingOpt in lexeme.attributes:
                if LexemeAttribute.Voicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.Voicing)
                if LexemeAttribute.NoVoicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.NoVoicing)
            elif LexemeAttribute.Voicing not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.NoVoicing)

        elif lexeme.syntactic_category in [
                SyntacticCategory.NOUN, SyntacticCategory.ADJECTIVE
        ]:
            if LexemeAttribute.VoicingOpt in lexeme.attributes:
                if LexemeAttribute.Voicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.Voicing)
                if LexemeAttribute.NoVoicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.NoVoicing)
            else:
                if root_vowel_count>1 and last_letter.voiceless and not last_letter.continuant and LexemeAttribute.NoVoicing not in lexeme.attributes \
                and LexemeAttribute.InverseHarmony not in lexeme.attributes:
                    lexeme.attributes.add(LexemeAttribute.Voicing)
                elif item_root.endswith('nk') or item_root.endswith(
                        'og') or item_root.endswith('rt'):
                    lexeme.attributes.add(LexemeAttribute.Voicing)
                elif LexemeAttribute.Voicing not in lexeme.attributes:
                    lexeme.attributes.add(LexemeAttribute.NoVoicing)