Пример #1
0
def get_intstruct_from_trans_str( _src, dstring_object ):
    """
        function get_intstruct_from_trans_str()

        _src    : (str) transliterated string like "क".

        Return a ListOfInternalStructures object.
    """

    # list of InternalStructure objects.
    istructs = ListOfInternalStructures(anonymize_the_unknown_chars =\
                                dstring_object.options["anonymize the unknown characters"] == 'yes')

    # we read <_src> through a DSTRING_SAN object :
    dstring_san = new_dstring(language='संस्कृतम्',
                              transliteration_method="iso15919")
    dstring_san = dstring_san(_src)

    # In Sanskrit, if a consonant is followed by a virama, it means that the following
    # consonants are part of a cluster of consonants.
    #
    # E.g. in कर्म (0915=ka, 0930=ra, 094D=virama, 092E=ma) we have something like kar+ma,
    # the -m- having no vowel.
    #
    place_consonant_among_subjc = False

    for dchar_san in dstring_san:

        if dchar_san.unknown_char:
            new_istruct = InternalStructure( dstring_object = dstring_object,
                                             unknown_character = True )
            istructs.append(new_istruct)

        else:

            # punctation symbol :
            if dchar_san.base_char in SAN__SYMB_PUNCTUATION:
                unicode_symb = SAN__SYMB_PUNCTUATION.get_default_symbol(dchar_san.base_char)
                new_istruct = InternalStructure( dstring_object = dstring_object,
                                                 punctuation_or_other_symbol = \
                                                   PUNCTUATION_INVERSED[unicode_symb] )
                istructs.append(new_istruct)

                place_consonant_among_subjc = False

            # other symbol :
            elif dchar_san.base_char in SAN__SYMB_OTHER_SYMBOLS:
                unicode_symb = SAN__SYMB_OTHER_SYMBOLS.get_default_symbol(dchar_san.base_char)
                new_istruct = InternalStructure( dstring_object = dstring_object,
                                                 punctuation_or_other_symbol = \
                                                   OTHER_SYMBOLS_INVERSED[unicode_symb] )
                istructs.append(new_istruct)

                place_consonant_among_subjc = False

            # independent vowel:
            elif dchar_san.base_char in SAN__SYMB_INDEPENDENT_VOWELS:

                #...............................................................
                # _independent_vowel will be added as an independent vowel :
                #...............................................................
                if  dstring_object.options["san2bod quality"] == "normal" and \
                    dchar_san.base_char=='O':
                    #====================
                    # @@BOD2SAN-NORM-004
                    # (independent vowel) ओ(ō) > औ(au)
                    #====================
                    _independent_vowel = "AU"

                elif dstring_object.options["san2bod quality"] == "low" and \
                     dchar_san.base_char=='O':
                    #====================
                    # @@BOD2SAN-LOW-004
                    # (independent vowel) ओ(ō) > औ(au)
                    #====================
                    _independent_vowel = "AU"

                elif dstring_object.options["san2bod quality"] == "low" and \
                     dchar_san.base_char in ('AA', 'II', 'UU'):
                    #====================
                    # @@BOD2SAN-LOW-006
                    # (independent vowel) long vowels > short vowels
                    #====================
                    _independent_vowel = {'AA' : 'A',
                                          'II' : 'I',
                                          'UU' : 'U'}[dchar_san.base_char]

                else:
                    _independent_vowel = dchar_san.base_char

                unicode_symb = SAN__SYMB_INDEPENDENT_VOWELS.get_default_symbol(_independent_vowel)
                new_istruct = InternalStructure( dstring_object = dstring_object,
                                                 consonant = "A",
                                                 vowel1 = INDEPENDENT_VOWELS_INVERSED[unicode_symb])
                istructs.append(new_istruct)

                place_consonant_among_subjc = False

            # consonant :
            elif dchar_san.base_char in SAN__SYMB_CONSONANTS:

                if dchar_san.base_char == 'DEVANAGARI SIGN VISARGA':
                    # special case : the visarga symbol is placed among consonants in Sanskrit,
                    # among diacritics in Tibetan.

                    if dstring_object.options["san2bod quality"] == "normal" and \
                       dchar_san.base_char=='DEVANAGARI SIGN VISARGA':
                        #====================
                        # @@BOD2SAN-NORM-001
                        # the visarga is omitted if "san2bod quality" == "normal"
                        #====================
                        pass
                    elif dstring_object.options["san2bod quality"] == "low" and \
                         dchar_san.base_char=='DEVANAGARI SIGN VISARGA':
                        #===================
                        # @@BOD2SAN-LOW-001
                        # the visarga is omitted if "san2bod quality" == "low"
                        #===================
                        pass
                    else:
                        unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(dchar_san.base_char)
                        istructs[-1].rnam_bcad = True

                        place_consonant_among_subjc = False

                elif not place_consonant_among_subjc:
                    # consonant to be placed as a main consonant
                    # (and not among subjoined consonants) :

                    #...........................................................
                    # _base_char will be added as a main consonant :
                    #...........................................................
                    if  dstring_object.options["san2bod quality"] == "normal" and \
                        dchar_san.base_char=='VA':
                        #====================
                        # @@BOD2SAN-NORM-002
                        # the व(va) becomes ब(ba) if "san2bod quality" == "normal"
                        #====================
                        _base_char = "BA"

                    elif dstring_object.options["san2bod quality"] == "low" and \
                         dchar_san.base_char=='VA':
                        #===================
                        # @@BOD2SAN-LOW-002
                        # the व(va) becomes ब(ba) if "san2bod quality" == "normal"
                        #===================
                        _base_char = "BA"

                    elif (dstring_object.options["san2bod quality"] == "low" and \
                        dchar_san.base_char in ('TTA',
                                                'TTHA',
                                                'DDA',
                                                'DDHA',
                                                'NNA')):
                        #===================
                        # @@BOD2SAN-LOW-007
                        # retroflex consonant > non-retroflex consonant
                        # retroflex consonant + aspiration > non-retroflex
                        # consonant without aspiration
                        #===================
                        _base_char = {'TTA'   : "TA",
                                      'TTHA'  : "TA",
                                      'DDA'   : "DA",
                                      'DDHA'  : "DA",
                                      'NNA'   : "NA"
                                      }[dchar_san.base_char]

                    elif (dstring_object.options["san2bod quality"] == "low" and \
                        dchar_san.base_char in ('KHA',
                                                'GHA',
                                                'THA',
                                                'CHA',
                                                'JHA',
                                                'TTHA',
                                                'DDHA',
                                                'PHA',
                                                'BHA')):
                        #===================
                        # @@BOD2SAN-LOW-008
                        # consonant + aspiration > consonant without aspiration
                        #===================
                        _base_char = {'KHA'   : "KA",
                                      'GHA'   : "GA",
                                      'THA'   : "TA",
                                      'CHA'   : "CA",
                                      'JHA'   : "JA",
                                      'DHA'   : "DA",
                                      'TTHA'  : "TTA",
                                      'DDHA'  : "DDA",
                                      'PHA'   : "PA",
                                      'BHA'   : "BA"
                                      }[dchar_san.base_char]

                    else:
                        # general case :
                        _base_char = dchar_san.base_char

                    unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(_base_char)
                    bod_consonant = CONSONANTS_INVERSED[unicode_symb]

                    new_istruct = InternalStructure( dstring_object = dstring_object,
                                                     consonant = bod_consonant )
                    istructs.append(new_istruct)

                    if dchar_san.virama:
                        place_consonant_among_subjc = True

                else:
                    # consonant to be placed among subjoined consonants
                    # (and not as a main consonant) :
                    if istructs[-1].subfix is None:
                        istructs[-1].subfix = []

                    unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(dchar_san.base_char)
                    cons = CONSONANTS_INVERSED[unicode_symb]

                    add_this_consonant = True
                    if dstring_object.options["san2bod quality"] == "low" and \
                       istructs[-1].subfix == [] and \
                       istructs[-1].consonant == cons:
                        #===================
                        # @@BOD2SAN-LOW-008
                        # geminate consonant > 0
                        #===================
                        add_this_consonant = False
                        # no more subjoinded consonant : the other one will be treated
                        # like main consonants :
                        place_consonant_among_subjc = False

                    if add_this_consonant:
                        istructs[-1].subfix.append( cons )

                        if not dchar_san.virama:
                            place_consonant_among_subjc = False

                # dependent vowel :
                if dchar_san.dependentvowel is not None and dchar_san.dependentvowel != "A":

                    #...........................................................
                    # _dependent_vowel will be added as a dependent vowel :
                    #...........................................................
                    if  dstring_object.options["san2bod quality"] == "normal" and \
                        dchar_san.dependentvowel=='O':
                        #====================
                        # @@BOD2SAN-NORM-003
                        # (dependent vowel) ओ(ō) > औ(au)
                        #====================
                        _dependent_vowel = "AU"

                    elif dstring_object.options["san2bod quality"] == "low" and \
                         dchar_san.dependentvowel=='O':
                        #====================
                        # @@BOD2SAN-LOW-003
                        # (dependent vowel) ओ(ō) > औ(au)
                        #====================
                        _dependent_vowel = "AU"

                    elif  dstring_object.options["san2bod quality"] == "low" and \
                          dchar_san.dependentvowel in ('AA', 'II', 'UU'):
                        #====================
                        # @@BOD2SAN-LOW-005
                        # (dependent vowel) long vowels > short vowels
                        #====================
                        _dependent_vowel = {'AA' : 'A',
                                            'II' : 'I',
                                            'UU' : 'U'}[dchar_san.dependentvowel]

                    else:
                        _dependent_vowel = dchar_san.dependentvowel

                    unicode_symb = \
                      SAN__SYMB_DEPENDENT_VOWELS.get_default_symbol(_dependent_vowel)

                    istructs[-1].vowel1 = DEPENDENT_VOWELS_INVERSED[unicode_symb]

            # anusvara/candrabindu :
            if dchar_san.anusvara_candrabindu is not None:
                unicode_symb = \
                  SAN__SYMB_DIACRITICS.get_default_symbol(dchar_san.anusvara_candrabindu)

                istructs[-1].anusvara_candrabindu = DIACRITICS_INVERSED[unicode_symb]

    res = ListOfInternalStructures(anonymize_the_unknown_chars =\
                                dstring_object.options["anonymize the unknown characters"] == 'yes')

    # we add a tsheg after a "real" syllable (id est, not a punctuation sign, ...)
    for istruct in istructs:
        res.append(istruct)

        if istruct.consonant is not None:
            res.append( InternalStructure(
                dstring_object = dstring_object,
                punctuation_or_other_symbol = 'MARK INTERSYLLABIC TSHEG' ))

    return res
Пример #2
0
    def get_sourcestr_representation(self):
        """
                DCharacterSAN.get_sourcestr_representation

                Return a string.
        """
        # .......................................................................
        # unknown char ? Nothing to do :
        # .......................................................................
        if self.unknown_char:
            if self.dstring_object.options["anonymize the unknown characters"] == "yes":
                return UNKNOWN_CHAR_SYMBOL
            else:
                return self.base_char

        # .......................................................................
        # ok, the function can analyse <self> :
        # .......................................................................
        res = []

        if self.base_char is not None:

            if self.punctuation:
                # punctuation symbol :
                res.append(SYMB_PUNCTUATION.get_default_symbol(self.base_char))

            else:
                if self.base_char in SYMB_OTHER_SYMBOLS:
                    # "other symbol" : not punctuation nor consonant nor
                    # independent vowel.
                    res.append(SYMB_OTHER_SYMBOLS.get_default_symbol(self.base_char))
                elif not self.is_an_independent_vowel:
                    # consonant :
                    res.append(SYMB_CONSONANTS.get_default_symbol(self.base_char))
                else:
                    # independent vowel :
                    res.append(SYMB_INDEPENDENT_VOWELS.get_default_symbol(self.base_char))

                # dependent vowel ?
                if self.dependentvowel is not None:
                    # yes :
                    res.append(SYMB_DEPENDENT_VOWELS.get_default_symbol(self.dependentvowel))

        if self.nukta:
            res.append(DEFAULTSYMB__NUKTA)

        if self.accent is not None:
            res.append(SYMB_DIACRITICS.get_default_symbol(self.accent))

        if self.virama:
            res.append(DEFAULTSYMB__VIRAMA)

        if self.anudatta:
            res.append(DEFAULTSYMB__ANUDATTA)

        if self.anusvara_candrabindu is not None:
            res.append(SYMB_DIACRITICS.get_default_symbol(self.anusvara_candrabindu))

        res = "".join(res)

        # we have to delete the fake symbol for 'a' since there's no symbol in devanagari for
        # the vowel 'a'.
        res = res.replace(FAKE_A__SYMBOL, "")

        # (1/3) composition with PRE_NORMALIZE_NFC :
        for src, dest in PRE_NORMALIZE_NFC:
            res = res.replace(src, dest)
        # (2/3) composition with unicodedata.normalize :
        res = unicodedata.normalize("NFC", res)
        # (3/3) composition with POST_NORMALIZE_NFC :
        for src, dest in POST_NORMALIZE_NFC:
            res = res.replace(src, dest)

        return res