示例#1
0
    def get_sourcestr_representation(self):
        """
                DCharacterANG.get_sourcestr_representation

                Return a string.
        """

        #.......................................................................
        # unknown char ? Nothing to do :
        #.......................................................................
        if self.unknown_char:
            if self.dstring_object.options["anonymize the unknown characters"] == 'yes':
                return UNKNOWN_CHAR_SYMBOL
            else:
                return self.base_char

        #.......................................................................
        # ok, the function can analyse <self> :
        #.......................................................................
        res = []

        if self.base_char is not None:
            if self.punctuation:
                # punctuation symbol :
                res.append( self.base_char )
            elif not self.capital_letter:
                # lower case :
                res.append( SYMB_LOWER_CASE.get_default_symbol(self.base_char) )
            else:
                # upper case :
                res.append( SYMB_UPPER_CASE.get_default_symbol(self.base_char) )

        if self.stress == -1:
            res.append( DEFAULTSYMB__STRESS_MINUS1 )
        if self.stress == 1:
            res.append( DEFAULTSYMB__STRESS1 )
        elif self.stress == 2:
            res.append( DEFAULTSYMB__STRESS2 )

        if self.makron:
            res.append( DEFAULTSYMB__MAKRON )

        if self.upperdot:
            res.append( DEFAULTSYMB__UPPERDOT )

        res = "".join(res)

        # (1/2) composition with unicodedata.normalize :
        res = unicodedata.normalize('NFC', res)
        # (2/2) composition with COMPLETE_NORMALIZE_NFC :
        for src, dest in COMPLETE_NORMALIZE_NFC:
            res = res.replace(src, dest)

        return res
示例#2
0
    def init_from_str(self, str_src):
        """
                DStringANG.init_from_str

                Function called by __init__(), initialize <self> and return
                <indexes_of_unrecognized_chars>.

                str_src : str

                HOW IT WORKS :
                * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
                * (2) = normalized_src -> (default symbols required) :
                *     replace_by_the_default_symbols() -> normalized_src
                * (3) initialisation from the recognized characters.
                *     re.finditer(DStringANG.pattern) give the symbols{letter+diacritics}
                *     (3.1) base_char
                *     (3.2) makron
                *     (3.3) stress
                *     (3.4) upperdot
                *     (3.5) we add the new character
        """
        #.......................................................................
        # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
        #.......................................................................
        normalized_src = unicodedata.normalize('NFD', str_src)

        #.......................................................................
        # (2) = normalized_src -> (default symbols required) :
        #     replace_by_the_default_symbols() -> normalized_src
        #.......................................................................
        normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_LOWER_CASE.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_UPPER_CASE.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src)

        #.......................................................................
        # (3) initialisation from the recognized characters.
        #     re.finditer(DStringANG.pattern) give the symbols{letter+diacritics}
        #.......................................................................
        indexes = []    # indexes of the substring well analyzed : ( start, end )
        for element in re.finditer(DStringANG.pattern,
                                   normalized_src):

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # we add the unknown characters at the beginning and in the middle
            # of the string (see at the end of this function)
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if indexes:
                # <indexes> isn't empty :
                # ... we add the unknown character(s) between the last character and
                # the current one :
                for index in range( max(indexes[-1])+1, element.start() ):
                    new_character = DCharacterANG(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )
            else:
                # <indexes> is empty :
                # ... we add the unknown character(s) before the first index in <indexes> :
                for index in range( 0, element.start() ):
                    new_character = DCharacterANG(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )

            indexes.append( (element.start(), element.end()-1 ) )

            data = element.groupdict()
            letter     = data['letter']
            diacritics = data['diacritics']

            punctuation = letter in SYMB_PUNCTUATION.symbol2name
            capital_letter = letter in SYMB_UPPER_CASE.symbol2name

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.1) base_char
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if punctuation:
                # punctuation symbol :
                base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter)
            elif not capital_letter:
                # lower case :
                base_char = SYMB_LOWER_CASE.get_the_name_for_this_symbol(letter)
            else:
                # upper case :
                base_char = SYMB_UPPER_CASE.get_the_name_for_this_symbol(letter)

            makron = False
            stress = 0
            upperdot = False
            if diacritics is not None:
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.2) makron
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                makron_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__MAKRON )

                if makron_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), makron defined several times."
                    raise DCharsError( context = "DStringANG.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                makron = SYMB_DIACRITICS.are_these_symbols_in_a_string("makron", diacritics)

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.3) stress
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                stressM1_nbr = number_of_occurences( source_string = diacritics,
                                                     symbols = SYMB_DIACRITICS__STRESS_MINUS1)
                stress1_nbr = number_of_occurences( source_string = diacritics,
                                                    symbols = SYMB_DIACRITICS__STRESS1)
                stress2_nbr = number_of_occurences( source_string = diacritics,
                                                    symbols = SYMB_DIACRITICS__STRESS2)

                if stressM1_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), stressM1(-1) defined several times."
                    raise DCharsError( context = "DStringANG.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                if stress1_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), stress1 defined several times."
                    raise DCharsError( context = "DStringANG.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                if stress2_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), stress2 defined several times."
                    raise DCharsError( context = "DStringANG.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                if stressM1_nbr + stress1_nbr + stress2_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), stressM1, stress1 and stress2 " \
                              "simultaneously defined."
                    raise DCharsError( context = "DStringANG.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                stress = 0

                if SYMB_DIACRITICS.are_these_symbols_in_a_string('stressM1', diacritics):
                    stress = -1
                if SYMB_DIACRITICS.are_these_symbols_in_a_string('stress1', diacritics):
                    stress = 1
                elif SYMB_DIACRITICS.are_these_symbols_in_a_string('stress2', diacritics):
                    stress = 2

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.4) upperdot
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                upperdot_nbr = number_of_occurences( source_string = diacritics,
                                                      symbols = SYMB_DIACRITICS__UPPERDOT)

                if upperdot_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), upperdot defined several times."
                    raise DCharsError( context = "DStringANG.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                upperdot = SYMB_DIACRITICS.are_these_symbols_in_a_string("upperdot", diacritics)

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.5) we add the new character
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            new_character = DCharacterANG(dstring_object = self,
                                          unknown_char = False,
                                          base_char = base_char,
                                          punctuation = punctuation,
                                          capital_letter = capital_letter,
                                          makron = makron,
                                          stress = stress,
                                          upperdot = upperdot)

            self.append( new_character )

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # we add the final unknown characters (see at the beginning of this
        # function)
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        if indexes:
            # <element> is the last one and <indexes> isn't empty :
            for index in range( max(indexes[-1])+1, len(normalized_src) ):
                new_character = DCharacterANG(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )
        else:
            # <indexes> is empty :
            for index in range( 0, len(normalized_src) ):
                new_character = DCharacterANG(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )