Exemplo n.º 1
0
    def get_sourcestr_representation(self):
        """
                DCharacterFRO.get_sourcestr_representation

                Return a string.
        """

        # .......................................................................
        # unknown char ? Nothing to do :
        # .......................................................................
        if self.unknown_char:
            if self.dstring_object.options["anonymize the unknown characters"] == "yes":
                return UNKNOWN_CHAR_SYMBOL
            else:
                return self.base_char

        # .......................................................................
        # ok, the function can analyse <self> :
        # .......................................................................
        res = []

        if self.base_char is not None:
            if self.punctuation:
                # punctuation symbol :
                res.append(self.base_char)
            elif not self.capital_letter:
                # lower case :
                res.append(SYMB_LOWER_CASE.get_default_symbol(self.base_char))
            else:
                # upper case :
                res.append(SYMB_UPPER_CASE.get_default_symbol(self.base_char))

        if self.stress == 1:
            res.append(DEFAULTSYMB__STRESS1)
        if self.stress == 2:
            res.append(DEFAULTSYMB__STRESS2)
        elif self.stress == 3:
            res.append(DEFAULTSYMB__STRESS12)
        elif self.stress == 4:
            res.append(DEFAULTSYMB__STRESS3)

        if self.cedilla is True:
            res.append(DEFAULTSYMB__CEDILLA)

        res = "".join(res)

        # (1/2) composition with unicodedata.normalize :
        res = unicodedata.normalize("NFC", res)
        # (2/2) composition with COMPLETE_NORMALIZE_NFC :
        for src, dest in COMPLETE_NORMALIZE_NFC:
            res = res.replace(src, dest)

        return res
Exemplo n.º 2
0
    def init_from_str(self, str_src):
        """
                DStringFRO.init_from_str

                Function called by __init__(), initialize <self> and return
                <indexes_of_unrecognized_chars>.

                str_src : str

                HOW IT WORKS :
                * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
                * (2) = normalized_src -> (default symbols required) :
                *     replace_by_the_default_symbols() -> normalized_src
                * (3) initialisation from the recognized characters.
                *     re.finditer(DStringFRO.pattern) give the symbols{letter+diacritics}
                *     (3.1) base_char
                *     (3.2) stress
                *     (3.3) cedilla
                *     (3.3) we add the new character
        """
        #.......................................................................
        # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
        #.......................................................................
        normalized_src = unicodedata.normalize('NFD', str_src)

        #.......................................................................
        # (2) = normalized_src -> (default symbols required) :
        #     replace_by_the_default_symbols() -> normalized_src
        #.......................................................................
        normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_LOWER_CASE.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_UPPER_CASE.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src)

        #.......................................................................
        # (3) initialisation from the recognized characters.
        #     re.finditer(DStringFRO.pattern) give the symbols{letter+diacritics}
        #.......................................................................
        indexes = []    # indexes of the substring well analyzed : ( start, end )
        for element in re.finditer(DStringFRO.pattern,
                                   normalized_src):

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # we add the unknown characters at the beginning and in the middle
            # of the string (see at the end of this function)
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if indexes:
                # <indexes> isn't empty :
                # ... we add the unknown character(s) between the last character and
                # the current one :
                for index in range( max(indexes[-1])+1, element.start() ):
                    new_character = DCharacterFRO(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )
            else:
                # <indexes> is empty :
                # ... we add the unknown character(s) before the first index in <indexes> :
                for index in range( 0, element.start() ):
                    new_character = DCharacterFRO(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )

            indexes.append( (element.start(), element.end()-1 ) )

            data = element.groupdict()
            letter     = data['letter']
            diacritics = data['diacritics']

            punctuation = letter in SYMB_PUNCTUATION.symbol2name
            capital_letter = letter in SYMB_UPPER_CASE.symbol2name

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.1) base_char
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if punctuation:
                # punctuation symbol :
                base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter)
            elif not capital_letter:
                # lower case :
                base_char = SYMB_LOWER_CASE.get_the_name_for_this_symbol(letter)
            else:
                # upper case :
                base_char = SYMB_UPPER_CASE.get_the_name_for_this_symbol(letter)

            stress = 0
            cedilla = False
            if diacritics is not None:

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.2) stress
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                stress1_nbr = number_of_occurences( source_string = diacritics,
                                                    symbols = SYMB_DIACRITICS__STRESS1)
                stress2_nbr = number_of_occurences( source_string = diacritics,
                                                    symbols = SYMB_DIACRITICS__STRESS2)
                stress12_nbr = number_of_occurences( source_string = diacritics,
                                                    symbols = SYMB_DIACRITICS__STRESS12)
                stress3_nbr = number_of_occurences( source_string = diacritics,
                                                    symbols = SYMB_DIACRITICS__STRESS3)

                if stress1_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), stress1 defined several times."
                    raise DCharsError( context = "DStringFRO.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                if stress2_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), stress2 defined several times."
                    raise DCharsError( context = "DStringFRO.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                if stress12_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), stress12 defined several times."
                    raise DCharsError( context = "DStringFRO.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                if stress3_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), stress3 defined several times."
                    raise DCharsError( context = "DStringFRO.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                if stress1_nbr + stress2_nbr + stress12_nbr + stress3_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), stress1, stress2 and stress12 " \
                              "simultaneously defined."
                    raise DCharsError( context = "DStringFRO.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                stress = 0

                if SYMB_DIACRITICS.are_these_symbols_in_a_string('stress1', diacritics):
                    stress = 1
                elif SYMB_DIACRITICS.are_these_symbols_in_a_string('stress2', diacritics):
                    stress = 2
                elif SYMB_DIACRITICS.are_these_symbols_in_a_string('stress12', diacritics):
                    stress = 3
                elif SYMB_DIACRITICS.are_these_symbols_in_a_string('stress3', diacritics):
                    stress = 4

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.3) cedilla
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                cedilla_nbr = number_of_occurences( source_string = diacritics,
                                                    symbols = SYMB_DIACRITICS__CEDILLA)
                if cedilla_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), cedilla defined several times."
                    raise DCharsError( context = "DStringFRO.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                if SYMB_DIACRITICS.are_these_symbols_in_a_string('cedilla', diacritics):
                    cedilla = True

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.4) we add the new character
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            new_character = DCharacterFRO(dstring_object = self,
                                          unknown_char = False,
                                          base_char = base_char,
                                          punctuation = punctuation,
                                          capital_letter = capital_letter,
                                          cedilla = cedilla,
                                          stress = stress)

            self.append( new_character )

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # we add the final unknown characters (see at the beginning of this
        # function)
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        if indexes:
            # <element> is the last one and <indexes> isn't empty :
            for index in range( max(indexes[-1])+1, len(normalized_src) ):
                new_character = DCharacterFRO(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )
        else:
            # <indexes> is empty :
            for index in range( 0, len(normalized_src) ):
                new_character = DCharacterFRO(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )