def get_sourcestr_representation(self): """ DCharacterFRO.get_sourcestr_representation Return a string. """ # ....................................................................... # unknown char ? Nothing to do : # ....................................................................... if self.unknown_char: if self.dstring_object.options["anonymize the unknown characters"] == "yes": return UNKNOWN_CHAR_SYMBOL else: return self.base_char # ....................................................................... # ok, the function can analyse <self> : # ....................................................................... res = [] if self.base_char is not None: if self.punctuation: # punctuation symbol : res.append(self.base_char) elif not self.capital_letter: # lower case : res.append(SYMB_LOWER_CASE.get_default_symbol(self.base_char)) else: # upper case : res.append(SYMB_UPPER_CASE.get_default_symbol(self.base_char)) if self.stress == 1: res.append(DEFAULTSYMB__STRESS1) if self.stress == 2: res.append(DEFAULTSYMB__STRESS2) elif self.stress == 3: res.append(DEFAULTSYMB__STRESS12) elif self.stress == 4: res.append(DEFAULTSYMB__STRESS3) if self.cedilla is True: res.append(DEFAULTSYMB__CEDILLA) res = "".join(res) # (1/2) composition with unicodedata.normalize : res = unicodedata.normalize("NFC", res) # (2/2) composition with COMPLETE_NORMALIZE_NFC : for src, dest in COMPLETE_NORMALIZE_NFC: res = res.replace(src, dest) return res
def init_from_str(self, str_src): """ DStringFRO.init_from_str Function called by __init__(), initialize <self> and return <indexes_of_unrecognized_chars>. str_src : str HOW IT WORKS : * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src * (2) = normalized_src -> (default symbols required) : * replace_by_the_default_symbols() -> normalized_src * (3) initialisation from the recognized characters. * re.finditer(DStringFRO.pattern) give the symbols{letter+diacritics} * (3.1) base_char * (3.2) stress * (3.3) cedilla * (3.3) we add the new character """ #....................................................................... # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src #....................................................................... normalized_src = unicodedata.normalize('NFD', str_src) #....................................................................... # (2) = normalized_src -> (default symbols required) : # replace_by_the_default_symbols() -> normalized_src #....................................................................... normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_LOWER_CASE.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_UPPER_CASE.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src) #....................................................................... # (3) initialisation from the recognized characters. # re.finditer(DStringFRO.pattern) give the symbols{letter+diacritics} #....................................................................... indexes = [] # indexes of the substring well analyzed : ( start, end ) for element in re.finditer(DStringFRO.pattern, normalized_src): #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the unknown characters at the beginning and in the middle # of the string (see at the end of this function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <indexes> isn't empty : # ... we add the unknown character(s) between the last character and # the current one : for index in range( max(indexes[-1])+1, element.start() ): new_character = DCharacterFRO(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : # ... we add the unknown character(s) before the first index in <indexes> : for index in range( 0, element.start() ): new_character = DCharacterFRO(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) indexes.append( (element.start(), element.end()-1 ) ) data = element.groupdict() letter = data['letter'] diacritics = data['diacritics'] punctuation = letter in SYMB_PUNCTUATION.symbol2name capital_letter = letter in SYMB_UPPER_CASE.symbol2name #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.1) base_char #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if punctuation: # punctuation symbol : base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter) elif not capital_letter: # lower case : base_char = SYMB_LOWER_CASE.get_the_name_for_this_symbol(letter) else: # upper case : base_char = SYMB_UPPER_CASE.get_the_name_for_this_symbol(letter) stress = 0 cedilla = False if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2) stress #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . stress1_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__STRESS1) stress2_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__STRESS2) stress12_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__STRESS12) stress3_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__STRESS3) if stress1_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stress1 defined several times." raise DCharsError( context = "DStringFRO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) if stress2_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stress2 defined several times." raise DCharsError( context = "DStringFRO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) if stress12_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stress12 defined several times." raise DCharsError( context = "DStringFRO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) if stress3_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stress3 defined several times." raise DCharsError( context = "DStringFRO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) if stress1_nbr + stress2_nbr + stress12_nbr + stress3_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stress1, stress2 and stress12 " \ "simultaneously defined." raise DCharsError( context = "DStringFRO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) stress = 0 if SYMB_DIACRITICS.are_these_symbols_in_a_string('stress1', diacritics): stress = 1 elif SYMB_DIACRITICS.are_these_symbols_in_a_string('stress2', diacritics): stress = 2 elif SYMB_DIACRITICS.are_these_symbols_in_a_string('stress12', diacritics): stress = 3 elif SYMB_DIACRITICS.are_these_symbols_in_a_string('stress3', diacritics): stress = 4 #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.3) cedilla #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . cedilla_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__CEDILLA) if cedilla_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), cedilla defined several times." raise DCharsError( context = "DStringFRO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) if SYMB_DIACRITICS.are_these_symbols_in_a_string('cedilla', diacritics): cedilla = True #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.4) we add the new character #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . new_character = DCharacterFRO(dstring_object = self, unknown_char = False, base_char = base_char, punctuation = punctuation, capital_letter = capital_letter, cedilla = cedilla, stress = stress) self.append( new_character ) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the final unknown characters (see at the beginning of this # function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <element> is the last one and <indexes> isn't empty : for index in range( max(indexes[-1])+1, len(normalized_src) ): new_character = DCharacterFRO(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : for index in range( 0, len(normalized_src) ): new_character = DCharacterFRO(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character )