def get_sourcestr_representation(self): """ DCharacterANG.get_sourcestr_representation Return a string. """ #....................................................................... # unknown char ? Nothing to do : #....................................................................... if self.unknown_char: if self.dstring_object.options["anonymize the unknown characters"] == 'yes': return UNKNOWN_CHAR_SYMBOL else: return self.base_char #....................................................................... # ok, the function can analyse <self> : #....................................................................... res = [] if self.base_char is not None: if self.punctuation: # punctuation symbol : res.append( self.base_char ) elif not self.capital_letter: # lower case : res.append( SYMB_LOWER_CASE.get_default_symbol(self.base_char) ) else: # upper case : res.append( SYMB_UPPER_CASE.get_default_symbol(self.base_char) ) if self.stress == -1: res.append( DEFAULTSYMB__STRESS_MINUS1 ) if self.stress == 1: res.append( DEFAULTSYMB__STRESS1 ) elif self.stress == 2: res.append( DEFAULTSYMB__STRESS2 ) if self.makron: res.append( DEFAULTSYMB__MAKRON ) if self.upperdot: res.append( DEFAULTSYMB__UPPERDOT ) res = "".join(res) # (1/2) composition with unicodedata.normalize : res = unicodedata.normalize('NFC', res) # (2/2) composition with COMPLETE_NORMALIZE_NFC : for src, dest in COMPLETE_NORMALIZE_NFC: res = res.replace(src, dest) return res
def init_from_str(self, str_src): """ DStringANG.init_from_str Function called by __init__(), initialize <self> and return <indexes_of_unrecognized_chars>. str_src : str HOW IT WORKS : * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src * (2) = normalized_src -> (default symbols required) : * replace_by_the_default_symbols() -> normalized_src * (3) initialisation from the recognized characters. * re.finditer(DStringANG.pattern) give the symbols{letter+diacritics} * (3.1) base_char * (3.2) makron * (3.3) stress * (3.4) upperdot * (3.5) we add the new character """ #....................................................................... # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src #....................................................................... normalized_src = unicodedata.normalize('NFD', str_src) #....................................................................... # (2) = normalized_src -> (default symbols required) : # replace_by_the_default_symbols() -> normalized_src #....................................................................... normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_LOWER_CASE.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_UPPER_CASE.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src) #....................................................................... # (3) initialisation from the recognized characters. # re.finditer(DStringANG.pattern) give the symbols{letter+diacritics} #....................................................................... indexes = [] # indexes of the substring well analyzed : ( start, end ) for element in re.finditer(DStringANG.pattern, normalized_src): #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the unknown characters at the beginning and in the middle # of the string (see at the end of this function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <indexes> isn't empty : # ... we add the unknown character(s) between the last character and # the current one : for index in range( max(indexes[-1])+1, element.start() ): new_character = DCharacterANG(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : # ... we add the unknown character(s) before the first index in <indexes> : for index in range( 0, element.start() ): new_character = DCharacterANG(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) indexes.append( (element.start(), element.end()-1 ) ) data = element.groupdict() letter = data['letter'] diacritics = data['diacritics'] punctuation = letter in SYMB_PUNCTUATION.symbol2name capital_letter = letter in SYMB_UPPER_CASE.symbol2name #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.1) base_char #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if punctuation: # punctuation symbol : base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter) elif not capital_letter: # lower case : base_char = SYMB_LOWER_CASE.get_the_name_for_this_symbol(letter) else: # upper case : base_char = SYMB_UPPER_CASE.get_the_name_for_this_symbol(letter) makron = False stress = 0 upperdot = False if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2) makron #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . makron_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__MAKRON ) if makron_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), makron defined several times." raise DCharsError( context = "DStringANG.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) makron = SYMB_DIACRITICS.are_these_symbols_in_a_string("makron", diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.3) stress #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . stressM1_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__STRESS_MINUS1) stress1_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__STRESS1) stress2_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__STRESS2) if stressM1_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stressM1(-1) defined several times." raise DCharsError( context = "DStringANG.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) if stress1_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stress1 defined several times." raise DCharsError( context = "DStringANG.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) if stress2_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stress2 defined several times." raise DCharsError( context = "DStringANG.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) if stressM1_nbr + stress1_nbr + stress2_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stressM1, stress1 and stress2 " \ "simultaneously defined." raise DCharsError( context = "DStringANG.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) stress = 0 if SYMB_DIACRITICS.are_these_symbols_in_a_string('stressM1', diacritics): stress = -1 if SYMB_DIACRITICS.are_these_symbols_in_a_string('stress1', diacritics): stress = 1 elif SYMB_DIACRITICS.are_these_symbols_in_a_string('stress2', diacritics): stress = 2 #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.4) upperdot #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . upperdot_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__UPPERDOT) if upperdot_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), upperdot defined several times." raise DCharsError( context = "DStringANG.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) upperdot = SYMB_DIACRITICS.are_these_symbols_in_a_string("upperdot", diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.5) we add the new character #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . new_character = DCharacterANG(dstring_object = self, unknown_char = False, base_char = base_char, punctuation = punctuation, capital_letter = capital_letter, makron = makron, stress = stress, upperdot = upperdot) self.append( new_character ) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the final unknown characters (see at the beginning of this # function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <element> is the last one and <indexes> isn't empty : for index in range( max(indexes[-1])+1, len(normalized_src) ): new_character = DCharacterANG(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : for index in range( 0, len(normalized_src) ): new_character = DCharacterANG(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character )