def init_from_str(self, str_src): """ DStringLAT.init_from_str Function called by __init__(), initialize <self> and return <indexes_of_unrecognized_chars>. str_src : str HOW IT WORKS : * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src * (2) = normalized_src -> (default symbols required) : * replace_by_the_default_symbols() -> normalized_src * (3) initialisation from the recognized characters. * re.finditer(DStringLAT.pattern) give the symbols{letter+diacritics} * (3.1) base_char * (3.2) length * (3.3) stress * (3.4) diaeresis * (3.5) we add the new character """ #....................................................................... # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src #....................................................................... normalized_src = unicodedata.normalize('NFD', str_src) #....................................................................... # (2) = normalized_src -> (default symbols required) : # replace_by_the_default_symbols() -> normalized_src #....................................................................... normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_LOWER_CASE.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_UPPER_CASE.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src) #....................................................................... # (3) initialisation from the recognized characters. # re.finditer(DStringLAT.pattern) give the symbols{letter+diacritics} #....................................................................... indexes = [] # indexes of the substring well analyzed : ( start, end ) for element in re.finditer(DStringLAT.pattern, normalized_src): #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the unknown characters at the beginning and in the middle # of the string (see at the end of this function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <indexes> isn't empty : # ... we add the unknown character(s) between the last character and # the current one : for index in range( max(indexes[-1])+1, element.start() ): new_character = DCharacterLAT(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : # ... we add the unknown character(s) before the first index in <indexes> : for index in range( 0, element.start() ): new_character = DCharacterLAT(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) indexes.append( (element.start(), element.end()-1 ) ) data = element.groupdict() letter = data['letter'] diacritics = data['diacritics'] punctuation = letter in SYMB_PUNCTUATION.symbol2name capital_letter = letter in SYMB_UPPER_CASE.symbol2name #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.1) base_char #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if punctuation: # punctuation symbol : base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter) elif not capital_letter: # lower case : base_char = SYMB_LOWER_CASE.get_the_name_for_this_symbol(letter) else: # upper case : base_char = SYMB_UPPER_CASE.get_the_name_for_this_symbol(letter) length = None stress = False diaeresis = False if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2) length #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . length_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__LENGTH ) if length_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), length defined several times." raise DCharsError( context = "DStringLAT.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) length = None if SYMB_DIACRITICS.are_these_symbols_in_a_string('short', diacritics): length = "short" elif SYMB_DIACRITICS.are_these_symbols_in_a_string('long', diacritics): length = "long" #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.3) stress #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . stress_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__STRESS) if stress_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stress defined several times." raise DCharsError( context = "DStringLAT.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) stress = SYMB_DIACRITICS.are_these_symbols_in_a_string("stress", diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.4) diaeresis #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . diaeresis_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__DIAERESIS) if diaeresis_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), diaeresis defined several times." raise DCharsError( context = "DStringLAT.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) diaeresis = SYMB_DIACRITICS.are_these_symbols_in_a_string("diaeresis", diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.5) we add the new character #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . new_character = DCharacterLAT(dstring_object = self, unknown_char = False, base_char = base_char, punctuation = punctuation, capital_letter = capital_letter, length = length, stress = stress, diaeresis = diaeresis) self.append( new_character ) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the final unknown characters (see at the beginning of this # function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <element> is the last one and <indexes> isn't empty : for index in range( max(indexes[-1])+1, len(normalized_src) ): new_character = DCharacterLAT(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : for index in range( 0, len(normalized_src) ): new_character = DCharacterLAT(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character )
def init_from_str(self, str_src): """ DStringANG.init_from_str Function called by __init__(), initialize <self> and return <indexes_of_unrecognized_chars>. str_src : str HOW IT WORKS : * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src * (2) = normalized_src -> (default symbols required) : * replace_by_the_default_symbols() -> normalized_src * (3) initialisation from the recognized characters. * re.finditer(DStringANG.pattern) give the symbols{letter+diacritics} * (3.1) base_char * (3.2) makron * (3.3) stress * (3.4) upperdot * (3.5) we add the new character """ #....................................................................... # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src #....................................................................... normalized_src = unicodedata.normalize('NFD', str_src) #....................................................................... # (2) = normalized_src -> (default symbols required) : # replace_by_the_default_symbols() -> normalized_src #....................................................................... normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_LOWER_CASE.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_UPPER_CASE.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src) #....................................................................... # (3) initialisation from the recognized characters. # re.finditer(DStringANG.pattern) give the symbols{letter+diacritics} #....................................................................... indexes = [] # indexes of the substring well analyzed : ( start, end ) for element in re.finditer(DStringANG.pattern, normalized_src): #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the unknown characters at the beginning and in the middle # of the string (see at the end of this function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <indexes> isn't empty : # ... we add the unknown character(s) between the last character and # the current one : for index in range( max(indexes[-1])+1, element.start() ): new_character = DCharacterANG(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : # ... we add the unknown character(s) before the first index in <indexes> : for index in range( 0, element.start() ): new_character = DCharacterANG(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) indexes.append( (element.start(), element.end()-1 ) ) data = element.groupdict() letter = data['letter'] diacritics = data['diacritics'] punctuation = letter in SYMB_PUNCTUATION.symbol2name capital_letter = letter in SYMB_UPPER_CASE.symbol2name #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.1) base_char #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if punctuation: # punctuation symbol : base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter) elif not capital_letter: # lower case : base_char = SYMB_LOWER_CASE.get_the_name_for_this_symbol(letter) else: # upper case : base_char = SYMB_UPPER_CASE.get_the_name_for_this_symbol(letter) makron = False stress = 0 upperdot = False if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2) makron #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . makron_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__MAKRON ) if makron_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), makron defined several times." raise DCharsError( context = "DStringANG.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) makron = SYMB_DIACRITICS.are_these_symbols_in_a_string("makron", diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.3) stress #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . stressM1_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__STRESS_MINUS1) stress1_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__STRESS1) stress2_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__STRESS2) if stressM1_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stressM1(-1) defined several times." raise DCharsError( context = "DStringANG.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) if stress1_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stress1 defined several times." raise DCharsError( context = "DStringANG.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) if stress2_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stress2 defined several times." raise DCharsError( context = "DStringANG.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) if stressM1_nbr + stress1_nbr + stress2_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stressM1, stress1 and stress2 " \ "simultaneously defined." raise DCharsError( context = "DStringANG.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) stress = 0 if SYMB_DIACRITICS.are_these_symbols_in_a_string('stressM1', diacritics): stress = -1 if SYMB_DIACRITICS.are_these_symbols_in_a_string('stress1', diacritics): stress = 1 elif SYMB_DIACRITICS.are_these_symbols_in_a_string('stress2', diacritics): stress = 2 #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.4) upperdot #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . upperdot_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__UPPERDOT) if upperdot_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), upperdot defined several times." raise DCharsError( context = "DStringANG.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) upperdot = SYMB_DIACRITICS.are_these_symbols_in_a_string("upperdot", diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.5) we add the new character #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . new_character = DCharacterANG(dstring_object = self, unknown_char = False, base_char = base_char, punctuation = punctuation, capital_letter = capital_letter, makron = makron, stress = stress, upperdot = upperdot) self.append( new_character ) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the final unknown characters (see at the beginning of this # function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <element> is the last one and <indexes> isn't empty : for index in range( max(indexes[-1])+1, len(normalized_src) ): new_character = DCharacterANG(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : for index in range( 0, len(normalized_src) ): new_character = DCharacterANG(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character )
def init_from_str(self, str_src): """ DStringJPN.init_from_str Function called by __init__(), initialize <self> and return <indexes_of_unrecognized_chars>. str_src : str HOW IT WORKS : * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src * (2) = normalized_src -> (default symbols required) : * replace_by_the_default_symbols() -> normalized_src * (3) initialisation from the recognized characters. * re.finditer(DStringJPN.pattern) give the symbols{letter+diacritics} * (3.1) base_char, chartype, smallsize * (3.2) diacritic * (3.3) we add the new character """ #....................................................................... # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src #....................................................................... normalized_src = unicodedata.normalize('NFD', str_src) #....................................................................... # (2) = normalized_src -> (default symbols required) : # replace_by_the_default_symbols() -> normalized_src #....................................................................... normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_CHOONPU.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_HIRAGANA.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_SMALL_HIRAGANA.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_KATAKANA.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_SMALL_KATAKANA.replace_by_the_default_symbols(normalized_src) #....................................................................... # (3) initialisation from the recognized characters. # re.finditer(DStringJPN.pattern) give the symbols{letter+diacritics} #....................................................................... indexes = [] # indexes of the substring well analyzed : ( start, end ) for element in re.finditer(DStringJPN.pattern, normalized_src): #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the unknown characters at the beginning and in the middle # of the string (see at the end of this function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <indexes> isn't empty : # ... we add the unknown character(s) between the last character and # the current one : for index in range( max(indexes[-1])+1, element.start() ): new_character = DCharacterJPN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : # ... we add the unknown character(s) before the first index in <indexes> : for index in range( 0, element.start() ): new_character = DCharacterJPN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) indexes.append( (element.start(), element.end()-1 ) ) data = element.groupdict() letter = data['letter'] diacritics = data['diacritics'] punctuation = letter in SYMB_PUNCTUATION.symbol2name #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.1) base_char, chartype, smallsize #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if punctuation: # punctuation symbol : base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter) smallsize = False chartype = "other" elif letter in SYMB_CHOONPU.symbol2name: # "ー" (the chōonpu 長音符 symbol) # confer http://en.wikipedia.org/wiki/Ch%C5%8Donpu base_char = SYMB_CHOONPU.get_the_name_for_this_symbol(letter) smallsize = False chartype = "choonpu" elif letter in SYMB_HIRAGANA.symbol2name: # hiragana : base_char = SYMB_HIRAGANA.get_the_name_for_this_symbol(letter) smallsize = False chartype = "hiragana" elif letter in SYMB_SMALL_HIRAGANA.symbol2name: # small hiragana : base_char = SYMB_HIRAGANA.get_the_name_for_this_symbol(\ SMALL_HIRAGANA_TO_HIRAGANA[letter]) smallsize = True chartype = "hiragana" elif letter in SYMB_KATAKANA.symbol2name: # katakana : base_char = SYMB_HIRAGANA.get_the_name_for_this_symbol(\ KATAKANA_TO_HIRAGANA[ SYMB_KATAKANA.get_the_name_for_this_symbol(letter) ]) smallsize = False chartype = "katakana" elif letter in SYMB_SMALL_KATAKANA.symbol2name: # small katakana : base_char = SYMB_HIRAGANA.get_the_name_for_this_symbol(\ KATAKANA_TO_HIRAGANA[SMALL_KATAKANA_TO_KATAKANA[letter]]) smallsize = True chartype = "katakana" elif letter in SYMB_KANJI.symbol2name: # kanji : base_char = SYMB_KANJI.get_the_name_for_this_symbol(letter) smallsize = False chartype = "kanji" else: # other : base_char = letter smallsize = False chartype = "other" #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2) diacritics #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . diacritic = None if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2.1) dakuten #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . dakuten_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__DAKUTEN ) if dakuten_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), dakuten defined several times." raise DCharsError( context = "DStringJPN.init_from_str", message = err_msg.format(element.string, element.start(), element.end())) if SYMB_DIACRITICS.are_these_symbols_in_a_string('dakuten', diacritics): diacritic = "dakuten" #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2.2) handakuten #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . handakuten_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__HANDAKUTEN ) if handakuten_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), handakuten defined several times." raise DCharsError( context = "DStringJPN.init_from_str", message = err_msg.format(element.string, element.start(), element.end())) if SYMB_DIACRITICS.are_these_symbols_in_a_string('handakuten', diacritics): diacritic = "handakuten" #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # dakuten + handakuten ? error if dakuten_nbr >= 1 and handakuten_nbr >= 1: err_msg = "In '{0}' (start={1}, end={2}), dakuten and handakuten " \ "defined simultaneously" raise DCharsError( context = "DStringJPN.init_from_str", message = err_msg.format(element.string, element.start(), element.end())) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.3) we add the new character #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . new_character = DCharacterJPN(dstring_object = self, unknown_char = False, base_char = base_char, diacritic = diacritic, punctuation = punctuation, chartype=chartype, smallsize = smallsize) self.append( new_character ) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the final unknown characters (see at the beginning of this # function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <element> is the last one and <indexes> isn't empty : for index in range( max(indexes[-1])+1, len(normalized_src) ): new_character = DCharacterJPN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : for index in range( 0, len(normalized_src) ): new_character = DCharacterJPN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character )
def init_from_str(self, str_src): """ DStringSAN.init_from_str Function called by __init__(), initialize <self> and return <indexes_of_unrecognized_chars>. str_src : str HOW IT WORKS : * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src * (2) = normalized_src -> (itrans symbols required) : * replace_by_the_default_symbols() -> normalized_src * (3) initialisation from the recognized characters. * re.finditer(DStringSAN.pattern) give the symbols{base_char, diacritics} * (3.1) virama * (3.2) base_char, punctuation, dependentvowel, is_an_independent_vowel * (3.3) accent * (3.4) nukta * (3.5) anusvara_candrabindu * (3.6) anudatta * (3.7) we add the new character """ #....................................................................... # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src #....................................................................... normalized_src = unicodedata.normalize('NFD', str_src) #....................................................................... # (2) = normalized_src -> (itrans symbols required) : # replace_by_the_default_symbols() -> normalized_src #....................................................................... normalized_src = SYMB_CONSONANTS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_INDEPENDENT_VOWELS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_DEPENDENT_VOWELS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src) #....................................................................... # (3) initialisation from the recognized characters. # re.finditer(DStringSAN.pattern) give the symbols{basechar, diacritics} #....................................................................... indexes = [] # indexes of the substring well analyzed : ( start, end ) for element in re.finditer(DStringSAN.pattern, normalized_src): #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the unknown characters at the beginning and in the middle # of the string (see at the end of this function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <indexes> isn't empty : # ... we add the unknown character(s) between the last character and # the current one : for index in range( max(indexes[-1])+1, element.start() ): new_character = DCharacterSAN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : # ... we add the unknown character(s) before the first index in <indexes> : for index in range( 0, element.start() ): new_character = DCharacterSAN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) indexes.append( (element.start(), element.end()-1 ) ) data = element.groupdict() base_char = data['basechar'] dependentvowel = data['dependentvowel'] diacritics = data['diacritics'] # base_char as "क" becomes "KA" base_char__punctuation = SYMB_PUNCTUATION.get_the_name_for_this_symbol(base_char) base_char__other_symbols = SYMB_OTHER_SYMBOLS.get_the_name_for_this_symbol(base_char) base_char__consonant = SYMB_CONSONANTS.get_the_name_for_this_symbol(base_char) base_char__ivowel = SYMB_INDEPENDENT_VOWELS.get_the_name_for_this_symbol(base_char) base_char__dvowel = SYMB_DEPENDENT_VOWELS.get_the_name_for_this_symbol(dependentvowel) is_an_independent_vowel = False # <is_an_independent_vowel> is set here since, # if base_char is a punctuation symbol, # it will never be set again but it is needed by # the call to new_character = DCharacterSAN(...) virama = False if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.1) virama #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . virama_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__VIRAMA) if virama_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), 'virama' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) virama = SYMB_DIACRITICS.are_these_symbols_in_a_string('DEVANAGARI SIGN VIRAMA', diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2) base_char, punctuation, dependentvowel, is_an_independent_vowel #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if base_char__punctuation is not None: # punctuation symbol : punctuation = True base_char = base_char__punctuation elif base_char__other_symbols is not None: # "other symbol" : not punctuation nor consonant nor independent vowel : punctuation = False base_char = base_char__other_symbols else: punctuation = False if base_char__consonant is not None: # consonant : is_an_independent_vowel = False base_char = base_char__consonant # dependent vowel ? if base_char != 'DEVANAGARI SIGN VISARGA' and \ not virama and dependentvowel is None: # special case : for normal consonants (and visarga is a pseudo-consonant) # written without any vowel symbol, the dependent vowel # is 'A'. E.g. 'क' stands for 'ka', not for 'k'. dependentvowel = "A" else: dependentvowel = base_char__dvowel else: # independent vowel : is_an_independent_vowel = True dependentvowel = None base_char = base_char__ivowel accent = None nukta = False anusvara_candrabindu = None anudatta = False if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.3) accent #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . accent_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__ACCENTS ) if accent_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), 'accent' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) accent = None for accent_char in SYMB_DIACRITICS__ACCENTS: accent_name = SYMB_DIACRITICS.defaultsymbol2name[accent_char] if SYMB_DIACRITICS.are_these_symbols_in_a_string(name=accent_name, string=diacritics): accent = accent_name break #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.4) nukta #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . nukta_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__NUKTA ) if nukta_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), 'nukta' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) nukta = SYMB_DIACRITICS.are_these_symbols_in_a_string('DEVANAGARI SIGN NUKTA', diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.5) anusvara_candrabindu #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . anusvara_candrabindu_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__ANUSVARA_CANDRABINDU) if anusvara_candrabindu_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), " \ "'anusvara_candrabindu' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) anusvara_candrabindu = None for anusvara_candrabindu_char in SYMB_DIACRITICS__ANUSVARA_CANDRABINDU: anusvara_candrabindu_name = SYMB_DIACRITICS.defaultsymbol2name[ anusvara_candrabindu_char] if SYMB_DIACRITICS.are_these_symbols_in_a_string(name=anusvara_candrabindu_name, string=diacritics): anusvara_candrabindu = anusvara_candrabindu_name break #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.6) anudatta #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . anudatta_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__ANUDATTA) if anudatta_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), 'anudatta' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) anudatta = SYMB_DIACRITICS.are_these_symbols_in_a_string( 'DEVANAGARI STRESS SIGN ANUDATTA', diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.7) we add the new character #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . new_character = DCharacterSAN(dstring_object = self, unknown_char = False, base_char = base_char, accent = accent, punctuation = punctuation, nukta = nukta, anusvara_candrabindu = anusvara_candrabindu, virama = virama, anudatta = anudatta, is_an_independent_vowel = is_an_independent_vowel, dependentvowel = dependentvowel) self.append( new_character ) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the final unknown characters (see at the beginning of this # function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <element> is the last one and <indexes> isn't empty : for index in range( max(indexes[-1])+1, len(normalized_src) ): new_character = DCharacterSAN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : for index in range( 0, len(normalized_src) ): new_character = DCharacterSAN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character )
def init_from_str(self, str_src): """ DStringGRC.init_from_str Function called by __init__(), initialize <self> and return <indexes_of_unrecognized_chars>. str_src : str HOW IT WORKS : * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src * (2) = normalized_src -> (default symbols required) : * replace_by_the_default_symbols() -> normalized_src * (3) initialisation from the recognized characters. * re.finditer(DStringGRC.pattern) give the symbols{letter+diacritics} * (3.1) base_char * (3.2) contextual_form * (3.3) tonos (τόνος) * (3.4) mekos (μῆκος) * (3.5) pneuma (πνεῦμα) * (3.6) hypogegrammene (ὑπογεγραμμένη) * (3.7) dialutika (διαλυτικά) * (3.8) we add the new character """ #....................................................................... # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src #....................................................................... normalized_src = unicodedata.normalize('NFD', str_src) #....................................................................... # (2) = normalized_src -> (default symbols required) : # replace_by_the_default_symbols() -> normalized_src #....................................................................... normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_LOWER_CASE.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_UPPER_CASE.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_OTHER_SYMBOLS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src) #....................................................................... # (3) initialisation from the recognized characters. # re.finditer(DStringGRC.pattern) give the symbols{letter+diacritics} #....................................................................... indexes = [] # indexes of the substring well analyzed : ( start, end ) for element in re.finditer(DStringGRC.pattern, normalized_src): #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the unknown characters at the beginning and in the middle # of the string (see at the end of this function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <indexes> isn't empty : # ... we add the unknown character(s) between the last character and # the current one : for index in range( max(indexes[-1])+1, element.start() ): new_character = DCharacterGRC(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : # ... we add the unknown character(s) before the first index in <indexes> : for index in range( 0, element.start() ): new_character = DCharacterGRC(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) indexes.append( (element.start(), element.end()-1 ) ) data = element.groupdict() letter = data['letter'] diacritics = data['diacritics'] punctuation = letter in SYMB_PUNCTUATION.symbol2name capital_letter = letter in SYMB_UPPER_CASE.symbol2name #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.1) base_char #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if punctuation: # punctuation symbol : base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter) elif letter in SYMB_LOWER_CASE.symbol2name: # lower case : base_char = SYMB_LOWER_CASE.get_the_name_for_this_symbol(letter) elif letter in SYMB_UPPER_CASE.symbol2name: # upper case : base_char = SYMB_UPPER_CASE.get_the_name_for_this_symbol(letter) else: # other symbols : base_char = SYMB_OTHER_SYMBOLS.get_the_name_for_this_symbol(letter) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2) contextual_form #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if base_char == 'β' and not capital_letter: contextual_form = "initial" elif base_char == 'ϐ' and not capital_letter: base_char = 'β' contextual_form = "medium+final" elif base_char == 'σ' and not capital_letter: contextual_form = "initial+medium" elif base_char == 'ς' and not capital_letter: base_char = 'σ' contextual_form = "final" else: contextual_form = "initial+medium+final" tonos = None mekos = None pneuma = None hypogegrammene = False dialutika = False if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.3) tonos (τόνος) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . tonos_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__TONOS ) if tonos_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), τόνος defined several times." raise DCharsError( context = "DStringGRC.init_from_str", message = err_msg.format(element.string, element.start(), element.end())) if SYMB_DIACRITICS.are_these_symbols_in_a_string('τόνος.βαρεῖα', diacritics): tonos = "βαρεῖα" elif SYMB_DIACRITICS.are_these_symbols_in_a_string('τόνος.ὀξεῖα', diacritics): tonos = "ὀξεῖα" elif SYMB_DIACRITICS.are_these_symbols_in_a_string('τόνος.περισπωμένη', diacritics): tonos = "περισπωμένη" #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.4) mekos (μῆκος) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . mekos_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__MEKOS) if mekos_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), μῆκος defined several times." raise DCharsError( context = "DStringGRC.init_from_str", message = err_msg.format(element.string, element.start(), element.end())) if SYMB_DIACRITICS.are_these_symbols_in_a_string('μῆκος.μακρόν', diacritics): mekos = "μακρόν" elif SYMB_DIACRITICS.are_these_symbols_in_a_string('μῆκος.βραχύ', diacritics): mekos = "βραχύ" #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.5) pneuma (πνεῦμα) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . pneuma_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__PNEUMA) if pneuma_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), πνεῦμα defined several times." raise DCharsError( context = "DStringGRC.init_from_str", message = err_msg.format(element.string, element.start(), element.end())) if SYMB_DIACRITICS.are_these_symbols_in_a_string('πνεῦμα.ψιλὸν', diacritics): pneuma = "ψιλὸν" elif SYMB_DIACRITICS.are_these_symbols_in_a_string('πνεῦμα.δασὺ', diacritics): pneuma = "δασὺ" #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.6) hypogegrammene (ὑπογεγραμμένη) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . hypogegrammene_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS['ὑπογεγραμμένη']) if hypogegrammene_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), ὑπογεγραμμένη defined several times." raise DCharsError( context = "DStringGRC.init_from_str", message = err_msg.format(element.string, element.start(), element.end())) hypogegrammene = SYMB_DIACRITICS.are_these_symbols_in_a_string('ὑπογεγραμμένη', diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.7) dialutika (διαλυτικά) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . dialutika_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS['διαλυτικά']) if dialutika_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), διαλυτικά defined several times." raise DCharsError( context = "DStringGRC.init_from_str", message = err_msg.format(element.string, element.start(), element.end())) dialutika = SYMB_DIACRITICS.are_these_symbols_in_a_string('διαλυτικά', diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.8) we add the new character #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . new_character = DCharacterGRC(dstring_object = self, unknown_char = False, base_char = base_char, contextual_form = contextual_form, punctuation = punctuation, capital_letter = capital_letter, tonos = tonos, pneuma = pneuma, hypogegrammene = hypogegrammene, dialutika = dialutika, mekos=mekos) self.append( new_character ) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the final unknown characters (see at the beginning of this # function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <element> is the last one and <indexes> isn't empty : for index in range( max(indexes[-1])+1, len(normalized_src) ): new_character = DCharacterGRC(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : for index in range( 0, len(normalized_src) ): new_character = DCharacterGRC(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character )
def init_from_str(self, str_src): """ DStringHBO.init_from_str Function called by __init__(), initialize <self> str_src : str HOW IT WORKS : * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src * (2) = normalized_src -> (default symbols required) : * replace_by_the_default_symbols() -> normalized_src * (3) initialisation from the recognized characters. * re.finditer(DStringHBO.pattern) give the symbols{base_char, diacritics} * (3.1) contextual_form * (3.2) shin_sin_dot * (3.3) daghesh_mapiq * (3.4) methegh * (3.5) specialpoint * (3.6) vowel * (3.7) raphe * (3.8) cantillation_mark * (3.9) we add the new character """ #....................................................................... # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src #....................................................................... normalized_src = unicodedata.normalize('NFD', str_src) #....................................................................... # (2) = normalized_src -> (default symbols required) : # replace_by_the_default_symbols() -> normalized_src #....................................................................... normalized_src = SYMB_LETTERS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_OTHER_SYMBOLS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_VOWELS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_POINTS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_SPECIALPOINTS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_CANTILLATION_MARKS.replace_by_the_default_symbols(normalized_src) #....................................................................... # (3) initialisation from the recognized characters. # re.finditer(DStringHBO.pattern) give the symbols{basechar, diacritics} #....................................................................... indexes = [] # indexes of the substring well analyzed : ( start, end ) for element in re.finditer(DStringHBO.pattern, normalized_src): #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the unknown characters at the beginning and in the middle # of the string (see at the end of this function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <indexes> isn't empty : # ... we add the unknown character(s) between the last character and # the current one : for index in range( max(indexes[-1])+1, element.start() ): new_character = DCharacterHBO(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : # ... we add the unknown character(s) before the first index in <indexes> : for index in range( 0, element.start() ): new_character = DCharacterHBO(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) indexes.append( (element.start(), element.end()-1 ) ) data = element.groupdict() base_char = data['basechar'] diacritics = data['diacritics'] punctuation = base_char in SYMB_PUNCTUATION.symbol2name #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.1) contextual_form #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if base_char == "ך": base_char = "כ" contextual_form = "final" elif base_char == "ם": base_char = "מ" contextual_form = "final" elif base_char == "ן": base_char = "נ" contextual_form = "final" elif base_char == "ף": base_char = "פ" contextual_form = "final" elif base_char == "ץ": base_char = "צ" contextual_form = "final" elif punctuation == False: contextual_form = "initial+medium+final" else: contextual_form = None shin_sin_dot = None daghesh_mapiq = False methegh = False specialpoint = None vowel = None raphe = False cantillation_mark = None if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2) shin_sin_dot #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . shin_sin_dot_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__SHIN_SIN_DOT ) if shin_sin_dot_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), shin_sin_dot defined several times." raise DCharsError( context = "DStringHBO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) shin_sin_dot = None if SYMB_POINTS.are_these_symbols_in_a_string("HEBREW POINT SHIN DOT", diacritics): shin_sin_dot = "HEBREW POINT SHIN DOT" elif SYMB_POINTS.are_these_symbols_in_a_string("HEBREW POINT SIN DOT", diacritics): shin_sin_dot = "HEBREW POINT SIN DOT" #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.3) daghesh_mapiq #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . daghesh_mapiq_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__DAGHESH_MAPIQ) if daghesh_mapiq_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), daghesh_mapiq defined several times." raise DCharsError( context = "DStringHBO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) daghesh_mapiq = SYMB_POINTS.are_these_symbols_in_a_string( "HEBREW POINT DAGESH OR MAPIQ", diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.4) methegh #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . methegh_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__METHEGH) if methegh_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), methegh defined several times." raise DCharsError( context = "DStringHBO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) methegh = SYMB_POINTS.are_these_symbols_in_a_string("HEBREW POINT METEG", diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.5) specialpoint #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . specialpoint_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__SPECIALPOINTS) if specialpoint_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), specialpoint defined several times." raise DCharsError( context = "DStringHBO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) specialpoint = None for specialpoint_char in SYMB_DIACRITICS__SPECIALPOINTS: specialpoint_name = SYMB_SPECIALPOINTS.defaultsymbol2name[specialpoint_char] if SYMB_SPECIALPOINTS.are_these_symbols_in_a_string(name=specialpoint_name, string=diacritics): specialpoint = specialpoint_name break #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.6) vowel #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . vowel_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__VOWELS) if vowel_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), vowel defined several times." raise DCharsError( context = "DStringHBO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) vowel = None for vowel_char in SYMB_DIACRITICS__VOWELS: vowel_name = SYMB_VOWELS.defaultsymbol2name[vowel_char] if SYMB_VOWELS.are_these_symbols_in_a_string(name=vowel_name, string=diacritics): vowel = vowel_name break #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.7) raphe #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . raphe_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__RAPHE) if raphe_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), raphe defined several times." raise DCharsError( context = "DStringHBO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) raphe = SYMB_POINTS.are_these_symbols_in_a_string("HEBREW POINT RAFE", diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.8) cantillation_mark #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . cmark_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__CANTILLATION_MARKS ) if cmark_nbr > 2: err_msg = "In '{0}' (start={1}, end={2}), " \ "cantillation marks defined more than two times." raise DCharsError( context = "DStringHBO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) cantillation_mark = [] for cmark_char in SYMB_DIACRITICS__CANTILLATION_MARKS: cmark_name = SYMB_CANTILLATION_MARKS.defaultsymbol2name[cmark_char] if SYMB_CANTILLATION_MARKS.are_these_symbols_in_a_string(name=cmark_name, string=diacritics): cantillation_mark.append( cmark_name ) if cantillation_mark == []: cantillation_mark = None #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.9) we add the new character #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . new_character = DCharacterHBO(dstring_object = self, unknown_char = False, base_char = base_char, contextual_form = contextual_form, punctuation = punctuation, shin_sin_dot = shin_sin_dot, daghesh_mapiq = daghesh_mapiq, methegh = methegh, specialpoint = specialpoint, vowel = vowel, raphe = raphe, cantillation_mark = cantillation_mark) self.append( new_character ) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the final unknown characters (see at the beginning of this # function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <element> is the last one and <indexes> isn't empty : for index in range( max(indexes[-1])+1, len(normalized_src) ): new_character = DCharacterHBO(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : for index in range( 0, len(normalized_src) ): new_character = DCharacterHBO(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character )