def init_from_str(self, str_src): """ DStringHBO.init_from_str Function called by __init__(), initialize <self> str_src : str HOW IT WORKS : * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src * (2) = normalized_src -> (default symbols required) : * replace_by_the_default_symbols() -> normalized_src * (3) initialisation from the recognized characters. * re.finditer(DStringHBO.pattern) give the symbols{base_char, diacritics} * (3.1) contextual_form * (3.2) shin_sin_dot * (3.3) daghesh_mapiq * (3.4) methegh * (3.5) specialpoint * (3.6) vowel * (3.7) raphe * (3.8) cantillation_mark * (3.9) we add the new character """ #....................................................................... # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src #....................................................................... normalized_src = unicodedata.normalize('NFD', str_src) #....................................................................... # (2) = normalized_src -> (default symbols required) : # replace_by_the_default_symbols() -> normalized_src #....................................................................... normalized_src = SYMB_LETTERS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_OTHER_SYMBOLS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_VOWELS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_POINTS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_SPECIALPOINTS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_CANTILLATION_MARKS.replace_by_the_default_symbols(normalized_src) #....................................................................... # (3) initialisation from the recognized characters. # re.finditer(DStringHBO.pattern) give the symbols{basechar, diacritics} #....................................................................... indexes = [] # indexes of the substring well analyzed : ( start, end ) for element in re.finditer(DStringHBO.pattern, normalized_src): #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the unknown characters at the beginning and in the middle # of the string (see at the end of this function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <indexes> isn't empty : # ... we add the unknown character(s) between the last character and # the current one : for index in range( max(indexes[-1])+1, element.start() ): new_character = DCharacterHBO(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : # ... we add the unknown character(s) before the first index in <indexes> : for index in range( 0, element.start() ): new_character = DCharacterHBO(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) indexes.append( (element.start(), element.end()-1 ) ) data = element.groupdict() base_char = data['basechar'] diacritics = data['diacritics'] punctuation = base_char in SYMB_PUNCTUATION.symbol2name #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.1) contextual_form #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if base_char == "ך": base_char = "כ" contextual_form = "final" elif base_char == "ם": base_char = "מ" contextual_form = "final" elif base_char == "ן": base_char = "נ" contextual_form = "final" elif base_char == "ף": base_char = "פ" contextual_form = "final" elif base_char == "ץ": base_char = "צ" contextual_form = "final" elif punctuation == False: contextual_form = "initial+medium+final" else: contextual_form = None shin_sin_dot = None daghesh_mapiq = False methegh = False specialpoint = None vowel = None raphe = False cantillation_mark = None if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2) shin_sin_dot #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . shin_sin_dot_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__SHIN_SIN_DOT ) if shin_sin_dot_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), shin_sin_dot defined several times." raise DCharsError( context = "DStringHBO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) shin_sin_dot = None if SYMB_POINTS.are_these_symbols_in_a_string("HEBREW POINT SHIN DOT", diacritics): shin_sin_dot = "HEBREW POINT SHIN DOT" elif SYMB_POINTS.are_these_symbols_in_a_string("HEBREW POINT SIN DOT", diacritics): shin_sin_dot = "HEBREW POINT SIN DOT" #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.3) daghesh_mapiq #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . daghesh_mapiq_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__DAGHESH_MAPIQ) if daghesh_mapiq_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), daghesh_mapiq defined several times." raise DCharsError( context = "DStringHBO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) daghesh_mapiq = SYMB_POINTS.are_these_symbols_in_a_string( "HEBREW POINT DAGESH OR MAPIQ", diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.4) methegh #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . methegh_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__METHEGH) if methegh_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), methegh defined several times." raise DCharsError( context = "DStringHBO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) methegh = SYMB_POINTS.are_these_symbols_in_a_string("HEBREW POINT METEG", diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.5) specialpoint #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . specialpoint_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__SPECIALPOINTS) if specialpoint_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), specialpoint defined several times." raise DCharsError( context = "DStringHBO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) specialpoint = None for specialpoint_char in SYMB_DIACRITICS__SPECIALPOINTS: specialpoint_name = SYMB_SPECIALPOINTS.defaultsymbol2name[specialpoint_char] if SYMB_SPECIALPOINTS.are_these_symbols_in_a_string(name=specialpoint_name, string=diacritics): specialpoint = specialpoint_name break #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.6) vowel #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . vowel_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__VOWELS) if vowel_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), vowel defined several times." raise DCharsError( context = "DStringHBO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) vowel = None for vowel_char in SYMB_DIACRITICS__VOWELS: vowel_name = SYMB_VOWELS.defaultsymbol2name[vowel_char] if SYMB_VOWELS.are_these_symbols_in_a_string(name=vowel_name, string=diacritics): vowel = vowel_name break #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.7) raphe #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . raphe_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__RAPHE) if raphe_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), raphe defined several times." raise DCharsError( context = "DStringHBO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) raphe = SYMB_POINTS.are_these_symbols_in_a_string("HEBREW POINT RAFE", diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.8) cantillation_mark #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . cmark_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__CANTILLATION_MARKS ) if cmark_nbr > 2: err_msg = "In '{0}' (start={1}, end={2}), " \ "cantillation marks defined more than two times." raise DCharsError( context = "DStringHBO.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) cantillation_mark = [] for cmark_char in SYMB_DIACRITICS__CANTILLATION_MARKS: cmark_name = SYMB_CANTILLATION_MARKS.defaultsymbol2name[cmark_char] if SYMB_CANTILLATION_MARKS.are_these_symbols_in_a_string(name=cmark_name, string=diacritics): cantillation_mark.append( cmark_name ) if cantillation_mark == []: cantillation_mark = None #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.9) we add the new character #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . new_character = DCharacterHBO(dstring_object = self, unknown_char = False, base_char = base_char, contextual_form = contextual_form, punctuation = punctuation, shin_sin_dot = shin_sin_dot, daghesh_mapiq = daghesh_mapiq, methegh = methegh, specialpoint = specialpoint, vowel = vowel, raphe = raphe, cantillation_mark = cantillation_mark) self.append( new_character ) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the final unknown characters (see at the beginning of this # function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <element> is the last one and <indexes> isn't empty : for index in range( max(indexes[-1])+1, len(normalized_src) ): new_character = DCharacterHBO(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : for index in range( 0, len(normalized_src) ): new_character = DCharacterHBO(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character )
def get_sourcestr_representation(self): """ DCharacterHBO.get_sourcestr_representation Return a string. """ #....................................................................... # unknown char ? Nothing to do : #....................................................................... if self.unknown_char: if self.dstring_object.options["anonymize the unknown characters"] == 'yes': return UNKNOWN_CHAR_SYMBOL else: return self.base_char #....................................................................... # ok, the function can analyse <self> : #....................................................................... res = [] if self.base_char is not None: base_char = self.base_char if base_char == "כ" and self.contextual_form == "final": base_char = "ך" if base_char == "מ" and self.contextual_form == "final": base_char = "ם" if base_char == "נ" and self.contextual_form == "final": base_char = "ן" if base_char == "פ" and self.contextual_form == "final": base_char = "ף" if base_char == "צ" and self.contextual_form == "final": base_char = "ץ" res.append( base_char ) if self.shin_sin_dot is not None: res.append( SYMB_POINTS.get_default_symbol(self.shin_sin_dot) ) if self.daghesh_mapiq: res.append( DEFAULTSYMB__DAGHESHMAPIQ ) if self.vowel is not None: res.append( SYMB_VOWELS.get_default_symbol(self.vowel) ) if self.methegh: res.append( DEFAULTSYMB__METEG ) if self.raphe: res.append( DEFAULTSYMB__RAFE ) if self.specialpoint is not None: res.append( SYMB_SPECIALPOINTS.get_default_symbol(self.specialpoint) ) if self.cantillation_mark is not None: for cmark in self.cantillation_mark: res.append( SYMB_CANTILLATION_MARKS.get_default_symbol(cmark) ) res = "".join(res) # composition with COMPLETE_NORMALIZE_NFC # we don't use NFC which "shuffle" the elements in an order incompatible # with our code. for src, dest in COMPLETE_NORMALIZE_NFC: res = res.replace(src, dest) return res