################################################################################ # transliteration's patterns : # PATTERN is used to cut one complex characters into its elements. # PATTERN2 is used to cut several complex characters into a list of complex characters. ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. PNEUMA = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["ψιλὸν"]), re.escape(DIACRITICS["δασὺ"])]) TONOS = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS["βαρεῖα"]), re.escape(DIACRITICS["ὀξεῖα"]), re.escape(DIACRITICS["περισπωμένη"])] ) LETTERS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(LOWER_CASE_INVERSED.keys())) + regexstring_list(tuple(UPPER_CASE_INVERSED.keys())) + regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) HYPOGEGRAMMENE = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["ὑπογεγραμμένη"])]) DIALUTIKA = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["διαλυτικά"])]) MEKOS = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["μακρόν"]), re.escape(DIACRITICS["βραχύ"])]) PATTERN_TXT = ( "((?P<trans_pneuma>({0}))?" "(?P<trans_tonos>({1}))?" "(?P<base_char>({2}))" "(?P<trans_hypogegrammene>({3}))?" "(?P<trans_dialutika>({4}))?" "(?P<trans_mekos>({5}))?)".format(
# PATTERN2 is used to cut several complex characters into a list of complex characters. ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. UPPERDOT = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['upperdot'])] ) MAKRON = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['makron']), ]) STRESS = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['stressM1']), re.escape(DIACRITICS['stress1']), re.escape(DIACRITICS['stress2']), ]) LETTERS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(LOWER_CASE_INVERSED.keys())) + \ regexstring_list(tuple(UPPER_CASE_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) PATTERN_TXT = "((?P<base_char>({0}))" \ "(?P<trans_stress>({1}))?" \ "(?P<trans_makron>({2}))?" \ "(?P<trans_upperdot>({3}))?" \ ")".format("|".join(prepare_list_to_strformat(LETTERS)), "|".join(prepare_list_to_strformat(STRESS)), "|".join(prepare_list_to_strformat(MAKRON)), "|".join(prepare_list_to_strformat(UPPERDOT)), ) # we inverse the effect of prepare_list_to_strformat() PATTERN_TXT = PATTERN_TXT.replace('{{', '{') PATTERN_TXT = PATTERN_TXT.replace('}}', '}')
OTHER_SYMBOLS_INVERSED = invertdict(OTHER_SYMBOLS) PUNCTUATION_INVERSED = invertdict(PUNCTUATION) DIACRITICS_INVERSED = invertdict(DIACRITICS) ################################################################################ # transliteration's patterns : # PATTERN is used to cut one complex characters into its elements. # PATTERN2 is used to cut several complex characters into a list of complex characters. ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. BASE_CHAR = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(CONSONANTS_INVERSED.keys())) + \ regexstring_list(tuple(CONSONANTS_WITH_NUKTA_INVERSED.keys())) + \ regexstring_list(tuple(VOWELS_INVERSED.keys())) + \ regexstring_list(tuple(VOWELS_IN_HIATUS_INVERSED.keys())) + \ regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) ANUDATTA = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['DEVANAGARI STRESS SIGN ANUDATTA']),]) ACCENT = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['DEVANAGARI STRESS SIGN UDATTA']), ]) ANUSVARA_CANDRABINDU = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['DEVANAGARI SIGN ANUSVARA']),
# E.g. CANTILATIONMARKS["HEBREW ACCENT ZAQEF GADOL"] = <HEBREW ACCENT ZAQEF GADOL> CANTILATIONMARKS = { cmark:"<"+cmark+">" for cmark in symbols.SYMB_CANTILLATION_MARKS.keys() } CANTILATIONMARKS_INVERSED = invertdict( CANTILATIONMARKS ) ################################################################################ # transliteration's patterns : # PATTERN is used to cut one complex characters into its elements. # PATTERN2 is used to cut several complex characters into a list of complex characters. ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. T_BASECHARS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(LETTERS_INVERSED.keys())) + \ regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) T_VOWELS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(VOWELS_INVERSED.keys()))) T_METHEGH = isort_a_lstrings_bylen_nodup( [re.escape(POINTS["HEBREW POINT METEG"]),] ) T_RAPHE = isort_a_lstrings_bylen_nodup( [re.escape(POINTS["HEBREW POINT RAFE"]),] ) T_SPECIALPOINTS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(SPECIALPOINTS_INVERSED.keys())) ) T_CMARKS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(CANTILATIONMARKS_INVERSED.keys())) ) PATTERN_TXT = "((?P<base_char>({0}))" \ "(?P<trans_vowel>({1}))?" \
# +1 (text->transliteration) # -1 (transliteration->text) # ################################################################################ AVAILABLE_DIRECTIONS = (-1, +1) ################################################################################ # transliteration's patterns : ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. TRANS_DOT_OR_PLUS = isort_a_lstrings_bylen_nodup( regexstring_list( ('+', '.') )) TRANS_PUNCTUATION_AND_OTHER_SYMBOL = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) TRANS_CONSONANTS_AND_VOWELS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(CONSONANTS_INVERSED.keys())) + \ regexstring_list(tuple(VOWELS_INVERSED.keys())) ) TRANS_RNAM_BCAD = isort_a_lstrings_bylen_nodup( regexstring_list( (DIACRITICS['SIGN RNAM BCAD'])) ) TRANS_HALANTA = isort_a_lstrings_bylen_nodup( regexstring_list( (DIACRITICS['MARK HALANTA'])) ) TRANS_ANUSVARA_CANDRABINDU = isort_a_lstrings_bylen_nodup( regexstring_list( (DIACRITICS['SIGN RJES SU NGA RO'], DIACRITICS['SIGN NYI ZLA NAA DA'], DIACRITICS['SIGN SNA LDAN'] )))
("[-]tsub" , "bb"), ("[-]tsup" , "pp"), )) ################################################################################ # transliteration's patterns : # PATTERN is used to cut one complex characters into its elements. # PATTERN2 is used to cut several complex characters into a list of complex characters. ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. LETTERS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(CHOONPU.keys())) + \ regexstring_list(tuple(HIRAGANA_INVERSED.keys())) + \ regexstring_list(tuple(HIRAGANA_DAKUTEN_INVERSED.keys())) + \ regexstring_list(tuple(HIRAGANA_HANDAKUTEN_INVERSED.keys())) + \ regexstring_list(tuple(KATAKANA_INVERSED.keys())) + \ regexstring_list(tuple(KATAKANA_DAKUTEN_INVERSED.keys())) + \ regexstring_list(tuple(KATAKANA_HANDAKUTEN_INVERSED.keys())) + \ regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) PATTERN_TXT = "(?P<base_char>({0}))".format("|".join(prepare_list_to_strformat(LETTERS)),) # we inverse the effect of prepare_list_to_strformat() PATTERN_TXT = PATTERN_TXT.replace('{{', '{') PATTERN_TXT = PATTERN_TXT.replace('}}', '}') PATTERN = re.compile(PATTERN_TXT)