} DIACRITICS_INVERSED = invertdict(DIACRITICS) ################################################################################ # transliteration's patterns : # PATTERN is used to cut one complex characters into its elements. # PATTERN2 is used to cut several complex characters into a list of complex characters. ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. PNEUMA = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['ψιλὸν']), re.escape(DIACRITICS['δασὺ'])] ) TONOS = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['βαρεῖα']), re.escape(DIACRITICS['ὀξεῖα']), re.escape(DIACRITICS['περισπωμένη'])] ) LETTERS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(LOWER_CASE_INVERSED.keys())) + \ regexstring_list(tuple(UPPER_CASE_INVERSED.keys())) + \ regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) HYPOGEGRAMMENE = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['ὑπογεγραμμένη']),] ) DIALUTIKA = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['διαλυτικά']),] ) MEKOS = isort_a_lstrings_bylen_nodup(
"διαλυτικά": ":", } DIACRITICS_INVERSED = invertdict(DIACRITICS) ################################################################################ # transliteration's patterns : # PATTERN is used to cut one complex characters into its elements. # PATTERN2 is used to cut several complex characters into a list of complex characters. ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. PNEUMA = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["ψιλὸν"]), re.escape(DIACRITICS["δασὺ"])]) TONOS = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS["βαρεῖα"]), re.escape(DIACRITICS["ὀξεῖα"]), re.escape(DIACRITICS["περισπωμένη"])] ) LETTERS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(LOWER_CASE_INVERSED.keys())) + regexstring_list(tuple(UPPER_CASE_INVERSED.keys())) + regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) HYPOGEGRAMMENE = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["ὑπογεγραμμένη"])]) DIALUTIKA = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["διαλυτικά"])]) MEKOS = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["μακρόν"]), re.escape(DIACRITICS["βραχύ"])]) PATTERN_TXT = ( "((?P<trans_pneuma>({0}))?"
DIACRITICS_INVERSED = invertdict(DIACRITICS) ################################################################################ # transliteration's patterns : # PATTERN is used to cut one complex characters into its elements. # PATTERN2 is used to cut several complex characters into a list of complex characters. ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. BASE_CHAR = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(CONSONANTS_INVERSED.keys())) + \ regexstring_list(tuple(CONSONANTS_WITH_NUKTA_INVERSED.keys())) + \ regexstring_list(tuple(VOWELS_INVERSED.keys())) + \ regexstring_list(tuple(VOWELS_IN_HIATUS_INVERSED.keys())) + \ regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) ANUDATTA = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['DEVANAGARI STRESS SIGN ANUDATTA']),]) ACCENT = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['DEVANAGARI STRESS SIGN UDATTA']), ]) ANUSVARA_CANDRABINDU = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['DEVANAGARI SIGN ANUSVARA']), re.escape(DIACRITICS['DEVANAGARI SIGN CANDRABINDU']), ])
"stress2" : "/", "makron" : "_", "upperdot" : "+", } DIACRITICS_INVERSED = invertdict(DIACRITICS) ################################################################################ # transliteration's patterns : # PATTERN is used to cut one complex characters into its elements. # PATTERN2 is used to cut several complex characters into a list of complex characters. ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. UPPERDOT = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['upperdot'])] ) MAKRON = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['makron']), ]) STRESS = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['stressM1']), re.escape(DIACRITICS['stress1']), re.escape(DIACRITICS['stress2']), ]) LETTERS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(LOWER_CASE_INVERSED.keys())) + \ regexstring_list(tuple(UPPER_CASE_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) PATTERN_TXT = "((?P<base_char>({0}))" \ "(?P<trans_stress>({1}))?" \ "(?P<trans_makron>({2}))?" \ "(?P<trans_upperdot>({3}))?" \
# E.g. CANTILATIONMARKS["HEBREW ACCENT ZAQEF GADOL"] = <HEBREW ACCENT ZAQEF GADOL> CANTILATIONMARKS = { cmark:"<"+cmark+">" for cmark in symbols.SYMB_CANTILLATION_MARKS.keys() } CANTILATIONMARKS_INVERSED = invertdict( CANTILATIONMARKS ) ################################################################################ # transliteration's patterns : # PATTERN is used to cut one complex characters into its elements. # PATTERN2 is used to cut several complex characters into a list of complex characters. ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. T_BASECHARS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(LETTERS_INVERSED.keys())) + \ regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) T_VOWELS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(VOWELS_INVERSED.keys()))) T_METHEGH = isort_a_lstrings_bylen_nodup( [re.escape(POINTS["HEBREW POINT METEG"]),] ) T_RAPHE = isort_a_lstrings_bylen_nodup( [re.escape(POINTS["HEBREW POINT RAFE"]),] ) T_SPECIALPOINTS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(SPECIALPOINTS_INVERSED.keys())) ) T_CMARKS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(CANTILATIONMARKS_INVERSED.keys())) ) PATTERN_TXT = "((?P<base_char>({0}))" \ "(?P<trans_vowel>({1}))?" \ "(?P<trans_methegh>({2}))?" \
DIACRITICS = {"stress1": "\\", "stress2": "/", "stress12": "/\\", "stress3": "+:", "cedilla": "+c"} DIACRITICS_INVERSED = invertdict(DIACRITICS) ################################################################################ # transliteration's patterns : # PATTERN is used to cut one complex characters into its elements. # PATTERN2 is used to cut several complex characters into a list of complex characters. ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. STRESS = isort_a_lstrings_bylen_nodup( [ re.escape(DIACRITICS["stress1"]), re.escape(DIACRITICS["stress2"]), re.escape(DIACRITICS["stress12"]), re.escape(DIACRITICS["stress3"]), ] ) CEDILLA = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["cedilla"])]) LETTERS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(LOWER_CASE_INVERSED.keys())) + regexstring_list(tuple(UPPER_CASE_INVERSED.keys())) + regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) PATTERN_TXT = ( "((?P<base_char>({0}))" "(?P<trans_stress>({1}))?" "(?P<trans_cedilla>({2}))?" ")".format(
"short" : "*", "long" : "_", "diaeresis" : "+", } DIACRITICS_INVERSED = invertdict(DIACRITICS) ################################################################################ # transliteration's patterns : # PATTERN is used to cut one complex characters into its elements. # PATTERN2 is used to cut several complex characters into a list of complex characters. ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. DIAERESIS = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['diaeresis'])] ) LENGTH = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['short']), re.escape(DIACRITICS['long'])] ) STRESS = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['stress']),]) LETTERS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(LOWER_CASE_INVERSED.keys())) + \ regexstring_list(tuple(UPPER_CASE_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) PATTERN_TXT = "((?P<base_char>({0}))" \ "(?P<trans_stress>({1}))?" \ "(?P<trans_length>({2}))?" \ "(?P<trans_diaeresis>({3}))?" \ ")".format("|".join(prepare_list_to_strformat(LETTERS)),
# # +1 (text->transliteration) # -1 (transliteration->text) # ################################################################################ AVAILABLE_DIRECTIONS = (-1, +1) ################################################################################ # transliteration's patterns : ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. TRANS_DOT_OR_PLUS = isort_a_lstrings_bylen_nodup( regexstring_list( ('+', '.') )) TRANS_PUNCTUATION_AND_OTHER_SYMBOL = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) TRANS_CONSONANTS_AND_VOWELS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(CONSONANTS_INVERSED.keys())) + \ regexstring_list(tuple(VOWELS_INVERSED.keys())) ) TRANS_RNAM_BCAD = isort_a_lstrings_bylen_nodup( regexstring_list( (DIACRITICS['SIGN RNAM BCAD'])) ) TRANS_HALANTA = isort_a_lstrings_bylen_nodup( regexstring_list( (DIACRITICS['MARK HALANTA'])) ) TRANS_ANUSVARA_CANDRABINDU = isort_a_lstrings_bylen_nodup( regexstring_list( (DIACRITICS['SIGN RJES SU NGA RO'], DIACRITICS['SIGN NYI ZLA NAA DA'], DIACRITICS['SIGN SNA LDAN'] )))
################################################################################ # transliteration's patterns : # PATTERN is used to cut one complex characters into its elements. # PATTERN2 is used to cut several complex characters into a list of complex characters. ################################################################################ # in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. LETTERS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(CHOONPU.keys())) + \ regexstring_list(tuple(HIRAGANA_INVERSED.keys())) + \ regexstring_list(tuple(HIRAGANA_DAKUTEN_INVERSED.keys())) + \ regexstring_list(tuple(HIRAGANA_HANDAKUTEN_INVERSED.keys())) + \ regexstring_list(tuple(KATAKANA_INVERSED.keys())) + \ regexstring_list(tuple(KATAKANA_DAKUTEN_INVERSED.keys())) + \ regexstring_list(tuple(KATAKANA_HANDAKUTEN_INVERSED.keys())) + \ regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) PATTERN_TXT = "(?P<base_char>({0}))".format("|".join(prepare_list_to_strformat(LETTERS)),) # we inverse the effect of prepare_list_to_strformat() PATTERN_TXT = PATTERN_TXT.replace('{{', '{') PATTERN_TXT = PATTERN_TXT.replace('}}', '}') PATTERN = re.compile(PATTERN_TXT) PATTERN_TXT2 = "({0})".format("|".join(prepare_list_to_strformat(LETTERS)),) # we inverse the effect of prepare_list_to_strformat()