Пример #1
0
                regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \
                regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) )
HYPOGEGRAMMENE = isort_a_lstrings_bylen_nodup(
                [re.escape(DIACRITICS['ὑπογεγραμμένη']),] )
DIALUTIKA = isort_a_lstrings_bylen_nodup(
                [re.escape(DIACRITICS['διαλυτικά']),] )
MEKOS = isort_a_lstrings_bylen_nodup(
                [re.escape(DIACRITICS['μακρόν']),
                re.escape(DIACRITICS['βραχύ'])] )

PATTERN_TXT = "((?P<trans_pneuma>({0}))?" \
              "(?P<trans_tonos>({1}))?" \
              "(?P<base_char>({2}))" \
              "(?P<trans_hypogegrammene>({3}))?" \
              "(?P<trans_dialutika>({4}))?" \
              "(?P<trans_mekos>({5}))?)".format("|".join(prepare_list_to_strformat(PNEUMA)),
                                                "|".join(prepare_list_to_strformat(TONOS)),
                                                "|".join(prepare_list_to_strformat(LETTERS)),
                                                "|".join(prepare_list_to_strformat(HYPOGEGRAMMENE)),
                                                "|".join(prepare_list_to_strformat(DIALUTIKA)),
                                                "|".join(prepare_list_to_strformat(MEKOS)),
                                              )
# we inverse the effect of prepare_list_to_strformat()
PATTERN_TXT = PATTERN_TXT.replace('{{', '{')
PATTERN_TXT = PATTERN_TXT.replace('}}', '}')
PATTERN = re.compile(PATTERN_TXT)

PATTERN_TXT2 = "(({0})?" \
               "({1})?" \
               "({2})" \
               "({3})?" \
Пример #2
0
    + regexstring_list(tuple(UPPER_CASE_INVERSED.keys()))
    + regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys()))
    + regexstring_list(tuple(PUNCTUATION_INVERSED.keys()))
)
HYPOGEGRAMMENE = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["ὑπογεγραμμένη"])])
DIALUTIKA = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["διαλυτικά"])])
MEKOS = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["μακρόν"]), re.escape(DIACRITICS["βραχύ"])])

PATTERN_TXT = (
    "((?P<trans_pneuma>({0}))?"
    "(?P<trans_tonos>({1}))?"
    "(?P<base_char>({2}))"
    "(?P<trans_hypogegrammene>({3}))?"
    "(?P<trans_dialutika>({4}))?"
    "(?P<trans_mekos>({5}))?)".format(
        "|".join(prepare_list_to_strformat(PNEUMA)),
        "|".join(prepare_list_to_strformat(TONOS)),
        "|".join(prepare_list_to_strformat(LETTERS)),
        "|".join(prepare_list_to_strformat(HYPOGEGRAMMENE)),
        "|".join(prepare_list_to_strformat(DIALUTIKA)),
        "|".join(prepare_list_to_strformat(MEKOS)),
    )
)
# we inverse the effect of prepare_list_to_strformat()
PATTERN_TXT = PATTERN_TXT.replace("{{", "{")
PATTERN_TXT = PATTERN_TXT.replace("}}", "}")
PATTERN = re.compile(PATTERN_TXT)

PATTERN_TXT2 = (
    "(({0})?"
    "({1})?"
Пример #3
0
                [re.escape(DIACRITICS['DEVANAGARI STRESS SIGN ANUDATTA']),])

ACCENT   = isort_a_lstrings_bylen_nodup(
                [re.escape(DIACRITICS['DEVANAGARI STRESS SIGN UDATTA']),
                ])

ANUSVARA_CANDRABINDU = isort_a_lstrings_bylen_nodup(
                [re.escape(DIACRITICS['DEVANAGARI SIGN ANUSVARA']),
                 re.escape(DIACRITICS['DEVANAGARI SIGN CANDRABINDU']),
                 ])

PATTERN_TXT = "((?P<base_char>({0}))" \
              "(?P<accent>({1}))?" \
              "(?P<anudatta>({2}))?" \
              "(?P<anusvara_candrabindu>({3}))?)".format(
                            "|".join(prepare_list_to_strformat(BASE_CHAR)),
                            "|".join(prepare_list_to_strformat(ACCENT)),
                            "|".join(prepare_list_to_strformat(ANUDATTA)),
                            "|".join(prepare_list_to_strformat(ANUSVARA_CANDRABINDU)),
                            )

# we inverse the effect of prepare_list_to_strformat()
PATTERN_TXT = PATTERN_TXT.replace('{{', '{')
PATTERN_TXT = PATTERN_TXT.replace('}}', '}')
PATTERN = re.compile(PATTERN_TXT)

PATTERN_TXT2 = "(({0})"  \
               "({1})?" \
               "({2})?" \
               "({3})?)".format(
                             "|".join(prepare_list_to_strformat(BASE_CHAR)),
Пример #4
0
MAKRON = isort_a_lstrings_bylen_nodup(
                 [re.escape(DIACRITICS['makron']), ])
STRESS = isort_a_lstrings_bylen_nodup(
                [re.escape(DIACRITICS['stressM1']),
                 re.escape(DIACRITICS['stress1']),
                 re.escape(DIACRITICS['stress2']), ])
LETTERS = isort_a_lstrings_bylen_nodup(
                regexstring_list(tuple(LOWER_CASE_INVERSED.keys())) + \
                regexstring_list(tuple(UPPER_CASE_INVERSED.keys())) + \
                regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) )

PATTERN_TXT = "((?P<base_char>({0}))" \
              "(?P<trans_stress>({1}))?" \
              "(?P<trans_makron>({2}))?" \
              "(?P<trans_upperdot>({3}))?" \
              ")".format("|".join(prepare_list_to_strformat(LETTERS)),
                         "|".join(prepare_list_to_strformat(STRESS)),
                         "|".join(prepare_list_to_strformat(MAKRON)),
                         "|".join(prepare_list_to_strformat(UPPERDOT)),
                         )
# we inverse the effect of prepare_list_to_strformat()
PATTERN_TXT = PATTERN_TXT.replace('{{', '{')
PATTERN_TXT = PATTERN_TXT.replace('}}', '}')
PATTERN = re.compile(PATTERN_TXT)

PATTERN_TXT2 = "(({0})" \
                "({1})?" \
                "({2})?" \
                "({3})?" \
                ")".format("|".join(prepare_list_to_strformat(LETTERS)),
                           "|".join(prepare_list_to_strformat(STRESS)),
Пример #5
0
T_METHEGH = isort_a_lstrings_bylen_nodup(
                 [re.escape(POINTS["HEBREW POINT METEG"]),] )
T_RAPHE = isort_a_lstrings_bylen_nodup(
                 [re.escape(POINTS["HEBREW POINT RAFE"]),] )
T_SPECIALPOINTS = isort_a_lstrings_bylen_nodup(
                regexstring_list(tuple(SPECIALPOINTS_INVERSED.keys())) )
T_CMARKS = isort_a_lstrings_bylen_nodup(
                regexstring_list(tuple(CANTILATIONMARKS_INVERSED.keys())) )

PATTERN_TXT = "((?P<base_char>({0}))" \
              "(?P<trans_vowel>({1}))?" \
              "(?P<trans_methegh>({2}))?" \
              "(?P<trans_raphe>({3}))?" \
              "(?P<trans_specialpoint>({4}))?" \
              "(?P<trans_cmark>({5})+)?)".format(
                  "|".join(prepare_list_to_strformat(T_BASECHARS)),
                  "|".join(prepare_list_to_strformat(T_VOWELS)),
                  "|".join(prepare_list_to_strformat(T_METHEGH)),
                  "|".join(prepare_list_to_strformat(T_RAPHE)),
                  "|".join(prepare_list_to_strformat(T_SPECIALPOINTS)),
                  "|".join(prepare_list_to_strformat(T_CMARKS)),
                )
# we inverse the effect of prepare_list_to_strformat()
PATTERN_TXT = PATTERN_TXT.replace('{{', '{')
PATTERN_TXT = PATTERN_TXT.replace('}}', '}')
PATTERN = re.compile(PATTERN_TXT)

PATTERN_TXT2 = "(({0})" \
               "({1})?" \
               "({2})?" \
               "({3})?" \
Пример #6
0
        re.escape(DIACRITICS["stress3"]),
    ]
)
CEDILLA = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["cedilla"])])
LETTERS = isort_a_lstrings_bylen_nodup(
    regexstring_list(tuple(LOWER_CASE_INVERSED.keys()))
    + regexstring_list(tuple(UPPER_CASE_INVERSED.keys()))
    + regexstring_list(tuple(PUNCTUATION_INVERSED.keys()))
)

PATTERN_TXT = (
    "((?P<base_char>({0}))"
    "(?P<trans_stress>({1}))?"
    "(?P<trans_cedilla>({2}))?"
    ")".format(
        "|".join(prepare_list_to_strformat(LETTERS)),
        "|".join(prepare_list_to_strformat(STRESS)),
        "|".join(prepare_list_to_strformat(CEDILLA)),
    )
)
# we inverse the effect of prepare_list_to_strformat()
PATTERN_TXT = PATTERN_TXT.replace("{{", "{")
PATTERN_TXT = PATTERN_TXT.replace("}}", "}")
PATTERN = re.compile(PATTERN_TXT)

PATTERN_TXT2 = (
    "(({0})"
    "({1})?"
    "({2})?"
    ")".format(
        "|".join(prepare_list_to_strformat(LETTERS)),
Пример #7
0
                 [re.escape(DIACRITICS['diaeresis'])] )
LENGTH = isort_a_lstrings_bylen_nodup(
                 [re.escape(DIACRITICS['short']),
                 re.escape(DIACRITICS['long'])] )
STRESS = isort_a_lstrings_bylen_nodup(
                [re.escape(DIACRITICS['stress']),])
LETTERS = isort_a_lstrings_bylen_nodup(
                regexstring_list(tuple(LOWER_CASE_INVERSED.keys())) + \
                regexstring_list(tuple(UPPER_CASE_INVERSED.keys())) + \
                regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) )

PATTERN_TXT = "((?P<base_char>({0}))" \
              "(?P<trans_stress>({1}))?" \
              "(?P<trans_length>({2}))?" \
              "(?P<trans_diaeresis>({3}))?" \
              ")".format("|".join(prepare_list_to_strformat(LETTERS)),
                         "|".join(prepare_list_to_strformat(STRESS)),
                         "|".join(prepare_list_to_strformat(LENGTH)),
                         "|".join(prepare_list_to_strformat(DIAERESIS)),
                         )
# we inverse the effect of prepare_list_to_strformat()
PATTERN_TXT = PATTERN_TXT.replace('{{', '{')
PATTERN_TXT = PATTERN_TXT.replace('}}', '}')
PATTERN = re.compile(PATTERN_TXT)

PATTERN_TXT2 = "(({0})" \
                "({1})?" \
                "({2})?" \
                "({3})?" \
                ")".format("|".join(prepare_list_to_strformat(LETTERS)),
                           "|".join(prepare_list_to_strformat(STRESS)),
Пример #8
0
                      regexstring_list(tuple(VOWELS_INVERSED.keys())) )
TRANS_RNAM_BCAD = isort_a_lstrings_bylen_nodup(
                      regexstring_list( (DIACRITICS['SIGN RNAM BCAD'])) )
TRANS_HALANTA = isort_a_lstrings_bylen_nodup(
                      regexstring_list( (DIACRITICS['MARK HALANTA'])) )
TRANS_ANUSVARA_CANDRABINDU = isort_a_lstrings_bylen_nodup(
                      regexstring_list( (DIACRITICS['SIGN RJES SU NGA RO'],
                                         DIACRITICS['SIGN NYI ZLA NAA DA'],
                                         DIACRITICS['SIGN SNA LDAN'] )))

TRANS_PATTERN_TXT = "(?P<dotpointorplus>({0}))?" \
              "(?P<base_char>({1}))" \
              "(?P<halanta>({2}))?" \
              "(?P<anusvara_candrabindu>({3}))?" \
              "(?P<rnam_bcad>({4}))?".format(
    "|".join(prepare_list_to_strformat(TRANS_DOT_OR_PLUS)),
    "|".join(prepare_list_to_strformat(TRANS_CONSONANTS_AND_VOWELS) + \
             prepare_list_to_strformat(TRANS_PUNCTUATION_AND_OTHER_SYMBOL)),
    "|".join(prepare_list_to_strformat(TRANS_HALANTA)),
    "|".join(prepare_list_to_strformat(TRANS_ANUSVARA_CANDRABINDU)),
    "|".join(prepare_list_to_strformat(TRANS_RNAM_BCAD)),
             )

TRANS_PATTERN_TXT = TRANS_PATTERN_TXT.replace('{{', '{')
TRANS_PATTERN_TXT = TRANS_PATTERN_TXT.replace('}}', '}')
TRANS_PATTERN = re.compile(TRANS_PATTERN_TXT)

#///////////////////////////////////////////////////////////////////////////////
def get_intstruct_from_trans_str( _src, dstring_object ):
    """
        function get_intstruct_from_trans_str()
Пример #9
0
# in order to build the pattern strings for the regexes we have to SORT the
# result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will.
# We delete the possible duplicates in the resulting string.

LETTERS = isort_a_lstrings_bylen_nodup(
                regexstring_list(tuple(CHOONPU.keys())) + \
                regexstring_list(tuple(HIRAGANA_INVERSED.keys())) + \
                regexstring_list(tuple(HIRAGANA_DAKUTEN_INVERSED.keys())) + \
                regexstring_list(tuple(HIRAGANA_HANDAKUTEN_INVERSED.keys())) + \
                regexstring_list(tuple(KATAKANA_INVERSED.keys())) + \
                regexstring_list(tuple(KATAKANA_DAKUTEN_INVERSED.keys())) + \
                regexstring_list(tuple(KATAKANA_HANDAKUTEN_INVERSED.keys())) + \
                regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \
                regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) )

PATTERN_TXT = "(?P<base_char>({0}))".format("|".join(prepare_list_to_strformat(LETTERS)),)

# we inverse the effect of prepare_list_to_strformat()
PATTERN_TXT = PATTERN_TXT.replace('{{', '{')
PATTERN_TXT = PATTERN_TXT.replace('}}', '}')
PATTERN = re.compile(PATTERN_TXT)

PATTERN_TXT2 = "({0})".format("|".join(prepare_list_to_strformat(LETTERS)),)

# we inverse the effect of prepare_list_to_strformat()
PATTERN_TXT2 = PATTERN_TXT2.replace('{{', '{')
PATTERN_TXT2 = PATTERN_TXT2.replace('}}', '}')

PATTERN2 = re.compile(PATTERN_TXT2)

#///////////////////////////////////////////////////////////////////////////////