Пример #1
0
def __add_case_fold(sm, Flags, trigger_set, start_state_idx, target_state_idx):
    for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
        for i in range(interval.begin, interval.end):
            fold = ucs_case_fold.get_fold_set(i, Flags)
            for x in fold:
                if type(x) == list:
                    __add_intermediate_states(sm, x, start_state_idx, target_state_idx)
                else:
                    trigger_set.add_interval(Interval(x, x+1))
Пример #2
0
def __add_case_fold(sm, Flags, trigger_set, start_state_idx, target_state_idx):
    for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
        for i in range(interval.begin, interval.end):
            fold = ucs_case_fold.get_fold_set(i, Flags)
            for x in fold:
                if type(x) == list:
                    __add_intermediate_states(sm, x, start_state_idx,
                                              target_state_idx)
                else:
                    trigger_set.add_interval(Interval(x, x + 1))
Пример #3
0
def do(sh, PatternDict, snap_expression=None, snap_set_expression=None):
    """Parse a case fold expression of the form \C(..){ R } or \C{ R }.
       Assume that '\C' has been snapped already from the stream.

       See function ucs_case_fold_parser.get_fold_set() for details
       about case folding.

       snap_expression != None, then snap_expression is the function 
                                to parse a RE and the caller
                                expects a state machine.

       snap_set_expression != None, then snap_set_expression is the
                                    function to parse a character 
                                    set and caller expects a 
                                    NumberSet object.
    """

    pos = sh.tell()
    skip_whitespace(sh)
    # -- parse the optional options in '(' ')' brackets
    if not check(sh, "("):
        # By default 'single' and 'multi' character case folds are active
        if snap_set_expression != None: flag_txt = "s"
        else: flag_txt = "sm"

    else:
        flag_txt = read_until_character(sh, ")")

        if flag_txt == "":
            sh.seek(pos)
            error_msg("Missing closing ')' in case fold expression.", sh)

        flag_txt = flag_txt.replace(" ", "").replace("\t",
                                                     "").replace("\n", "")

        for letter in flag_txt:
            if letter not in "smt":
                sh.seek(pos)
                error_msg("Letter '%s' not permitted as case fold option.\n" % letter + \
                          "Options are:  's' for simple case fold.\n" + \
                          "              'm' for multi character sequence case fold.\n" + \
                          "              't' for special turkish case fold rules.", sh)

            if snap_set_expression != None and letter == "m":
                sh.seek(pos)
                error_msg("Option 'm' not permitted as case fold option in set expression.\n" + \
                          "Set expressions cannot absorb multi character sequences.", sh)

        skip_whitespace(sh)

    # -- parse the expression in '{' '}' which is subject to case folding
    if not check(sh, "{"):
        sh.seek(pos)
        error_msg("Missing '{' for case fold expression.", sh)

    skip_whitespace(sh)
    if snap_set_expression != None:
        trigger_set = snap_set_expression(sh, PatternDict)
        if trigger_set == None:
            error_msg(
                "Missing character set for case fold in set expression.\n" +
                "The content in '\\C{content}' should start with '[' or '[:'.",
                sh)

        # -- perform the case fold for Sets!
        for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
            for i in range(interval.begin, interval.end):
                fold = ucs_case_fold.get_fold_set(i, flag_txt)
                for x in fold:
                    assert type(x) != list
                    trigger_set.add_interval(Interval(x, x + 1))

        result = trigger_set

    else:
        sm = snap_expression(sh, PatternDict)
        if sm == None:
            error_msg(
                "Missing expression for case fold '\C'.\n" +
                "The content in '\\C{content}' should start with '[' or '[:'.",
                sh)

        # -- perform the case fold for State Machines!
        for state_idx, state in sm.states.items():
            transitions = state.transitions()
            for target_state_idx, trigger_set in transitions.get_map().items():
                __add_case_fold(sm, flag_txt, trigger_set, state_idx,
                                target_state_idx)

        result = sm

    if not check(sh, "}"):
        sh.seek(pos)
        error_msg("Missing '}' for case fold expression.", sh)

    return result
Пример #4
0
def do(sh, PatternDict, snap_expression=None, snap_set_expression=None):
    """Parse a case fold expression of the form \C(..){ R } or \C{ R }.
       Assume that '\C' has been snapped already from the stream.

       See function ucs_case_fold_parser.get_fold_set() for details
       about case folding.

       snap_expression != None, then snap_expression is the function 
                                to parse a RE and the caller
                                expects a state machine.

       snap_set_expression != None, then snap_set_expression is the
                                    function to parse a character 
                                    set and caller expects a 
                                    NumberSet object.
    """

    pos = sh.tell()
    skip_whitespace(sh)
    # -- parse the optional options in '(' ')' brackets
    if not check(sh, "("):
        # By default 'single' and 'multi' character case folds are active
        if snap_set_expression != None: flag_txt = "s"
        else:                           flag_txt = "sm"

    else:
        flag_txt = read_until_character(sh, ")")

        if flag_txt == "":
            sh.seek(pos)
            error_msg("Missing closing ')' in case fold expression.", sh)

        flag_txt = flag_txt.replace(" ", "").replace("\t", "").replace("\n", "")

        for letter in flag_txt:
            if letter not in "smt":
                sh.seek(pos)
                error_msg("Letter '%s' not permitted as case fold option.\n" % letter + \
                          "Options are:  's' for simple case fold.\n" + \
                          "              'm' for multi character sequence case fold.\n" + \
                          "              't' for special turkish case fold rules.", sh)

            if snap_set_expression != None and letter == "m":
                sh.seek(pos)
                error_msg("Option 'm' not permitted as case fold option in set expression.\n" + \
                          "Set expressions cannot absorb multi character sequences.", sh)

        skip_whitespace(sh)

    # -- parse the expression in '{' '}' which is subject to case folding
    if not check(sh, "{"):
        sh.seek(pos)
        error_msg("Missing '{' for case fold expression.", sh)

    skip_whitespace(sh)
    if snap_set_expression != None:
        trigger_set = snap_set_expression(sh, PatternDict)
        if trigger_set == None:
            error_msg("Missing character set for case fold in set expression.\n" + 
                      "The content in '\\C{content}' should start with '[' or '[:'.", 
                      sh)

        # -- perform the case fold for Sets!
        for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
            for i in range(interval.begin, interval.end):
                fold = ucs_case_fold.get_fold_set(i, flag_txt)
                for x in fold:
                    assert type(x) != list
                    trigger_set.add_interval(Interval(x, x+1))

        result = trigger_set

    else:
        sm = snap_expression(sh, PatternDict)
        if sm == None:
            error_msg("Missing expression for case fold '\C'.\n" + 
                      "The content in '\\C{content}' should start with '[' or '[:'.", 
                      sh)

        # -- perform the case fold for State Machines!
        for state_idx, state in sm.states.items():
            transitions = state.transitions()
            for target_state_idx, trigger_set in transitions.get_map().items():
                __add_case_fold(sm, flag_txt, trigger_set, state_idx, target_state_idx)

        result = sm

    if not check(sh, "}"):
        sh.seek(pos)
        error_msg("Missing '}' for case fold expression.", sh)

    return result