def get_setup(L0, L1, FSM0, FSM1, FSM2): # SPECIALITIES: -- sm0 and sm1 have an intersection between their second # transition. # -- sm1 transits further upon acceptance. # -- sm2 has only one transition. ci_list = [ CountInfo(dial_db.new_incidence_id(), NumberSet.from_range(L0, L1), CountAction(E_CharacterCountType.COLUMN, 0)), ] # Generate State Machine that does not have any intersection with # the loop transitions. sm0 = StateMachine() si = sm0.add_transition(sm0.init_state_index, FSM0) si = sm0.add_transition(si, NS_A, AcceptanceF=True) sm0.states[si].mark_acceptance_id(dial_db.new_incidence_id()) sm1 = StateMachine() si0 = sm1.add_transition(sm1.init_state_index, FSM1) si = sm1.add_transition(si0, NS_A, AcceptanceF=True) iid1 = dial_db.new_incidence_id() sm1.states[si].mark_acceptance_id(iid1) si = sm1.add_transition(si, NS_B, si0) sm1.states[si].mark_acceptance_id(iid1) sm2 = StateMachine() si = sm2.add_transition(sm2.init_state_index, FSM2, AcceptanceF=True) sm2.states[si].mark_acceptance_id(dial_db.new_incidence_id()) return ci_list, [sm0, sm1, sm2]
def seal(self): if len(self.space_db) == 0 and len(self.grid_db) == 0: default_space = ord(' ') default_tab = ord('\t') bad = self.bad_character_set if bad.get().contains(default_space) == False: self.specify_space("[ ]", NumberSet(default_space), 1, self.fh) if bad.get().contains(default_tab) == False: self.specify_grid("[\\t]", NumberSet(default_tab), 4, self.fh) if len(self.space_db) == 0 and len(self.grid_db) == 0: error_msg( "No space or grid defined for indentation counting. Default\n" "values ' ' and '\\t' could not be used since they are specified as 'bad'.", bad.file_name, bad.line_n) if self.newline_state_machine.get() is None: sm = StateMachine() end_idx = sm.add_transition(sm.init_state_index, NumberSet(ord('\n')), AcceptanceF=True) mid_idx = sm.add_transition(sm.init_state_index, NumberSet(ord('\r')), AcceptanceF=False) sm.add_transition(mid_idx, NumberSet(ord('\n')), end_idx, AcceptanceF=False) self.specify_newline("(\\r\\n)|(\\n)", sm, self.fh)
class X: def __init__(self, Name): sh = StringIO("[:\\P{Script=%s}:]" % Name) self.name = Name self.charset = regex.snap_set_expression(sh, {}) self.sm = StateMachine() self.sm.add_transition(self.sm.init_state_index, self.charset, AcceptanceF=True) self.id = self.sm.get_id() def check(self, SM): """This function throws an exception as soon as one single value is not matched according to the expectation. """ print "Name = " + self.name, for interval in self.charset.get_intervals(PromiseToTreatWellF=True): for i in range(interval.begin, interval.end): utf8_seq = unicode_to_utf8(i) # Apply sequence to state machine s_idx = result.init_state_index for byte in utf8_seq: s_idx = result.states[s_idx].target_map.get_resulting_target_state_index(byte) # All acceptance flags must belong to the original state machine for cmd in result.states[s_idx].single_entry: if cmd.__class__ != SeAccept: continue # HERE: As soon as something is wrong --> fire an exception assert cmd.acceptance_id() == self.id print " (OK=%i)" % self.id
class X: def __init__(self, Name): sh = StringIO("[:\\P{Script=%s}:]" % Name) self.name = Name self.charset = regex.snap_set_expression(sh, {}) self.sm = StateMachine() self.sm.add_transition(self.sm.init_state_index, self.charset, AcceptanceF=True) self.id = self.sm.get_id() def check(self, SM): """This function throws an exception as soon as one single value is not matched according to the expectation. """ print "Name = " + self.name, for interval in self.charset.get_intervals(PromiseToTreatWellF=True): for i in range(interval.begin, interval.end): utf8_seq = unicode_to_utf8(i) # Apply sequence to state machine s_idx = result.init_state_index for byte in utf8_seq: s_idx = result.states[ s_idx].target_map.get_resulting_target_state_index( byte) # All acceptance flags must belong to the original state machine for cmd in result.states[s_idx].single_entry: if cmd.__class__ != SeAccept: continue # HERE: As soon as something is wrong --> fire an exception assert cmd.acceptance_id() == self.id print " (OK=%i)" % self.id
def test(TestString): print "expression = \"" + TestString + "\"" sm = StateMachine() try: trigger_set = character_set.do(StringIO(TestString + "]")) sm.add_transition(sm.init_state_index, trigger_set, AcceptanceF=True) print "state machine\n", sm except RegularExpressionException, x: print repr(x)
def snap_non_control_character(stream, PatternDict): __debug_entry("non-control characters", stream) # (*) read first character char_code = utf8.__read_one_utf8_code_from_stream(stream) if char_code is None: error_msg("Character could not be interpreted as UTF8 code or End of File reached prematurely.", stream) result = StateMachine() result.add_transition(result.init_state_index, char_code, AcceptanceF=True) return __debug_exit(result, stream)
def get_any(): """RETURNS: A state machine that 'eats' any character, but only one. (0)--- \Any --->(( 0 )) """ result = StateMachine() result.add_transition(result.init_state_index, NumberSet(Interval(-sys.maxint, sys.maxint)), AcceptanceF=True) return result
def create_ALL_BUT_NEWLINE_state_machine(): global Setup result = StateMachine() # NOTE: Buffer control characters are supposed to be filtered out by the code # generator. trigger_set = NumberSet(Interval(ord("\n")).inverse()) if Setup.get_character_value_limit() != sys.maxint: trigger_set.intersect_with(Interval(0, Setup.get_character_value_limit())) result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) return result
def create_ALL_BUT_NEWLINE_state_machine(stream): global Setup result = StateMachine() # NOTE: Buffer control characters are supposed to be filtered out by the code # generator. trigger_set = NumberSet(Interval(ord("\n"))).get_complement(Setup.buffer_codec.source_set) if trigger_set.is_empty(): error_msg("The set of admissible characters contains only newline.\n" "The '.' for 'all but newline' is an empty set.", SourceRef.from_FileHandle(stream)) result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) return result
def do(stream, PatternDict): trigger_set = snap_set_expression(stream, PatternDict) if trigger_set is None: raise RegularExpressionException("Regular Expression: character_set_expression called for something\n" + \ "that does not start with '[:', '[' or '\\P'") if trigger_set.is_empty(): raise RegularExpressionException("Regular Expression: Character set expression results in empty set.") # Create state machine that triggers with the trigger set to SUCCESS # NOTE: The default for the ELSE transition is FAIL. sm = StateMachine() sm.add_transition(sm.init_state_index, trigger_set, AcceptanceF=True) return __debug_exit(sm, stream)
def create_ALL_BUT_NEWLINE_state_machine(): global Setup result = StateMachine() # NOTE: Buffer control characters are supposed to be filtered out by the code # generator. trigger_set = NumberSet(Interval(ord("\n")).inverse()) if Setup.get_character_value_limit() != sys.maxint: trigger_set.intersect_with( Interval(0, Setup.get_character_value_limit())) result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) return result
def do(sh): """Converts a uni-code string into a state machine that parses its letters sequentially. Each state in the sequence correponds to the sucessful triggering of a letter. Only the last state, though, is an acceptance state. Any bailing out before is 'not accepted'. Example: "hey" is translated into the state machine: (0)-- 'h' -->(1)-- 'e' -->(2)-- 'y' --> ACCEPTANCE | | | FAIL FAIL FAIL Note: The state indices are globally unique. But, they are not necessarily 0, 1, 2, ... """ assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" # resulting state machine result = StateMachine() state_idx = result.init_state_index # Only \" is a special character '"', any other backslashed character # remains as the sequence 'backslash' + character for char_code in get_character_code_sequence(sh): state_idx = result.add_transition(state_idx, char_code) # when the last state has trigger it is supposed to end up in 'acceptance' result.states[state_idx].set_acceptance() return result
def test_on_UCS_range(Trafo, Source, Drain, CharacterBackwardTrafo): sm = StateMachine() acc_db = {} for x in range(Source.begin, Source.end): ti = sm.add_transition(sm.init_state_index, x, AcceptanceF=True) acc_id = len(acc_db) sm.states[ti].mark_acceptance_id(acc_id) acc_db[x] = acc_id if Setup.bad_lexatom_detection_f: acc_db[None] = E_IncidenceIDs.BAD_LEXATOM else: acc_db[None] = None state_n_before, result = transform(Trafo, sm) # assert state_n_before == len(result.states) init_state = result.get_init_state() count = 0 for y in range(Drain.begin, Drain.end): # Translate character into x = CharacterBackwardTrafo(y) # Transit on the translated charater ti = init_state.target_map.get_resulting_target_state_index(y) # Compare resulting state with the expected state's acceptance assert_only_acceptance_id(sm.states, ti, acc_db, x, y) count += 1 print "<terminated: %i transitions ok>" % count
def do(stream, PatternDict): trigger_set = snap_set_expression(stream, PatternDict) if trigger_set is None: raise RegularExpressionException("Regular Expression: character_set_expression called for something\n" + \ "that does not start with '[:', '[' or '\\P'") if trigger_set.is_empty(): raise RegularExpressionException( "Regular Expression: Character set expression results in empty set." ) # Create state machine that triggers with the trigger set to SUCCESS # NOTE: The default for the ELSE transition is FAIL. sm = StateMachine() sm.add_transition(sm.init_state_index, trigger_set, AcceptanceF=True) return __debug_exit(sm, stream)
def snap_character_set_expression(stream, PatternDict): # GRAMMAR: # # set_expression: # [: set_term :] # traditional character set # \P '{' propperty string '}' # '{' identifier '}' # # set_term: # "alnum" # "alpha" # "blank" # "cntrl" # "digit" # "graph" # "lower" # "print" # "punct" # "space" # "upper" # "xdigit" # "union" '(' set_term [ ',' set_term ]+ ')' # "intersection" '(' set_term [ ',' set_term ]+ ')' # "difference" '(' set_term [ ',' set_term ]+ ')' # "inverse" '(' set_term ')' # set_expression # trigger_set = snap_set_expression(stream, PatternDict) if trigger_set is None: error.log("Regular Expression: snap_character_set_expression called for something\n" + \ "that does not start with '[:', '[' or '\\P'", stream) elif trigger_set.is_empty(): error.warning( "Regular Expression: Character set expression results in empty set.", stream) # Create state machine that triggers with the trigger set to SUCCESS # NOTE: The default for the ELSE transition is FAIL. sm = StateMachine() sm.add_transition(sm.init_state_index, trigger_set, AcceptanceF=True) return __debug_exit(sm, stream)
def snap_character_set_expression(stream, PatternDict): # GRAMMAR: # # set_expression: # [: set_term :] # traditional character set # \P '{' propperty string '}' # '{' identifier '}' # # set_term: # "alnum" # "alpha" # "blank" # "cntrl" # "digit" # "graph" # "lower" # "print" # "punct" # "space" # "upper" # "xdigit" # "union" '(' set_term [ ',' set_term ]+ ')' # "intersection" '(' set_term [ ',' set_term ]+ ')' # "difference" '(' set_term [ ',' set_term ]+ ')' # "inverse" '(' set_term ')' # set_expression # trigger_set = snap_set_expression(stream, PatternDict) if trigger_set is None: error_msg("Regular Expression: snap_character_set_expression called for something\n" + \ "that does not start with '[:', '[' or '\\P'", stream) elif trigger_set.is_empty(): error_msg("Regular Expression: Character set expression results in empty set.", stream, DontExitF=True) # Create state machine that triggers with the trigger set to SUCCESS # NOTE: The default for the ELSE transition is FAIL. sm = StateMachine() sm.add_transition(sm.init_state_index, trigger_set, AcceptanceF=True) return __debug_exit(sm, stream)
def seal(self): if len(self.space_db) == 0 and len(self.grid_db) == 0: default_space = ord(' ') default_tab = ord('\t') bad = self.bad_character_set if bad.get().contains(default_space) == False: self.specify_space("[ ]", NumberSet(default_space), 1, self.fh) if bad.get().contains(default_tab) == False: self.specify_grid("[\\t]", NumberSet(default_tab), 4, self.fh) if len(self.space_db) == 0 and len(self.grid_db) == 0: error_msg("No space or grid defined for indentation counting. Default\n" "values ' ' and '\\t' could not be used since they are specified as 'bad'.", bad.file_name, bad.line_n) if self.newline_state_machine.get() is None: sm = StateMachine() end_idx = sm.add_transition(sm.init_state_index, NumberSet(ord('\n')), AcceptanceF=True) mid_idx = sm.add_transition(sm.init_state_index, NumberSet(ord('\r')), AcceptanceF=False) sm.add_transition(mid_idx, NumberSet(ord('\n')), end_idx, AcceptanceF=False) self.specify_newline("(\\r\\n)|(\\n)", sm, self.fh)
class X: def __init__(self, Name): sh = StringIO("[:\\P{Script=%s}:]" % Name) self.name = Name self.charset = regex.snap_set_expression(sh, {}) self.sm = StateMachine() self.sm.add_transition(self.sm.init_state_index, self.charset, AcceptanceF=True) self.id = self.sm.get_id() def check(self, SM, TransformFunc): """This function throws an exception as soon as one single value is not matched according to the expectation. """ print "## [%i] Name = %s" % (self.id, self.name), interval_list = self.charset.get_intervals(PromiseToTreatWellF=True) interval_count = len(interval_list) for interval in interval_list: for i in range(interval.begin, interval.end): lexatom_seq = TransformFunc(i) # Apply sequence to state machine state = SM.apply_sequence(lexatom_seq) if state is None: error(self.sm, SM, lexatom_seq) # All acceptance flags must belong to the original state machine acceptance_id_list = [ cmd.acceptance_id() for cmd in state.single_entry.get_iterable(SeAccept) ] if acceptance_id_list and self.id not in acceptance_id_list: print eval("u'\U%08X'" % i) print "#Seq: ", ["%02X" % x for x in lexatom_seq] print "#acceptance-ids:", acceptance_id_list error(self.sm, SM, lexatom_seq) print " (OK=%i)" % interval_count
class X: def __init__(self, Name): sh = StringIO("[:\\P{Script=%s}:]" % Name) self.name = Name self.charset = regex.snap_set_expression(sh, {}) self.sm = StateMachine() self.sm.add_transition(self.sm.init_state_index, self.charset, AcceptanceF=True) self.id = self.sm.get_id() def check(self, SM): """This function throws an exception as soon as one single value is not matched according to the expectation. """ print "Name = " + self.name, for interval in self.charset.get_intervals(PromiseToTreatWellF=True): for i in range(interval.begin, interval.end): utf16_seq = unicode_to_utf16(i) # Apply sequence to state machine s_idx = result.init_state_index for word in utf16_seq: s_idx = result.states[ s_idx].target_map.get_resulting_target_state_index( word) assert s_idx is not None, \ "No acceptance for %X in [%X,%X] --> %s" % \ (i, interval.begin, interval.end - 1, repr(map(lambda x: "%04X." % x, utf16_seq))) # All acceptance flags must belong to the original state machine for cmd in result.states[s_idx].single_entry.get_iterable( SeAccept): # HERE: As soon as something is wrong --> fire an exception assert cmd.acceptance_id() == self.id print " (OK=%i)" % self.id
def StateMachine_Newline(): """Creates a state machine matching newline according to what has been specified in the setup (Setup.dos_carriage_return_newline_f). That is, if is DOS newline then the state machine represents '\r\n' and if it is unix only, then it represents '\n'. If both is required they are implemented in parallel. RETURNS: StateMachine """ UnixF = True DosF = Setup.dos_carriage_return_newline_f NL = ord('\n') # (pure) newline, i.e. line feed CR = ord('\r') # carriage return sm = StateMachine() if UnixF: sm.add_transition(sm.init_state_index, NL, AcceptanceF=True) if DosF: idx = sm.add_transition(sm.init_state_index, CR, AcceptanceF=False) sm.add_transition(idx, NL, AcceptanceF=True) return beautifier.do(sm)
ci_list, sm_list = get_setup(0x10, 0x60, NumberSet.from_range(0x10, 0x40), NumberSet.from_range(0x20, 0x50), NumberSet.from_range(0x30, 0x60)) # Test for each 'sm' in 'sm_list' is superfluous. # It is done in 'AppendixNoI'. test(ci_list, sm_list) elif "Split" in sys.argv: # A first transition of a state machine is separated into two, because # it is covered by more than one different count action. NS1 = NumberSet.from_range(0x10, 0x20) NS2 = NumberSet.from_range(0x20, 0x30) NS3 = NumberSet.from_range(0x30, 0x40) NS4 = NumberSet.from_range(0x40, 0x50) ci_list = [ CountInfo(dial_db.new_incidence_id(), NS1, CountAction(E_CharacterCountType.COLUMN, 1)), CountInfo(dial_db.new_incidence_id(), NS2, CountAction(E_CharacterCountType.COLUMN, 2)), CountInfo(dial_db.new_incidence_id(), NS3, CountAction(E_CharacterCountType.COLUMN, 3)), CountInfo(dial_db.new_incidence_id(), NS4, CountAction(E_CharacterCountType.COLUMN, 4)) ] sm = StateMachine() si = sm.init_state_index iid = dial_db.new_incidence_id() ti0 = sm.add_transition(si, NumberSet.from_range(0x1A, 0x4B)) ac0 = sm.add_transition(ti0, NS_A, AcceptanceF=True) test(ci_list, [sm])
def __parse_option(fh, new_mode): def get_pattern_object(SM): if not SM.is_DFA_compliant(): result = nfa_to_dfa.do(SM) else: result = SM result = hopcroft.do(result, CreateNewStateMachineF=False) return Pattern(result, AllowStateMachineTrafoF=True) identifier = read_option_start(fh) if identifier is None: return False verify_word_in_list(identifier, mode_option_info_db.keys(), "mode option", fh.name, get_current_line_info_number(fh)) if identifier == "skip": # A skipper 'eats' characters at the beginning of a pattern that belong # to a specified set of characters. A useful application is most probably # the whitespace skipper '[ \t\n]'. The skipper definition allows quex to # implement a very effective way to skip these regions. pattern_str, trigger_set = regular_expression.parse_character_set(fh, PatternStringF=True) skip_whitespace(fh) if fh.read(1) != ">": error_msg("missing closing '>' for mode option '%s'." % identifier, fh) if trigger_set.is_empty(): error_msg("Empty trigger set for skipper." % identifier, fh) # TriggerSet skipping is implemented the following way: As soon as one element of the # trigger set appears, the state machine enters the 'trigger set skipper section'. # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action. # NOTE: The correspondent CodeFragment for skipping is created in 'implement_skippers(...)' pattern_sm = StateMachine() pattern_sm.add_transition(pattern_sm.init_state_index, trigger_set, AcceptanceF=True) # Skipper code is to be generated later action = GeneratedCode(skip_character_set.do, FileName = fh.name, LineN = get_current_line_info_number(fh)) action.data["character_set"] = trigger_set new_mode.add_match(pattern_str, action, get_pattern_object(pattern_sm), Comment=E_SpecialPatterns.SKIP) return True elif identifier in ["skip_range", "skip_nested_range"]: # A non-nesting skipper can contain a full fledged regular expression as opener, # since it only effects the trigger. Not so the nested range skipper-see below. # -- opener skip_whitespace(fh) if identifier == "skip_nested_range": # Nested range state machines only accept 'strings' not state machines opener_str, opener_sequence = __parse_string(fh, "Opener pattern for 'skip_nested_range'") opener_sm = StateMachine.from_sequence(opener_sequence) else: opener_str, opener_pattern = regular_expression.parse(fh) opener_sm = opener_pattern.sm # For 'range skipping' the opener sequence is not needed, only the opener state # machine is webbed into the pattern matching state machine. opener_sequence = None skip_whitespace(fh) # -- closer closer_str, closer_sequence = __parse_string(fh, "Closing pattern for 'skip_range' or 'skip_nested_range'") skip_whitespace(fh) if fh.read(1) != ">": error_msg("missing closing '>' for mode option '%s'" % identifier, fh) # Skipper code is to be generated later generator_function, comment = { "skip_range": (skip_range.do, E_SpecialPatterns.SKIP_RANGE), "skip_nested_range": (skip_nested_range.do, E_SpecialPatterns.SKIP_NESTED_RANGE), }[identifier] action = GeneratedCode(generator_function, FileName = fh.name, LineN = get_current_line_info_number(fh)) action.data["opener_sequence"] = opener_sequence action.data["closer_sequence"] = closer_sequence action.data["mode_name"] = new_mode.name new_mode.add_match(opener_str, action, get_pattern_object(opener_sm), Comment=comment) return True elif identifier == "indentation": value = indentation_setup.do(fh) # Enter 'Newline' and 'Suppressed Newline' as matches into the engine. # Similar to skippers, the indentation count is then triggered by the newline. # -- Suppressed Newline = Suppressor followed by Newline, # then newline does not trigger indentation counting. suppressed_newline_pattern_str = "" if value.newline_suppressor_state_machine.get() is not None: suppressed_newline_pattern_str = \ "(" + value.newline_suppressor_state_machine.pattern_string() + ")" \ + "(" + value.newline_state_machine.pattern_string() + ")" suppressed_newline_sm = \ sequentialize.do([value.newline_suppressor_state_machine.get(), value.newline_state_machine.get()]) FileName = value.newline_suppressor_state_machine.file_name LineN = value.newline_suppressor_state_machine.line_n # Go back to start. code = UserCodeFragment("goto %s;" % get_label("$start", U=True), FileName, LineN) new_mode.add_match(suppressed_newline_pattern_str, code, get_pattern_object(suppressed_newline_sm), Comment=E_SpecialPatterns.SUPPRESSED_INDENTATION_NEWLINE) # When there is an empty line, then there shall be no indentation count on it. # Here comes the trick: # # Let newline # be defined as: newline ([space]* newline])* # # This way empty lines are eating away before the indentation count is activated. # -- 'space' x0 = StateMachine() x0.add_transition(x0.init_state_index, value.indentation_count_character_set(), AcceptanceF=True) # -- '[space]*' x1 = repeat.do(x0) # -- '[space]* newline' x2 = sequentialize.do([x1, value.newline_state_machine.get()]) # -- '([space]* newline)*' x3 = repeat.do(x2) # -- 'newline ([space]* newline)*' x4 = sequentialize.do([value.newline_state_machine.get(), x3]) # -- nfa to dfa; hopcroft optimization sm = beautifier.do(x4) FileName = value.newline_state_machine.file_name LineN = value.newline_state_machine.line_n action = GeneratedCode(indentation_counter.do, FileName, LineN) action.data["indentation_setup"] = value new_mode.add_match(value.newline_state_machine.pattern_string(), action, get_pattern_object(sm), Comment=E_SpecialPatterns.INDENTATION_NEWLINE) # Announce the mode to which the setup belongs value.set_containing_mode_name(new_mode.name) else: value = read_option_value(fh) # The 'verify_word_in_list()' call must have ensured that the following holds assert mode_option_info_db.has_key(identifier) # Is the option of the appropriate value? option_info = mode_option_info_db[identifier] if option_info.domain is not None and value not in option_info.domain: error_msg("Tried to set value '%s' for option '%s'. " % (value, identifier) + \ "Though, possible for this option are only: %s." % repr(option_info.domain)[1:-1], fh) # Finally, set the option new_mode.add_option(identifier, value) return True
def snap_primary(stream, PatternDict): """primary: " non_double_quote * " = character string [ non_rect_bracket_close ] = set of characters { identifier } = pattern replacement ( expression ) non_control_character+ = lonely characters primary repetition_cmd """ __debug_entry("primary", stream) x = stream.read(1) lookahead = stream.read(1) if x != "" and lookahead != "": stream.seek(-1, 1) if x == "": return __debug_exit(None, stream) # -- 'primary' primary if x == "\"": result = snap_character_string.do(stream) elif x == "[": stream.seek(-1, 1) result = character_set_expression.do(stream, PatternDict) elif x == "{": result = snap_replacement(stream, PatternDict) elif x == ".": result = create_ALL_BUT_NEWLINE_state_machine() elif x == "(": result = snap_bracketed_expression(stream, PatternDict) elif x.isspace(): # a lonestanding space ends the regular expression stream.seek(-1, 1) return __debug_exit(None, stream) elif x in ["*", "+", "?"]: raise RegularExpressionException( "lonely operator '%s' without expression proceeding." % x) elif x == "\\": if lookahead == "C": stream.read(1) result = snap_case_folded_pattern(stream, PatternDict) elif lookahead == "R": result = get_expression_in_brackets(stream, PatternDict, "reverse operator", "R").get_inverse() elif lookahead == "A": result = get_expression_in_brackets(stream, PatternDict, "anti-pattern operator", "A") result.transform_to_anti_pattern() else: stream.seek(-1, 1) trigger_set = character_set_expression.snap_property_set(stream) if trigger_set is None: stream.seek( 1, 1) # snap_property_set() leaves tream right before '\\' char_code = snap_backslashed_character.do(stream) if char_code is None: raise RegularExpressionException( "Backslash followed by unrecognized character code.") trigger_set = char_code result = StateMachine() result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) elif x not in CONTROL_CHARACTERS: # NOTE: The '\' is not inside the control characters---for a reason. # It is used to define for example character codes using '\x' etc. stream.seek(-1, 1) result = snap_non_control_character(stream, PatternDict) else: # NOTE: This includes the '$' sign which means 'end of line' # because the '$' sign is in CONTROL_CHARACTERS, but is not checked # against. Thus, it it good to leave here on '$' because the # '$' sign is handled on the very top level. # this is not a valid primary stream.seek(-1, 1) return __debug_exit(None, stream) # -- optional repetition command? result_repeated = __snap_repetition_range(result, stream) if result_repeated is not None: result = result_repeated return __debug_exit(beautifier.do(result), stream)
def do(SM_List): """Intersection: Only match on patterns which are matched by all state machines in 'SM_List'. (C) 2013 Frank-Rene Schaefer ________________________________________________________________________ A lexeme which matches all patterns must reach an acceptance in each given state machine. That is, For each state machine; there is a path from the init state to an acceptance state triggered along the by the characters of the lexeme. We cannot go forward, since we cannot omit a path upon non-fit. Now, consider the super-state consisting of all acceptance states of all state machines. There there must be a way backward from the super-acceptance-state to the init state states. As soon, as a path is interupted, it can be thrown away. This can be achieved by reversed state machines which are combined into a single one. Reverse all state machines; The epsilon closure of the init state corresponds to the super acceptance state. The transitions in the super-state machine correspond to the way backwards in the state machine. For each feasible state in the super-state machine create a new state. The acceptance state of the reversed state machines correspond to the init state of the original state machines. If the super state contains an acceptance state of the original state, it can become an acceptance state of the intersection, because we now found a path. The found state machine must be reversed at the end. """ for sm in SM_List: if special.is_none(sm): # If one state machine is '\None' return special.get_none() # then, the intersection is '\None' reverse_sm_list = [ reverse.do(sm) for sm in SM_List ] state_id_set_list = [ set(sm.states.iterkeys()) for sm in reverse_sm_list ] acceptance_state_id_list = [ set(sm.get_acceptance_state_index_list()) for sm in reverse_sm_list ] def has_one_from_each(StateIDSet_List, StateIDSet): """StateIDSet_List[i] is the set of state indices from state machine 'i' in 'reverse_sm_list'. RETURNS: True -- If the StateIDSet has at least one state from every state machine. False -- If there is at least one state machine that has no state in 'StateIDSet'. """ for state_id_set in StateIDSet_List: if state_id_set.isdisjoint(StateIDSet): return False return True def get_merged_state(AcceptanceStateIndexList, EpsilonClosure): """Create the new target state in the state machine Accept only if all accept. """ acceptance_f = has_one_from_each(AcceptanceStateIndexList, EpsilonClosure) return State(AcceptanceF=acceptance_f) # Plain merge of all states of all state machines with an # epsilon transition from the init state to all init states # of the reverse_sm sm = StateMachine() for rsm in reverse_sm_list: sm.states.update(rsm.states) sm.add_epsilon_transition(sm.init_state_index, rsm.init_state_index) initial_state_epsilon_closure = sm.get_epsilon_closure(sm.init_state_index) InitState = get_merged_state(acceptance_state_id_list, initial_state_epsilon_closure) result = StateMachine(InitStateIndex=index.get(), InitState=InitState) # (*) prepare the initial worklist worklist = [ ( result.init_state_index, initial_state_epsilon_closure) ] epsilon_closure_db = sm.get_epsilon_closure_db() while len(worklist) != 0: # 'start_state_index' is the index of an **existing** state in the state machine. # It was either created above, in StateMachine's constructor, or as a target # state index. start_state_index, start_state_combination = worklist.pop() # (*) compute the elementary trigger sets together with the # epsilon closure of target state combinations that they trigger to. # In other words: find the ranges of characters where the state triggers to # a unique state combination. E.g: # Range Target State Combination # [0:23] --> [ State1, State2, State10 ] # [24:60] --> [ State1 ] # [61:123] --> [ State2, State10 ] # elementary_trigger_set_infos = sm.get_elementary_trigger_sets(start_state_combination, epsilon_closure_db) ## DEBUG_print(start_state_combination, elementary_trigger_set_infos) # (*) loop over all elementary trigger sets for epsilon_closure_of_target_state_combination, trigger_set in elementary_trigger_set_infos.iteritems(): # -- if there is no trigger to the given target state combination, then drop it if trigger_set.is_empty(): continue elif not has_one_from_each(state_id_set_list, epsilon_closure_of_target_state_combination): continue # -- add a new target state representing the state combination # (if this did not happen yet) target_state_index = \ map_state_combination_to_index(epsilon_closure_of_target_state_combination) # -- if target state combination was not considered yet, then create # a new state in the state machine if not result.states.has_key(target_state_index): result.states[target_state_index] = get_merged_state(acceptance_state_id_list, epsilon_closure_of_target_state_combination) worklist.append((target_state_index, epsilon_closure_of_target_state_combination)) # -- add the transition 'start state to target state' result.add_transition(start_state_index, trigger_set, target_state_index) if not result.has_acceptance_states(): return StateMachine() else: return beautifier.do(reverse.do(result))
def snap_primary(stream, PatternDict): """primary: " non_double_quote * " = character string [ non_rect_bracket_close ] = set of characters { identifier } = pattern replacement ( expression ) non_control_character+ = lonely characters primary repetition_cmd """ __debug_entry("primary", stream) x = stream.read(1); lookahead = stream.read(1); if x != "" and lookahead != "": stream.seek(-1, 1) if x == "": return __debug_exit(None, stream) # -- 'primary' primary if x == "\"": result = snap_character_string.do(stream) elif x == "[": stream.seek(-1, 1); result = character_set_expression.do(stream, PatternDict) elif x == "{": result = snap_replacement(stream, PatternDict) elif x == ".": result = create_ALL_BUT_NEWLINE_state_machine() elif x == "(": result = snap_bracketed_expression(stream, PatternDict) elif x.isspace(): # a lonestanding space ends the regular expression stream.seek(-1, 1) return __debug_exit(None, stream) elif x in ["*", "+", "?"]: raise RegularExpressionException("lonely operator '%s' without expression proceeding." % x) elif x == "\\": if lookahead == "C": stream.read(1) result = snap_case_folded_pattern(stream, PatternDict) elif lookahead == "R": result = get_expression_in_brackets(stream, PatternDict, "reverse operator", "R").get_inverse() elif lookahead == "A": result = get_expression_in_brackets(stream, PatternDict, "anti-pattern operator", "A") result.transform_to_anti_pattern() else: stream.seek(-1, 1) trigger_set = character_set_expression.snap_property_set(stream) if trigger_set is None: stream.seek(1, 1) # snap_property_set() leaves tream right before '\\' char_code = snap_backslashed_character.do(stream) if char_code is None: raise RegularExpressionException("Backslash followed by unrecognized character code.") trigger_set = char_code result = StateMachine() result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) elif x not in CONTROL_CHARACTERS: # NOTE: The '\' is not inside the control characters---for a reason. # It is used to define for example character codes using '\x' etc. stream.seek(-1, 1) result = snap_non_control_character(stream, PatternDict) else: # NOTE: This includes the '$' sign which means 'end of line' # because the '$' sign is in CONTROL_CHARACTERS, but is not checked # against. Thus, it it good to leave here on '$' because the # '$' sign is handled on the very top level. # this is not a valid primary stream.seek(-1, 1) return __debug_exit(None, stream) # -- optional repetition command? result_repeated = __snap_repetition_range(result, stream) if result_repeated is not None: result = result_repeated return __debug_exit(beautifier.do(result), stream)
def __parse_option(fh, new_mode): def get_pattern_object(SM): if not SM.is_DFA_compliant(): result = nfa_to_dfa.do(SM) else: result = SM result = hopcroft.do(result, CreateNewStateMachineF=False) return Pattern(result, AllowStateMachineTrafoF=True) identifier = read_option_start(fh) if identifier is None: return False verify_word_in_list(identifier, mode_option_info_db.keys(), "mode option", fh.name, get_current_line_info_number(fh)) if identifier == "skip": # A skipper 'eats' characters at the beginning of a pattern that belong # to a specified set of characters. A useful application is most probably # the whitespace skipper '[ \t\n]'. The skipper definition allows quex to # implement a very effective way to skip these regions. pattern_str, trigger_set = regular_expression.parse_character_set( fh, PatternStringF=True) skip_whitespace(fh) if fh.read(1) != ">": error_msg("missing closing '>' for mode option '%s'." % identifier, fh) if trigger_set.is_empty(): error_msg("Empty trigger set for skipper." % identifier, fh) # TriggerSet skipping is implemented the following way: As soon as one element of the # trigger set appears, the state machine enters the 'trigger set skipper section'. # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action. # NOTE: The correspondent CodeFragment for skipping is created in 'implement_skippers(...)' pattern_sm = StateMachine() pattern_sm.add_transition(pattern_sm.init_state_index, trigger_set, AcceptanceF=True) # Skipper code is to be generated later action = GeneratedCode(skip_character_set.do, FileName=fh.name, LineN=get_current_line_info_number(fh)) action.data["character_set"] = trigger_set new_mode.add_match(pattern_str, action, get_pattern_object(pattern_sm), Comment=E_SpecialPatterns.SKIP) return True elif identifier in ["skip_range", "skip_nested_range"]: # A non-nesting skipper can contain a full fledged regular expression as opener, # since it only effects the trigger. Not so the nested range skipper-see below. # -- opener skip_whitespace(fh) if identifier == "skip_nested_range": # Nested range state machines only accept 'strings' not state machines opener_str, opener_sequence = __parse_string( fh, "Opener pattern for 'skip_nested_range'") opener_sm = StateMachine.from_sequence(opener_sequence) else: opener_str, opener_pattern = regular_expression.parse(fh) opener_sm = opener_pattern.sm # For 'range skipping' the opener sequence is not needed, only the opener state # machine is webbed into the pattern matching state machine. opener_sequence = None skip_whitespace(fh) # -- closer closer_str, closer_sequence = __parse_string( fh, "Closing pattern for 'skip_range' or 'skip_nested_range'") skip_whitespace(fh) if fh.read(1) != ">": error_msg("missing closing '>' for mode option '%s'" % identifier, fh) # Skipper code is to be generated later generator_function, comment = { "skip_range": (skip_range.do, E_SpecialPatterns.SKIP_RANGE), "skip_nested_range": (skip_nested_range.do, E_SpecialPatterns.SKIP_NESTED_RANGE), }[identifier] action = GeneratedCode(generator_function, FileName=fh.name, LineN=get_current_line_info_number(fh)) action.data["opener_sequence"] = opener_sequence action.data["closer_sequence"] = closer_sequence action.data["mode_name"] = new_mode.name new_mode.add_match(opener_str, action, get_pattern_object(opener_sm), Comment=comment) return True elif identifier == "indentation": value = indentation_setup.do(fh) # Enter 'Newline' and 'Suppressed Newline' as matches into the engine. # Similar to skippers, the indentation count is then triggered by the newline. # -- Suppressed Newline = Suppressor followed by Newline, # then newline does not trigger indentation counting. suppressed_newline_pattern_str = "" if value.newline_suppressor_state_machine.get() is not None: suppressed_newline_pattern_str = \ "(" + value.newline_suppressor_state_machine.pattern_string() + ")" \ + "(" + value.newline_state_machine.pattern_string() + ")" suppressed_newline_sm = \ sequentialize.do([value.newline_suppressor_state_machine.get(), value.newline_state_machine.get()]) FileName = value.newline_suppressor_state_machine.file_name LineN = value.newline_suppressor_state_machine.line_n # Go back to start. code = UserCodeFragment("goto %s;" % get_label("$start", U=True), FileName, LineN) new_mode.add_match( suppressed_newline_pattern_str, code, get_pattern_object(suppressed_newline_sm), Comment=E_SpecialPatterns.SUPPRESSED_INDENTATION_NEWLINE) # When there is an empty line, then there shall be no indentation count on it. # Here comes the trick: # # Let newline # be defined as: newline ([space]* newline])* # # This way empty lines are eating away before the indentation count is activated. # -- 'space' x0 = StateMachine() x0.add_transition(x0.init_state_index, value.indentation_count_character_set(), AcceptanceF=True) # -- '[space]*' x1 = repeat.do(x0) # -- '[space]* newline' x2 = sequentialize.do([x1, value.newline_state_machine.get()]) # -- '([space]* newline)*' x3 = repeat.do(x2) # -- 'newline ([space]* newline)*' x4 = sequentialize.do([value.newline_state_machine.get(), x3]) # -- nfa to dfa; hopcroft optimization sm = beautifier.do(x4) FileName = value.newline_state_machine.file_name LineN = value.newline_state_machine.line_n action = GeneratedCode(indentation_counter.do, FileName, LineN) action.data["indentation_setup"] = value new_mode.add_match(value.newline_state_machine.pattern_string(), action, get_pattern_object(sm), Comment=E_SpecialPatterns.INDENTATION_NEWLINE) # Announce the mode to which the setup belongs value.set_containing_mode_name(new_mode.name) else: value = read_option_value(fh) # The 'verify_word_in_list()' call must have ensured that the following holds assert mode_option_info_db.has_key(identifier) # Is the option of the appropriate value? option_info = mode_option_info_db[identifier] if option_info.domain is not None and value not in option_info.domain: error_msg("Tried to set value '%s' for option '%s'. " % (value, identifier) + \ "Though, possible for this option are only: %s." % repr(option_info.domain)[1:-1], fh) # Finally, set the option new_mode.add_option(identifier, value) return True
def snap_primary(stream, PatternDict): """primary: " non_double_quote * " = character string [ non_rect_bracket_close ] = set of characters { identifier } = pattern replacement ( expression ) non_control_character+ = lonely characters primary repetition_cmd """ global SPECIAL_TERMINATOR __debug_entry("primary", stream) x = stream.read(1) if x == "": return __debug_exit(None, stream) # -- 'primary' primary if x == "\"": result = snap_character_string.do(stream) elif x == "[": stream.seek(-1, 1); result = snap_character_set_expression(stream, PatternDict) elif x == "{": result = snap_replacement(stream, PatternDict) elif x == ".": result = create_ALL_BUT_NEWLINE_state_machine(stream) elif x == "(": result = snap_bracketed_expression(stream, PatternDict) elif x.isspace(): # a lonestanding space ends the regular expression stream.seek(-1, 1) return __debug_exit(None, stream) elif x in ["*", "+", "?"]: raise RegularExpressionException("lonely operator '%s' without expression proceeding." % x) elif x == "\\": result = snap_command(stream, PatternDict) if result is None: stream.seek(-1, 1) trigger_set = snap_property_set(stream) if trigger_set is None: # snap the '\' stream.read(1) char_code = snap_backslashed_character.do(stream) if char_code is None: raise RegularExpressionException("Backslash followed by unrecognized character code.") trigger_set = char_code result = StateMachine() result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) elif x not in CONTROL_CHARACTERS and x != SPECIAL_TERMINATOR: # NOTE: The '\' is not inside the control characters---for a reason. # It is used to define for example character codes using '\x' etc. stream.seek(-1, 1) result = snap_non_control_character(stream, PatternDict) else: # NOTE: This includes the '$' sign which means 'end of line' # because the '$' sign is in CONTROL_CHARACTERS, but is not checked # against. Thus, it it good to leave here on '$' because the # '$' sign is handled on the very top level. # this is not a valid primary stream.seek(-1, 1) return __debug_exit(None, stream) # -- optional repetition command? result_repeated = __snap_repetition_range(result, stream) if result_repeated is not None: result = result_repeated # There's something going wrong with pseudo-ambigous post context # if we do not clean-up here. TODO: Investigate why? # See tests in generator/TEST directory. return __debug_exit(beautifier.do(result), stream)
# Test for each 'sm' in 'sm_list' is superfluous. # It is done in 'AppendixNoI'. test(ci_list, sm_list) elif "Split" in sys.argv: # A first transition of a state machine is separated into two, because # it is covered by more than one different count action. NS1 = NumberSet.from_range(0x10, 0x20) NS2 = NumberSet.from_range(0x20, 0x30) NS3 = NumberSet.from_range(0x30, 0x40) NS4 = NumberSet.from_range(0x40, 0x50) ci_list = [ CountInfo(dial_db.new_incidence_id(), NS1, CountAction(E_CharacterCountType.COLUMN, 1)), CountInfo(dial_db.new_incidence_id(), NS2, CountAction(E_CharacterCountType.COLUMN, 2)), CountInfo(dial_db.new_incidence_id(), NS3, CountAction(E_CharacterCountType.COLUMN, 3)), CountInfo(dial_db.new_incidence_id(), NS4, CountAction(E_CharacterCountType.COLUMN, 4)) ] sm = StateMachine() si = sm.init_state_index iid = dial_db.new_incidence_id() ti0 = sm.add_transition(si, NumberSet.from_range(0x1A, 0x4B)) ac0 = sm.add_transition(ti0, NS_A, AcceptanceF=True) test(ci_list, [sm])