def get_setup(L0, L1, FSM0, FSM1, FSM2): # SPECIALITIES: -- sm0 and sm1 have an intersection between their second # transition. # -- sm1 transits further upon acceptance. # -- sm2 has only one transition. ci_list = [ CountInfo(dial_db.new_incidence_id(), NumberSet.from_range(L0, L1), CountAction(E_CharacterCountType.COLUMN, 0)), ] # Generate State Machine that does not have any intersection with # the loop transitions. sm0 = StateMachine() si = sm0.add_transition(sm0.init_state_index, FSM0) si = sm0.add_transition(si, NS_A, AcceptanceF=True) sm0.states[si].mark_acceptance_id(dial_db.new_incidence_id()) sm1 = StateMachine() si0 = sm1.add_transition(sm1.init_state_index, FSM1) si = sm1.add_transition(si0, NS_A, AcceptanceF=True) iid1 = dial_db.new_incidence_id() sm1.states[si].mark_acceptance_id(iid1) si = sm1.add_transition(si, NS_B, si0) sm1.states[si].mark_acceptance_id(iid1) sm2 = StateMachine() si = sm2.add_transition(sm2.init_state_index, FSM2, AcceptanceF=True) sm2.states[si].mark_acceptance_id(dial_db.new_incidence_id()) return ci_list, [sm0, sm1, sm2]
def test_on_UCS_range(Trafo, Source, Drain, CharacterBackwardTrafo): sm = StateMachine() acc_db = {} for x in range(Source.begin, Source.end): ti = sm.add_transition(sm.init_state_index, x, AcceptanceF=True) acc_id = len(acc_db) sm.states[ti].mark_acceptance_id(acc_id) acc_db[x] = acc_id if Setup.bad_lexatom_detection_f: acc_db[None] = E_IncidenceIDs.BAD_LEXATOM else: acc_db[None] = None state_n_before, result = transform(Trafo, sm) # assert state_n_before == len(result.states) init_state = result.get_init_state() count = 0 for y in range(Drain.begin, Drain.end): # Translate character into x = CharacterBackwardTrafo(y) # Transit on the translated charater ti = init_state.target_map.get_resulting_target_state_index(y) # Compare resulting state with the expected state's acceptance assert_only_acceptance_id(sm.states, ti, acc_db, x, y) count += 1 print "<terminated: %i transitions ok>" % count
def do(sh): """Converts a uni-code string into a state machine that parses its letters sequentially. Each state in the sequence correponds to the sucessful triggering of a letter. Only the last state, though, is an acceptance state. Any bailing out before is 'not accepted'. Example: "hey" is translated into the state machine: (0)-- 'h' -->(1)-- 'e' -->(2)-- 'y' --> ACCEPTANCE | | | FAIL FAIL FAIL Note: The state indices are globally unique. But, they are not necessarily 0, 1, 2, ... """ assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" # resulting state machine result = StateMachine() state_idx = result.init_state_index # Only \" is a special character '"', any other backslashed character # remains as the sequence 'backslash' + character for char_code in get_character_code_sequence(sh): state_idx = result.add_transition(state_idx, char_code) # when the last state has trigger it is supposed to end up in 'acceptance' result.states[state_idx].set_acceptance() return result
def do(the_state_machine_list, LeaveIntermediateAcceptanceStatesF=False, MountToFirstStateMachineF=False, CloneRemainingStateMachinesF=True): """Creates a state machine connecting all state machines in the array 'state_machine_list'. When the flag 'LeaveIntermediateAcceptanceStatesF' is given as True, the connection points between the state machines will remain acceptances states. In any other case (e.g. the normal sequentialization) the connection points leave there acceptance status and only the last state machine in the list keeps its acceptance states. If MountToFirstStateMachineF is set, then the first state machine will contain the result of the concatination. """ assert len(the_state_machine_list) != 0 for sm in the_state_machine_list: # DEBUG sm.assert_consistency() # DEBUG # state machines with no states can be deleted from the list. they do not do anything # and do not introduce triggers. state_machine_list = filter(lambda sm: not sm.is_empty(), the_state_machine_list) if len(state_machine_list) < 2: if len(state_machine_list) < 1: return StateMachine() else: return state_machine_list[0] # (*) collect all transitions from both state machines into a single one # (clone to ensure unique identifiers of states) result = state_machine_list[0] if not MountToFirstStateMachineF: result = result.clone() # (*) need to clone the state machines, i.e. provide their internal # states with new ids, but the 'behavior' remains. This allows # state machines to appear twice, or being used in 'larger' # conglomerates. appended_sm_list = state_machine_list[1:] if CloneRemainingStateMachinesF: appended_sm_list = map(lambda sm: sm.clone(), appended_sm_list) # (*) all but last state machine enter the subsequent one, in case of SUCCESS # NOTE: The start index is unique. Therefore, one can assume that each # appended_sm_list's '.states' dictionary has different keys. One can simply # take over all transitions of a start index into the result without # considering interferences (see below) for appendix in appended_sm_list: appendix.assert_consistency() # DEBUG # Mount on every acceptance state the initial state of the following state # machine via epsilon transition. result.mount_to_acceptance_states( appendix.init_state_index, CancelStartAcceptanceStateF=not LeaveIntermediateAcceptanceStatesF) for state_index, state in appendix.states.items(): result.states[ state_index] = state # state is already cloned (if desired), so no deepcopy here # (*) double check for consistency (each target state is contained in state machine) result.assert_consistency() # DEBUG return result
def seal(self): if len(self.space_db) == 0 and len(self.grid_db) == 0: default_space = ord(' ') default_tab = ord('\t') bad = self.bad_character_set if bad.get().contains(default_space) == False: self.specify_space("[ ]", NumberSet(default_space), 1, self.fh) if bad.get().contains(default_tab) == False: self.specify_grid("[\\t]", NumberSet(default_tab), 4, self.fh) if len(self.space_db) == 0 and len(self.grid_db) == 0: error_msg( "No space or grid defined for indentation counting. Default\n" "values ' ' and '\\t' could not be used since they are specified as 'bad'.", bad.file_name, bad.line_n) if self.newline_state_machine.get() is None: sm = StateMachine() end_idx = sm.add_transition(sm.init_state_index, NumberSet(ord('\n')), AcceptanceF=True) mid_idx = sm.add_transition(sm.init_state_index, NumberSet(ord('\r')), AcceptanceF=False) sm.add_transition(mid_idx, NumberSet(ord('\n')), end_idx, AcceptanceF=False) self.specify_newline("(\\r\\n)|(\\n)", sm, self.fh)
def __init__(self, SM_A, SM_B, StartingSM=None): self.original = SM_A self.admissible = SM_B if StartingSM is None: self.result = StateMachine( InitStateIndex=index.map_state_combination_to_index( (SM_A.init_state_index, SM_B.init_state_index)), InitState=self.get_state_core(SM_A.init_state_index, SM_B.init_state_index)) else: self.result = StartingSM # TODO: Think if 'state_db' cannot be replaced by 'result' self.state_db = {} self.path = [] # Use 'operation_index' to get a unique index that allows to indicate # that 'SM_B' is no longer involved. Also, it ensures that the # generated state indices from (a_state_index, operation_index) are # unique. self.operation_index = index.get() TreeWalker.__init__(self)
def test(TestString): print "expression = \"" + TestString + "\"" sm = StateMachine() try: trigger_set = character_set.do(StringIO(TestString + "]")) sm.add_transition(sm.init_state_index, trigger_set, AcceptanceF=True) print "state machine\n", sm except RegularExpressionException, x: print repr(x)
def __init__(self, Name): sh = StringIO("[:\\P{Script=%s}:]" % Name) self.name = Name self.charset = regex.snap_set_expression(sh, {}) self.sm = StateMachine() self.sm.add_transition(self.sm.init_state_index, self.charset, AcceptanceF=True) self.id = self.sm.get_id()
def snap_non_control_character(stream, PatternDict): __debug_entry("non-control characters", stream) # (*) read first character char_code = utf8.__read_one_utf8_code_from_stream(stream) if char_code is None: error_msg("Character could not be interpreted as UTF8 code or End of File reached prematurely.", stream) result = StateMachine() result.add_transition(result.init_state_index, char_code, AcceptanceF=True) return __debug_exit(result, stream)
def generate_sm_for_boarders(Boarders, Trafo): sm = StateMachine() for ucs_char in Boarders: target_idx = index.get() sms.line(sm, sm.init_state_index, (ucs_char, target_idx), (ucs_char, target_idx)) sm.states[target_idx].set_acceptance() Trafo.adapt_source_and_drain_range(-1) verdict_f, result = Trafo.do_state_machine(sm, beautifier) assert verdict_f return result
def create_state_machine(SM, Result, Class_StateMachine, Class_State): # If all states are of size one, this means, that there were no states that # could have been combined. In this case a simple copy of the original # state machine will do. if len(filter(lambda state_set: len(state_set) != 1, Result.state_set_list)) == 0: return SM.clone() # Define a mapping from the state set to a new target state index # # map: state_set_index ---> index of the state that represents it # map_new_state_index = dict([(i, state_machine_index.get()) for i in xrange(len(Result.state_set_list))]) # The state set that contains the initial state becomes the initial state of # the new state machine. state_set_containing_initial_state_i = Result.map[SM.init_state_index] new_init_state_index = map_new_state_index[ state_set_containing_initial_state_i] result = StateMachine(new_init_state_index) # Ensure that each target state index has a state inside the state machine # Build up the state machine out of the state sets for state_set_idx, state_set in enumerate(Result.state_set_list): new_state_index = map_new_state_index[state_set_idx] # Merge all core information of the states inside the state set. # If one state set contains an acceptance state, then the result is 'acceptance'. # (Note: The initial split separates acceptance states from those that are not # acceptance states. There can be no state set containing acceptance and # non-acceptance states) # (Note, that the prototype's info has not been included yet, consider whole set) result.states[new_state_index] = Class_State.new_merged_core_state( SM.states[i] for i in state_set) for state_set_idx, state_set in enumerate(Result.state_set_list): # The prototype: States in one set behave all equivalent with respect to target state sets # thus only one state from the start set has to be considered. prototype = SM.states[state_set[0]] representative = result.states[map_new_state_index[state_set_idx]] # The representative must have all transitions that the prototype has for target_state_index, trigger_set in prototype.target_map.get_map( ).iteritems(): target_state_set_index = Result.map[target_state_index] target_index = map_new_state_index[target_state_set_index] representative.add_transition(trigger_set, target_index) return result
def create_ALL_BUT_NEWLINE_state_machine(stream): global Setup result = StateMachine() # NOTE: Buffer control characters are supposed to be filtered out by the code # generator. trigger_set = NumberSet(Interval(ord("\n"))).get_complement(Setup.buffer_codec.source_set) if trigger_set.is_empty(): error_msg("The set of admissible characters contains only newline.\n" "The '.' for 'all but newline' is an empty set.", SourceRef.from_FileHandle(stream)) result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) return result
def get_any(): """RETURNS: A state machine that 'eats' any character, but only one. (0)--- \Any --->(( 0 )) """ result = StateMachine() result.add_transition(result.init_state_index, NumberSet(Interval(-sys.maxint, sys.maxint)), AcceptanceF=True) return result
def do(SM): """Creates a state machine that matches the reverse of what 'SM' matches. """ result = StateMachine(InitStateIndex=SM.init_state_index) original_acceptance_state_index_list = SM.get_acceptance_state_index_list() if len(original_acceptance_state_index_list) == 0: # If there is no acceptance state in a state machine, the state machine # cannot match any pattern, it is equivalent to '\None'. The reverse # of \None is \None. return special.get_none() # Ensure that each target state index has a state inside the state machine for state_index in SM.states.keys(): result.create_new_state(StateIdx=state_index) for state_index, state in SM.states.items(): for target_state_index, trigger_set in state.target_map.get_map( ).items(): result.states[target_state_index].add_transition( trigger_set.clone(), state_index) for target_state_index in state.target_map.get_epsilon_target_state_index_list( ): result.states[ target_state_index].target_map.add_epsilon_target_state( state_index) # -- copy all origins of the original state machine # -- We need to cancel any acceptance, because the inverted engine now starts # from a combination of the acceptance states and ends at the initial state. for state_index, state in SM.states.items(): original_origin_list = [origin.clone() for origin in state.origins()] for origin in original_origin_list: origin.set_input_position_restore_f(False) origin.set_pre_context_id(E_PreContextIDs.NONE) origin.set_acceptance_f(False) result.states[state_index].origins().set( original_origin_list) # deepcopy implicit # -- only the ORIGINAL initial state becomes an acceptance state (end of inverse) result.states[SM.init_state_index].set_acceptance(True) # -- setup an epsilon transition from an new init state to all previous # acceptance states. new_init_state_index = result.create_new_init_state() for state_index in original_acceptance_state_index_list: result.add_epsilon_transition(new_init_state_index, state_index) # -- for uniqueness of state ids, clone the result return result.clone()
def __prepare_incidence_id_map(self, IncidenceIdMap): def add(sm, StateIndex, TriggerSet, IncidenceId): if TriggerSet.is_empty(): return target_state_index = sm.add_transition(StateIndex, TriggerSet) target_state = sm.states[target_state_index] target_state.mark_self_as_origin(IncidenceId, target_state_index) target_state.set_acceptance(True) sm = StateMachine() for character_set, incidence_id in IncidenceIdMap: # 'cliid' = unique command list incidence id. add(sm, sm.init_state_index, character_set, incidence_id) return sm
def test_plug_sequence(ByteSequenceDB): L = len(ByteSequenceDB[0]) for seq in ByteSequenceDB: assert len(seq) == L for x in seq: assert isinstance(x, Interval) first_different_byte_index = -1 for i in range(L): x0 = ByteSequenceDB[0][i] for seq in ByteSequenceDB[1:]: if not seq[i].is_equal(x0): first_different_byte_index = i break if first_different_byte_index != -1: break if first_different_byte_index == -1: first_different_byte_index = 0 print "# Best To be Displayed by:" print "#" print "# > " + sys.argv[0] + " " + sys.argv[1] + " | dot -Tsvg -o tmp.svg" print "#" print "# -------------------------" print "# Byte Sequences: " i = -1 for seq in ByteSequenceDB: i += 1 print "# (%i) " % i, for x in seq: print " " + x.get_string(Option="hex"), print print "# L = %i" % L print "# DIdx = %i" % first_different_byte_index sm = StateMachine() end_index = state_machine.index.get() sm.states[end_index] = State() trafo = EncodingTrafoUTF8() Setup.buffer_codec_set(trafo, 1) trafo._plug_interval_sequences(sm, sm.init_state_index, end_index, ByteSequenceDB, beautifier) if len(sm.get_orphaned_state_index_list()) != 0: print "Error: Orphaned States Detected!" show_graphviz(sm)
def create_ALL_BUT_NEWLINE_state_machine(): global Setup result = StateMachine() # NOTE: Buffer control characters are supposed to be filtered out by the code # generator. trigger_set = NumberSet(Interval(ord("\n")).inverse()) if Setup.get_character_value_limit() != sys.maxint: trigger_set.intersect_with( Interval(0, Setup.get_character_value_limit())) result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) return result
def do(stream, PatternDict): trigger_set = snap_set_expression(stream, PatternDict) if trigger_set is None: raise RegularExpressionException("Regular Expression: character_set_expression called for something\n" + \ "that does not start with '[:', '[' or '\\P'") if trigger_set.is_empty(): raise RegularExpressionException( "Regular Expression: Character set expression results in empty set." ) # Create state machine that triggers with the trigger set to SUCCESS # NOTE: The default for the ELSE transition is FAIL. sm = StateMachine() sm.add_transition(sm.init_state_index, trigger_set, AcceptanceF=True) return __debug_exit(sm, stream)
def snap_character_set_expression(stream, PatternDict): # GRAMMAR: # # set_expression: # [: set_term :] # traditional character set # \P '{' propperty string '}' # '{' identifier '}' # # set_term: # "alnum" # "alpha" # "blank" # "cntrl" # "digit" # "graph" # "lower" # "print" # "punct" # "space" # "upper" # "xdigit" # "union" '(' set_term [ ',' set_term ]+ ')' # "intersection" '(' set_term [ ',' set_term ]+ ')' # "difference" '(' set_term [ ',' set_term ]+ ')' # "inverse" '(' set_term ')' # set_expression # trigger_set = snap_set_expression(stream, PatternDict) if trigger_set is None: error.log("Regular Expression: snap_character_set_expression called for something\n" + \ "that does not start with '[:', '[' or '\\P'", stream) elif trigger_set.is_empty(): error.warning( "Regular Expression: Character set expression results in empty set.", stream) # Create state machine that triggers with the trigger set to SUCCESS # NOTE: The default for the ELSE transition is FAIL. sm = StateMachine() sm.add_transition(sm.init_state_index, trigger_set, AcceptanceF=True) return __debug_exit(sm, stream)
def get_sm_shape_by_name(Name): sm = StateMachine(InitStateIndex=0L) if Name == "linear": sm, state_n, pic = get_linear(sm) elif Name == "butterfly": sm, state_n, pic = get_butterfly(sm) elif Name == "long_loop": sm, state_n, pic = get_long_loop(sm) elif Name == "nested_loop": sm, state_n, pic = get_nested_loop(sm) elif Name == "mini_loop": sm, state_n, pic = get_mini_loop(sm) elif Name == "fork": sm, state_n, pic = get_fork(sm) elif Name == "fork2": sm, state_n, pic = get_fork2(sm) elif Name == "fork3": sm, state_n, pic = get_fork3(sm) elif Name == "fork4": sm, state_n, pic = get_fork4(sm) elif Name == "mini_bubble": sm, state_n, pic = get_mini_bubble(sm) elif Name == "bubble": sm, state_n, pic = get_bubble(sm) elif Name == "bubble2": sm, state_n, pic = get_bubble2(sm) elif Name == "bubble2b": sm, state_n, pic = get_bubble2b(sm) elif Name == "bubble3": sm, state_n, pic = get_bubble3(sm) elif Name == "bubble4": sm, state_n, pic = get_bubble4(sm) else: sm, state_n, pic = get_tree(sm) return sm, state_n, pic
def get_all(): """RETURNS: A state machine that 'eats' absolutely everything, i.e. .--- \Any ---. | | (0)--- \Any --->(( 0 ))<--------' """ result = StateMachine() i = index.get() state = State(AcceptanceF=True) state.add_transition(NumberSet_All(), i) result.states[i] = state result.get_init_state().add_transition(NumberSet_All(), i) return result
def __init__(self, SM_A, SM_B, result=None): self.original = SM_A self.admissible = SM_B if result is None: init_state_index = index.map_state_combination_to_index( (SM_A.init_state_index, SM_B.init_state_index)) state = self.get_state_core(SM_A.init_state_index) self.result = StateMachine(InitStateIndex=init_state_index, InitState=state) else: self.result = result self.path = [] # Use 'operation_index' to get a unique index that allows to indicate # that 'SM_B' is no longer involved. Also, it ensures that the # generated state indices from (a_state_index, operation_index) are # unique. self.operation_index = index.get() TreeWalker.__init__(self)
def StateMachine_Newline(): """Creates a state machine matching newline according to what has been specified in the setup (Setup.dos_carriage_return_newline_f). That is, if is DOS newline then the state machine represents '\r\n' and if it is unix only, then it represents '\n'. If both is required they are implemented in parallel. RETURNS: StateMachine """ UnixF = True DosF = Setup.dos_carriage_return_newline_f NL = ord('\n') # (pure) newline, i.e. line feed CR = ord('\r') # carriage return sm = StateMachine() if UnixF: sm.add_transition(sm.init_state_index, NL, AcceptanceF=True) if DosF: idx = sm.add_transition(sm.init_state_index, CR, AcceptanceF=False) sm.add_transition(idx, NL, AcceptanceF=True) return beautifier.do(sm)
def setup(EntryN, StateOperation): sm = StateMachine() examiner = Examiner(sm, RecipeAcceptance) si = 1111L setup_state_operation(sm, StateOperation, si) operation = sm.states[si].single_entry examiner.linear_db[sm.init_state_index] = LinearStateInfo() predecessor0_recipe = RecipeAcceptance( [SeAccept(0)], { E_IncidenceIDs.CONTEXT_FREE_MATCH: 0, 10L: -1, # same for both / no restore 11L: -2, # unequal for both 12L: E_Values.RESTORE, # same for both / restore same 13L: E_Values.RESTORE, # same for both / restore differs 21L: 0, # no present in other }, { (E_R.PositionRegister, 12L): 0, (E_R.PositionRegister, 13L): 0 })
print for cmd in si.single_entry: acceptance_mark = " " if cmd.is_acceptance(): acceptance_mark = "*" print acceptance_mark + repr(cmd) print "---------------------------------------------------------------------" #---------------------------------------------------------------------------------------- # (*) setup the state machine origins # # -- the function 'filter dominated origins searches for the original acceptance # in the state machine => need to create to dummy state machines StateMachine(InitStateIndex=0L, AcceptanceF=True) StateMachine(InitStateIndex=1L, AcceptanceF=True) StateMachine(InitStateIndex=2L, AcceptanceF=True) StateMachine(InitStateIndex=3L, AcceptanceF=True) StateMachine(InitStateIndex=4L, AcceptanceF=True) StateMachine(InitStateIndex=5L, AcceptanceF=True) StateMachine(InitStateIndex=6L, AcceptanceF=True) StateMachine(InitStateIndex=100L, AcceptanceF=False) StateMachine(InitStateIndex=101L, AcceptanceF=False) StateMachine(InitStateIndex=102L, AcceptanceF=False) StateMachine(InitStateIndex=103L, AcceptanceF=False) StateMachine(InitStateIndex=104L, AcceptanceF=False) StateMachine(InitStateIndex=105L, AcceptanceF=False) StateMachine(InitStateIndex=106L, AcceptanceF=False) # (*) add priviledges
acceptance_scheme_0 = [ RecipeAcceptance.RestoreAcceptance ] acceptance_scheme_1 = [ SeAccept(1111L, E_PreContextIDs.NONE, False) ] acceptance_scheme_2 = [ SeAccept(2222L, E_PreContextIDs.NONE, True) ] acceptance_scheme_3 = [ SeAccept(3333L, 33L, True), SeAccept(4444L, 44L, True), SeAccept(5555L, E_PreContextIDs.NONE, True) ] examiner = Examiner(StateMachine(), RecipeAcceptance) # For the test, only 'examiner.mouth_db' and 'examiner.recipe_type' # are important. examiner.mouth_db[1L] = get_MouthStateInfoSnapshotMap(entry_n, acceptance_scheme_0, ip_offset_scheme_0) examiner.mouth_db[2L] = get_MouthStateInfoSnapshotMap(entry_n, acceptance_scheme_1, ip_offset_scheme_1) examiner.mouth_db[3L] = get_MouthStateInfoSnapshotMap(entry_n, acceptance_scheme_2, ip_offset_scheme_2) examiner.mouth_db[4L] = get_MouthStateInfoSnapshotMap(entry_n, acceptance_scheme_3, ip_offset_scheme_3) examiner._interference(set([1L, 2L, 3L, 4L])) print_interference_result(examiner.mouth_db)
def snap_primary(stream, PatternDict): """primary: " non_double_quote * " = character string [ non_rect_bracket_close ] = set of characters { identifier } = pattern replacement ( expression ) non_control_character+ = lonely characters primary repetition_cmd """ __debug_entry("primary", stream) x = stream.read(1) lookahead = stream.read(1) if x != "" and lookahead != "": stream.seek(-1, 1) if x == "": return __debug_exit(None, stream) # -- 'primary' primary if x == "\"": result = snap_character_string.do(stream) elif x == "[": stream.seek(-1, 1) result = character_set_expression.do(stream, PatternDict) elif x == "{": result = snap_replacement(stream, PatternDict) elif x == ".": result = create_ALL_BUT_NEWLINE_state_machine() elif x == "(": result = snap_bracketed_expression(stream, PatternDict) elif x.isspace(): # a lonestanding space ends the regular expression stream.seek(-1, 1) return __debug_exit(None, stream) elif x in ["*", "+", "?"]: raise RegularExpressionException( "lonely operator '%s' without expression proceeding." % x) elif x == "\\": if lookahead == "C": stream.read(1) result = snap_case_folded_pattern(stream, PatternDict) elif lookahead == "R": result = get_expression_in_brackets(stream, PatternDict, "reverse operator", "R").get_inverse() elif lookahead == "A": result = get_expression_in_brackets(stream, PatternDict, "anti-pattern operator", "A") result.transform_to_anti_pattern() else: stream.seek(-1, 1) trigger_set = character_set_expression.snap_property_set(stream) if trigger_set is None: stream.seek( 1, 1) # snap_property_set() leaves tream right before '\\' char_code = snap_backslashed_character.do(stream) if char_code is None: raise RegularExpressionException( "Backslash followed by unrecognized character code.") trigger_set = char_code result = StateMachine() result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) elif x not in CONTROL_CHARACTERS: # NOTE: The '\' is not inside the control characters---for a reason. # It is used to define for example character codes using '\x' etc. stream.seek(-1, 1) result = snap_non_control_character(stream, PatternDict) else: # NOTE: This includes the '$' sign which means 'end of line' # because the '$' sign is in CONTROL_CHARACTERS, but is not checked # against. Thus, it it good to leave here on '$' because the # '$' sign is handled on the very top level. # this is not a valid primary stream.seek(-1, 1) return __debug_exit(None, stream) # -- optional repetition command? result_repeated = __snap_repetition_range(result, stream) if result_repeated is not None: result = result_repeated return __debug_exit(beautifier.do(result), stream)
def __parse_option(fh, new_mode): def get_pattern_object(SM): if not SM.is_DFA_compliant(): result = nfa_to_dfa.do(SM) else: result = SM result = hopcroft.do(result, CreateNewStateMachineF=False) return Pattern(result, AllowStateMachineTrafoF=True) identifier = read_option_start(fh) if identifier is None: return False verify_word_in_list(identifier, mode_option_info_db.keys(), "mode option", fh.name, get_current_line_info_number(fh)) if identifier == "skip": # A skipper 'eats' characters at the beginning of a pattern that belong # to a specified set of characters. A useful application is most probably # the whitespace skipper '[ \t\n]'. The skipper definition allows quex to # implement a very effective way to skip these regions. pattern_str, trigger_set = regular_expression.parse_character_set( fh, PatternStringF=True) skip_whitespace(fh) if fh.read(1) != ">": error_msg("missing closing '>' for mode option '%s'." % identifier, fh) if trigger_set.is_empty(): error_msg("Empty trigger set for skipper." % identifier, fh) # TriggerSet skipping is implemented the following way: As soon as one element of the # trigger set appears, the state machine enters the 'trigger set skipper section'. # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action. # NOTE: The correspondent CodeFragment for skipping is created in 'implement_skippers(...)' pattern_sm = StateMachine() pattern_sm.add_transition(pattern_sm.init_state_index, trigger_set, AcceptanceF=True) # Skipper code is to be generated later action = GeneratedCode(skip_character_set.do, FileName=fh.name, LineN=get_current_line_info_number(fh)) action.data["character_set"] = trigger_set new_mode.add_match(pattern_str, action, get_pattern_object(pattern_sm), Comment=E_SpecialPatterns.SKIP) return True elif identifier in ["skip_range", "skip_nested_range"]: # A non-nesting skipper can contain a full fledged regular expression as opener, # since it only effects the trigger. Not so the nested range skipper-see below. # -- opener skip_whitespace(fh) if identifier == "skip_nested_range": # Nested range state machines only accept 'strings' not state machines opener_str, opener_sequence = __parse_string( fh, "Opener pattern for 'skip_nested_range'") opener_sm = StateMachine.from_sequence(opener_sequence) else: opener_str, opener_pattern = regular_expression.parse(fh) opener_sm = opener_pattern.sm # For 'range skipping' the opener sequence is not needed, only the opener state # machine is webbed into the pattern matching state machine. opener_sequence = None skip_whitespace(fh) # -- closer closer_str, closer_sequence = __parse_string( fh, "Closing pattern for 'skip_range' or 'skip_nested_range'") skip_whitespace(fh) if fh.read(1) != ">": error_msg("missing closing '>' for mode option '%s'" % identifier, fh) # Skipper code is to be generated later generator_function, comment = { "skip_range": (skip_range.do, E_SpecialPatterns.SKIP_RANGE), "skip_nested_range": (skip_nested_range.do, E_SpecialPatterns.SKIP_NESTED_RANGE), }[identifier] action = GeneratedCode(generator_function, FileName=fh.name, LineN=get_current_line_info_number(fh)) action.data["opener_sequence"] = opener_sequence action.data["closer_sequence"] = closer_sequence action.data["mode_name"] = new_mode.name new_mode.add_match(opener_str, action, get_pattern_object(opener_sm), Comment=comment) return True elif identifier == "indentation": value = indentation_setup.do(fh) # Enter 'Newline' and 'Suppressed Newline' as matches into the engine. # Similar to skippers, the indentation count is then triggered by the newline. # -- Suppressed Newline = Suppressor followed by Newline, # then newline does not trigger indentation counting. suppressed_newline_pattern_str = "" if value.newline_suppressor_state_machine.get() is not None: suppressed_newline_pattern_str = \ "(" + value.newline_suppressor_state_machine.pattern_string() + ")" \ + "(" + value.newline_state_machine.pattern_string() + ")" suppressed_newline_sm = \ sequentialize.do([value.newline_suppressor_state_machine.get(), value.newline_state_machine.get()]) FileName = value.newline_suppressor_state_machine.file_name LineN = value.newline_suppressor_state_machine.line_n # Go back to start. code = UserCodeFragment("goto %s;" % get_label("$start", U=True), FileName, LineN) new_mode.add_match( suppressed_newline_pattern_str, code, get_pattern_object(suppressed_newline_sm), Comment=E_SpecialPatterns.SUPPRESSED_INDENTATION_NEWLINE) # When there is an empty line, then there shall be no indentation count on it. # Here comes the trick: # # Let newline # be defined as: newline ([space]* newline])* # # This way empty lines are eating away before the indentation count is activated. # -- 'space' x0 = StateMachine() x0.add_transition(x0.init_state_index, value.indentation_count_character_set(), AcceptanceF=True) # -- '[space]*' x1 = repeat.do(x0) # -- '[space]* newline' x2 = sequentialize.do([x1, value.newline_state_machine.get()]) # -- '([space]* newline)*' x3 = repeat.do(x2) # -- 'newline ([space]* newline)*' x4 = sequentialize.do([value.newline_state_machine.get(), x3]) # -- nfa to dfa; hopcroft optimization sm = beautifier.do(x4) FileName = value.newline_state_machine.file_name LineN = value.newline_state_machine.line_n action = GeneratedCode(indentation_counter.do, FileName, LineN) action.data["indentation_setup"] = value new_mode.add_match(value.newline_state_machine.pattern_string(), action, get_pattern_object(sm), Comment=E_SpecialPatterns.INDENTATION_NEWLINE) # Announce the mode to which the setup belongs value.set_containing_mode_name(new_mode.name) else: value = read_option_value(fh) # The 'verify_word_in_list()' call must have ensured that the following holds assert mode_option_info_db.has_key(identifier) # Is the option of the appropriate value? option_info = mode_option_info_db[identifier] if option_info.domain is not None and value not in option_info.domain: error_msg("Tried to set value '%s' for option '%s'. " % (value, identifier) + \ "Though, possible for this option are only: %s." % repr(option_info.domain)[1:-1], fh) # Finally, set the option new_mode.add_option(identifier, value) return True
def get_none(): return StateMachine()
def _do(the_state_machine, post_context_sm, EndOfLinePostContextF, SourceReference): """Appends a post context to the given state machine and changes state infos as required. NOTE: In case that: post_context_sm is not None or EndOfLinePostContextF The function appends something to the state machine and it is therefore required to pass 'NFA to DFA'--better also Hopcroft Minimization. ________________________________________________________________________ This process is very similar to sequentialization. There is a major difference, though: Given a state machine (e.g. a pattern) X with a post context Y, a match is only valid if X is followed by Y. Let Xn be an acceptance state of X and Ym an acceptance state of Y: ---(Xn-1)---->(Xn)---->(Y0)----> ... ---->((Ym)) store acceptance input position That is, it holds: -- The next input position is stored the position of Xn, even though it is 'officially' not an acceptance state. -- Ym will be an acceptance state, but it will not store the input position! The analysis of the next pattern will start at the position where X stopped, even though Ym is required to state acceptance. """ if post_context_sm is None and EndOfLinePostContextF == False: return the_state_machine, None # State machines with no states are senseless here. assert not the_state_machine.is_empty(), \ "empty state machine can have no post context." assert post_context_sm is None or not post_context_sm.is_empty(), \ "empty state machine cannot be a post-context." # State machines involved with post condition building are part of a pattern, # but not configured out of multiple patterns. Thus there should be no origins. assert the_state_machine.has_origins() == False assert post_context_sm is None or not post_context_sm.has_origins() for state in the_state_machine.get_acceptance_state_list(): for origin in state.origins(): assert origin.pre_context_id() == E_PreContextIDs.NONE, \ "Post Contexts MUST be mounted BEFORE pre-contexts." if post_context_sm is None: assert EndOfLinePostContextF # Generate a new post context that just contains the 'newline' post_context_sm = StateMachine(AcceptanceF=True) post_context_sm.mount_newline_to_acceptance_states(Setup.dos_carriage_return_newline_f) elif EndOfLinePostContextF: # Mount 'newline' to existing post context post_context_sm.mount_newline_to_acceptance_states(Setup.dos_carriage_return_newline_f) # A post context with an initial state that is acceptance is not really a # 'context' since it accepts anything. The state machine remains un-post context. if post_context_sm.get_init_state().is_acceptance(): error_msg("Post context accepts anything--replaced by no post context.", SourceReference, DontExitF=True) return the_state_machine, None # (*) Two ways of handling post-contexts: # # -- Seldom Exception: # Pseudo-Ambiguous Post Conditions (x+/x) -- detecting the end of the # core pattern after the end of the post context # has been reached. # if ambiguous_post_context.detect_forward(the_state_machine, post_context_sm): if ambiguous_post_context.detect_backward(the_state_machine, post_context_sm): # -- for post contexts that are forward and backward ambiguous # a philosophical cut is necessary. error_msg("Post context requires philosophical cut--handle with care!\n" "Proposal: Isolate pattern and ensure results are as expected!", SourceReference, DontExitF=True) post_context_sm = ambiguous_post_context.philosophical_cut(the_state_machine, post_context_sm) # NOTE: May be, the_state_machine does contain now an epsilon transition. See # comment at entry of this function. bipd_sm_to_be_inverted = ambiguous_post_context.mount(the_state_machine, post_context_sm) the_state_machine = beautifier.do(the_state_machine) return the_state_machine, bipd_sm_to_be_inverted # -- The 'normal' way: storing the input position at the end of the core # pattern. # # (*) Need to clone the state machines, i.e. provide their internal # states with new ids, but the 'behavior' remains. This allows # state machines to appear twice, or being used in 'larger' # conglomerates. post_clone = post_context_sm.clone() # -- Once an acceptance state is reached no further analysis is necessary. ## NO: acceptance_pruning.do(post_clone) ## BECAUSE: it may have to compete with a pseudo-ambiguous post context # (*) collect all transitions from both state machines into a single one # # NOTE: The start index is unique. Therefore, one can assume that each # clone_list '.states' dictionary has different keys. One can simply # take over all transitions of a start index into the result without # considering interferences (see below) # orig_acceptance_state_id_list = the_state_machine.get_acceptance_state_index_list() # -- mount on every acceptance state the initial state of the following state # machine via epsilon transition the_state_machine.mount_to_acceptance_states(post_clone.init_state_index, CancelStartAcceptanceStateF=True) for start_state_index, state in post_clone.states.iteritems(): the_state_machine.states[start_state_index] = state # states are already cloned # -- raise at each old acceptance state the 'store input position flag' # -- set the post context flag for all acceptance states for state_idx in orig_acceptance_state_id_list: state = the_state_machine.states[state_idx] state.set_input_position_store_f(True) # -- no acceptance state shall store the input position # -- set the post context flag for all acceptance states for state in the_state_machine.get_acceptance_state_list(): state.set_input_position_store_f(False) state.set_input_position_restore_f(True) # No input position backward search required return beautifier.do(the_state_machine), None