Пример #1
0
    def import_from_fsm(self,
                        FileName="automaton.fsm",
                        SymbolFileName="automaton.sym"):
        """
            Load automaton from file in FSM format. Based on FSM man
            page: http://www2.research.att.com/~fsmtools/fsm/man4.html . This method must be updated if new symbol is added to Netbench. Raises Exception if unknown symbol string type is found and coresponding class can not be determinated.

            :param FileName: File name from which the fsm part will be imported.
            :type FileName: string
            :param SymbolFileName: File name from which the sym part will be imported.
            :type SymbolFileName: string
            :raises: nfa_data_import_exception if unknown symbol string type is found and coresponding class can not be determinated.

        """

        # initialization
        self.states = dict()
        # Finite set of states
        self.alphabet = dict()
        # Symbols of alphabet
        self.start = -1
        # ID of Start state
        self.transitions = set()
        # Transitions
        self.final = set()
        # Final states
        self.Flags = dict()
        # Flags for specified properties

        # Load symbols from symbol file
        fs = open(SymbolFileName, 'r')

        symbol_mapper = dict()

        # Read all symbols
        for line in fs.readlines():
            # Split line
            line = line.split()
            # Get symbol ID - subtract 1 (FSM Library use 0 for epsilon symbol, Netbench use -1 for epsilon symbol)
            symbol_id = int(line[1]) - 1
            # maps symbol string to its id
            symbol_mapper[line[0]] = symbol_id
            # get name of symbol class
            try:
                cls = b_symbol.io_reverse_mapper[line[0][0]]
            except:
                raise nfa_data_import_exception(line[0][0])
            symbol = None
            # Create new object of selected class
            if cls == "b_Sym_char":
                symbol = sym_char.b_Sym_char("", "", 0)
            if cls == "b_Sym_char_class":
                symbol = sym_char_class.b_Sym_char_class("", set(), 0)
            if cls == "b_Sym_string":
                symbol = sym_string.b_Sym_string("", "", 0)
            if cls == "b_Sym_kchar":
                symbol = sym_kchar.b_Sym_kchar("", ("", ""), 0)
            if cls == "DEF_SYMBOLS":
                symbol = b_symbol.DEF_SYMBOLS("", 0)
            if cls == "b_Sym_cnt_constr":
                symbol = sym_cnt_constr.b_Sym_cnt_constr("", "", 0, 0, 0)
            if symbol == None:
                raise nfa_data_import_exception(line[0][0])
            else:
                # Import symbol
                symbol.import_symbol(line[0], symbol_id)
                # Add to alphabet
                self.alphabet[symbol_id] = symbol

        fs.close()

        fr = open(FileName, 'r')  # file read

        # first line indicating start state
        line = fr.readline()
        line = line.split()
        src = int(line[0])
        self.start = src
        self.states[src] = b_State(mid=src)
        # line is transition
        if len(line) > 1:
            des = int(line[1])
            if src != des:
                self.states[des] = b_State(mid=des)
            self.transitions.add((src, symbol_mapper[line[2]], des))
        # first line is start state and too final state
        # (line is final state)
        else:
            self.final.add(src)
            self.states[src]._rnum = src

        # from 2 line to EndOfFile
        for line in fr.readlines():
            line = line.split()
            src = int(line[0])
            if src not in self.states:
                self.states[src] = b_State(mid=src)
            # line is transition
            if len(line) > 1:
                des = int(line[1])
                if des not in self.states:
                    self.states[des] = b_State(mid=des)
                self.transitions.add((src, symbol_mapper[line[2]], des))
            # line is final state
            else:
                self.final.add(src)
                self.states[src]._rnum = src

        self.Flags["ImportFromFsm"] = True
        fr.close()
Пример #2
0
    def get_nfa(self):
        """
            Parse a current line and returns parsed nfa.
            
            :returns: Created automaton in nfa_data format. Returns None if failure happens.
            :rtype: nfa_data or None
        """
        # Check if some reg. exp. are set. 
        if (self._position < 0):
            return None
        
        # Create random value.
        #value = random.randint(0, sys.maxint)
        
        # Get line.
        line = self._text[self._position]
        
        # Remove trailing \n
        if line[len(line) - 1] == '\n':
            line = line[0:len(line)-1]

        #line = "/" + line + "/"
        
        self.last = line
              
        # find where we are
        msfm_path = aux_func.getPatternMatchDir()
        work_path = os.getcwd()
                
        # invoke C regexp parser
        #cmd = "echo '" + line + "' | " + msfm_path + "/pcre_parser/parser -o STDOUT -s"
        #res = aux_func.getstatusoutput(cmd)
        cmd = ""
        
        # Create cnt_constr symbols if requested
        if self.create_cnt_constr == False:
            cmd = msfm_path + "/pcre_parser/parser -o STDOUT -s" 
        else:
            cmd = msfm_path + "/pcre_parser/parser -o STDOUT -s -c"
        # Do not create eof symbols if requested
        if self.create_eof_symbols == False:
            cmd += " -E"
        
        res = aux_func.getstatusoutput(cmd, line)
        # Print stderr if there is some content
        if len(res[2]) != 0:
            sys.stderr.write(res[2] + "\n")
        # If error, stop.
        if res[0] != 0:
            sys.stderr.write("PARSER ERROR:\n")
            sys.stderr.write("CMD: " + cmd + "\n")
            sys.stderr.write("PCRE: " + line + "\n")
            sys.stderr.write("MSFM:\n");
            sys.stderr.write(res[1] + "\n");
            return None;
        else:
            try:
                # Create empty object
                nfa = nfa_data.nfa_data()              
                
                # Preprocess automaton file
                FSMfile = res[1].split("\n")

                # Get start state of NFA
                nfa.start = int(FSMfile[2])
                del FSMfile[2]
                
                # FORMAT of Automata file
                #  - Number of the States in the automaton
                #  - Number of the transition in the automaton
                #  - Each transition is represenetd by one line in the file. Line 
                #    is in format Source_State|Symbol|Target_State|Epsilon
                #  - End of the transition table is represented by line of #
                #  - Number of the end states
                #  - Line with identifikator of the endState. Every endstate is 
                #    folowed by , (coma)
                #  - End of endState section is represented by line of #
                #  - Number of the symbols in symbol table
                #  - Every symbol is stored on its own line and it is represented 
                #    as Symbol_Number:Character1|Character2|
                #  - End of the file
        
                TransitionTable = [x.split("|") for x in FSMfile[2:int(FSMfile[1])+2]];         
                # Transition table is list of the list and represents the whole 
                # transition table of the automata.  2 is an index of the first 
                # transition FSMfile[1] is the number of the transition in automaton
                
                # List of the endStates is stored after all transition (FSMfile[1])
                # and after 4 other lines (number of states, number of transitions,
                # number of endstates, and the line of ####
                # Endstates are isolated by , (coma) 
                Endstates = FSMfile[int(FSMfile[1])+4].split(",")
                
                # Alphabet symbols start on the index FSMfile[1] 
                # (all transitions) + 7 (4 as before + line of #, 
                # line of endstates and number of symbols) 
                Symbols = (FSMfile[int(FSMfile[1])+7:]);
                            
                # Creates end states objects.
                for state in Endstates:
                    if state != "":
                        Tmp = b_State(int(state),set([self._position]))     #Creates state which is described by the int(State)
                        nfa.states[Tmp.get_id()] = Tmp
                        nfa.final.add(Tmp.get_id())
                
                all_msfm_syms = dict()
                
                # For every symbol in alphabet
                for ActSym in Symbols:                        
                    # Separate symbol number and symbol data (done by first :)
                    StartSym = ActSym.find(":");
                    if ActSym[StartSym+1] == '#':
                        # Split at #
                        sharp_split = ActSym[StartSym+1:len(ActSym)-1].split("#")
                        # Get m
                        m = int(sharp_split[1])
                        # Get n
                        n = 0
                        # Check if infinite number of symbols can occure
                        if sharp_split[2] == '':
                            n = float("inf")
                        else:
                            n = int(sharp_split[2])
                        # Get symbol part of encoded cnt constr
                        SymSym = ActSym.rfind("#");
                        symSet = set([x for x in ActSym[SymSym+1:len(ActSym)-1].split("|")])
                        symSetMod = set()
                        # convert hex to char
                        for s in symSet:
                            symSetMod.add(chr(long(s,16) & 255))
                        # Create symbol
                        symbol = None
                        text_info = ""
                        if not (m == 0 and n == 0):
                            # Create char if number of symbols is 1.
                            if len(symSetMod) == 1:
                                char = symSetMod.pop()
                                symbol = char
                                text_info += char + "{" + str(m) + "," + str(n) + "}"
                            else:
                                # Create char class otherwise.
                                strSymSetMod = str()
                                for sym in symSetMod:
                                    strSymSetMod += sym
                                strSymSetMod = "[" + strSymSetMod + "]"
                                text_info += strSymSetMod  + "{" + str(m) + "," + str(n) + "}"
                                symbol = symSetMod
                            # Create sym_cnt_constr object
                            Tmp = sym_cnt_constr.b_Sym_cnt_constr(text_info, symbol, m ,n, int(ActSym[:StartSym], 16))
                            nfa.alphabet[Tmp.get_id()] = Tmp
                            # Create mapping from symbol chars to their ids
                            if (m,n,frozenset(symbol)) not in all_msfm_syms:
                                all_msfm_syms[(m,n,frozenset(symbol))] = set()
                            all_msfm_syms[(m,n,frozenset(symbol))].add(int(ActSym[:StartSym], 16))
                        else:
                            #BUG: Workaround for bug in parser, when cnt constr symbols are generated even construction such as s+, d*, .+, ... are converted. This behaviaor is not OK, but fix of the parser would consume to mauch time. This workaround works OK.
                            # Create mapping from symbol chars to their ids
                            if frozenset(symSetMod) not in all_msfm_syms:
                                all_msfm_syms[frozenset(symSetMod)] = set()
                            all_msfm_syms[frozenset(symSetMod)].add(int(ActSym[:StartSym], 16))
                            
                            # Create char if number of symbols is 1.
                            if len(symSetMod) == 1:
                                char = symSetMod.pop()
                                Symbol = sym_char.b_Sym_char(char,char,int(ActSym[:StartSym], 16))
                                nfa.alphabet[Symbol.get_id()] = Symbol
            #                    nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char.b_Sym_char(char, char)
                            else:
                                # Create char class otherwise.
            #                    nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char_class.b_Sym_char_class(str(symSetMod), symSetMod)
                                strSymSetMod = str()
                                for sym in symSetMod:
                                    strSymSetMod += sym
                                strSymSetMod = "[" + strSymSetMod + "]"
                                #nfa.alphabet[int(ActSym[:StartSym], 16)] 
                                Tmp = sym_char_class.b_Sym_char_class(strSymSetMod,symSetMod,int(ActSym[:StartSym], 16))
                                nfa.alphabet[Tmp.get_id()] = Tmp
                    elif ActSym[StartSym+1:] == "EOF|":
                        # Add EOF symbol into alphabet
                        Symbol = sym_eof.b_Sym_EOF("EOF", int(ActSym[:StartSym], 16))
                        nfa.alphabet[Symbol.get_id()] = Symbol
                        # Create mapping from symbol chars to their ids
                        if "EOF" not in all_msfm_syms:
                            all_msfm_syms["EOF"] = set()
                        all_msfm_syms["EOF"].add(int(ActSym[:StartSym], 16))
                    else:
                        symSet = set([x for x in ActSym[StartSym+1:len(ActSym)-1].split("|")])
                        symSetMod = set()
                        # convert hex to char
                        for s in symSet:
                            symSetMod.add(chr(long(s,16) & 255))
                        
                        # Create mapping from symbol chars to their ids
                        if frozenset(symSetMod) not in all_msfm_syms:
                            all_msfm_syms[frozenset(symSetMod)] = set()
                        all_msfm_syms[frozenset(symSetMod)].add(int(ActSym[:StartSym], 16))
                        
                        # Create char if number of symbols is 1.
                        if len(symSetMod) == 1:
                            char = symSetMod.pop()
                            Symbol = sym_char.b_Sym_char(char,char,int(ActSym[:StartSym], 16))
                            nfa.alphabet[Symbol.get_id()] = Symbol
        #                    nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char.b_Sym_char(char, char)
                        else:
                            # Create char class otherwise.
        #                    nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char_class.b_Sym_char_class(str(symSetMod), symSetMod)
                            strSymSetMod = str()
                            for sym in symSetMod:
                                strSymSetMod += sym
                            strSymSetMod = "[" + strSymSetMod + "]"
                            #nfa.alphabet[int(ActSym[:StartSym], 16)] 
                            Tmp = sym_char_class.b_Sym_char_class(strSymSetMod,symSetMod,int(ActSym[:StartSym], 16))
                            nfa.alphabet[Tmp.get_id()] = Tmp
                        
                # TODO: use special class for Epsilon?
                # Epsilon is representad now as sym_char object with char "" and index -1
                #nfa.alphabet[-1] 
                Tmp = sym_char.b_Sym_char("Epsilon", "",-1)
                nfa.alphabet[Tmp.get_id()] = Tmp
                
                # removeable symbols
                removeable_symbols = set()
                nonremoveable_symbols = set()
                # Add non final states to automaton.
                for transition in TransitionTable:
                    # if not in states, add start state of transition.
                    if not (int(transition[0]) in nfa.states):
                        nfa.states[int(transition[0])] = b_State(int(transition[0]), set())
                    
                    # if not in states, add end state of transition.
                    if not (int(transition[2]) in nfa.states):
                        nfa.states[int(transition[2])] = b_State(int(transition[2]), set())
                    
                    # Handle epsilon transitions.
                    alphaNum = -1
                    if transition[3] == '1':
                        alphaNum = -1
                        removeable_symbols.add(int(transition[1], 16))
                    else:
                        alphaNum = int(transition[1], 16)
                        nonremoveable_symbols.add(alphaNum)
                                       
                    # Add transition to automaton.
                    nfa.transitions.add((int(transition[0]), alphaNum, int(transition[2])))
                
                # Corect the removeable symbols
                removeable_symbols -= nonremoveable_symbols
                
                # Remove unused symbols
                for rsymbol in removeable_symbols:
                    del nfa.alphabet[rsymbol]
                
                # Remove duplicit symbols
                sym_mapping = dict()
                
                # Create mapping between current ids and the ids which will be used.
                # Only non removed id can be used as key
                #print all_msfm_syms
                #print removeable_symbols
                for key in all_msfm_syms:
                    sym = all_msfm_syms[key].pop()
                    if sym not in removeable_symbols:
                        all_msfm_syms[key].add(sym)
                    else:
                        found = 0
                        syms = set()
                        syms.add(sym)
                        while found == 0:
                            if len(all_msfm_syms[key]) == 0:
                                break
                            sym = all_msfm_syms[key].pop()
                            syms.add(sym)
                            if sym not in removeable_symbols:
                                found = 1
                                all_msfm_syms[key] |= syms
                                
                    for sid in all_msfm_syms[key]:
                        sym_mapping[sid] = sym
                    
                sym_mapping[-1] = -1
                
                add_transitions = set()
                #print sym_mapping
                for transition in nfa.transitions:
                    #print transition
                    add_transitions.add((transition[0], sym_mapping[transition[1]], transition[2]))
                
                nfa.transitions = add_transitions
                
                for sid in sym_mapping:
                    if sid != sym_mapping[sid]:
                        if sid not in removeable_symbols:
                            del nfa.alphabet[sid]

                # Somethimg is wrong with the msfm file, try autodetect the start state
                if nfa.start < 0:
                    # Determinate start station
                    # Dictionary mapping between states and their previous states.
                    StateInSymbols = dict()
                    # Autodetect start state of NFA - remove when start state is aded to the msfm format
                    # Compute the mapping between states and their transitions.
                    for state in nfa.states.keys():
                        StateInSymbols[state] = set()
                    for transition in nfa.transitions:
                        if StateInSymbols.has_key(transition[2]) == True:
                            StateInSymbols[transition[2]].add(transition[0])
                        else:
                            StateInSymbols[transition[2]] = set()
                            StateInSymbols[transition[2]].add(transition[0])
                    
                    # Autodetection - start state can have only 0 or 1 in transition originating from itself - problem /^(abc)+..../
                    for state in StateInSymbols.keys():
                        if len(StateInSymbols[state]) == 0:
                            nfa.start =state
                        elif (len(StateInSymbols[state]) == 1) and (list(StateInSymbols[state])[0] == state):
                            nfa.start = state
                
                return nfa
            except None:
                sys.stderr.write("ERROR while parsing msfm output of parser:\n")
                sys.stderr.write("CMD: " + cmd + "\n")
                sys.stderr.write("PCRE: " + line + "\n")
                sys.stderr.write("MSFM:\n");
                sys.stderr.write(res[1] + "\n");
                return None
Пример #3
0
    def determinise(self, create_table=False, states_limit=0):
        """
            Determinisation of automaton.

            :param create_table: If create_table = false than state representation table is not created and less memory is consumed.
            :type create_table: boolean

            :param states_limit: If num of states exceeds this limit, during determinization, then flag "Deterministic" is set to False and determinize stops; if nfa exceeds limit and is already deterministic, then nothing happens (this is because speed, not because logic); safe use is only if you want to stop algorithm if it exceeds limit; zero means no limit.
            :type states_limit: int

            :flags: Set Deterministic, Epsilon Free and Alphabet collision free.

            This method sets _compute to False, and get_compute() will return False until compute() is called.
        """

        #        if not self.has_flag("Alphabet collision free") \
        #           or self.get_flag("Alphabet collision free") == False:
        #            raise ALPHABET_COLLISION_FREE_ERROR

        # Automaton doesn't have any state = automaton is empty
        if self._automaton.is_empty() or self._automaton.start < 0:
            return

        self.remove_epsilons()  # check the Epsilon free flag ?

        counter = 0
        stack = list()
        newStates = dict()
        newStatesRev = dict()
        tmp = set()
        tmp.add(self._automaton.start)
        newStates[counter] = tmp
        newStatesRev[frozenset(tmp)] = counter
        stack.append(counter)
        counter += 1
        final = set()
        transitions = set()
        alphCounter = 0
        alphabet = dict()
        alphabetRev = dict()
        states = dict()
        states[0] = b_State(
            0,
            self._automaton.states[self._automaton.start].get_regexp_number())
        stateTrans = dict()

        # transtions from each state
        for transition in self._automaton.transitions:
            stateTrans.setdefault(transition[0], set()).add(
                (transition[1], transition[2]))

        # copy alphabet, ID's 0,1,...
        mapId = dict()  # maps old id -> new id
        for id, sym in self._automaton.alphabet.iteritems():
            sym.set_id(alphCounter)
            alphabet[alphCounter] = sym
            alphabetRev[sym] = alphCounter
            mapId[id] = alphCounter
            alphCounter += 1

        while stack:
            actState = stack.pop()
            if newStates[actState].intersection(self._automaton.final):
                final.add(actState)

            # transitions from actual state for each symbol
            outSymbols = dict()  # (symbol id, set of states id)
            for state in newStates[actState]:
                if state not in stateTrans.keys():
                    continue
                for t in stateTrans[state]:
                    outSymbols.setdefault(mapId[t[0]], set()).add(t[1])

            # resolve symbol collisions
            symbolAdded = True
            while symbolAdded:
                symbolAdded = False
                for sym1 in list(outSymbols.keys()):
                    toCompare = list(outSymbols.keys())
                    toCompare.remove(sym1)
                    for sym2 in toCompare:
                        if not (outSymbols[sym1] and outSymbols[sym2]):
                            continue  # no next state for one of the symbols
                        if not alphabet[sym1].collision([alphabet[sym2]]):
                            continue
#                        print "COLLISION DETECTED"

                        symStates = list([[]] * 3)
                        symStates[0] = outSymbols[sym1]
                        symStates[2] = outSymbols[sym2]
                        symStates[1] = symStates[0] | symStates[2]
                        outSymbols[sym1] = set()
                        outSymbols[sym2] = set()
                        ret = alphabet[sym1].resolve_collision(alphabet[sym2])

                        for i in range(3):
                            if not ret[i]:  # no symbol returned
                                continue

                            for new in ret[i]:
                                symbolAdded = True
                                if new not in alphabetRev:
                                    # add new symbol
                                    new.set_id(alphCounter)
                                    alphabet[alphCounter] = new
                                    alphabetRev[new] = alphCounter
                                    id = alphCounter
                                    alphCounter += 1
                                else:
                                    id = alphabetRev[new]
                                # update next states for symbol
                                tmp = outSymbols.setdefault(id, set())
                                outSymbols[id] = tmp | symStates[i]

            # create new transitions
            for symbol, nextState in outSymbols.iteritems():
                if not nextState:
                    continue  # no next states -> ignore symbol

                if frozenset(nextState) not in newStatesRev.keys():
                    # create a new state
                    newStatesRev[frozenset(nextState)] = counter
                    newStates[counter] = nextState
                    stack.append(counter)

                    endVal = set()  # set of regular expression numbres
                    for state in nextState:
                        if self._automaton.states[state].is_final() == True:
                            endVal |= self._automaton.states[
                                state].get_regexp_number()

                    states[counter] = b_State(counter, endVal)

                    if states_limit != 0 and counter > states_limit:
                        self.set_flag("Deterministic", False)
                        return

                    counter = counter + 1

                transitions.add(
                    (actState, symbol, newStatesRev[frozenset(nextState)]))

        # remove unused symbols
        toRemove = alphabet.keys()
        for trans in transitions:
            if trans[1] in toRemove:
                toRemove.remove(trans[1])
        self._automaton.alphabet = alphabet
        self._automaton.remove_symbols(toRemove)

        # set new symbol ID's
        mapId = dict()  # maps old id -> new id
        alphCounter = 0
        alphabet = dict()
        for id, sym in self._automaton.alphabet.iteritems():
            sym.set_id(alphCounter)
            alphabet[alphCounter] = sym
            mapId[id] = alphCounter
            alphCounter += 1

        # correct symbol ID's in transitions
        newTrans = set()
        for trans in transitions:
            newTrans.add((trans[0], mapId[trans[1]], trans[2]))

        # update automaton
        self._automaton.start = 0
        self._automaton.alphabet = alphabet
        self._automaton.states = states
        self._automaton.transitions = newTrans
        self._automaton.final = final

        self.set_flag("Deterministic", True)
        self.set_flag("Epsilon Free", True)
        if len(self._automaton.alphabet) > 0:
            self.set_flag("Alphabet collision free", True)

        self._compute = False

        if create_table == True:
            for i in range(0, counter):
                self._state_representation.append(newStates[i])
Пример #4
0
    def minimise(self):
        """
            Minimalization of DFA automaton.

            :raises: ALPHABET_COLLISION_ERROR() if alphabet is not collision free.
            :raises: DETERMINISTIC_ERROR() if automaton is not deterministic.
            :flags: Sets Minimal flag to true.

            This method sets _compute to False, and get_compute() will return False until compute() is called.
        """

        if not self.has_flag("Alphabet collision free") \
        or self.get_flag("Alphabet collision free") == False:
            raise ALPHABET_COLLISION_FREE_ERROR
        if not self.has_flag("Deterministic") \
        or self.get_flag("Deterministic") != True:
            raise DETERMINISTIC_ERROR

        # variables
        a = self._automaton  # shortcut
        newClasses = dict()  # new indistinguishable states
        actualClasses = dict()  # actual indistinguishable states
        table = dict()  # table of "transitions", key is state, value is class

        # 1) *** Eliminate not available states. ***
        self.remove_unreachable()

        # 2) *** Compute not indistinguishable states. ***
        # set default table
        for state in a.states.keys():
            table[state] = dict()
        for t in a.transitions:
            table[t[0]][t[1]] = t[2]
        defaultTable = copy.deepcopy(table)
        # zero iteration:
        # set first class other then final states
        newClasses[0] = a.states.keys()
        for finalState in a.final:
            if finalState in newClasses[0]:
                newClasses[0].remove(finalState)

        if self.get_multilanguage() is True:
            # set final states into next classes
            # each final state will be set in class according
            # get_regexp_number()
            newClassIdOffset = len(newClasses)
            newClassIdMapper = dict()
            for finalStateKey in a.final:
                frozen_regexp = frozenset(
                    a.states[finalStateKey].get_regexp_number())
                if not newClassIdMapper.has_key(frozen_regexp):
                    newClassIdMapper[frozen_regexp] = newClassIdOffset
                    newClasses[newClassIdOffset] = list()
                    newClassIdOffset += 1
                newClasses[newClassIdMapper[frozen_regexp]].append(
                    finalStateKey)
        else:
            # all final states are in one class
            newClasses[1] = list(a.final)

        # indistinguishable iterations
        while newClasses != actualClasses:
            actualClasses = copy.deepcopy(newClasses)
            # recompute table for next iteration
            table = copy.deepcopy(defaultTable)
            for state in table.keys():
                for symbol in table[state].keys():
                    for ClassID in range(0, len(actualClasses), 1):
                        if table[state][symbol] in actualClasses[ClassID]:
                            table[state][symbol] = ClassID
                            break
            newClasses = dict()
            for ClassID in sorted(actualClasses.keys()):
                states_in_class = copy.deepcopy(actualClasses[ClassID])
                while states_in_class != []:
                    state = states_in_class[0]
                    states_in_class.remove(state)
                    states_in_new_class = []
                    states_in_new_class.append(state)
                    for other_state in list(states_in_class):
                        if table[state] == table[other_state]:
                            states_in_class.remove(other_state)
                            states_in_new_class.append(other_state)
                    newClassID = len(newClasses.keys())
                    newClasses[newClassID] = states_in_new_class
        # *** Change to Reduced DFA. ***
        # change STATES
        back = a.states
        a.states = dict()
        for ClassID in range(0, len(actualClasses), 1):
            finalIndication = set()  # indication of final states
            for state in actualClasses[ClassID]:
                if state in a.final:
                    finalIndication |= back[state].get_regexp_number()
            a.states[ClassID] = b_State(mid=ClassID, rnum=finalIndication)
        # change ALPHABET - nothing to change
        # change START STATE
        for ClassID in range(0, len(actualClasses), 1):
            if a.start in actualClasses[ClassID]:
                a.start = ClassID
                break
        # change TRANSITIONS
        newTran = set()  # re-computed transitions
        for t in a.transitions:
            sourceState = -1
            destinationState = -1
            # discover source state
            for ClassID in range(0, len(actualClasses), 1):
                if t[0] in actualClasses[ClassID]:
                    sourceState = ClassID
                    break
            # discover destination state
            for ClassID in range(0, len(actualClasses), 1):
                if t[2] in actualClasses[ClassID]:
                    destinationState = ClassID
                    break
            # add new transitions
            newTran.add((sourceState, t[1], destinationState))
        a.transitions = newTran
        # change FINAL STATES
        newFinal = set()
        for finalState in a.final:
            for ClassID in range(0, len(actualClasses), 1):
                if finalState in actualClasses[ClassID]:
                    newFinal.add(ClassID)
                    break
        a.final = newFinal

        # 3) *** Removal of surplus state that do not affect the adoption
        # of a string. ***
        self.set_flag("Minimal", True)
        self._compute = False
Пример #5
0
    def _determinise(self, create_table=False, states_limit=0):
        """
            Determinisation of automaton.

            :param create_table: If create_table = false than state representation table is not created and less memory is consumed.
            :type create_table: boolean

            :param states_limit: If num of states exceeds this limit, during determinization, then flag "Deterministic" is set to False and determinize stops; if nfa exceeds limit and is already deterministic, then nothing happens (this is because speed, not because logic); safe use is only if you want to stop algorithm if it exceeds limit; zero means no limit.
            :type states_limit: int

            :raises: ALPHABET_COLLISION_FREE_ERROR() if alphabet is not collision free.
            :flags: Set Deterministic and Epsilon Free.

            This method sets _compute to False, and get_compute() will return False until compute() is called.
        """
        if self.has_flag("Deterministic") and self.get_flag(
                "Deterministic") == True:
            return
        if self.has_flag("Epsilon Free") == False or self.get_flag(
                "Epsilon Free") == False:
            self.remove_epsilons()

        if not self.has_flag("Alphabet collision free") \
           or self.get_flag("Alphabet collision free") == False:
            raise ALPHABET_COLLISION_FREE_ERROR

        # Automatom doesn't have any state = automaton is empty
        if self._automaton.is_empty() or self._automaton.start < 0:
            return

        Stack = list()
        Citac = 0
        newStates = dict()
        newStatesBack = dict()
        tmp = set()
        tmp.add(self._automaton.start)
        newStates[Citac] = tmp
        newStatesBack[frozenset(tmp)] = Citac
        Citac = Citac + 1
        Stack.append(0)
        EndStates = set()
        Transitions = set()
        alphabetCounter = 0
        alphabet = dict()
        alphabetBack = dict()
        states = dict()
        states[0] = b_State(
            0,
            self._automaton.states[self._automaton.start].get_regexp_number())
        StateOutSymbols = dict()

        for transition in self._automaton.transitions:
            if StateOutSymbols.has_key(transition[0]) == True:
                StateOutSymbols[transition[0]].add(
                    (transition[1], transition[2]))
            else:
                StateOutSymbols[transition[0]] = set()
                StateOutSymbols[transition[0]].add(
                    (transition[1], transition[2]))

        while len(Stack) != 0:
            ActState = Stack.pop()
            TransitionLine = dict()
            if len(newStates[ActState].intersection(
                    self._automaton.final)) != 0:
                EndStates.add(ActState)
            Symbols = set()

            SymbolSetList = list()
            translationTable = list()

            for States in newStates[ActState]:
                if States in StateOutSymbols.keys():
                    for sym in StateOutSymbols[States]:
                        if self._automaton.alphabet[sym[0]].get_type(
                        ) == b_symbol.io_mapper["b_Sym_char"]:
                            SymbolSetList.append(
                                set(self._automaton.alphabet[sym[0]].char))
                            translationTable.append(sym[1])
                        elif self._automaton.alphabet[sym[0]].get_type(
                        ) == b_symbol.io_mapper["b_Sym_char_class"]:
                            SymbolSetList.append(
                                self._automaton.alphabet[sym[0]].charClass)
                            translationTable.append(sym[1])
                        else:
                            raise Exception()

            res = self.__allIntersections(SymbolSetList)

            translatedUsedList = list()

            for used in res[0]:
                newSet = set()
                for target in used:
                    newSet.add(translationTable[target])
                translatedUsedList.append(newSet)

            for i in range(0, len(translatedUsedList)):
                if frozenset(
                        translatedUsedList[i]) not in newStatesBack.keys():
                    newStatesBack[frozenset(translatedUsedList[i])] = Citac
                    newStates[Citac] = translatedUsedList[i]
                    Stack.append(Citac)

                    endVal = set()
                    for state in translatedUsedList[i]:
                        if self._automaton.states[state].is_final() == True:
                            endVal |= self._automaton.states[
                                state].get_regexp_number()

                    states[Citac] = b_State(Citac, endVal)

                    if states_limit != 0 and Citac > states_limit:
                        self.set_flag("Deterministic", False)
                        return

                    Citac = Citac + 1

                if frozenset(res[1][i]) not in alphabetBack.keys():
                    alphabetBack[frozenset(res[1][i])] = alphabetCounter
                    if len(res[1][i]) > 1:
                        strSymSetMod = str()
                        for sym in res[1][i]:
                            strSymSetMod += sym
                        strSymSetMod = "[" + strSymSetMod + "]"
                        Tmp = sym_char_class.b_Sym_char_class(
                            strSymSetMod, res[1][i], alphabetCounter)
                        alphabet[Tmp.get_id()] = Tmp
                    else:
                        for sym in res[1][i]:
                            char = sym
                        Symbol = sym_char.b_Sym_char(char, char,
                                                     alphabetCounter)
                        alphabet[Symbol.get_id()] = Symbol
                    alphabetCounter += 1

                Transitions.add(
                    (ActState, alphabetBack[frozenset(res[1][i])],
                     newStatesBack[frozenset(translatedUsedList[i])]))

        self._automaton.start = 0
        self._automaton.alphabet = alphabet
        self._automaton.states = states
        self._automaton.transitions = Transitions
        self._automaton.final = EndStates

        if create_table == True:
            for i in range(0, Citac):
                self._state_representation.append(newStates[i])

        self.set_flag("Deterministic", True)
        self.set_flag("Epsilon Free", True)
        self._compute = False
Пример #6
0
    def import_from_fsm(self, FileName="automaton.fsm", SymbolFileName = "automaton.sym"):
        """
            Load automaton from file in FSM format. Based on FSM man
            page: http://www2.research.att.com/~fsmtools/fsm/man4.html . This method must be updated if new symbol is added to Netbench. Raises Exception if unknown symbol string type is found and coresponding class can not be determinated.

            :param FileName: File name from which the fsm part will be imported.
            :type FileName: string
            :param SymbolFileName: File name from which the sym part will be imported.
            :type SymbolFileName: string
            :raises: nfa_data_import_exception if unknown symbol string type is found and coresponding class can not be determinated.

        """

        # initialization
        self.states      = dict();      # Finite set of states
        self.alphabet    = dict();      # Symbols of alphabet
        self.start       = -1;          # ID of Start state
        self.transitions = set();       # Transitions
        self.final       = set();       # Final states
        self.Flags       = dict();      # Flags for specified properties

        # Load symbols from symbol file
        fs = open(SymbolFileName, 'r')

        symbol_mapper = dict()

        # Read all symbols
        for line in fs.readlines():
            # Split line
            line = line.split()
            # Get symbol ID - subtract 1 (FSM Library use 0 for epsilon symbol, Netbench use -1 for epsilon symbol)
            symbol_id = int(line[1]) - 1
            # maps symbol string to its id
            symbol_mapper[line[0]] = symbol_id
            # get name of symbol class
            try:
                cls = b_symbol.io_reverse_mapper[line[0][0]]
            except:
                raise nfa_data_import_exception(line[0][0])
            symbol = None
            # Create new object of selected class
            if cls == "b_Sym_char":
                symbol = sym_char.b_Sym_char("","", 0)
            if cls == "b_Sym_char_class":
                symbol = sym_char_class.b_Sym_char_class("", set(), 0)
            if cls == "b_Sym_string":
                symbol = sym_string.b_Sym_string("","", 0)
            if cls == "b_Sym_kchar":
                symbol = sym_kchar.b_Sym_kchar("",("",""), 0)
            if cls == "DEF_SYMBOLS":
                symbol = b_symbol.DEF_SYMBOLS("", 0)
            if cls == "b_Sym_cnt_constr":
                symbol = sym_cnt_constr.b_Sym_cnt_constr("","",0,0,0)
            if symbol == None:
                raise nfa_data_import_exception(line[0][0])
            else:
                # Import symbol
                symbol.import_symbol(line[0], symbol_id)
                # Add to alphabet
                self.alphabet[symbol_id] = symbol

        fs.close()

        fr = open(FileName, 'r') # file read

        # first line indicating start state
        line = fr.readline()
        line = line.split()
        src = int(line[0])
        self.start = src
        self.states[src] = b_State(mid = src)
        # line is transition
        if len(line) > 1:
            des = int(line[1])
            if src != des:
                self.states[des] = b_State(mid = des)
            self.transitions.add((src, symbol_mapper[line[2]], des))
        # first line is start state and too final state
        # (line is final state)
        else :
            self.final.add(src)
            self.states[src]._rnum = src

        # from 2 line to EndOfFile
        for line in fr.readlines():
            line = line.split()
            src = int(line[0])
            if src not in self.states:
                self.states[src] = b_State(mid = src)
            # line is transition
            if len(line) > 1:
                des = int(line[1])
                if des not in self.states:
                    self.states[des] = b_State(mid = des)
                self.transitions.add((src, symbol_mapper[line[2]], des))
            # line is final state
            else :
                self.final.add(src)
                self.states[src]._rnum = src

        self.Flags["ImportFromFsm"] = True
        fr.close()
Пример #7
0
    def get_nfa(self):
        """
            Parse a current line and returns parsed nfa.
            
            :returns: Created automaton in nfa_data format. Returns None if failure happens.
            :rtype: nfa_data or None
        """
        # Check if some reg. exp. are set.
        if (self._position < 0):
            return None

        # Create random value.
        #value = random.randint(0, sys.maxint)

        # Get line.
        line = self._text[self._position]

        # Remove trailing \n
        if line[len(line) - 1] == '\n':
            line = line[0:len(line) - 1]

        #line = "/" + line + "/"

        self.last = line

        # find where we are
        msfm_path = aux_func.getPatternMatchDir()
        work_path = os.getcwd()

        # invoke C regexp parser
        #cmd = "echo '" + line + "' | " + msfm_path + "/pcre_parser/parser -o STDOUT -s"
        #res = aux_func.getstatusoutput(cmd)
        cmd = ""

        # Create cnt_constr symbols if requested
        if self.create_cnt_constr == False:
            cmd = msfm_path + "/pcre_parser/parser -o STDOUT -s"
        else:
            cmd = msfm_path + "/pcre_parser/parser -o STDOUT -s -c"
        # Do not create eof symbols if requested
        if self.create_eof_symbols == False:
            cmd += " -E"

        res = aux_func.getstatusoutput(cmd, line)
        # Print stderr if there is some content
        if len(res[2]) != 0:
            sys.stderr.write(res[2] + "\n")
        # If error, stop.
        if res[0] != 0:
            sys.stderr.write("PARSER ERROR:\n")
            sys.stderr.write("CMD: " + cmd + "\n")
            sys.stderr.write("PCRE: " + line + "\n")
            sys.stderr.write("MSFM:\n")
            sys.stderr.write(res[1] + "\n")
            return None
        else:
            try:
                # Create empty object
                nfa = nfa_data.nfa_data()

                # Preprocess automaton file
                FSMfile = res[1].split("\n")

                # Get start state of NFA
                nfa.start = int(FSMfile[2])
                del FSMfile[2]

                # FORMAT of Automata file
                #  - Number of the States in the automaton
                #  - Number of the transition in the automaton
                #  - Each transition is represenetd by one line in the file. Line
                #    is in format Source_State|Symbol|Target_State|Epsilon
                #  - End of the transition table is represented by line of #
                #  - Number of the end states
                #  - Line with identifikator of the endState. Every endstate is
                #    folowed by , (coma)
                #  - End of endState section is represented by line of #
                #  - Number of the symbols in symbol table
                #  - Every symbol is stored on its own line and it is represented
                #    as Symbol_Number:Character1|Character2|
                #  - End of the file

                TransitionTable = [
                    x.split("|") for x in FSMfile[2:int(FSMfile[1]) + 2]
                ]
                # Transition table is list of the list and represents the whole
                # transition table of the automata.  2 is an index of the first
                # transition FSMfile[1] is the number of the transition in automaton

                # List of the endStates is stored after all transition (FSMfile[1])
                # and after 4 other lines (number of states, number of transitions,
                # number of endstates, and the line of ####
                # Endstates are isolated by , (coma)
                Endstates = FSMfile[int(FSMfile[1]) + 4].split(",")

                # Alphabet symbols start on the index FSMfile[1]
                # (all transitions) + 7 (4 as before + line of #,
                # line of endstates and number of symbols)
                Symbols = (FSMfile[int(FSMfile[1]) + 7:])

                # Creates end states objects.
                for state in Endstates:
                    if state != "":
                        Tmp = b_State(
                            int(state), set([self._position])
                        )  #Creates state which is described by the int(State)
                        nfa.states[Tmp.get_id()] = Tmp
                        nfa.final.add(Tmp.get_id())

                all_msfm_syms = dict()

                # For every symbol in alphabet
                for ActSym in Symbols:
                    # Separate symbol number and symbol data (done by first :)
                    StartSym = ActSym.find(":")
                    if ActSym[StartSym + 1] == '#':
                        # Split at #
                        sharp_split = ActSym[StartSym + 1:len(ActSym) -
                                             1].split("#")
                        # Get m
                        m = int(sharp_split[1])
                        # Get n
                        n = 0
                        # Check if infinite number of symbols can occure
                        if sharp_split[2] == '':
                            n = float("inf")
                        else:
                            n = int(sharp_split[2])
                        # Get symbol part of encoded cnt constr
                        SymSym = ActSym.rfind("#")
                        symSet = set([
                            x for x in ActSym[SymSym + 1:len(ActSym) -
                                              1].split("|")
                        ])
                        symSetMod = set()
                        # convert hex to char
                        for s in symSet:
                            symSetMod.add(chr(long(s, 16) & 255))
                        # Create symbol
                        symbol = None
                        text_info = ""
                        if not (m == 0 and n == 0):
                            # Create char if number of symbols is 1.
                            if len(symSetMod) == 1:
                                char = symSetMod.pop()
                                symbol = char
                                text_info += char + "{" + str(m) + "," + str(
                                    n) + "}"
                            else:
                                # Create char class otherwise.
                                strSymSetMod = str()
                                for sym in symSetMod:
                                    strSymSetMod += sym
                                strSymSetMod = "[" + strSymSetMod + "]"
                                text_info += strSymSetMod + "{" + str(
                                    m) + "," + str(n) + "}"
                                symbol = symSetMod
                            # Create sym_cnt_constr object
                            Tmp = sym_cnt_constr.b_Sym_cnt_constr(
                                text_info, symbol, m, n,
                                int(ActSym[:StartSym], 16))
                            nfa.alphabet[Tmp.get_id()] = Tmp
                            # Create mapping from symbol chars to their ids
                            if (m, n, frozenset(symbol)) not in all_msfm_syms:
                                all_msfm_syms[(m, n,
                                               frozenset(symbol))] = set()
                            all_msfm_syms[(m, n, frozenset(symbol))].add(
                                int(ActSym[:StartSym], 16))
                        else:
                            #BUG: Workaround for bug in parser, when cnt constr symbols are generated even construction such as s+, d*, .+, ... are converted. This behaviaor is not OK, but fix of the parser would consume to mauch time. This workaround works OK.
                            # Create mapping from symbol chars to their ids
                            if frozenset(symSetMod) not in all_msfm_syms:
                                all_msfm_syms[frozenset(symSetMod)] = set()
                            all_msfm_syms[frozenset(symSetMod)].add(
                                int(ActSym[:StartSym], 16))

                            # Create char if number of symbols is 1.
                            if len(symSetMod) == 1:
                                char = symSetMod.pop()
                                Symbol = sym_char.b_Sym_char(
                                    char, char, int(ActSym[:StartSym], 16))
                                nfa.alphabet[Symbol.get_id()] = Symbol
            #                    nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char.b_Sym_char(char, char)
                            else:
                                # Create char class otherwise.
                                #                    nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char_class.b_Sym_char_class(str(symSetMod), symSetMod)
                                strSymSetMod = str()
                                for sym in symSetMod:
                                    strSymSetMod += sym
                                strSymSetMod = "[" + strSymSetMod + "]"
                                #nfa.alphabet[int(ActSym[:StartSym], 16)]
                                Tmp = sym_char_class.b_Sym_char_class(
                                    strSymSetMod, symSetMod,
                                    int(ActSym[:StartSym], 16))
                                nfa.alphabet[Tmp.get_id()] = Tmp
                    elif ActSym[StartSym + 1:] == "EOF|":
                        # Add EOF symbol into alphabet
                        Symbol = sym_eof.b_Sym_EOF("EOF",
                                                   int(ActSym[:StartSym], 16))
                        nfa.alphabet[Symbol.get_id()] = Symbol
                        # Create mapping from symbol chars to their ids
                        if "EOF" not in all_msfm_syms:
                            all_msfm_syms["EOF"] = set()
                        all_msfm_syms["EOF"].add(int(ActSym[:StartSym], 16))
                    else:
                        symSet = set([
                            x for x in ActSym[StartSym + 1:len(ActSym) -
                                              1].split("|")
                        ])
                        symSetMod = set()
                        # convert hex to char
                        for s in symSet:
                            symSetMod.add(chr(long(s, 16) & 255))

                        # Create mapping from symbol chars to their ids
                        if frozenset(symSetMod) not in all_msfm_syms:
                            all_msfm_syms[frozenset(symSetMod)] = set()
                        all_msfm_syms[frozenset(symSetMod)].add(
                            int(ActSym[:StartSym], 16))

                        # Create char if number of symbols is 1.
                        if len(symSetMod) == 1:
                            char = symSetMod.pop()
                            Symbol = sym_char.b_Sym_char(
                                char, char, int(ActSym[:StartSym], 16))
                            nfa.alphabet[Symbol.get_id()] = Symbol
        #                    nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char.b_Sym_char(char, char)
                        else:
                            # Create char class otherwise.
                            #                    nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char_class.b_Sym_char_class(str(symSetMod), symSetMod)
                            strSymSetMod = str()
                            for sym in symSetMod:
                                strSymSetMod += sym
                            strSymSetMod = "[" + strSymSetMod + "]"
                            #nfa.alphabet[int(ActSym[:StartSym], 16)]
                            Tmp = sym_char_class.b_Sym_char_class(
                                strSymSetMod, symSetMod,
                                int(ActSym[:StartSym], 16))
                            nfa.alphabet[Tmp.get_id()] = Tmp

                # TODO: use special class for Epsilon?
                # Epsilon is representad now as sym_char object with char "" and index -1
                #nfa.alphabet[-1]
                Tmp = sym_char.b_Sym_char("Epsilon", "", -1)
                nfa.alphabet[Tmp.get_id()] = Tmp

                # removeable symbols
                removeable_symbols = set()
                nonremoveable_symbols = set()
                # Add non final states to automaton.
                for transition in TransitionTable:
                    # if not in states, add start state of transition.
                    if not (int(transition[0]) in nfa.states):
                        nfa.states[int(transition[0])] = b_State(
                            int(transition[0]), set())

                    # if not in states, add end state of transition.
                    if not (int(transition[2]) in nfa.states):
                        nfa.states[int(transition[2])] = b_State(
                            int(transition[2]), set())

                    # Handle epsilon transitions.
                    alphaNum = -1
                    if transition[3] == '1':
                        alphaNum = -1
                        removeable_symbols.add(int(transition[1], 16))
                    else:
                        alphaNum = int(transition[1], 16)
                        nonremoveable_symbols.add(alphaNum)

                    # Add transition to automaton.
                    nfa.transitions.add(
                        (int(transition[0]), alphaNum, int(transition[2])))

                # Corect the removeable symbols
                removeable_symbols -= nonremoveable_symbols

                # Remove unused symbols
                for rsymbol in removeable_symbols:
                    del nfa.alphabet[rsymbol]

                # Remove duplicit symbols
                sym_mapping = dict()

                # Create mapping between current ids and the ids which will be used.
                # Only non removed id can be used as key
                #print all_msfm_syms
                #print removeable_symbols
                for key in all_msfm_syms:
                    sym = all_msfm_syms[key].pop()
                    if sym not in removeable_symbols:
                        all_msfm_syms[key].add(sym)
                    else:
                        found = 0
                        syms = set()
                        syms.add(sym)
                        while found == 0:
                            if len(all_msfm_syms[key]) == 0:
                                break
                            sym = all_msfm_syms[key].pop()
                            syms.add(sym)
                            if sym not in removeable_symbols:
                                found = 1
                                all_msfm_syms[key] |= syms

                    for sid in all_msfm_syms[key]:
                        sym_mapping[sid] = sym

                sym_mapping[-1] = -1

                add_transitions = set()
                #print sym_mapping
                for transition in nfa.transitions:
                    #print transition
                    add_transitions.add(
                        (transition[0], sym_mapping[transition[1]],
                         transition[2]))

                nfa.transitions = add_transitions

                for sid in sym_mapping:
                    if sid != sym_mapping[sid]:
                        if sid not in removeable_symbols:
                            del nfa.alphabet[sid]

                # Somethimg is wrong with the msfm file, try autodetect the start state
                if nfa.start < 0:
                    # Determinate start station
                    # Dictionary mapping between states and their previous states.
                    StateInSymbols = dict()
                    # Autodetect start state of NFA - remove when start state is aded to the msfm format
                    # Compute the mapping between states and their transitions.
                    for state in nfa.states.keys():
                        StateInSymbols[state] = set()
                    for transition in nfa.transitions:
                        if StateInSymbols.has_key(transition[2]) == True:
                            StateInSymbols[transition[2]].add(transition[0])
                        else:
                            StateInSymbols[transition[2]] = set()
                            StateInSymbols[transition[2]].add(transition[0])

                    # Autodetection - start state can have only 0 or 1 in transition originating from itself - problem /^(abc)+..../
                    for state in StateInSymbols.keys():
                        if len(StateInSymbols[state]) == 0:
                            nfa.start = state
                        elif (len(StateInSymbols[state]) == 1) and (list(
                                StateInSymbols[state])[0] == state):
                            nfa.start = state

                return nfa
            except None:
                sys.stderr.write(
                    "ERROR while parsing msfm output of parser:\n")
                sys.stderr.write("CMD: " + cmd + "\n")
                sys.stderr.write("PCRE: " + line + "\n")
                sys.stderr.write("MSFM:\n")
                sys.stderr.write(res[1] + "\n")
                return None