def __build_nfa(self): op_stack = [] self.nfa = Digraph(len(self.pat) + 1) # tracks the last left parenthesis leftPar = -1 for i in range(self.end_state): ch = self.pat[i] if ch == '(': # add an epsilon transition to the next character and push the operator for later processing self.nfa.add_edge(i, i+1) op_stack.append(i) leftPar = i elif ch == '|' and leftPar >= 0: # add a transition from the left parenthesis node to the first node after the OR (|) operator self.nfa.add_edge(leftPar, i+1) op_stack.append(i) elif ch == ')': # add a transition from the node right after the | operator to the right parenthesis node last_op = op_stack.pop() while self.pat[last_op] == '|': self.nfa.add_edge(last_op, i) last_op = op_stack.pop() leftPar = last_op self.nfa.add_edge(i, i+1) # handles the case (...)* if i < self.end_state - 1 and self.pat[i+1] == '*': self.nfa.add_edge(leftPar, i+1) self.nfa.add_edge(i+1, leftPar) elif ch == '*': # always add a transition to the next node as the star operator can match a zero input self.nfa.add_edge(i, i+1) # handles the case of a star operator after a simple symbol, e.g. ab* if i > 0 and self.pat[i-1] != ')': self.nfa.add_edge(i, i-1) self.nfa.add_edge(i-1, i+1) assert len(op_stack) == 0
class SimpleRegEx: def __init__(self, pattern): self.pat = pattern self.end_state = len(pattern) self.__build_nfa() # stores a flag per node indicating if the last DFS operation reached the corresponding node self._marked = [False for _ in range(self.nfa.V())] def matches(self, text): # find the epsilon transitions from the origin state self.__dfs_cycle([0]) current_states = self.__get_reachable_states() if self.end_state in current_states: return True # compute the next set of reachable states for each input character for i in range(len(text)): new_states = [] for state in current_states: if text[i] == self.pat[state] or self.pat[state] == '.': new_states.append(state+1) self.__dfs_cycle(new_states) current_states = self.__get_reachable_states() if self.end_state in current_states: return True if len(current_states) == 0: return False return self.end_state in current_states def __get_reachable_states(self): return [i for i in range(self.end_state + 1) if self._marked[i]] def __build_nfa(self): op_stack = [] self.nfa = Digraph(len(self.pat) + 1) # tracks the last left parenthesis leftPar = -1 for i in range(self.end_state): ch = self.pat[i] if ch == '(': # add an epsilon transition to the next character and push the operator for later processing self.nfa.add_edge(i, i+1) op_stack.append(i) leftPar = i elif ch == '|' and leftPar >= 0: # add a transition from the left parenthesis node to the first node after the OR (|) operator self.nfa.add_edge(leftPar, i+1) op_stack.append(i) elif ch == ')': # add a transition from the node right after the | operator to the right parenthesis node last_op = op_stack.pop() while self.pat[last_op] == '|': self.nfa.add_edge(last_op, i) last_op = op_stack.pop() leftPar = last_op self.nfa.add_edge(i, i+1) # handles the case (...)* if i < self.end_state - 1 and self.pat[i+1] == '*': self.nfa.add_edge(leftPar, i+1) self.nfa.add_edge(i+1, leftPar) elif ch == '*': # always add a transition to the next node as the star operator can match a zero input self.nfa.add_edge(i, i+1) # handles the case of a star operator after a simple symbol, e.g. ab* if i > 0 and self.pat[i-1] != ')': self.nfa.add_edge(i, i-1) self.nfa.add_edge(i-1, i+1) assert len(op_stack) == 0 def __dfs_cycle(self, states): self._marked = [False for _ in range(self.nfa.V())] for s in states: self.__dfs(s) def __dfs(self, v): self._marked[v] = True for w in self.nfa.edges(v): if not self._marked[w]: self.__dfs(w)