def read_cfg(cls, text, start_symbol=cfg.Variable("S"), contains_regexes=False, track_variables=False): variables = set() productions = set() terminals = set() if track_variables: for line in text.splitlines(): head = line.strip().split(' ', 1)[0] variables.add(cfg.Variable(head)) for line in text.splitlines(): if contains_regexes and \ len(line.split()) > 1 and \ len(line.strip().split(' ', 1)[1]) > 1 and \ any(symb in line for symb in ['*', '|', '+', '?', ]): raw_head, *raw_body = line.strip().split(' ', 1) regex = Regex.from_python_regex(' '.join(raw_body)) head = cfg.Variable(raw_head) cur_cfg = cls._create_cfg_from_regex(head, regex, track_variables) terminals.update(cur_cfg.terminals) productions.update(cur_cfg.productions) variables.update(cur_cfg.variables) else: line = line.strip() if not line: continue if track_variables: tmp_vars = set() cls._read_line(line, productions, terminals, tmp_vars) else: cls._read_line(line, productions, terminals, variables) return cls(variables=variables, terminals=terminals, productions=productions, start_symbol=start_symbol)
def from_str(st, py=True): if py: e_dfa = Regex.from_python_regex(st).to_epsilon_nfa() else: e_dfa = Regex(st).to_epsilon_nfa() dfa = e_dfa.to_deterministic().minimize() dfa, states_map = SimpleGraph.dfa_normalize_states(dfa) edges = [] size = 0 for vs, labels in dfa.to_dict().items(): for label, ve in labels.items(): vs, ve = int(str(vs)), int(str(ve)) label = str(label) size = max(size, vs, ve) edges.append((vs, label, ve)) return Regexp(size + 1, edges, dfa, states_map)
def from_text(cls, text: List[str], use_python_regexes_if_necessary=False, variables=None): vars, terms, prods = set(), set(), set() start_var = None for line in text: if not line.strip(): continue raw_head, *raw_body = line.strip().split(' ', 1) if raw_body and any([spec in raw_body[0] for spec in ['|', '.', '?', '+', '-']]): if '-' in raw_body[0] and use_python_regexes_if_necessary: regex = Regex.from_python_regex(raw_body[0]) else: regex = Regex(raw_body[0]) head = Variable(raw_head) if start_var is None: start_var = head cur_cfg = cls._create_cfg_from_regex(head, regex, variables) vars.update(cur_cfg.variables) terms.update(cur_cfg.terminals) prods.update(cur_cfg.productions) else: raw_body = raw_body[0].split(' ') if raw_body else '' if start_var is None: start_var = Variable(raw_head) head = Variable(raw_head) vars.add(head) body = [] for element in raw_body: if element == 'eps': continue elif (not variables and any(letter.isupper() for letter in element) or variables and element in variables): var = Variable(element) vars.add(var) body.append(var) else: term = Terminal(element) terms.add(term) body.append(term) prods.add(Production(head, body)) cfg = CFG(vars, terms, start_var, prods) return cls(cfg)
def from_regexp(path): graph = Graph() with open(path, 'r') as file: dfa = Regex.from_python_regex( file.read()).to_epsilon_nfa().to_deterministic().minimize() file.close() state_counter = 0 dfa_states = {} for state in dfa._states: if state not in dfa_states: dfa_states[state] = state_counter state_counter += 1 graph.size = state_counter for state in dfa._states: for symbol in dfa._input_symbols: reachable_states = dfa._transition_function(state, symbol) for out_state in reachable_states: # add all edges in boolean matrix if symbol in graph.label_dictionary: graph.label_dictionary[symbol][ dfa_states[state], dfa_states[out_state]] = 1 else: boolean_matrix = Matrix.sparse(BOOL, graph.size, graph.size) boolean_matrix[dfa_states[state], dfa_states[out_state]] = 1 graph.label_dictionary[symbol] = boolean_matrix graph.vertices.add(dfa_states[state]) graph.vertices.add(dfa_states[out_state]) # sync start and final states graph.start_states = [dfa_states[dfa.start_state]] for final_state in dfa._final_states: graph.final_states.append(dfa_states[final_state]) return graph
def test_from_python_brackets(self): regex = Regex.from_python_regex("a[bc]") self.assertTrue(regex.accepts(["a", "b"])) self.assertTrue(regex.accepts(["a", "c"])) self.assertFalse(regex.accepts(["a", "b", "c"])) self.assertFalse(regex.accepts(["a", "a"]))
def test_from_python_simple(self): regex = Regex.from_python_regex("abc") self.assertTrue(regex.accepts(["a", "b", "c"])) self.assertFalse(regex.accepts(["a", "b", "b"])) self.assertFalse(regex.accepts(["a", "b"]))
def read_grammar(cls, name): id = 0 terminals, variables, productions = set(), set(), set() start_symb = None with open(name, 'r') as file: productions_txt = file.readlines() for production_txt in productions_txt: head, _, *body_full = production_txt.strip().split() if start_symb is None: start_symb = Variable(head) tmp_body = [] bodies = [ list(group) for k, group in groupby(body_full, lambda x: x == "|") if not k ] for body in bodies: is_regex = not any([ True if '*' not in value else False for value in body ]) if is_regex: new_productions, new_variables, new_terminals, id = CFGrammar \ .read_production_regex(head, Regex.from_python_regex(body[0]), id, False) productions |= new_productions variables |= new_variables terminals |= new_terminals else: body_cfg = [] for letter in body: if letter == "epsilon": body_cfg.append(Epsilon()) elif letter.isupper(): non_terminal = Variable(letter) variables.add(non_terminal) body_cfg.append(non_terminal) else: terminal = Terminal(letter) terminals.add(terminal) body_cfg.append(terminal) productions.add(Production(Variable(head), body_cfg)) cfg = CFG(variables, terminals, start_symb, productions) return cfg
def from_regex(cls, regex: str, is_python_regex=True): if is_python_regex: pyformlang_regex = Regex.from_python_regex(regex) else: pyformlang_regex = Regex(regex) return RegexGraphWrapper(pyformlang_regex.to_epsilon_nfa().minimize())