def test_get_first_set2(self): # Example from: # https://www.geeksforgeeks.org/first-set-in-syntax-analysis/ text = """ S -> A C B | C b b | B a A -> d a | B C B -> g | Є C -> h | Є """ cfg = CFG.from_text(text) llone_parser = LLOneParser(cfg) first_set = llone_parser.get_first_set() self.assertEqual(first_set[Variable("S")], {Terminal(x) for x in {"d", "g", "h", "b", "a"} }.union({Epsilon()})) self.assertEqual(first_set[Variable("A")], {Terminal(x) for x in {"d", "g", "h"}}.union({Epsilon()})) self.assertEqual(first_set[Variable("B")], {Terminal(x) for x in {"g"}}.union({Epsilon()})) self.assertEqual(first_set[Variable("C")], {Terminal(x) for x in {"h"}}.union({Epsilon()}))
def test_intersection_with_epsilon(self): state0 = State(0) state1 = State(1) symb_a = Symbol("a") dfa = DeterministicFiniteAutomaton({state0, state1}, {symb_a}, start_state=state0, final_states={state1}) dfa.add_transition(state0, symb_a, state1) self.assertTrue(dfa.accepts([symb_a])) ter_a = Terminal("a") var_s = Variable("S") var_l = Variable("L") var_t = Variable("T") productions = { Production(var_s, [var_l, var_t]), Production(var_l, [Epsilon()]), Production(var_t, [ter_a]), Production(var_t, [Epsilon()]) } cfg = CFG(productions=productions, start_symbol=var_s) self.assertFalse(cfg.is_empty()) self.assertTrue(cfg.contains([ter_a])) cfg_temp = cfg.to_pda().to_cfg() self.assertFalse(cfg_temp.is_empty()) self.assertTrue(cfg_temp.contains([ter_a])) cfg_temp = cfg.to_pda().to_final_state().to_empty_stack().to_cfg() self.assertFalse(cfg_temp.is_empty()) self.assertTrue(cfg_temp.contains([ter_a])) cfg_i = cfg.intersection(dfa) self.assertFalse(cfg_i.is_empty())
def _get_first_set_production(production, first_set): first_not_containing_epsilon = 0 first_set_temp = set() for body_component in production.body: first_set_temp = first_set_temp.union( first_set.get(production.body[first_not_containing_epsilon], set())) if Epsilon() not in first_set.get(body_component, set()): break first_not_containing_epsilon += 1 if first_not_containing_epsilon != len(production.body): if Epsilon() in first_set_temp: first_set_temp.remove(Epsilon()) return first_set_temp
def read_production_regex(cls, head, regex, id, case_sens=True): var_by_state = dict() terminals, variables, productions = set(), set(), set() enfa = regex.to_epsilon_nfa().minimize() if len(enfa.states) == 0: variables.add(head) productions.add(Production(head, [Epsilon()])) return productions, variables, terminals, id for state in enfa.states: var_by_state[state] = Variable(f'Id{id},{state}') id += 1 transitions = enfa._transition_function for start_st in enfa.start_states: productions.add(Production(head, [var_by_state[start_st]])) for st_from, symb, st_to in transitions: new_head = var_by_state[st_from] new_body = [] value = symb.value if value == 'eps': new_body.append(Epsilon()) elif value.isupper() and case_sens: variable = Variable(value) new_body.append(variable) variables.add(variable) elif value.isdigit() or value.islower() or not case_sens: variable = Terminal(value) new_body.append(variable) variables.add(variable) else: raise ValueError( f'Symbol "{value}" should be either lower or upper case') new_body.append(var_by_state[st_to]) productions.add(Production(new_head, new_body)) if st_to in enfa.final_states: productions.add(Production(var_by_state[st_to], [])) return productions, variables, terminals, id
def test_nullable_object(self): """ Tests the finding of nullable objects """ var_a = Variable("A") var_b = Variable("B") ter_a = Terminal("a") ter_b = Terminal("b") start = Variable("S") prod0 = Production(start, [var_a, var_b]) prod1 = Production(var_a, [ter_a, var_a, var_a]) prod2 = Production(var_a, [Epsilon()]) prod3 = Production(var_b, [ter_b, var_b, var_b]) prod4 = Production(var_b, [Epsilon()]) cfg = CFG({var_a, var_b, start}, {ter_a, ter_b}, start, {prod0, prod1, prod2, prod3, prod4}) self.assertEqual(cfg.get_nullable_symbols(), {var_a, var_b, start})
def get_parse_tree(self, word, left=True): """ Get a parse tree for a given word Parameters ---------- word : list The word to parse left If we do the recursive from the left or the right(left by \ default) Returns ------- parse_tree : :class:`~pyformlang.cfg.ParseTree` The parse tree Raises -------- NotParsableException When the word cannot be parsed """ word = [to_terminal(x) for x in word if x != Epsilon()] parse_tree = ParseTree(self._cfg.start_symbol) starting_expansion = [(self._cfg.start_symbol, parse_tree)] if self._get_parse_tree_sub(word, starting_expansion, left): return parse_tree raise NotParsableException
def _test_profiling_intersection(self): size = 50 states = [State(i) for i in range(size * 2 + 1)] symb_a = Symbol("a") symb_b = Symbol("b") dfa = DeterministicFiniteAutomaton(states, {symb_a, symb_b}, start_state=states[0], final_states={states[-1]}) for i in range(size): dfa.add_transition(states[i], symb_a, states[i + 1]) for i in range(size, size * 2): dfa.add_transition(states[i], symb_b, states[i + 1]) ter_a = Terminal("a") ter_b = Terminal("b") var_s = Variable("S") var_s1 = Variable("S1") var_l = Variable("L") productions = [ Production(var_s, [var_l, var_s1]), Production(var_l, [Epsilon()]), Production(var_s1, [ter_a, var_s1, ter_b]), Production(var_s1, [ter_b, var_s1, ter_a]), Production(var_s1, []) ] cfg = CFG(productions=productions, start_symbol=var_s) cfg_i = cfg.intersection(dfa) self.assertFalse(cfg_i.is_empty()) self.assertTrue(cfg_i.contains([ter_a] * size + [ter_b] * size)) self.assertFalse(cfg_i.contains([]))
def from_cfg(cls, cfg: CFG): """ Create a recursive automaton from context-free grammar Parameters ----------- cfg : :class:`~pyformlang.cfg.CFG` The context-free grammar Returns ----------- rsa : :class:`~pyformlang.rsa.RecursiveAutomaton` The new recursive automaton built from context-free grammar """ initial_label = to_symbol(cfg.start_symbol) grammar_in_true_format = remove_repetition_of_nonterminals_from_productions( cfg.to_text()) boxes = set() labels = set() notation_for_epsilon = Epsilon().to_text() for production in grammar_in_true_format.splitlines(): head, body = production.split(" -> ") labels.add(to_symbol(head)) if body == "": body = notation_for_epsilon boxes.add( Box(Regex(body).to_epsilon_nfa().minimize(), to_symbol(head))) return RecursiveAutomaton(labels, initial_label, boxes)
def test_intersection_dfa2(self): state0 = State(0) symb_a = Symbol("a") symb_b = Symbol("b") dfa = DeterministicFiniteAutomaton({state0}, {symb_a, symb_b}, start_state=state0, final_states={state0}) dfa.add_transition(state0, symb_a, state0) dfa.add_transition(state0, symb_b, state0) self.assertTrue(dfa.accepts([symb_a, symb_a, symb_b, symb_b])) ter_a = Terminal("a") ter_b = Terminal("b") var_s = Variable("S") var_s1 = Variable("S1") var_l = Variable("L") productions = { Production(var_s, [var_l, var_s1]), Production(var_l, [Epsilon()]), Production(var_s1, [ter_a, var_s1, ter_b]), Production(var_s1, [ter_b, var_s1, ter_a]), Production(var_s1, []) } cfg = CFG(productions=productions, start_symbol=var_s) self.assertTrue(cfg.contains([ter_a, ter_a, ter_b, ter_b])) self.assertFalse(cfg.contains([ter_a, ter_a, ter_b])) cfg_i = cfg.intersection(dfa) self.assertFalse(cfg_i.is_empty()) self.assertTrue(cfg_i.contains([ter_a, ter_a, ter_b, ter_b])) self.assertTrue(cfg_i.contains([]))
def test_to_pda(self): """ Tests the conversion to PDA """ var_e = Variable("E") var_i = Variable("I") ter_a = Terminal("a") ter_b = Terminal("b") ter_0 = Terminal("0") ter_1 = Terminal("1") ter_par_open = Terminal("(") ter_par_close = Terminal(")") ter_mult = Terminal("*") ter_plus = Terminal("+") productions = { Production(var_e, [var_i]), Production(var_e, [var_e, ter_plus, var_e]), Production(var_e, [var_e, ter_mult, var_e]), Production(var_e, [ter_par_open, var_e, ter_par_close]), Production(var_i, [ter_a]), Production(var_i, [ter_b]), Production(var_i, [var_i, ter_a]), Production(var_i, [var_i, ter_b]), Production(var_i, [var_i, ter_0]), Production(var_i, [var_i, ter_1]), Production(var_i, [var_i, Epsilon()]) } cfg = CFG({var_e, var_i}, { ter_a, ter_b, ter_0, ter_1, ter_par_open, ter_par_close, ter_mult, ter_plus }, var_e, productions) pda = cfg.to_pda() self.assertEqual(len(pda.states), 1) self.assertEqual(len(pda.final_states), 0) self.assertEqual(len(pda.input_symbols), 8) self.assertEqual(len(pda.stack_symbols), 10) self.assertEqual(pda.get_number_transitions(), 19)
def test_generating_object(self): """ Test the finding of CFGObject """ var_a = Variable("A") var_b = Variable("B") ter_a = Terminal("a") ter_b = Terminal("b") start = Variable("S") prod0 = Production(start, [var_a, var_b]) prod1 = Production(start, [ter_a]) prod2 = Production(var_a, [ter_b]) cfg = CFG({var_a, var_b, start}, {ter_a, ter_b}, start, {prod0, prod1, prod2}) self.assertEqual(len(cfg.variables), 3) self.assertEqual(len(cfg.terminals), 2) self.assertEqual(len(cfg.productions), 3) self.assertEqual(cfg.get_generating_symbols(), {var_a, ter_a, ter_b, start}) prod3 = Production(var_b, [Epsilon()]) cfg = CFG({var_a, var_b, start}, {ter_a, ter_b}, start, {prod0, prod1, prod2, prod3}) self.assertEqual(len(cfg.variables), 3) self.assertEqual(len(cfg.terminals), 2) self.assertEqual(len(cfg.productions), 4) self.assertEqual(cfg.get_generating_symbols(), {var_a, var_b, ter_a, ter_b, start})
def test_derivation_empty(self): var_s = Variable("S") productions = [Production(var_s, [Epsilon()])] cfg = CFG(productions=productions, start_symbol=var_s) parse_tree = cfg.get_cnf_parse_tree([]) derivation = parse_tree.get_rightmost_derivation() self.assertEqual([[var_s], []], derivation)
def _initialize_follow_set(self, first_set): to_process = SetQueue() follow_set = dict() follow_set[self._cfg.start_symbol] = {"$"} to_process.append(self._cfg.start_symbol) for production in self._cfg.productions: for i, component in enumerate(production.body): for component_next in production.body[i + 1:]: follow_set[component] = follow_set.get( component, set()).union(first_set.get(component_next, set())) if Epsilon() not in first_set.get(component_next, set()): break if Epsilon() in follow_set.get(component, set()): follow_set[component].remove(Epsilon()) if follow_set.get(component, set()): to_process.append(component) return follow_set, to_process
def read_grammar(cls, name): id = 0 terminals, variables, productions = set(), set(), set() start_symb = None with open(name, 'r') as file: productions_txt = file.readlines() for production_txt in productions_txt: head, _, *body_full = production_txt.strip().split() if start_symb is None: start_symb = Variable(head) tmp_body = [] bodies = [ list(group) for k, group in groupby(body_full, lambda x: x == "|") if not k ] for body in bodies: is_regex = not any([ True if '*' not in value else False for value in body ]) if is_regex: new_productions, new_variables, new_terminals, id = CFGrammar \ .read_production_regex(head, Regex.from_python_regex(body[0]), id, False) productions |= new_productions variables |= new_variables terminals |= new_terminals else: body_cfg = [] for letter in body: if letter == "epsilon": body_cfg.append(Epsilon()) elif letter.isupper(): non_terminal = Variable(letter) variables.add(non_terminal) body_cfg.append(non_terminal) else: terminal = Terminal(letter) terminals.add(terminal) body_cfg.append(terminal) productions.add(Production(Variable(head), body_cfg)) cfg = CFG(variables, terminals, start_symb, productions) return cfg
def test_get_first_set(self): # Example from: # https://www.geeksforgeeks.org/first-set-in-syntax-analysis/ text = """ E -> T E’ E’ -> + T E’ | Є T -> F T’ T’ -> * F T’ | Є F -> ( E ) | id """ cfg = CFG.from_text(text) llone_parser = LLOneParser(cfg) first_set = llone_parser.get_first_set() self.assertEqual(first_set[Variable("E")], {Terminal("("), Terminal("id")}) self.assertEqual(first_set[Variable("E’")], {Terminal("+"), Epsilon()}) self.assertEqual(first_set[Variable("T")], {Terminal("("), Terminal("id")}) self.assertEqual(first_set[Variable("T’")], {Terminal("*"), Epsilon()}) self.assertEqual(first_set[Variable("F")], {Terminal("("), Terminal("id")})
def test_membership(self): """ Tests the membership of a CFG """ # pylint: disable=too-many-locals var_useless = Variable("USELESS") var_s = Variable("S") var_b = Variable("B") ter_a = Terminal("a") ter_b = Terminal("b") ter_c = Terminal("c") prod0 = Production(var_s, [ter_a, var_s, var_b]) prod1 = Production(var_useless, [ter_a, var_s, var_b]) prod2 = Production(var_s, [var_useless]) prod4 = Production(var_b, [ter_b]) prod5 = Production(var_useless, []) cfg0 = CFG({var_useless, var_s}, {ter_a, ter_b}, var_s, {prod0, prod1, prod2, prod4, prod5}) self.assertTrue(cfg0.contains([Epsilon()])) self.assertTrue(cfg0.contains([ter_a, ter_b])) self.assertTrue(cfg0.contains([ter_a, ter_a, ter_b, ter_b])) self.assertTrue( cfg0.contains([ter_a, ter_a, ter_a, ter_b, ter_b, ter_b])) self.assertFalse(cfg0.contains([ter_a, ter_b, ter_b])) self.assertFalse(cfg0.contains([ter_a, ter_b, ter_c, ter_b])) self.assertFalse(cfg0.contains([ter_a, ter_a, ter_a, ter_b, ter_b])) prod3 = Production(var_s, [ter_c]) cfg0 = CFG({var_s}, {ter_a, ter_b, ter_c}, var_s, {prod0, prod3}) self.assertFalse(cfg0.contains([Epsilon()])) var_a = Variable("A") prod6 = Production(var_s, [var_a, var_b]) prod7 = Production(var_a, [var_a, var_b]) prod8 = Production(var_a, [ter_a]) prod9 = Production(var_b, [ter_b]) cfg1 = CFG({var_a, var_b, var_s}, {ter_a, ter_b}, var_s, {prod6, prod7, prod8, prod9}) self.assertTrue(cfg1.contains([ter_a, ter_b, ter_b])) cfg1 = CFG({"A", "B", "S"}, {"a", "b"}, "S", {prod6, prod7, prod8, prod9}) self.assertTrue(cfg1.contains(["a", "b", "b"]))
def cyk(cfgrammar, w): w = w.split() length = len(w) if length != 0: number = len(cfgrammar.variables) matrix = [[[0 for _ in range(length)] for _ in range(length)] for _ in range(number)] variables = dict(zip(cfgrammar.variables, range(number))) symbols = defaultdict(list) for i, s in enumerate(w): symbols[s].append(i) bodies = defaultdict(list) for i, s in enumerate(list(map(check_eps, cfgrammar.productions))): bodies[s].append(i) for s in w: if s == ' ': term = Epsilon() else: term = Terminal(s) if term in bodies: for i in symbols[s]: for j in bodies[term]: matrix[variables[list( cfgrammar.productions)[j].head]][i][i] = 1 for m in range(1, length): for i in range(length - m): j = i + m for n in range(number): for p in cfgrammar.productions: for k in range(i, j): for key, value in variables.items(): if n == value: h = key if p.head == h and len(p.body) == 2: matrix[n][i][j] += matrix[variables[list( p.body)[0]]][i][k] * matrix[variables[list( p.body)[1]]][k + 1][j] if matrix[n][i][j]: break if matrix[n][i][j]: break else: return cfgrammar.generate_epsilon() return bool(matrix[variables[cfgrammar.start_symbol]][0][length - 1])
def _get_triggers_follow_set(self, first_set): triggers = dict() for production in self._cfg.productions: if production.head not in triggers: triggers[production.head] = set() for i, component in enumerate(production.body): all_epsilon = True for component_next in production.body[i + 1:]: if Epsilon() not in first_set.get(component_next, set()): all_epsilon = False break if all_epsilon: triggers[production.head].add(component) return triggers
def _initialize_first_set(self, triggers): to_process = SetQueue() first_set = dict() # Initialisation for terminal in self._cfg.terminals: first_set[terminal] = {terminal} for triggered in triggers.get(terminal, []): to_process.append(triggered) # Generate only epsilon for production in self._cfg.productions: if not production.body: first_set[production.head] = {Epsilon()} for triggered in triggers.get(production.head, []): to_process.append(triggered) return first_set, to_process
def regex_to_grammar_productions(regex, head, var_dict, terminal_dict): _var_dict = {} production_set = set() # Getting an NFA from regex enfa = regex.to_epsilon_nfa() enfa = enfa.minimize() transitions = enfa._transition_function._transitions # Producing variables from NFA states for state in enfa.states: _var_dict[state] = Variable( # Creating new CFG variable with unique name '%s#REGEX#%s' % (head.value, get_new_var_num())) for head_state in transitions: # Adding productions from head to start states for start_state in enfa.start_states: start_p = Production(head, [_var_dict[start_state]]) production_set.add(start_p) # Getting productions from NFA transitions for sym in list(transitions[head_state]): body_state = transitions[head_state][sym] inner_head = _var_dict[head_state] inner_body = [] if sym in var_dict: inner_body.append(var_dict[sym]) elif sym in terminal_dict: inner_body.append(terminal_dict[sym]) elif sym == EPS_SYM: inner_body.append(Epsilon()) else: raise ValueError(f'''Symbol "{sym}" is not defined as a terminal or a variable''') inner_body.append(_var_dict[body_state]) production_set.add(Production(inner_head, inner_body)) if transitions[head_state][sym] in enfa.final_states: eps_p = Production(_var_dict[body_state], []) production_set.add(eps_p) return production_set
def get_llone_parse_tree(self, word): """ Get LL(1) parse Tree Parameters ---------- word : list The word to parse Returns ------- parse_tree : :class:`~pyformlang.cfg.ParseTree` The parse tree Raises -------- NotParsableException When the word cannot be parsed """ word = [to_terminal(x) for x in word if x != Epsilon()] word.append("$") word = word[::-1] parsing_table = self.get_llone_parsing_table() parse_tree = ParseTree(self._cfg.start_symbol) stack = ["$", parse_tree] while stack: current = stack.pop() if current == "$" and word[-1] == "$": return parse_tree if current.value == word[-1]: word.pop() else: rule_applied = list( parsing_table.get(current.value, dict()).get(word[-1], [])) if len(rule_applied) == 1: for component in rule_applied[0].body[::-1]: new_node = ParseTree(component) current.sons.append(new_node) stack.append(new_node) else: raise NotParsableException current.sons = current.sons[::-1] raise NotParsableException
def test_remove_epsilon(self): """ Tests the removal of epsilon """ var_a = Variable("A") var_b = Variable("B") ter_a = Terminal("a") ter_b = Terminal("b") start = Variable("S") prod0 = Production(start, [var_a, var_b]) prod1 = Production(var_a, [ter_a, var_a, var_a]) prod2 = Production(var_a, [Epsilon()]) prod3 = Production(var_b, [ter_b, var_b, var_b]) prod4 = Production(var_b, []) cfg = CFG({var_a, var_b, start}, {ter_a, ter_b}, start, {prod0, prod1, prod2, prod3, prod4}) new_cfg = cfg.remove_epsilon() self.assertEqual(len(new_cfg.variables), 3) self.assertEqual(len(new_cfg.terminals), 2) self.assertEqual(len(set(new_cfg.productions)), 9) self.assertEqual(len(new_cfg.get_nullable_symbols()), 0) self.assertFalse(cfg.is_empty())
def regex_to_grammar_productions(regex, head): _var_dict = {} production_set = set() # Getting an NFA from regex enfa = regex.to_epsilon_nfa() enfa = enfa.minimize() transitions = enfa._transition_function._transitions for state in enfa.states: _var_dict[state] = Variable( # Creating new CFG variable with unique name '%s#REGEX#%s' % (head.value, get_new_var_num())) for head_state in transitions: # Adding productions from head to start states for start_state in enfa.start_states: start_p = Production(head, [_var_dict[start_state]]) production_set.add(start_p) # Getting productions from NFA transitions for sym in list(transitions[head_state]): body_state = transitions[head_state][sym] inner_head = _var_dict[head_state] inner_body = [] if sym.value == EPS_SYM: inner_body.append(Epsilon()) elif sym.value.isupper(): inner_body.append(Variable(sym)) else: inner_body.append(Terminal(sym)) inner_body.append(_var_dict[body_state]) production_set.add(Production(inner_head, inner_body)) if transitions[head_state][sym] in enfa.final_states: eps_p = Production(_var_dict[body_state], []) production_set.add(eps_p) return production_set
def regex_to_production(regex, head): _dict = {} production_set = set() enfa = regex.to_epsilon_nfa() enfa = enfa.minimize() transitions = enfa._transition_function._transitions for state in enfa.states: _dict[state] = Variable('%s#REGEX#%s' % (head.value, get_new_var_num())) for head_state in transitions: for start_state in enfa.start_states: start_production = Production(head, [_dict[start_state]]) production_set.add(start_production) for symbol in list(transitions[head_state]): body_state = transitions[head_state][symbol] inner_head = _dict[head_state] inner_body = [] if symbol.value == EPS_SYM: inner_body.append(Epsilon()) elif symbol.value.isupper(): inner_body.append(Variable(symbol)) else: inner_body.append(Terminal(symbol)) inner_body.append(_dict[body_state]) production_set.add(Production(inner_head, inner_body)) if transitions[head_state][symbol] in enfa.final_states: eps_production = Production(_dict[body_state], []) production_set.add(eps_production) return production_set
def test_pda_object_creator(self): pda_oc = PDAObjectCreator([], []) self.assertEqual(pda_oc.get_symbol_from(Epsilon()), pda.Epsilon()) self.assertEqual(pda_oc.get_stack_symbol_from(Epsilon()), pda.Epsilon())
def check_eps(p): if p.body: if len(p.body) == 1: return list(p.body)[0] else: return Epsilon()