def induce_grammar_from(dsg, rec_par, decomp, labeling=(lambda x, y: str(x)), terminal_labeling=id, terminal_labeling_lcfrs=None, start="START", normalize=True, enforce_outputs=True): if terminal_labeling_lcfrs is None: terminal_labeling_lcfrs = terminal_labeling lcfrs = LCFRS(start=start) ordered_nodes = dsg.dog.ordered_nodes() rhs_nont = induce_grammar_rec(lcfrs, dsg, rec_par, decomp, labeling, terminal_labeling, terminal_labeling_lcfrs , normalize, enforce_outputs, ordered_nodes=ordered_nodes) rhs_top = dsg.dog.top(decomp[0]) # construct a chain rule from START to initial nonterminal of decomposition # LCFRS part lcfrs_lhs = LCFRS_lhs(start) lcfrs_lhs.add_arg([LCFRS_var(0, 0)]) # DOG part dog = DirectedOrderedGraph() assert len(dsg.dog.inputs) == 0 assert not enforce_outputs or len(dsg.dog.outputs) > 0 for i in range(len(rhs_top)): dog.add_node(i) for output in dsg.dog.outputs: dog.add_to_outputs(rhs_top.index(output)) dog.add_nonterminal_edge([], [i for i in range(len(rhs_top))], enforce_outputs) # no sync sync = [] lcfrs.add_rule(lcfrs_lhs, [rhs_nont], weight=1.0, dcp=[dog, sync]) return lcfrs
def test_la_viterbi_parsing_3(self): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("B") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.25) # rule 1 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.5) # rule 2 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["B"], 1.0) # rule 3 lhs = LCFRS_lhs("A") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.5) # rule 4 lhs = LCFRS_lhs("B") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.75) grammar.make_proper() inp = ["a"] * 3 nontMap = Enumerator() gi = PyGrammarInfo(grammar, nontMap) sm = PyStorageManager() print(nontMap.object_index("S")) print(nontMap.object_index("B")) la = build_PyLatentAnnotation_initial(grammar, gi, sm) parser = DiscodopKbestParser(grammar, la=la, nontMap=nontMap, grammarInfo=gi, latent_viterbi_mode=True) parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) der = parser.latent_viterbi_derivation(True) print(der) der2 = None for w, der_ in parser.k_best_derivation_trees(): if der2 is None: der2 = der_ print(w, der_) print(der2)
def test_projection_based_parser_k_best_hack(self): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("B") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.25) # rule 1 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.5) # rule 2 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["B"], 1.0) # rule 3 lhs = LCFRS_lhs("A") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.5) # rule 4 lhs = LCFRS_lhs("B") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.75) grammar.make_proper() inp = ["a"] * 3 nontMap = Enumerator() gi = PyGrammarInfo(grammar, nontMap) sm = PyStorageManager() la = build_PyLatentAnnotation_initial(grammar, gi, sm) parser = Coarse_to_fine_parser(grammar, la, gi, nontMap, base_parser_type=GFParser_k_best) parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) der = parser.max_rule_product_derivation() print(der) der = parser.best_derivation_tree() print(der) for node in der.ids(): print(der.getRule(node), der.spanned_ranges(node))
def build_grammar(self): grammar = LCFRS("S") lhs1 = LCFRS_lhs("S") lhs1.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) rule_1 = grammar.add_rule(lhs1, ["S", "S"]) lhs2 = LCFRS_lhs("S") lhs2.add_arg(["a"]) rule_2 = grammar.add_rule(lhs2, []) lhs3 = LCFRS_lhs("A") lhs3.add_arg(["a"]) rule_3 = grammar.add_rule(lhs3, []) return grammar, rule_1.get_idx(), rule_2.get_idx()
def direct_extract_lcfrs_from_prebinarized_corpus(tree, term_labeling=PosTerminals(), nont_labeling=BasicNonterminalLabeling(), isolate_pos=True): gram = LCFRS(start=START) root = tree.root[0] if root in tree.full_yield(): lhs = LCFRS_lhs(START) label = term_labeling.token_label(tree.node_token(root)) lhs.add_arg([label]) dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_term(DCP_index(0, edge_label=tree.node_token(root).edge()), [])]) gram.add_rule(lhs, [], dcp=[dcp_rule]) else: first, _, _ = direct_extract_lcfrs_prebinarized_recur(tree, root, gram, term_labeling, nont_labeling, isolate_pos) lhs = LCFRS_lhs(START) lhs.add_arg([LCFRS_var(0, 0)]) dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)]) gram.add_rule(lhs, [first], dcp=[dcp_rule]) return gram
def induce_grammar(trees, nont_labelling, term_labelling, recursive_partitioning, start_nont='START'): """ :rtype: LCFRS :param trees: corpus of HybridTree (i.e. list (or Generator for lazy IO)) :type trees: __generator[HybridTree] :type nont_labelling: AbstractLabeling :param term_labelling: HybridTree, NodeId -> str :param recursive_partitioning: HybridTree -> RecursivePartitioning :type start_nont: str :rtype: int, LCFRS Top level method to induce an LCFRS/DCP-hybrid grammar for dependency parsing. """ grammar = LCFRS(start_nont) n_trees = 0 for tree in trees: n_trees += 1 for rec_par in recursive_partitioning: match = re.search(r'no_new_nont', rec_par.__name__) if match: rec_par_int = rec_par(tree, grammar.nonts(), nont_labelling) else: rec_par_int = rec_par(tree) rec_par_nodes = tree.node_id_rec_par(rec_par_int) (_, _, nont_name) = add_rules_to_grammar_rec(tree, rec_par_nodes, grammar, nont_labelling, term_labelling) # Add rule from top start symbol to top most nonterminal for the hybrid tree lhs = LCFRS_lhs(start_nont) lhs.add_arg([LCFRS_var(0, 0)]) rhs = [nont_name] dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)]) grammar.add_rule(lhs, rhs, 1.0, [dcp_rule]) grammar.make_proper() return n_trees, grammar
def __test_projection(self, split_weights, goal_weights, merge_method=False): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "A"]) # rule 1 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, []) lhs = LCFRS_lhs("A") lhs.add_arg(["b"]) grammar.add_rule(lhs, [], weight=2.0) grammar.make_proper() # print(grammar) nonterminal_map = Enumerator() grammarInfo = PyGrammarInfo(grammar, nonterminal_map) storageManager = PyStorageManager() la = build_PyLatentAnnotation([1, 2], [1.0], split_weights, grammarInfo, storageManager) # parser = LCFRS_parser(grammar) # parser.set_input(["a", "b"]) # parser.parse() # der = parser.best_derivation_tree() # print(la.serialize()) if merge_method: la.project_weights(grammar, grammarInfo) else: splits, _, _ = la.serialize() merge_sources = [[[ split for split in range(0, splits[nont_idx]) ]] for nont_idx in range(0, nonterminal_map.get_counter())] # print("Projecting to fine grammar LA", file=self.logger) coarse_la = la.project_annotation_by_merging(grammarInfo, merge_sources, debug=False) coarse_la.project_weights(grammar, grammarInfo) # print(grammar) for i in range(3): self.assertAlmostEqual( grammar.rule_index(i).weight(), goal_weights[i])
def build_paper_grammar(): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("B") lhs.add_arg(["a"]) grammar.add_rule(lhs, []) # rule 1 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["B"]) # rule 2 lhs = LCFRS_lhs("B") lhs.add_arg([LCFRS_var(0,0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["B", "B"]) grammar.make_proper() return grammar
def build_nm_grammar(): grammar = LCFRS("START") # rule 0 lhs = LCFRS_lhs("START") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["S"]) # rule 1 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0), LCFRS_var(0, 1), LCFRS_var(1, 1)]) grammar.add_rule(lhs, ["N", "M"]) for nont, term in [("A", "a"), ("B", "b"), ("C", "c"), ("D", "d")]: # rule 2 lhs = LCFRS_lhs(nont) lhs.add_arg([term]) grammar.add_rule(lhs, []) for nont, nont_, c1, c2 in [("N", "N'", "A", "C"), ("M", "M'", "B", "D")]: # rule 3 lhs = LCFRS_lhs(nont) lhs.add_arg([LCFRS_var(0, 0)]) lhs.add_arg([LCFRS_var(1, 0)]) grammar.add_rule(lhs, [c1, c2]) # rule 4 lhs = LCFRS_lhs(nont) lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) lhs.add_arg([LCFRS_var(0,1)]) grammar.add_rule(lhs, [nont_, c1]) # rule 5 lhs = LCFRS_lhs(nont_) lhs.add_arg([LCFRS_var(0, 0)]) lhs.add_arg([LCFRS_var(0, 1), LCFRS_var(1, 0)]) grammar.add_rule(lhs, [nont, c2]) grammar.make_proper() return grammar
def build_grammar(): grammar = LCFRS("START") # rule 0 lhs = LCFRS_lhs("START") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["S"]) # rule 1 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["S", "S"]) # rule 1.5 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["S", "S"], dcp=["1.5"]) # rule 2 lhs = LCFRS_lhs("S") lhs.add_arg(["a"]) grammar.add_rule(lhs, []) # rule 3 lhs = LCFRS_lhs("S") lhs.add_arg(["b"]) grammar.add_rule(lhs, [], weight=2.0) # rule 4 lhs = LCFRS_lhs("S") lhs.add_arg(["b"]) grammar.add_rule(lhs, [], dcp=["4"]) # rule 5 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, []) grammar.make_proper() return grammar