def test_la_viterbi_parsing_3(self): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("B") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.25) # rule 1 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.5) # rule 2 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["B"], 1.0) # rule 3 lhs = LCFRS_lhs("A") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.5) # rule 4 lhs = LCFRS_lhs("B") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.75) grammar.make_proper() inp = ["a"] * 3 nontMap = Enumerator() gi = PyGrammarInfo(grammar, nontMap) sm = PyStorageManager() print(nontMap.object_index("S")) print(nontMap.object_index("B")) la = build_PyLatentAnnotation_initial(grammar, gi, sm) parser = DiscodopKbestParser(grammar, la=la, nontMap=nontMap, grammarInfo=gi, latent_viterbi_mode=True) parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) der = parser.latent_viterbi_derivation(True) print(der) der2 = None for w, der_ in parser.k_best_derivation_trees(): if der2 is None: der2 = der_ print(w, der_) print(der2)
def test_la_viterbi_parsing_2(self): grammar = self.build_paper_grammar() inp = ["a"] * 3 nontMap = Enumerator() gi = PyGrammarInfo(grammar, nontMap) sm = PyStorageManager() print(nontMap.object_index("S")) print(nontMap.object_index("B")) la = build_PyLatentAnnotation( [2, 1], [1.0], [[0.25, 1.0], [1.0, 0.0], [0.0, 0.5, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0]], gi, sm) self.assertTrue(la.is_proper()) parser = DiscodopKbestParser(grammar, la=la, nontMap=nontMap, grammarInfo=gi, latent_viterbi_mode=True) parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) der = parser.latent_viterbi_derivation(True) print(der) ranges = {der.spanned_ranges(idx)[0] for idx in der.ids()} self.assertSetEqual({(0, 3), (0, 2), (0, 1), (1, 2), (2, 3)}, ranges)
def initialize_parser(self): if "disco-dop" in self.parsing_mode: self.parser = DiscodopKbestParser(grammar=self.base_grammar, k=self.k_best, beam_beta=self.disco_dop_params["beam_beta"], beam_delta=self.disco_dop_params["beam_delta"], pruning_k=self.disco_dop_params["pruning_k"], cfg_ctf=self.disco_dop_params["cfg_ctf"]) else: self.parser = GFParser_k_best(grammar=self.base_grammar, k=self.k_best, save_preprocessing=(self.directory, "gfgrammar"))
def test_discodop_kbest_parser(self): grammar = self.build_grammar() parser = DiscodopKbestParser(grammar) inp = ["a"] * 5 parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) counter = 0 for weight, der in parser.k_best_derivation_trees(): # print(weight, der) self.assertTrue(der.check_integrity_recursive(der.root_id(), grammar.start())) counter += 1 self.assertEqual(50, counter)
def initialize_parser(self): save_preprocess = (self.directory, "mygrammar") k = 1 if not self.organizer.disable_split_merge or self.oracle_parsing else self.k_best if "disco-dop" in self.parsing_mode: self.parser = DiscodopKbestParser(grammar=self.base_grammar, k=self.k_best, cfg_ctf=self.disco_dop_params["cfg_ctf"], pruning_k=self.disco_dop_params["pruning_k"], beam_beta=self.disco_dop_params["beam_beta"], beam_delta=self.disco_dop_params["beam_delta"] ) else: self.parser = GFParser_k_best(self.base_grammar, save_preprocessing=save_preprocess, k=k)
def test_la_viterbi_parsing(self): grammar = self.build_grammar() inp = ["a"] * 3 nontMap = Enumerator() gi = PyGrammarInfo(grammar, nontMap) sm = PyStorageManager() la = build_PyLatentAnnotation_initial(grammar, gi, sm) parser = DiscodopKbestParser(grammar, la=la, nontMap=nontMap, grammarInfo=gi, latent_viterbi_mode=True) parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() print(der) for node in der.ids(): print(node, der.getRule(node), der.spanned_ranges(node))
def test_copy_grammar(self): grammar = self.build_nm_grammar() for cfg_aprrox in [True, False]: parser = DiscodopKbestParser(grammar, cfg_ctf=cfg_aprrox) n = 2 m = 3 inp = ["a"] * n + ["b"] * m + ["c"] * n + ["d"] * m parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) counter = 0 for weight, der in parser.k_best_derivation_trees(): print(weight, der) self.assertTrue(der.check_integrity_recursive(der.root_id(), grammar.start())) self.assertEqual(inp, der.compute_yield()) counter += 1 self.assertEqual(1, counter)
def test_negra_dag_small_grammar(self): DAG_CORPUS = 'res/tiger/tiger_full_with_sec_edges.export' DAG_CORPUS_BIN = 'res/tiger/tiger_full_with_sec_edges_bin_h1_v1.export' names = list([str(i) for i in range(1, 101)]) if not os.path.exists(DAG_CORPUS): print( 'run the following command to create an export corpus with dags:' ) print('\tPYTHONPATH=. util/tiger_dags_to_negra.py ' + 'res/tiger/tiger_release_aug07.corrected.16012013.xml ' + DAG_CORPUS + ' 1 50474') self.assertTrue(os.path.exists(DAG_CORPUS)) if not os.path.exists(DAG_CORPUS_BIN): print( 'run the following command to binarize the export corpus with dags:' ) print("discodop treetransforms --binarize -v 1 -h 1 " + DAG_CORPUS + " " + DAG_CORPUS_BIN) # _, DAG_CORPUS_BIN = tempfile.mkstemp(prefix='corpus_bin_', suffix='.export') # subprocess.call(["discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", DAG_CORPUS, DAG_CORPUS_BIN]) self.assertTrue(os.path.exists(DAG_CORPUS_BIN)) corpus = np.sentence_names_to_hybridtrees(names, DAG_CORPUS, secedge=True) corpus_bin = np.sentence_names_to_hybridtrees(names, DAG_CORPUS_BIN, secedge=True) grammar = LCFRS(start="START") for hybrid_dag, hybrid_dag_bin in zip(corpus, corpus_bin): self.assertEqual(len(hybrid_dag.token_yield()), len(hybrid_dag_bin.token_yield())) dag_grammar = direct_extract_lcfrs_from_prebinarized_corpus( hybrid_dag_bin) grammar.add_gram(dag_grammar) grammar.make_proper() print( "Extracted LCFRS/DCP-hybrid grammar with %i nonterminals and %i rules" % (len(grammar.nonts()), len(grammar.rules()))) parser = DiscodopKbestParser(grammar, k=1) _, RESULT_FILE = tempfile.mkstemp(prefix='parser_results_', suffix='.export') with open(RESULT_FILE, 'w') as results: for hybrid_dag in corpus: poss = list(map(lambda x: x.pos(), hybrid_dag.token_yield())) parser.set_input(poss) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() dcp_term = DCP_evaluator(der).getEvaluation() dag_eval = HybridDag(hybrid_dag.sent_label()) dcp_to_hybriddag(dag_eval, dcp_term, copy.deepcopy(hybrid_dag.token_yield()), False, construct_token=construct_constituent_token) lines = np.serialize_hybridtrees_to_negra( [dag_eval], 1, 500, use_sentence_names=True) for line in lines: print(line, end='', file=results) parser.clear() print("Wrote results to %s" % RESULT_FILE)
def test_something(self): normal_corpus = 'res/tiger/tiger_8000.export' binarized_corpus = 'res/tiger/tiger_8000_bin.export' limit = 55000 # limit = 30 corpus_bin = sentence_names_to_hybridtrees( {str(x) for x in range(limit)}, binarized_corpus, disconnect_punctuation=False, add_vroot=True, mode="DISCO-DOP") corpus = sentence_names_to_hybridtrees({str(x) for x in range(limit)}, normal_corpus, disconnect_punctuation=False, add_vroot=True, mode="DISCO-DOP") term_labeling = terminal_labeling(corpus, threshold=4) grammar = None for htree, htree_bin in zip(corpus, corpus_bin): # print(htree_bin) try: htree_grammar = direct_extract_lcfrs_from_prebinarized_corpus( htree_bin, term_labeling=term_labeling) except Exception as e: print(e) print(htree_bin) print(htree_bin.nodes()) print(htree_bin.word_yield()) raise e # print(htree_grammar) parser_input = term_labeling.prepare_parser_input( htree.token_yield()) p = LCFRS_sDCP_Parser(htree_grammar, terminal_labelling=term_labeling) p.set_input(htree) p.parse() # p = LCFRS_parser(htree_grammar, parser_input) self.assertTrue(p.recognized()) derivs = list(p.all_derivation_trees()) # print("derivations:", len(derivs)) for der in derivs: dcp = DCP_evaluator(der).getEvaluation() sys_tree = HybridTree(htree.sent_label()) sys_tree = dcp_to_hybridtree( sys_tree, dcp, deepcopy(htree.token_yield()), ignore_punctuation=False, construct_token=construct_constituent_token) # print(sys_tree) # print(htree == sys_tree) # print(der) if htree != sys_tree: print(htree.sent_label()) print(htree) print(sys_tree) self.assertEqual(htree, sys_tree) if grammar is None: grammar = htree_grammar else: grammar.add_gram(htree_grammar) htree_grammar.make_proper() try: disco_parser = DiscodopKbestParser(htree_grammar) except ValueError as ve: print(ve) print(htree.sent_label()) print(htree) print(htree_bin) print(htree_grammar) raise ve grammar.make_proper() disco_parser = DiscodopKbestParser(grammar)
def prepare_sm_parser(self): last_la = self.organizer.latent_annotations[ self.organizer.last_sm_cycle] if self.parsing_mode == "discodop-multi-method": if self.organizer.project_weights_before_parsing: self.project_weights() self.parser = DiscodopKbestParser( self.base_grammar, k=self.k_best, la=last_la, nontMap=self.organizer.nonterminal_map, variational=False, sum_op=False, cfg_ctf=self.disco_dop_params["cfg_ctf"], beam_beta=self.disco_dop_params["beam_beta"], beam_delta=self.disco_dop_params["beam_delta"], pruning_k=self.disco_dop_params["pruning_k"], grammarInfo=self.organizer.grammarInfo, projection_mode=False, latent_viterbi_mode=True, secondaries=[ "VARIATIONAL", "MAX-RULE-PRODUCT", "LATENT-RERANK" ]) self.parser.k_best_reranker = Coarse_to_fine_parser( self.base_grammar, last_la, self.organizer.grammarInfo, self.organizer.nonterminal_map, base_parser=self.parser) elif self.parsing_mode == "best-latent-derivation": grammar = build_sm_grammar(last_la, self.base_grammar, self.organizer.grammarInfo, rule_pruning=0.0001, rule_smoothing=0.1) self.parser = GFParser_k_best(grammar=grammar, k=1, save_preprocessing=(self.directory, "gfgrammar")) elif self.parsing_mode in { method + engine for method in {"k-best-rerank", "latent-viterbi"} for engine in {"-GF", "-disco-dop", ""} }: if self.organizer.project_weights_before_parsing: self.project_weights() if "disco-dop" in self.parsing_mode: engine = DiscodopKbestParser( grammar=self.base_grammar, k=self.k_best, la=last_la, nontMap=self.organizer.nonterminal_map, grammarInfo=self.organizer.grammarInfo, cfg_ctf=self.disco_dop_params["cfg_ctf"], beam_beta=self.disco_dop_params["beam_beta"], beam_delta=self.disco_dop_params["beam_beta"], pruning_k=self.disco_dop_params["pruning_k"], latent_viterbi_mode="latent-viterbi" in self.parsing_mode) else: engine = GFParser_k_best(grammar=self.base_grammar, k=self.k_best, heuristics=self.heuristics, save_preprocessing=(self.directory, "gfgrammar")) if "latent-viterbi" in self.parsing_mode: self.parser = engine else: self.parser = Coarse_to_fine_parser( self.base_grammar, last_la, self.organizer.grammarInfo, self.organizer.nonterminal_map, base_parser=engine) elif self.parsing_mode in { method + "%s" % engine for method in {"max-rule-prod", "max-rule-sum", "variational"} for engine in {"-GF", "-disco-dop", ""} }: if self.organizer.project_weights_before_parsing: self.project_weights() if "GF" in self.parsing_mode: self.parser = Coarse_to_fine_parser( self.base_grammar, last_la, self.organizer.grammarInfo, nontMap=self.organizer.nonterminal_map, base_parser_type=GFParser_k_best, k=self.k_best, heuristics=self.heuristics, save_preprocessing=(self.directory, "gfgrammar"), mode=self.parsing_mode, variational="variational" in self.parsing_mode, sum_op="sum" in self.parsing_mode) else: self.parser = DiscodopKbestParser( self.base_grammar, k=self.k_best, la=last_la, nontMap=self.organizer.nonterminal_map, variational="variational" in self.parsing_mode, sum_op="sum" in self.parsing_mode, cfg_ctf=self.disco_dop_params["cfg_ctf"], beam_beta=self.disco_dop_params["beam_beta"], beam_delta=self.disco_dop_params["beam_delta"], pruning_k=self.disco_dop_params["pruning_k"], grammarInfo=self.organizer.grammarInfo, projection_mode=True) else: raise ValueError("Unknown parsing mode %s" % self.parsing_mode)