def test_basic_split_merge(self): tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START') for rule in grammar.rules(): print(rule, file=stderr) print("call S/M Training", file=stderr) new_grammars = split_merge_training(grammar, terminal_labeling, [tree, tree2], 3, 5, merge_threshold=0.5, debug=False) for new_grammar in new_grammars: for i, rule in enumerate(new_grammar.rules()): print(i, rule, file=stderr) print(file=stderr) print("finished S/M Training", file=stderr)
def test_trace_serialization(self): tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START') for rule in grammar.rules(): print(rule, file=stderr) trace = compute_reducts(grammar, [tree, tree2], terminal_labeling) trace.serialize(b"/tmp/reducts.p") grammar_load = grammar trace2 = PySDCPTraceManager(grammar_load, terminal_labeling) trace2.load_traces_from_file(b"/tmp/reducts.p") trace2.serialize(b"/tmp/reducts2.p") with open(b"/tmp/reducts.p", "r") as f1, open(b"/tmp/reducts2.p", "r") as f2: for e1, e2 in zip(f1, f2): self.assertEqual(e1, e2)
def test_basic_em_training(self): tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START') for rule in grammar.rules(): print(rule, file=stderr) print("compute reducts", file=stderr) trace = compute_reducts(grammar, [tree, tree2], terminal_labeling) print("call em Training", file=stderr) emTrainer = PyEMTrainer(trace) emTrainer.em_training(grammar, n_epochs=10) print("finished em Training", file=stderr) for rule in grammar.rules(): print(rule, file=stderr)
def test_grammar_export(self): tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') _, grammar = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'), terminal_labeling.token_label, [direct_extraction], 'START') print(max([grammar.fanout(nont) for nont in grammar.nonts()])) print(grammar) prefix = '/tmp/' name = 'tmpGrammar' name_ = export(grammar, prefix, name) self.assertEqual(0, compile_gf_grammar(prefix, name_)) GFParser.preprocess_grammar(grammar) string = ["NP", "N", "V", "V", "V"] parser = GFParser(grammar, string) self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() self.assertTrue( der.check_integrity_recursive(der.root_id(), grammar.start())) print(der) print( derivation_to_hybrid_tree(der, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '), 'NP N V V V'.split(' ')) ] dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token) print(h_tree_2)
def test_dcp_evaluation_with_induced_dependency_grammar(self): tree = hybrid_tree_1() print(tree) tree2 = hybrid_tree_2() print(tree2) # print tree.recursive_partitioning() labeling = the_labeling_factory().create_simple_labeling_strategy( 'child', 'pos') term_pos = the_terminal_labeling_factory().get_strategy( 'pos').token_label (_, grammar) = induce_grammar([tree, tree2], labeling, term_pos, [direct_extraction], 'START') # print grammar self.assertEqual(grammar.well_formed(), None) self.assertEqual(grammar.ordered()[0], True) # print max([grammar.fanout(nont) for nont in grammar.nonts()]) print(grammar) parser = Parser(grammar, 'NP N V V'.split(' ')) self.assertEqual(parser.recognized(), True) for item in parser.successful_root_items(): der = Derivation() derivation_tree(der, item, None) print(der) hybrid_tree = derivation_to_hybrid_tree( der, 'NP N V V'.split(' '), 'Piet Marie helpen lezen'.split(' '), construct_constituent_token) print(hybrid_tree) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip('Piet Marie helpen lezen'.split(' '), 'NP N V V'.split(' ')) ] dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token)
def test_basic_sdcp_parsing_dependency(self): tree1 = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree1, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START') print("grammar induced. Printing rules...", file=stderr) for rule in grammar.rules(): print(rule, file=stderr) parser_type = LCFRS_sDCP_Parser print("preprocessing grammar", file=stderr) parser_type.preprocess_grammar(grammar, terminal_labeling) print("invoking parser", file=stderr) parser = parser_type(grammar, tree1) print("listing derivations", file=stderr) for der in parser.all_derivation_trees(): print(der) output_tree = HybridTree() tokens = tree1.token_yield() dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False, construct_conll_token) print(tree1) print(output_tree) print("completed test", file=stderr)