def test_basic_sdcp_parsing_constituency(self): tree1 = constituent_tree_1() tree2 = constituent_tree_2() tree3 = constituent_tree_1_pos_stripped() terminal_labeling = FormTerminals() # [tree1, tree2], 1, filter=["VP"]) fanout = 1 grammar = LCFRS('START') for tree in [tree1, tree2]: tree_part = tree.unlabelled_structure() part = fanout_limited_partitioning(tree_part, fanout) tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling) grammar.add_gram(tree_grammar) grammar.make_proper() print("grammar induced. Printing rules...", file=stderr) for rule in grammar.rules(): print(rule, file=stderr) parser_type = LCFRS_sDCP_Parser print("preprocessing grammar", file=stderr) parser_type.preprocess_grammar(grammar, terminal_labeling, debug=True) print("invoking parser", file=stderr) parser = parser_type(grammar, tree1) print("listing derivations", file=stderr) for der in parser.all_derivation_trees(): print(der) output_tree = ConstituentTree(tree1.sent_label()) tokens = [construct_constituent_token(token.form(), '--', True) for token in tree1.token_yield()] dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False, construct_constituent_token) print(tree1) print(output_tree) parser = parser_type(grammar, tree3) print(parser.recognized()) for der in parser.all_derivation_trees(): print(der) output_tree = ConstituentTree(tree3.sent_label()) tokens = [construct_constituent_token(token.form(), '--', True) for token in tree3.token_yield()] dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False, construct_constituent_token) print(tree3) print(output_tree) print("completed test", file=stderr)
def test_minimum_risk_parsing(self): limit_train = 20 limit_test = 10 train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' test = train parser_type = GFParser_k_best # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory( ).create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) parser_type.preprocess_grammar(grammar_prim) tree_yield = term_labelling.prepare_parser_input trees = parse_conll_corpus(test, False, limit_test) for i, tree in enumerate(trees): print("Parsing sentence ", i, file=stderr) # print >>stderr, tree parser = parser_type(grammar_prim, tree_yield(tree.token_yield()), k=50) self.assertTrue(parser.recognized()) derivations = [der for der in parser.k_best_derivation_trees()] print("# derivations: ", len(derivations), file=stderr) h_trees = [] current_weight = 0 weights = [] derivation_list = [] for weight, der in derivations: self.assertTrue(not der in derivation_list) derivation_list.append(der) dcp = DCP_evaluator(der).getEvaluation() h_tree = HybridTree() cleaned_tokens = copy.deepcopy(tree.full_token_yield()) dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_conll_token) h_trees.append(h_tree) weights.append(weight) if True: min_risk_tree = compute_minimum_risk_tree(h_trees, weights) if not min_risk_tree.__eq__(h_trees[0]): print(h_trees[0]) print(min_risk_tree)
def test_grammar_export(self): tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') _, grammar = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'), terminal_labeling.token_label, [direct_extraction], 'START') print(max([grammar.fanout(nont) for nont in grammar.nonts()])) print(grammar) prefix = '/tmp/' name = 'tmpGrammar' name_ = export(grammar, prefix, name) self.assertEqual(0, compile_gf_grammar(prefix, name_)) GFParser.preprocess_grammar(grammar) string = ["NP", "N", "V", "V", "V"] parser = GFParser(grammar, string) self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() self.assertTrue( der.check_integrity_recursive(der.root_id(), grammar.start())) print(der) print( derivation_to_hybrid_tree(der, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '), 'NP N V V V'.split(' ')) ] dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token) print(h_tree_2)
def test_induction_and_parsing_with_pos_recovery(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) tree = self.tree tree.add_to_root("VP1") print(tree) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True, term_labeling=FormTerminals()) print(grammar) parser = LCFRS_parser(grammar) parser.set_input([token.form() for token in tree.token_yield()]) parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() e = DCP_evaluator(derivation) dcp_term = e.getEvaluation() print(str(dcp_term[0])) t = ConstituentTree() dcp_to_hybridtree( t, dcp_term, [ construct_constituent_token(token.form(), '--', True) for token in tree.token_yield() ], ignore_punctuation=False, construct_token=construct_constituent_token) print(t) self.assertEqual(len(tree.token_yield()), len(t.token_yield())) for tok1, tok2 in zip(tree.token_yield(), t.token_yield()): self.assertEqual(tok1.form(), tok2.form()) self.assertEqual(tok1.pos(), tok2.pos())
def dcp_best_derivation(self): der = self.best_derivation_tree() # print der if der is not None: # todo: comment out the next integrity check if not der.check_integrity_recursive( der.root_id(), der.getRule(der.root_id()).lhs().nont()): print(der) raise Exception() return DCP_evaluator(der).getEvaluation() else: return []
def test_dcp_evaluation_with_induced_dependency_grammar(self): tree = hybrid_tree_1() print(tree) tree2 = hybrid_tree_2() print(tree2) # print tree.recursive_partitioning() labeling = the_labeling_factory().create_simple_labeling_strategy( 'child', 'pos') term_pos = the_terminal_labeling_factory().get_strategy( 'pos').token_label (_, grammar) = induce_grammar([tree, tree2], labeling, term_pos, [direct_extraction], 'START') # print grammar self.assertEqual(grammar.well_formed(), None) self.assertEqual(grammar.ordered()[0], True) # print max([grammar.fanout(nont) for nont in grammar.nonts()]) print(grammar) parser = Parser(grammar, 'NP N V V'.split(' ')) self.assertEqual(parser.recognized(), True) for item in parser.successful_root_items(): der = Derivation() derivation_tree(der, item, None) print(der) hybrid_tree = derivation_to_hybrid_tree( der, 'NP N V V'.split(' '), 'Piet Marie helpen lezen'.split(' '), construct_constituent_token) print(hybrid_tree) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip('Piet Marie helpen lezen'.split(' '), 'NP N V V'.split(' ')) ] dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token)
def parsing_postprocess(self, sentence, derivation, label=None): full_yield, id_yield, full_token_yield, token_yield = sentence dcp_tree = ConstituentTree(label) punctuation_positions = [i + 1 for i, idx in enumerate(full_yield) if idx not in id_yield] cleaned_tokens = copy.deepcopy(full_token_yield) dcp = DCP_evaluator(derivation).getEvaluation() dcp_to_hybridtree(dcp_tree, dcp, cleaned_tokens, False, construct_constituent_token, punct_positions=punctuation_positions) if True or self.strip_vroot: dcp_tree.strip_vroot() return dcp_tree
def test_cfg_parser(self): tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START') for parser_class in [LCFRS_parser, CFGParser]: parser_class.preprocess_grammar(grammar) string = ["NP", "N", "V", "V", "V"] parser = parser_class(grammar, string) self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() self.assertTrue( der.check_integrity_recursive(der.root_id(), grammar.start())) print(der) print( derivation_to_hybrid_tree( der, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip( 'Piet Marie helpen lezen leren'.split(' '), 'NP N V V V'.split(' ')) ] dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token) print(h_tree_2)
def test_basic_sdcp_parsing_dependency(self): tree1 = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree1, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START') print("grammar induced. Printing rules...", file=stderr) for rule in grammar.rules(): print(rule, file=stderr) parser_type = LCFRS_sDCP_Parser print("preprocessing grammar", file=stderr) parser_type.preprocess_grammar(grammar, terminal_labeling) print("invoking parser", file=stderr) parser = parser_type(grammar, tree1) print("listing derivations", file=stderr) for der in parser.all_derivation_trees(): print(der) output_tree = HybridTree() tokens = tree1.token_yield() dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False, construct_conll_token) print(tree1) print(output_tree) print("completed test", file=stderr)
def test_negra_to_dag_parsing(self): names = list(map(str, [26954])) fd_, primary_file = tempfile.mkstemp(suffix='.export') with open(primary_file, mode='w') as pf: for s in names: dsg = tp.sentence_names_to_deep_syntax_graphs( ["s" + s], "res/tiger/tiger_s%s.xml" % s, hold=False, ignore_puntcuation=False)[0] dsg.set_label(dsg.label[1:]) lines = np.serialize_hybrid_dag_to_negra( [dsg], 0, 500, use_sentence_names=True) print(''.join(lines), file=pf) _, binarized_file = tempfile.mkstemp(suffix='.export') subprocess.call([ "discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", primary_file, binarized_file ]) print(primary_file) print(binarized_file) corpus = np.sentence_names_to_hybridtrees(names, primary_file, secedge=True) corpus2 = np.sentence_names_to_hybridtrees(names, binarized_file, secedge=True) dag = corpus[0] print(dag) assert isinstance(dag, HybridDag) self.assertEqual(8, len(dag.token_yield())) for token in dag.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() dag_bin = corpus2[0] print(dag_bin) for token in dag_bin.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() self.assertEqual(8, len(dag_bin.token_yield())) for node, token in zip( dag_bin.nodes(), list(map(str, map(dag_bin.node_token, dag_bin.nodes())))): print(node, token) print() print(top(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'101', '500'}, top(dag_bin, {'500', '101', '102'})) print(bottom(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'502'}, bottom(dag_bin, {'500', '101', '102'})) nont_labeling = BasicNonterminalLabeling() term_labeling = FormTerminals() # PosTerminals() grammar = direct_extract_lcfrs_from_prebinarized_corpus( dag_bin, term_labeling, nont_labeling) # print(grammar) for rule in grammar.rules(): print(rule.get_idx(), rule) print("Testing LCFRS parsing and DCP evaluation".center(80, '=')) parser = LCFRS_parser(grammar) parser_input = term_labeling.prepare_parser_input( dag_bin.token_yield()) print(parser_input) parser.set_input(parser_input) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() print(der) dcp_term = DCP_evaluator(der).getEvaluation() print(dcp_term[0]) dag_eval = HybridDag(dag_bin.sent_label()) dcp_to_hybriddag(dag_eval, dcp_term, copy.deepcopy(dag_bin.token_yield()), False, construct_token=construct_constituent_token) print(dag_eval) for node in dag_eval.nodes(): token = dag_eval.node_token(node) if token.type() == "CONSTITUENT-CATEGORY": label = token.category() elif token.type() == "CONSTITUENT-TERMINAL": label = token.form(), token.pos() print(node, label, dag_eval.children(node), dag_eval.sec_children(node), dag_eval.sec_parents(node)) lines = np.serialize_hybridtrees_to_negra([dag_eval], 1, 500, use_sentence_names=True) for line in lines: print(line, end='') print() with open(primary_file) as pcf: for line in pcf: print(line, end='') print('Testing reduct computation with Schick parser'.center(80, '=')) grammar_path = '/tmp/lcfrs_dcp_grammar.gr' derivation_manager = PyDerivationManager(grammar) with open(grammar_path, 'w') as grammar_file: nonterminal_enc, terminal_enc = linearize( grammar, nont_labeling, term_labeling, grammar_file, delimiter=' : ', nonterminal_encoder=derivation_manager.get_nonterminal_map()) print(np.negra_to_json(dag, terminal_enc, term_labeling)) json_data = np.export_corpus_to_json([dag], terminal_enc, term_labeling) corpus_path = '/tmp/json_dags.json' with open(corpus_path, 'w') as data_file: json.dump(json_data, data_file) reduct_dir = '/tmp/schick_parser_reducts' if os.path.isdir(reduct_dir): shutil.rmtree(reduct_dir) os.makedirs(reduct_dir) p = subprocess.Popen([ ' '.join([ "java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'reduct', '-g', grammar_path, '-t', corpus_path, "--input-format", "json", "-o", reduct_dir ]) ], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) print("stdout", p.stdout.name) while True: nextline = p.stdout.readline() if nextline == b'' and p.poll() is not None: break print(nextline.decode('unicode_escape'), end='') # sys.stdout.write(nextline) # sys.stdout.flush() p.wait() p.stdout.close() self.assertEqual(0, p.returncode) rtgs = [] def decode_nonterminals(s): return derivation_manager.get_nonterminal_map().index_object( int(s)) for i in range(1, len(corpus) + 1): rtgs.append( read_rtg(os.path.join(reduct_dir, str(i) + '.gra'), symbol_offset=-1, rule_prefix='r', process_nonterminal=decode_nonterminals)) print("Reduct RTG") for rule in rtgs[0].rules: print(rule.lhs, "->", rule.symbol, rule.rhs) derivation_manager.get_nonterminal_map().print_index() derivation_manager.convert_rtgs_to_hypergraphs(rtgs) derivation_manager.serialize( bytes('/tmp/reduct_manager.trace', encoding='utf8')) derivations = [ LCFRSDerivationWrapper(der) for der in derivation_manager.enumerate_derivations(0, grammar) ] self.assertGreaterEqual(len(derivations), 1) if len(derivations) >= 1: print("Sentence", i) for der in derivations: print(der) self.assertTrue( der.check_integrity_recursive(der.root_id(), grammar.start()))
def test_fst_compilation_right(self): if not test_pynini: return tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), terminal_labeling.token_label, [right_branching], 'START') a, rules = compile_wfst_from_right_branching_grammar(grammar) print(repr(a)) symboltable = a.input_symbols() string = 'NP N V V V'.split(' ') token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip( 'Piet Marie helpen leren lezen'.split(' '), string) ] fsa = fsa_from_list_of_symbols(string, symboltable) self.assertEqual( '0\t1\tNP\tNP\n1\t2\tN\tN\n2\t3\tV\tV\n3\t4\tV\tV\n4\t5\tV\tV\n5\n', fsa.text().decode('utf-8')) b = compose(fsa, a) print(b.input_symbols()) for i in b.input_symbols(): print(i) print("Input Composition") print(b.text(symboltable, symboltable).decode('utf-8')) i = 0 for path in paths(b): print(i, "th path:", path, end=' ') r = list(map(rules.index_object, path)) d = PolishDerivation(r[1::]) dcp = DCP_evaluator(d).getEvaluation() h = HybridTree() dcp_to_hybridtree(h, dcp, token_sequence, False, construct_conll_token) h.reorder() if h == tree2: print("correct") else: print("incorrect") i += 1 stats = defaultdict(lambda: 0) local_rule_stats(b, stats, 15) print(stats) print("Shortest path probability") best = shortestpath(b) best.topsort() self.assertAlmostEqual(1.80844898756e-05, pow(e, -float(shortestdistance(best)[-1]))) print(best.text()) polish_rules = retrieve_rules(best) self.assertSequenceEqual(polish_rules, [8, 7, 1, 6, 2, 5, 3, 10, 3, 3]) polish_rules = list(map(rules.index_object, polish_rules)) print(polish_rules) der = PolishDerivation(polish_rules[1::]) print(der) print( derivation_to_hybrid_tree(der, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token) print(h_tree_2)
def generic_parsing_test(self, parser_type, limit_train, limit_test, compare_order): def filter_by_id(n, trees): j = 0 for tree in trees: if j in n: yield tree j += 1 #params train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' test = train # test = 'res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory( ).create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) parser_type.preprocess_grammar(grammar_prim, term_labelling) trees = parse_conll_corpus(test, False, limit_test) count_derivs = {} no_complete_match = 0 for i, tree in enumerate(trees): print("Parsing tree for ", i, file=stderr) print(tree, file=stderr) parser = parser_type(grammar_prim, tree) self.assertTrue(parser.recognized()) count_derivs[i] = 0 print("Found derivations for ", i, file=stderr) j = 0 derivations = [] for der in parser.all_derivation_trees(): self.assertTrue( der.check_integrity_recursive(der.root_id(), start)) print(count_derivs[i], file=stderr) print(der, file=stderr) output_tree = HybridTree() tokens = tree.token_yield() the_yield = der.compute_yield() # print >>stderr, the_yield tokens2 = list( map(lambda pos: construct_conll_token('_', pos), the_yield)) dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens2, False, construct_conll_token, reorder=False) print(tree, file=stderr) print(output_tree, file=stderr) self.compare_hybrid_trees(tree, output_tree, compare_order) count_derivs[i] += 1 derivations.append(der) self.assertTrue( sDCPParserTest.pairwise_different( derivations, sDCPParserTest.compare_derivations)) self.assertEqual(len(derivations), count_derivs[i]) if count_derivs[i] == 0: no_complete_match += 1 for key in count_derivs: print(key, count_derivs[key]) print("# trees with no complete match:", no_complete_match)
def test_negra_to_dag_parsing(self): pass names = list(map(str, [26954])) fd_, primary_file = tempfile.mkstemp(suffix='.export') with open(primary_file, mode='w') as pf: for s in names: dsg = tp.sentence_names_to_deep_syntax_graphs( [s], "res/tiger/tiger_s%s.xml" % s, hold=False, ignore_puntcuation=False)[0] dsg.set_label(dsg.label[1:]) lines = np.serialize_hybrid_dag_to_negra( [dsg], 0, 500, use_sentence_names=True) print(''.join(lines), file=pf) _, binarized_file = tempfile.mkstemp(suffix='.export') subprocess.call([ "discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", primary_file, binarized_file ]) print(primary_file) print(binarized_file) corpus = np.sentence_names_to_hybridtrees(names, primary_file, secedge=True) corpus2 = np.sentence_names_to_hybridtrees(names, binarized_file, secedge=True) dag = corpus[0] print(dag) assert isinstance(dag, HybridDag) self.assertEqual(8, len(dag.token_yield())) for token in dag.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() dag_bin = corpus2[0] print(dag_bin) for token in dag_bin.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() self.assertEqual(8, len(dag_bin.token_yield())) for node, token in zip( dag_bin.nodes(), list(map(str, map(dag_bin.node_token, dag_bin.nodes())))): print(node, token) print() print(top(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'101', '500'}, top(dag_bin, {'500', '101', '102'})) print(bottom(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'502'}, bottom(dag_bin, {'500', '101', '102'})) grammar = direct_extract_lcfrs_from_prebinarized_corpus(dag_bin) print(grammar) parser = LCFRS_parser(grammar) poss = list(map(lambda x: x.pos(), dag_bin.token_yield())) print(poss) parser.set_input(poss) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() print(der) dcp_term = DCP_evaluator(der).getEvaluation() print(dcp_term[0]) dag_eval = HybridDag(dag_bin.sent_label()) dcp_to_hybriddag(dag_eval, dcp_term, copy.deepcopy(dag_bin.token_yield()), False, construct_token=construct_constituent_token) print(dag_eval) for node in dag_eval.nodes(): token = dag_eval.node_token(node) if token.type() == "CONSTITUENT-CATEGORY": label = token.category() elif token.type() == "CONSTITUENT-TERMINAL": label = token.form(), token.pos() print(node, label, dag_eval.children(node), dag_eval.sec_children(node), dag_eval.sec_parents(node)) lines = np.serialize_hybridtrees_to_negra([dag_eval], 1, 500, use_sentence_names=True) for line in lines: print(line, end='') print() with open(primary_file) as pcf: for line in pcf: print(line, end='')
def test_fst_compilation_left(self): if not test_pynini: return tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), terminal_labeling.token_label, [left_branching], 'START') fst, rules = compile_wfst_from_left_branching_grammar(grammar) print(repr(fst)) symboltable = fst.input_symbols() string = ["NP", "N", "V", "V", "V"] fsa = fsa_from_list_of_symbols(string, symboltable) self.assertEqual( fsa.text().decode('utf-8'), '0\t1\tNP\tNP\n1\t2\tN\tN\n2\t3\tV\tV\n3\t4\tV\tV\n4\t5\tV\tV\n5\n' ) b = compose(fsa, fst) print(b.text(symboltable, symboltable)) print("Shortest path probability", end=' ') best = shortestpath(b) best.topsort() # self.assertAlmostEquals(pow(e, -float(shortestdistance(best)[-1])), 1.80844898756e-05) print(best.text()) polish_rules = retrieve_rules(best) self.assertSequenceEqual(polish_rules, [1, 2, 3, 4, 5, 4, 9, 4, 7, 8]) polish_rules = list(map(rules.index_object, polish_rules)) for rule in polish_rules: print(rule) print() der = ReversePolishDerivation(polish_rules[0:-1]) self.assertTrue(der.check_integrity_recursive(der.root_id())) print(der) LeftBranchingFSTParser.preprocess_grammar(grammar) parser = LeftBranchingFSTParser(grammar, string) der_ = parser.best_derivation_tree() print(der_) self.assertTrue(der_.check_integrity_recursive(der_.root_id())) print( derivation_to_hybrid_tree(der, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) print( derivation_to_hybrid_tree(der_, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '), 'NP N V V V'.split(' ')) ] dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token) print(h_tree_2)
def test_k_best_parsing(self): limit_train = 20 limit_test = 10 train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' test = train parser_type = GFParser_k_best # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory( ).create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) parser_type.preprocess_grammar(grammar_prim) tree_yield = term_labelling.prepare_parser_input trees = parse_conll_corpus(test, False, limit_test) for i, tree in enumerate(trees): print("Parsing sentence ", i, file=stderr) # print >>stderr, tree parser = parser_type(grammar_prim, tree_yield(tree.token_yield()), k=50) self.assertTrue(parser.recognized()) derivations = [der for der in parser.k_best_derivation_trees()] print("# derivations: ", len(derivations), file=stderr) h_trees = [] current_weight = 0 weights = [] derivation_list = [] for weight, der in derivations: # print >>stderr, exp(-weight) # print >>stderr, der self.assertTrue(not der in derivation_list) derivation_list.append(der) # TODO this should hold, but it looks like a GF bug! # self.assertGreaterEqual(weight, current_weight) current_weight = weight dcp = DCP_evaluator(der).getEvaluation() h_tree = HybridTree() cleaned_tokens = copy.deepcopy(tree.full_token_yield()) dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_conll_token) h_trees.append(h_tree) weights.append(weight) # print >>stderr, h_tree # print a matrix indicating which derivations result # in the same hybrid tree if True: for i, h_tree1 in enumerate(h_trees): for h_tree2 in h_trees: if h_tree1 == h_tree2: print("x", end=' ', file=stderr) else: print("", end=' ', file=stderr) print(weights[i], file=stderr) print(file=stderr)
def test_negra_dag_small_grammar(self): DAG_CORPUS = 'res/tiger/tiger_full_with_sec_edges.export' DAG_CORPUS_BIN = 'res/tiger/tiger_full_with_sec_edges_bin_h1_v1.export' names = list([str(i) for i in range(1, 101)]) if not os.path.exists(DAG_CORPUS): print( 'run the following command to create an export corpus with dags:' ) print('\tPYTHONPATH=. util/tiger_dags_to_negra.py ' + 'res/tiger/tiger_release_aug07.corrected.16012013.xml ' + DAG_CORPUS + ' 1 50474') self.assertTrue(os.path.exists(DAG_CORPUS)) if not os.path.exists(DAG_CORPUS_BIN): print( 'run the following command to binarize the export corpus with dags:' ) print("discodop treetransforms --binarize -v 1 -h 1 " + DAG_CORPUS + " " + DAG_CORPUS_BIN) # _, DAG_CORPUS_BIN = tempfile.mkstemp(prefix='corpus_bin_', suffix='.export') # subprocess.call(["discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", DAG_CORPUS, DAG_CORPUS_BIN]) self.assertTrue(os.path.exists(DAG_CORPUS_BIN)) corpus = np.sentence_names_to_hybridtrees(names, DAG_CORPUS, secedge=True) corpus_bin = np.sentence_names_to_hybridtrees(names, DAG_CORPUS_BIN, secedge=True) grammar = LCFRS(start="START") for hybrid_dag, hybrid_dag_bin in zip(corpus, corpus_bin): self.assertEqual(len(hybrid_dag.token_yield()), len(hybrid_dag_bin.token_yield())) dag_grammar = direct_extract_lcfrs_from_prebinarized_corpus( hybrid_dag_bin) grammar.add_gram(dag_grammar) grammar.make_proper() print( "Extracted LCFRS/DCP-hybrid grammar with %i nonterminals and %i rules" % (len(grammar.nonts()), len(grammar.rules()))) parser = DiscodopKbestParser(grammar, k=1) _, RESULT_FILE = tempfile.mkstemp(prefix='parser_results_', suffix='.export') with open(RESULT_FILE, 'w') as results: for hybrid_dag in corpus: poss = list(map(lambda x: x.pos(), hybrid_dag.token_yield())) parser.set_input(poss) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() dcp_term = DCP_evaluator(der).getEvaluation() dag_eval = HybridDag(hybrid_dag.sent_label()) dcp_to_hybriddag(dag_eval, dcp_term, copy.deepcopy(hybrid_dag.token_yield()), False, construct_token=construct_constituent_token) lines = np.serialize_hybridtrees_to_negra( [dag_eval], 1, 500, use_sentence_names=True) for line in lines: print(line, end='', file=results) parser.clear() print("Wrote results to %s" % RESULT_FILE)
def test_stanford_unking_scheme(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) tree = self.tree tree.add_to_root("VP1") print(tree) terminal_labeling = StanfordUNKing([tree]) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True, term_labeling=terminal_labeling) print(grammar) parser = LCFRS_parser(grammar) parser.set_input([token.form() for token in tree.token_yield()]) parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() e = DCP_evaluator(derivation) dcp_term = e.getEvaluation() print(str(dcp_term[0])) t = ConstituentTree() dcp_to_hybridtree( t, dcp_term, [ construct_constituent_token(token.form(), '--', True) for token in tree.token_yield() ], ignore_punctuation=False, construct_token=construct_constituent_token) print(t) self.assertEqual(len(tree.token_yield()), len(t.token_yield())) for tok1, tok2 in zip(tree.token_yield(), t.token_yield()): self.assertEqual(tok1.form(), tok2.form()) self.assertEqual(tok1.pos(), tok2.pos()) rules = terminal_labeling.create_smoothed_rules() print(rules) new_rules = {} for rule in grammar.rules(): if rule.rhs() == []: assert len(rule.dcp()) == 1 dcp = rule.dcp()[0] assert len(dcp.rhs()) == 1 term = dcp.rhs()[0] head = term.head() pos = head.pos() for tag, form in rules: if tag == pos: lhs = LCFRS_lhs(rule.lhs().nont()) lhs.add_arg([form]) new_rules[lhs, dcp] = rules[tag, form] for lhs, dcp in new_rules: print(str(lhs), str(dcp), new_rules[(lhs, dcp)]) tokens = [ construct_constituent_token('hat', '--', True), construct_constituent_token('HAT', '--', True) ] self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat') self.assertEqual(terminal_labeling.token_label(tokens[1]), '_UNK') terminal_labeling.test_mode = True self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat') self.assertEqual(terminal_labeling.token_label(tokens[1]), 'hat')
def test_best_trees(self): limit_train = 5000 limit_test = 100 train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' test = train parser_type = GFParser_k_best # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory( ).create_simple_labeling_strategy("child", "pos+deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) parser_type.preprocess_grammar(grammar_prim) tree_yield = term_labelling.prepare_parser_input trees = parse_conll_corpus(test, False, limit_test) for i, tree in enumerate(trees): print("Parsing sentence ", i, file=stderr) parser = parser_type(grammar_prim, tree_yield(tree.token_yield()), k=200) self.assertTrue(parser.recognized()) viterbi_weight = parser.viterbi_weight() viterbi_deriv = parser.viterbi_derivation() der_to_tree = lambda der: dcp_to_hybridtree( HybridTree(), DCP_evaluator(der).getEvaluation(), copy.deepcopy(tree.full_token_yield()), False, construct_conll_token) viterbi_tree = der_to_tree(viterbi_deriv) ordered_parse_trees = parser.best_trees(der_to_tree) best_tree, best_weight, best_witnesses = ordered_parse_trees[0] for i, (parsed_tree, _, _) in enumerate(ordered_parse_trees): if parsed_tree.__eq__(tree): print("Gold tree is ", i + 1, " in best tree list", file=stderr) break if (not viterbi_tree.__eq__(best_tree) and viterbi_weight != best_weight): print("viterbi and k-best tree differ", file=stderr) print("viterbi: ", viterbi_weight, file=stderr) print("k-best: ", best_weight, best_witnesses, file=stderr) if False: print(viterbi_tree, file=stderr) print(tree_to_conll_str(viterbi_tree), file=stderr) print(best_tree, file=stderr) print(tree_to_conll_str(best_tree), file=stderr) print("gold tree", file=stderr) print(tree, file=stderr) print(tree_to_conll_str(tree), file=stderr)
def do_parsing(grammar_prim, limit, ignore_punctuation, recompile=True, preprocess_path=None): trees = parse_conll_corpus(test, False, limit) if ignore_punctuation: trees = disconnect_punctuation(trees) total_time = 0.0 load_preprocess = preprocess_path if recompile or (not os.path.isfile( parser_type.resolve_path(preprocess_path))): load_preprocess = None parser = parser_type(grammar_prim, save_preprocess=preprocess_path, load_preprocess=load_preprocess) with open(result, 'w') as result_file: failures = 0 for tree in trees: if len(tree.id_yield()) > limit: continue time_stamp = time.clock() parser.set_input(tree_yield(tree.token_yield())) parser.parse() # if not parser.recognized(): # parser = parser_type(grammar_second, tree_yield(tree.token_yield())) # if not parser.recognized(): # parser = parser_type(grammar_tern, tree_yield(tree.token_yield())) time_stamp = time.clock() - time_stamp total_time += time_stamp cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') h_tree = HybridTree(tree.sent_label()) if parser_type == GFParser_k_best and parser.recognized(): der_to_tree = lambda der: dcp_to_hybridtree( HybridTree(), DCP_evaluator(der).getEvaluation(), copy.deepcopy(tree.full_token_yield()), False, construct_conll_token) h_tree = parser.best_trees(der_to_tree)[0][0] elif parser_type == CFGParser \ or parser_type == GFParser \ or parser_type == LeftBranchingFSTParser \ or parser_type == RightBranchingFSTParser: h_tree = parser.dcp_hybrid_tree_best_derivation( h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token) else: h_tree = None if h_tree: result_file.write(tree_to_conll_str(h_tree)) result_file.write('\n\n') else: failures += 1 forms = [token.form() for token in tree.full_token_yield()] poss = [token.pos() for token in tree.full_token_yield()] result_file.write( tree_to_conll_str(fall_back_left_branching(forms, poss))) result_file.write('\n\n') parser.clear() print("parse failures", failures) print("parse time", total_time) print("eval.pl", "no punctuation") p = subprocess.Popen( ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q"]) p.communicate() print("eval.pl", "punctation") p = subprocess.Popen( ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q", "-p"]) p.communicate()
def test_something(self): normal_corpus = 'res/tiger/tiger_8000.export' binarized_corpus = 'res/tiger/tiger_8000_bin.export' limit = 55000 # limit = 30 corpus_bin = sentence_names_to_hybridtrees( {str(x) for x in range(limit)}, binarized_corpus, disconnect_punctuation=False, add_vroot=True, mode="DISCO-DOP") corpus = sentence_names_to_hybridtrees({str(x) for x in range(limit)}, normal_corpus, disconnect_punctuation=False, add_vroot=True, mode="DISCO-DOP") term_labeling = terminal_labeling(corpus, threshold=4) grammar = None for htree, htree_bin in zip(corpus, corpus_bin): # print(htree_bin) try: htree_grammar = direct_extract_lcfrs_from_prebinarized_corpus( htree_bin, term_labeling=term_labeling) except Exception as e: print(e) print(htree_bin) print(htree_bin.nodes()) print(htree_bin.word_yield()) raise e # print(htree_grammar) parser_input = term_labeling.prepare_parser_input( htree.token_yield()) p = LCFRS_sDCP_Parser(htree_grammar, terminal_labelling=term_labeling) p.set_input(htree) p.parse() # p = LCFRS_parser(htree_grammar, parser_input) self.assertTrue(p.recognized()) derivs = list(p.all_derivation_trees()) # print("derivations:", len(derivs)) for der in derivs: dcp = DCP_evaluator(der).getEvaluation() sys_tree = HybridTree(htree.sent_label()) sys_tree = dcp_to_hybridtree( sys_tree, dcp, deepcopy(htree.token_yield()), ignore_punctuation=False, construct_token=construct_constituent_token) # print(sys_tree) # print(htree == sys_tree) # print(der) if htree != sys_tree: print(htree.sent_label()) print(htree) print(sys_tree) self.assertEqual(htree, sys_tree) if grammar is None: grammar = htree_grammar else: grammar.add_gram(htree_grammar) htree_grammar.make_proper() try: disco_parser = DiscodopKbestParser(htree_grammar) except ValueError as ve: print(ve) print(htree.sent_label()) print(htree) print(htree_bin) print(htree_grammar) raise ve grammar.make_proper() disco_parser = DiscodopKbestParser(grammar)
def do_parsing(grammar, test_corpus, term_labelling, result, grammar_identifier, parser_type, k_best, minimum_risk=False, oracle_parse=False, recompile=True, reparse=False, dir=None, opt=None): tree_yield = term_labelling.prepare_parser_input result_path = result(grammar_identifier) minimum_risk_path = result(grammar_identifier, 'min_risk') oracle_parse_path = result(grammar_identifier, 'oracle_file') total_time = 0.0 preprocess_path = [os.path.join(dir, grammar_identifier), "gf_grammar"] # print(preprocess_path) load_preprocess = preprocess_path if parser_type not in [GFParser, GFParser_k_best, Coarse_to_fine_parser] \ or recompile \ or (not os.path.isfile(GFParser.resolve_path(preprocess_path))): load_preprocess = None if parser_type in [GFParser, GFParser_k_best, Coarse_to_fine_parser] \ and not os.path.isdir(os.path.join(dir, grammar_identifier)): os.makedirs(os.path.join(dir, grammar_identifier)) if parser_type == GFParser_k_best: parser = GFParser_k_best(grammar, save_preprocessing=preprocess_path, load_preprocessing=load_preprocess, k=k_best) elif parser_type == Coarse_to_fine_parser: parser = Coarse_to_fine_parser(grammar, base_parser_type=GFParser_k_best, la=opt["latentAnnotation"], grammarInfo=opt["grammarInfo"], nontMap=opt["nontMap"], save_preprocessing=preprocess_path, load_preprocessing=load_preprocess, k=k_best) else: parser = parser_type(grammar, save_preprocess=preprocess_path, load_preprocess=load_preprocess) if recompile or reparse or \ not os.path.isfile(result_path) \ or (minimum_risk and not os.path.isfile(minimum_risk_path)) \ or (oracle_parse and not os.path.isfile(oracle_parse_path)): result_dirs = map(lambda path: os.path.split(path)[0], [result_path, minimum_risk_path, oracle_parse_path]) for result_dir in result_dirs: if not os.path.isdir(result_dir): os.makedirs(result_dir) with open(result_path, 'w') as result_file, \ open(minimum_risk_path, 'w') as minimum_risk_file, \ open(oracle_parse_path, 'w') as oracle_parse_file: failures = 0 for tree in test_corpus.get_trees(): time_stamp = time.clock() parser.set_input(tree_yield(tree.token_yield())) parser.parse() # if not parser.recognized(): # parser = parser_type(grammar_second, tree_yield(tree.token_yield())) # if not parser.recognized(): # parser = parser_type(grammar_tern, tree_yield(tree.token_yield())) time_stamp = time.clock() - time_stamp total_time += time_stamp cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') h_tree = HybridTree(tree.sent_label()) if parser_type in [GFParser_k_best, Coarse_to_fine_parser ] and parser.recognized(): if minimum_risk or oracle_parse: h_trees = [] weights = [] for weight, der in parser.k_best_derivation_trees(): dcp = DCP_evaluator(der).getEvaluation() h_tree = HybridTree() cleaned_tokens = copy.deepcopy( tree.full_token_yield()) dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_conll_token) h_trees.append(h_tree) weights.append(weight) if minimum_risk: h_tree_min_risk = compute_minimum_risk_tree( h_trees, weights) if oracle_parse: h_tree_oracle = compute_oracle_tree(h_trees, tree) der_to_tree = lambda der: dcp_to_hybridtree( HybridTree(), DCP_evaluator(der).getEvaluation(), copy.deepcopy(tree.full_token_yield()), False, construct_conll_token) # h_tree = parser.best_trees(der_to_tree)[0][0] h_tree = HybridTree(tree.sent_label()) h_tree = parser.dcp_hybrid_tree_best_derivation( h_tree, copy.deepcopy(tree.full_token_yield()), ignore_punctuation, construct_conll_token) elif parser_type == CFGParser \ or parser_type == GFParser \ or parser_type == LeftBranchingFSTParser \ or parser_type == RightBranchingFSTParser: h_tree = parser.dcp_hybrid_tree_best_derivation( h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token) else: h_tree = None if h_tree: result_file.write(tree_to_conll_str(h_tree)) result_file.write('\n\n') if minimum_risk and parser_type in [ GFParser_k_best, Coarse_to_fine_parser ]: minimum_risk_file.write( tree_to_conll_str(h_tree_min_risk)) minimum_risk_file.write('\n\n') if oracle_parse and parser_type in [ GFParser_k_best, Coarse_to_fine_parser ]: oracle_parse_file.write( tree_to_conll_str(h_tree_oracle)) oracle_parse_file.write('\n\n') else: failures += 1 forms = [token.form() for token in tree.full_token_yield()] poss = [token.pos() for token in tree.full_token_yield()] fall_back = tree_to_conll_str( fall_back_left_branching(forms, poss)) files = [result_file] if minimum_risk: files.append(minimum_risk_file) if oracle_parse: files.append(oracle_parse_file) for file in files: file.write(fall_back) file.write('\n\n') parser.clear() print("parse failures", failures) print("parse time", total_time) if parser_type == GFParser_k_best: print("best parse results") else: print("viterbi parse results") eval_pl_call(test_corpus._path, result_path) if oracle_parse: print("\noracle parse results") eval_pl_call(test_corpus._path, oracle_parse_path) if minimum_risk: print("\nminimum risk results") eval_pl_call(test_corpus._path, minimum_risk_path) return parser
def build_score_validator(baseline_grammar, grammarInfo, nont_map, storageManager, term_labelling, parser, corpus_validation, validationMethod): validator = PyCandidateScoreValidator(grammarInfo, storageManager, validationMethod) # parser = GFParser(baseline_grammar) tree_count = 0 der_count = 0 for gold_tree in corpus_validation: tree_count += 1 parser.set_input( term_labelling.prepare_parser_input(gold_tree.token_yield())) parser.parse() derivations = [der for _, der in parser.k_best_derivation_trees()] manager = PyDerivationManager(baseline_grammar, nont_map) manager.convert_hypergraphs(derivations) scores = [] relevant = set([tuple(t) for t in gold_tree.labelled_spans()]) for der in derivations: der_count += 1 h_tree = ConstituentTree() cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield()) dcp = DCP_evaluator(der).getEvaluation() dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_constituent_token) retrieved = set([tuple(t) for t in h_tree.labelled_spans()]) inters = retrieved & relevant # in case of parse failure there are two options here: # - parse failure -> no spans at all, thus precision = 1 # - parse failure -> a dummy tree with all spans wrong, thus precision = 0 precision = 1.0 * len(inters) / len(retrieved) \ if len(retrieved) > 0 else 0 recall = 1.0 * len(inters) / len(relevant) \ if len(relevant) > 0 else 0 fmeasure = 2.0 * precision * recall / (precision + recall) \ if precision + recall > 0 else 0 if validationMethod == "F1": scores.append(fmeasure) elif validationMethod == "Precision": scores.append(precision) elif validationMethod == "Recall": scores.append(recall) else: raise () validator.add_scored_candidates(manager, scores, 1.0 if len(relevant) > 0 else 0.0) # print(tree_count, scores) parser.clear() print("trees used for validation ", tree_count, "with", der_count * 1.0 / tree_count, "derivations on average") return validator
def build_score_validator(baseline_grammar, grammarInfo, nont_map, storageManager, term_labelling, parser, corpus_validation, validationMethod): validator = PyCandidateScoreValidator(grammarInfo, storageManager, validationMethod) # parser = GFParser(baseline_grammar) tree_count = 0 der_count = 0 for gold_tree in corpus_validation.get_trees(): tree_count += 1 parser.set_input( term_labelling.prepare_parser_input(gold_tree.token_yield())) parser.parse() derivations = map(lambda x: x[1], parser.k_best_derivation_trees()) manager = PyDerivationManager(baseline_grammar, nont_map) manager.convert_derivations_to_hypergraphs(derivations) scores = [] gold_labels = {} gold_heads = {} for position, id in enumerate(gold_tree.id_yield()): parent_id = gold_tree.parent(id) gold_labels[position] = gold_tree.node_token(id).deprel() if parent_id is None: assert id in gold_tree.root gold_heads[position] = 0 else: gold_heads[position] = gold_tree.id_yield().index( parent_id) + 1 derivations = parser.k_best_derivation_trees() for _, der in derivations: der_count += 1 h_tree = HybridTree() cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield()) dcp = DCP_evaluator(der).getEvaluation() dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_conll_token) las, uas, lac = 0, 0, 0 for position, id in enumerate(h_tree.id_yield()): parent_id = h_tree.parent(id) if parent_id is None: assert id in h_tree.root head = 0 else: head = h_tree.id_yield().index(parent_id) + 1 label = h_tree.node_token(id).deprel() if gold_heads[position] == head: uas += 1 if gold_labels[position] == label: lac += 1 if gold_heads[position] == head and gold_labels[ position] == label: las += 1 if validationMethod == "LAS": scores.append(las) elif validationMethod == "UAS": scores.append(uas) elif validationMethod == "LAC": scores.append(lac) max_score = len(gold_tree.id_yield()) validator.add_scored_candidates(manager, scores, max_score) print(tree_count, max_score, scores) parser.clear() print("trees used for validation ", tree_count, "with", der_count * 1.0 / tree_count, "derivations on average") return validator