def dummy_constituent_tree(token_yield, full_token_yield, dummy_label, dummy_root, label=None): """ :param token_yield: connected yield of a parse tree :type token_yield: list[ConstituentTerminal] :param full_token_yield: full yield of the parse tree :type full_token_yield: list[ConstituentTerminal] :return: dummy constituent tree :rtype: ConstituentTree generates a dummy tree for a given yield using dummy_label as inner node symbol """ tree = ConstituentTree(label) # create all leaves and punctuation for token in full_token_yield: if token not in token_yield: tree.add_punct(full_token_yield.index(token), token.pos(), token.form()) else: tree.add_leaf(full_token_yield.index(token), token.pos(), token.form()) # generate root node root_id = 'n0' tree.add_node(root_id, ConstituentCategory(dummy_root)) tree.add_to_root(root_id) parent = root_id if len(token_yield) > 1: i = 1 # generate inner nodes of branching tree for token in token_yield[:-2]: node = ConstituentCategory(str(dummy_label)) tree.add_node('n' + str(i), node) tree.add_child(parent, 'n' + str(i)) tree.add_child(parent, full_token_yield.index(token)) parent = 'n' + str(i) i += 1 token = token_yield[len(token_yield) - 2] tree.add_child(parent, full_token_yield.index(token)) token = token_yield[len(token_yield) - 1] tree.add_child(parent, full_token_yield.index(token)) elif len(token_yield) == 1: tree.add_child(parent, full_token_yield.index(token_yield[0])) return tree
def main(): # train_path = '../res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/train/train.German.gold.xml' # corpus = sentence_names_to_hybridtrees(["s" + str(i) for i in range(1, 10)], file_name=train_path, hold=False) train_path = '../res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/dev/dev.German.gold.xml' names = ["s" + str(i) for i in range(40675, 40700)] names = ['s40564'] corpus = sentence_names_to_hybridtrees(names, path=train_path, hold=False) cp = TreeComparator() tree_sys = ConstituentTree() tree_sys.add_node('0', ConstituentCategory('PN')) tree_sys.add_node('1', corpus[0].token_yield()[0], order=True) tree_sys.add_punct("3", '$.', '.') tree_sys.add_to_root('0') tree_sys.add_child('0', '1') param = build_param() for i, hybridtree in enumerate(corpus): print(i) # discotree = convert_tree(hybridtree) tree, sent = convert_tree(hybridtree) tree2, sent2 = convert_tree(tree_sys) if i == 11: pass # print(discotree) # print(discotree.draw()) # print(DrawTree(discotree, discotree.sent)) print(DrawTree(tree, sent)) print(' '.join(map(lambda x: x.form(), hybridtree.full_token_yield()))) print(DrawTree(tree2, sent2)) print(tree[::-1]) print('POS', tree.pos()) result = TreePairResult(i, tree, sent, tree2, sent2, param) print(result.scores()) print("Comparator: ", cp.compare_hybridtrees(hybridtree, hybridtree))
def tree2(): tree = ConstituentTree("1") for i, t in enumerate(["a", "b", "d", "c"]): tree.add_leaf(str(i), "P" + t, t) tree.set_label('r0', 'C') tree.set_label('r1', 'A') tree.set_label('r2', 'B') tree.add_to_root('r0') tree.add_child('r0', 'r1') tree.add_child('r0', 'r2') tree.add_child('r1', '0') tree.add_child('r1', '3') tree.add_child('r2', '1') tree.add_child('r2', '2') print(tree, tree.word_yield()) return tree
def constituent_tree_1_pos_stripped(): tree = ConstituentTree("s1") tree.add_leaf("f1", "--", "hat") tree.add_leaf("f2", "--", "schnell") tree.add_leaf("f3", "--", "gearbeitet") tree.add_punct("f4", "--", ".") tree.set_label("V", "V") tree.add_child("V", "f1") tree.add_child("V", "f3") tree.set_label("ADV", "ADV") tree.add_child("ADV", "f2") tree.set_label("VP", "VP") tree.add_child("VP", "V") tree.add_child("VP", "ADV") tree.add_to_root("VP") return tree
def flat_dummy_constituent_tree(token_yield, full_token_yield, dummy_label, dummy_root, label=None, gold_pos=True): """ :param token_yield: connected yield of a parse tree :type token_yield: list[ConstituentTerminal] :param full_token_yield: full yield of the parse tree :type full_token_yield: list[ConstituentTerminal] :return: dummy constituent tree :rtype: ConstituentTree generates a flat dummy tree for a given yield where all nodes are attached under the root """ tree = ConstituentTree(label) # generate root node root_id = 'n_root' tree.add_node(root_id, ConstituentCategory(dummy_root)) tree.add_to_root(root_id) parent = root_id # create all leaves and punctuation for token in full_token_yield: pos = token.pos() if gold_pos else '--' if token not in token_yield: tree.add_punct(full_token_yield.index(token), pos, token.form()) else: idx = full_token_yield.index(token) tree.add_leaf(idx, pos, token.form(), morph=token.morph_feats(), lemma=token.lemma()) tree.add_child(parent, idx) return tree
def constituent_tree_2(): tree = ConstituentTree("s2") tree.add_leaf("l1", "N", "John") tree.add_leaf("l2", "V", "hit") tree.add_leaf("l3", "D", "the") tree.add_leaf("l4", "N", "Ball") tree.add_punct("l5", "PUNC", ".") tree.set_label("NP", "NP") tree.add_child("NP", "l3") tree.add_child("NP", "l4") tree.set_label("VP", "VP") tree.add_child("VP", "l2") tree.add_child("VP", "NP") tree.set_label("S", "S") tree.add_child("S", "l1") tree.add_child("S", "VP") tree.add_to_root("S") return tree
def setUp(self): tree = ConstituentTree("s1") tree.add_leaf("f1", "VAFIN", "hat", morph=[("number", "Sg"), ("person", "3"), ("tense", "Past"), ("mood", "Ind")]) tree.add_leaf("f2", "ADV", "schnell", morph=[("degree", "Pos")]) tree.add_leaf("f3", "VVPP", "gearbeitet") tree.add_punct("f4", "PUNC", ".") tree.add_child("VP2", "f1") tree.add_child("VP2", "f3") tree.add_child("ADVP", "f2") tree.add_child("VP1", "VP2") tree.add_child("VP1", "ADVP") tree.set_label("VP2", "VP") tree.set_label("VP1", "VP") tree.set_label("ADVP", "ADVP") self.tree = tree tree2 = ConstituentTree("s2") tree2.add_leaf("f1", "VAFIN", "haben", morph=[("number", "Pl"), ("person", "3"), ("tense", "Past"), ("mood", "Ind")]) tree2.add_leaf("f2", "ADV", "gut", morph=[("degree", "Pos")]) tree2.add_leaf("f3", "VVPP", "gekocht") tree2.add_punct("f4", "PUNC", ".") tree2.add_child("VP2", "f1") tree2.add_child("VP2", "f3") tree2.add_child("ADVP", "f2") tree2.add_child("VP1", "VP2") tree2.add_child("VP1", "ADVP") tree2.set_label("VP2", "VP") tree2.set_label("VP1", "VP") tree2.set_label("ADVP", "ADVP") tree2.add_to_root("VP1") self.tree2 = tree2 self.tree3 = ConstituentTree("s3") self.tree3.add_leaf("f1", "ADJA", "Allgemeiner", edge="NK", morph=[("number", "Sg")]) self.tree3.add_leaf("f2", "ADJA", "Deutscher", edge="NK", morph=[("degree", "Pos"), ("number", "Sg")]) self.tree3.add_leaf("f3", "NN", "Fahrrad", edge="NK", morph=[("number", "Sg"), ("gender", "Neut")]) self.tree3.add_leaf("f4", "NN", "Club", edge="NK", morph=[("number", "Sg"), ("gender", "Neut")]) for i in range(1, 5): self.tree3.add_child("NP", "f" + str(i)) self.tree3.set_label("NP", "NP") self.tree3.add_to_root("NP")
class ConstituentTreeTest(unittest.TestCase): def test_something(self): tree = self.tree print("rooted", tree.root) tree.add_to_root("VP1") print("rooted", tree.root) print(tree) print("sent label", tree.sent_label()) print("leaves", tree.leaves()) print("is leaf (leaves)", [(x, tree.is_leaf(x)) for (x, _, _) in tree.leaves()]) print("is leaf (internal)", [(x, tree.is_leaf(x)) for x in tree.ids()]) print("leaf index", [(x, tree.leaf_index(x)) for x in ["f1", "f2", "f3"]]) print("pos yield", tree.pos_yield()) print("word yield", tree.word_yield()) # reentrant # parent print("ids", tree.ids()) # reorder print("n nodes", tree.n_nodes()) print("n gaps", tree.n_gaps()) print("fringe VP", tree.fringe("VP")) print("fringe V", tree.fringe("V")) print("empty fringe", tree.empty_fringe()) print("complete?", tree.complete()) print("max n spans", tree.max_n_spans()) print("unlabelled structure", tree.unlabelled_structure()) print("labelled spans", tree.labelled_spans()) def test_induction(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) # return fanout_k_left_to_right(tree, 1) tree = self.tree tree.add_to_root("VP1") feature_log1 = defaultdict(lambda: 0) grammar = fringe_extract_lcfrs(tree, rec_part(tree), feature_logging=feature_log1, naming=naming) for key in feature_log1: print(key, feature_log1[key]) print(grammar) feats = defaultdict(lambda: 0) grammar_ = fringe_extract_lcfrs(tree, rec_part(tree), isolate_pos=True, feature_logging=feats, naming=naming) print(grammar_) for key in feats: print(key, feats[key]) print("Adding 2nd grammar to first") grammar.add_gram(grammar_, feature_logging=(feature_log1, feats)) for idx in range(0, len(grammar.rules())): print(idx, grammar.rule_index(idx)) print("Adding 3rd grammar to first") feats3 = defaultdict(lambda: 0) grammar3 = fringe_extract_lcfrs(self.tree2, rec_part(self.tree2), isolate_pos=True, feature_logging=feats3, naming=naming) grammar.add_gram(grammar3, feature_logging=(feature_log1, feats3)) print() for idx in range(0, len(grammar.rules())): print(idx, grammar.rule_index(idx)) print() print("New feature log") print() for key in feature_log1: print(key, feature_log1[key]) grammar.make_proper() build_nont_splits_dict(grammar, feature_log1, nonterminals=Enumerator()) print(grammar.rule_index(0)) print(grammar.rule_index(2)) def test_markovized_induction(self): naming = 'strict-markov-v-2-h-0' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) # return fanout_k_left_to_right(tree, 1) tree = self.tree tree.add_to_root("VP1") print(tree) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True) print(grammar) def test_induction_2(self): def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) features = defaultdict(lambda: 0) grammar = fringe_extract_lcfrs(self.tree3, rec_part(self.tree3), naming="child", feature_logging=features, isolate_pos=True) grammar.make_proper() if False: for idx in range(0, len(grammar.rules())): print(grammar.rule_index(idx)) for key in features: if key[0] == idx: print(key, features[key]) print() for key in features: if type(key[0]) == int: continue print(key, features[key]) nont_splits, root_weights, rule_weights, _ = build_nont_splits_dict( grammar, features, nonterminals=Enumerator(), feat_function=pos_cat_feats, debug=True) print(nont_splits) print(root_weights) print(rule_weights) def setUp(self): tree = ConstituentTree("s1") tree.add_leaf("f1", "VAFIN", "hat", morph=[("number", "Sg"), ("person", "3"), ("tense", "Past"), ("mood", "Ind")]) tree.add_leaf("f2", "ADV", "schnell", morph=[("degree", "Pos")]) tree.add_leaf("f3", "VVPP", "gearbeitet") tree.add_punct("f4", "PUNC", ".") tree.add_child("VP2", "f1") tree.add_child("VP2", "f3") tree.add_child("ADVP", "f2") tree.add_child("VP1", "VP2") tree.add_child("VP1", "ADVP") tree.set_label("VP2", "VP") tree.set_label("VP1", "VP") tree.set_label("ADVP", "ADVP") self.tree = tree tree2 = ConstituentTree("s2") tree2.add_leaf("f1", "VAFIN", "haben", morph=[("number", "Pl"), ("person", "3"), ("tense", "Past"), ("mood", "Ind")]) tree2.add_leaf("f2", "ADV", "gut", morph=[("degree", "Pos")]) tree2.add_leaf("f3", "VVPP", "gekocht") tree2.add_punct("f4", "PUNC", ".") tree2.add_child("VP2", "f1") tree2.add_child("VP2", "f3") tree2.add_child("ADVP", "f2") tree2.add_child("VP1", "VP2") tree2.add_child("VP1", "ADVP") tree2.set_label("VP2", "VP") tree2.set_label("VP1", "VP") tree2.set_label("ADVP", "ADVP") tree2.add_to_root("VP1") self.tree2 = tree2 self.tree3 = ConstituentTree("s3") self.tree3.add_leaf("f1", "ADJA", "Allgemeiner", edge="NK", morph=[("number", "Sg")]) self.tree3.add_leaf("f2", "ADJA", "Deutscher", edge="NK", morph=[("degree", "Pos"), ("number", "Sg")]) self.tree3.add_leaf("f3", "NN", "Fahrrad", edge="NK", morph=[("number", "Sg"), ("gender", "Neut")]) self.tree3.add_leaf("f4", "NN", "Club", edge="NK", morph=[("number", "Sg"), ("gender", "Neut")]) for i in range(1, 5): self.tree3.add_child("NP", "f" + str(i)) self.tree3.set_label("NP", "NP") self.tree3.add_to_root("NP")
class ConstituentTreeTest(unittest.TestCase): def test_basic_tree_methods(self): tree = self.tree print("rooted", tree.root) tree.add_to_root("VP1") print("rooted", tree.root) print(tree) print("sent label", tree.sent_label()) print("leaves", tree.leaves()) print("is leaf (leaves)", [(x, tree.is_leaf(x)) for (x, _, _) in tree.leaves()]) print("is leaf (internal)", [(x, tree.is_leaf(x)) for x in tree.ids()]) print("leaf index", [(x, tree.leaf_index(x)) for x in ["f1", "f2", "f3"]]) print("pos yield", tree.pos_yield()) print("word yield", tree.word_yield()) # reentrant # parent print("ids", tree.ids()) # reorder print("n nodes", tree.n_nodes()) print("n gaps", tree.n_gaps()) print("fringe VP", tree.fringe("VP")) print("fringe V", tree.fringe("V")) print("empty fringe", tree.empty_fringe()) print("complete?", tree.complete()) print("max n spans", tree.max_n_spans()) print("unlabelled structure", tree.unlabelled_structure()) print("labelled spans", tree.labelled_spans()) def test_induction(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) # return fanout_k_left_to_right(tree, 1) tree = self.tree tree.add_to_root("VP1") feature_log1 = defaultdict(lambda: 0) grammar = fringe_extract_lcfrs(tree, rec_part(tree), feature_logging=feature_log1, naming=naming) for key in feature_log1: print(key, feature_log1[key]) print(grammar) feats = defaultdict(lambda: 0) grammar_ = fringe_extract_lcfrs(tree, rec_part(tree), isolate_pos=True, feature_logging=feats, naming=naming) print(grammar_) for key in feats: print(key, feats[key]) print("Adding 2nd grammar to first") grammar.add_gram(grammar_, feature_logging=(feature_log1, feats)) for idx in range(0, len(grammar.rules())): print(idx, grammar.rule_index(idx)) print("Adding 3rd grammar to first") feats3 = defaultdict(lambda: 0) grammar3 = fringe_extract_lcfrs(self.tree2, rec_part(self.tree2), isolate_pos=True, feature_logging=feats3, naming=naming) grammar.add_gram(grammar3, feature_logging=(feature_log1, feats3)) print() for idx in range(0, len(grammar.rules())): print(idx, grammar.rule_index(idx)) print() print("New feature log") print() for key in feature_log1: print(key, feature_log1[key]) grammar.make_proper() build_nont_splits_dict(grammar, feature_log1, nonterminals=Enumerator()) print(grammar.rule_index(0)) print(grammar.rule_index(2)) def test_markovized_induction(self): naming = 'strict-markov-v-2-h-0' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) # return fanout_k_left_to_right(tree, 1) tree = self.tree tree.add_to_root("VP1") print(tree) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True) print(grammar) def test_induction_and_parsing_with_pos_recovery(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) tree = self.tree tree.add_to_root("VP1") print(tree) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True, term_labeling=FormTerminals()) print(grammar) parser = LCFRS_parser(grammar) parser.set_input([token.form() for token in tree.token_yield()]) parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() e = DCP_evaluator(derivation) dcp_term = e.getEvaluation() print(str(dcp_term[0])) t = ConstituentTree() dcp_to_hybridtree( t, dcp_term, [ construct_constituent_token(token.form(), '--', True) for token in tree.token_yield() ], ignore_punctuation=False, construct_token=construct_constituent_token) print(t) self.assertEqual(len(tree.token_yield()), len(t.token_yield())) for tok1, tok2 in zip(tree.token_yield(), t.token_yield()): self.assertEqual(tok1.form(), tok2.form()) self.assertEqual(tok1.pos(), tok2.pos()) def test_stanford_unking_scheme(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) tree = self.tree tree.add_to_root("VP1") print(tree) terminal_labeling = StanfordUNKing([tree]) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True, term_labeling=terminal_labeling) print(grammar) parser = LCFRS_parser(grammar) parser.set_input([token.form() for token in tree.token_yield()]) parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() e = DCP_evaluator(derivation) dcp_term = e.getEvaluation() print(str(dcp_term[0])) t = ConstituentTree() dcp_to_hybridtree( t, dcp_term, [ construct_constituent_token(token.form(), '--', True) for token in tree.token_yield() ], ignore_punctuation=False, construct_token=construct_constituent_token) print(t) self.assertEqual(len(tree.token_yield()), len(t.token_yield())) for tok1, tok2 in zip(tree.token_yield(), t.token_yield()): self.assertEqual(tok1.form(), tok2.form()) self.assertEqual(tok1.pos(), tok2.pos()) rules = terminal_labeling.create_smoothed_rules() print(rules) new_rules = {} for rule in grammar.rules(): if rule.rhs() == []: assert len(rule.dcp()) == 1 dcp = rule.dcp()[0] assert len(dcp.rhs()) == 1 term = dcp.rhs()[0] head = term.head() pos = head.pos() for tag, form in rules: if tag == pos: lhs = LCFRS_lhs(rule.lhs().nont()) lhs.add_arg([form]) new_rules[lhs, dcp] = rules[tag, form] for lhs, dcp in new_rules: print(str(lhs), str(dcp), new_rules[(lhs, dcp)]) tokens = [ construct_constituent_token('hat', '--', True), construct_constituent_token('HAT', '--', True) ] self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat') self.assertEqual(terminal_labeling.token_label(tokens[1]), '_UNK') terminal_labeling.test_mode = True self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat') self.assertEqual(terminal_labeling.token_label(tokens[1]), 'hat') def test_induction_with_spans(self): naming = 'child-spans' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) # return fanout_k_left_to_right(tree, 1) tree = self.tree tree.add_to_root("VP1") print(tree) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True) print(grammar) def test_induction_2(self): def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) features = defaultdict(lambda: 0) grammar = fringe_extract_lcfrs(self.tree3, rec_part(self.tree3), naming="child", feature_logging=features, isolate_pos=True) grammar.make_proper() if False: for idx in range(0, len(grammar.rules())): print(grammar.rule_index(idx)) for key in features: if key[0] == idx: print(key, features[key]) print() for key in features: if type(key[0]) == int: continue print(key, features[key]) nont_splits, root_weights, rule_weights, _ = build_nont_splits_dict( grammar, features, nonterminals=Enumerator(), feat_function=pos_cat_feats, debug=True) print(nont_splits) print(root_weights) print(rule_weights) def setUp(self): tree = ConstituentTree("s1") tree.add_leaf("f1", "VAFIN", "hat", morph=[("number", "Sg"), ("person", "3"), ("tense", "Past"), ("mood", "Ind")]) tree.add_leaf("f2", "ADV", "schnell", morph=[("degree", "Pos")]) tree.add_leaf("f3", "VVPP", "gearbeitet") tree.add_punct("f4", "PUNC", ".") tree.add_child("VP2", "f1") tree.add_child("VP2", "f3") tree.add_child("ADVP", "f2") tree.add_child("VP1", "VP2") tree.add_child("VP1", "ADVP") tree.set_label("VP2", "VP") tree.set_label("VP1", "VP") tree.set_label("ADVP", "ADVP") self.tree = tree tree2 = ConstituentTree("s2") tree2.add_leaf("f1", "VAFIN", "haben", morph=[("number", "Pl"), ("person", "3"), ("tense", "Past"), ("mood", "Ind")]) tree2.add_leaf("f2", "ADV", "gut", morph=[("degree", "Pos")]) tree2.add_leaf("f3", "VVPP", "gekocht") tree2.add_punct("f4", "PUNC", ".") tree2.add_child("VP2", "f1") tree2.add_child("VP2", "f3") tree2.add_child("ADVP", "f2") tree2.add_child("VP1", "VP2") tree2.add_child("VP1", "ADVP") tree2.set_label("VP2", "VP") tree2.set_label("VP1", "VP") tree2.set_label("ADVP", "ADVP") tree2.add_to_root("VP1") self.tree2 = tree2 self.tree3 = ConstituentTree("s3") self.tree3.add_leaf("f1", "ADJA", "Allgemeiner", edge="NK", morph=[("number", "Sg")]) self.tree3.add_leaf("f2", "ADJA", "Deutscher", edge="NK", morph=[("degree", "Pos"), ("number", "Sg")]) self.tree3.add_leaf("f3", "NN", "Fahrrad", edge="NK", morph=[("number", "Sg"), ("gender", "Neut")]) self.tree3.add_leaf("f4", "NN", "Club", edge="NK", morph=[("number", "Sg"), ("gender", "Neut")]) for i in range(1, 5): self.tree3.add_child("NP", "f" + str(i)) self.tree3.set_label("NP", "NP") self.tree3.add_to_root("NP")