예제 #1
0
    def test_basic_sdcp_parsing_constituency(self):
        tree1 = constituent_tree_1()
        tree2 = constituent_tree_2()
        tree3 = constituent_tree_1_pos_stripped()

        terminal_labeling = FormTerminals() # [tree1, tree2], 1, filter=["VP"])
        fanout = 1

        grammar = LCFRS('START')
        for tree in [tree1, tree2]:
            tree_part = tree.unlabelled_structure()
            part = fanout_limited_partitioning(tree_part, fanout)
            tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling)
            grammar.add_gram(tree_grammar)
        grammar.make_proper()

        print("grammar induced. Printing rules...", file=stderr)

        for rule in grammar.rules():
            print(rule, file=stderr)

        parser_type = LCFRS_sDCP_Parser

        print("preprocessing grammar", file=stderr)

        parser_type.preprocess_grammar(grammar, terminal_labeling, debug=True)

        print("invoking parser", file=stderr)

        parser = parser_type(grammar, tree1)

        print("listing derivations", file=stderr)

        for der in parser.all_derivation_trees():
            print(der)
            output_tree = ConstituentTree(tree1.sent_label())
            tokens = [construct_constituent_token(token.form(), '--', True) for token in tree1.token_yield()]
            dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False,
                              construct_constituent_token)
            print(tree1)
            print(output_tree)

        parser = parser_type(grammar, tree3)
        print(parser.recognized())
        for der in parser.all_derivation_trees():
            print(der)
            output_tree = ConstituentTree(tree3.sent_label())
            tokens = [construct_constituent_token(token.form(), '--', True) for token in tree3.token_yield()]
            dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False,
                              construct_constituent_token)
            print(tree3)
            print(output_tree)

        print("completed test", file=stderr)
예제 #2
0
    def parsing_postprocess(self, sentence, derivation, label=None):
        full_yield, id_yield, full_token_yield, token_yield = sentence

        dcp_tree = ConstituentTree(label)
        punctuation_positions = [i + 1 for i, idx in enumerate(full_yield)
                                 if idx not in id_yield]

        cleaned_tokens = copy.deepcopy(full_token_yield)
        dcp = DCP_evaluator(derivation).getEvaluation()
        dcp_to_hybridtree(dcp_tree, dcp, cleaned_tokens, False, construct_constituent_token,
                          punct_positions=punctuation_positions)

        if True or self.strip_vroot:
            dcp_tree.strip_vroot()

        return dcp_tree
    def test_induction_and_parsing_with_pos_recovery(self):
        naming = 'child'

        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))

        tree = self.tree
        tree.add_to_root("VP1")

        print(tree)

        grammar = fringe_extract_lcfrs(tree,
                                       rec_part(tree),
                                       naming=naming,
                                       isolate_pos=True,
                                       term_labeling=FormTerminals())
        print(grammar)

        parser = LCFRS_parser(grammar)
        parser.set_input([token.form() for token in tree.token_yield()])
        parser.parse()
        self.assertTrue(parser.recognized())
        derivation = parser.best_derivation_tree()
        e = DCP_evaluator(derivation)
        dcp_term = e.getEvaluation()
        print(str(dcp_term[0]))
        t = ConstituentTree()
        dcp_to_hybridtree(
            t,
            dcp_term, [
                construct_constituent_token(token.form(), '--', True)
                for token in tree.token_yield()
            ],
            ignore_punctuation=False,
            construct_token=construct_constituent_token)
        print(t)
        self.assertEqual(len(tree.token_yield()), len(t.token_yield()))
        for tok1, tok2 in zip(tree.token_yield(), t.token_yield()):
            self.assertEqual(tok1.form(), tok2.form())
            self.assertEqual(tok1.pos(), tok2.pos())
 def tree2():
     tree = ConstituentTree("1")
     for i, t in enumerate(["a", "b", "d", "c"]):
         tree.add_leaf(str(i), "P" + t, t)
     tree.set_label('r0', 'C')
     tree.set_label('r1', 'A')
     tree.set_label('r2', 'B')
     tree.add_to_root('r0')
     tree.add_child('r0', 'r1')
     tree.add_child('r0', 'r2')
     tree.add_child('r1', '0')
     tree.add_child('r1', '3')
     tree.add_child('r2', '1')
     tree.add_child('r2', '2')
     print(tree, tree.word_yield())
     return tree
def constituent_tree_1():
    tree = ConstituentTree("s1")
    tree.add_leaf("f1", "VP", "hat")
    tree.add_leaf("f2", "ADV", "schnell")
    tree.add_leaf("f3", "VP", "gearbeitet")
    tree.add_punct("f4", "PUNC", ".")

    tree.set_label("V", "V")
    tree.add_child("V", "f1")
    tree.add_child("V", "f3")

    tree.set_label("ADV", "ADV")
    tree.add_child("ADV", "f2")

    tree.set_label("VP", "VP")
    tree.add_child("VP", "V")
    tree.add_child("VP", "ADV")

    tree.add_to_root("VP")

    return tree
def constituent_tree_2():
    tree = ConstituentTree("s2")
    tree.add_leaf("l1", "N", "John")
    tree.add_leaf("l2", "V", "hit")
    tree.add_leaf("l3", "D", "the")
    tree.add_leaf("l4", "N", "Ball")
    tree.add_punct("l5", "PUNC", ".")

    tree.set_label("NP", "NP")
    tree.add_child("NP", "l3")
    tree.add_child("NP", "l4")

    tree.set_label("VP", "VP")
    tree.add_child("VP", "l2")
    tree.add_child("VP", "NP")

    tree.set_label("S", "S")
    tree.add_child("S", "l1")
    tree.add_child("S", "VP")

    tree.add_to_root("S")

    return tree
    def setUp(self):
        tree = ConstituentTree("s1")
        tree.add_leaf("f1",
                      "VAFIN",
                      "hat",
                      morph=[("number", "Sg"), ("person", "3"),
                             ("tense", "Past"), ("mood", "Ind")])
        tree.add_leaf("f2", "ADV", "schnell", morph=[("degree", "Pos")])
        tree.add_leaf("f3", "VVPP", "gearbeitet")
        tree.add_punct("f4", "PUNC", ".")

        tree.add_child("VP2", "f1")
        tree.add_child("VP2", "f3")
        tree.add_child("ADVP", "f2")

        tree.add_child("VP1", "VP2")
        tree.add_child("VP1", "ADVP")

        tree.set_label("VP2", "VP")
        tree.set_label("VP1", "VP")
        tree.set_label("ADVP", "ADVP")

        self.tree = tree

        tree2 = ConstituentTree("s2")
        tree2.add_leaf("f1",
                       "VAFIN",
                       "haben",
                       morph=[("number", "Pl"), ("person", "3"),
                              ("tense", "Past"), ("mood", "Ind")])
        tree2.add_leaf("f2", "ADV", "gut", morph=[("degree", "Pos")])
        tree2.add_leaf("f3", "VVPP", "gekocht")
        tree2.add_punct("f4", "PUNC", ".")

        tree2.add_child("VP2", "f1")
        tree2.add_child("VP2", "f3")
        tree2.add_child("ADVP", "f2")

        tree2.add_child("VP1", "VP2")
        tree2.add_child("VP1", "ADVP")

        tree2.set_label("VP2", "VP")
        tree2.set_label("VP1", "VP")
        tree2.set_label("ADVP", "ADVP")
        tree2.add_to_root("VP1")
        self.tree2 = tree2

        self.tree3 = ConstituentTree("s3")
        self.tree3.add_leaf("f1",
                            "ADJA",
                            "Allgemeiner",
                            edge="NK",
                            morph=[("number", "Sg")])
        self.tree3.add_leaf("f2",
                            "ADJA",
                            "Deutscher",
                            edge="NK",
                            morph=[("degree", "Pos"), ("number", "Sg")])
        self.tree3.add_leaf("f3",
                            "NN",
                            "Fahrrad",
                            edge="NK",
                            morph=[("number", "Sg"), ("gender", "Neut")])
        self.tree3.add_leaf("f4",
                            "NN",
                            "Club",
                            edge="NK",
                            morph=[("number", "Sg"), ("gender", "Neut")])
        for i in range(1, 5):
            self.tree3.add_child("NP", "f" + str(i))
        self.tree3.set_label("NP", "NP")
        self.tree3.add_to_root("NP")
class ConstituentTreeTest(unittest.TestCase):
    def test_something(self):
        tree = self.tree
        print("rooted", tree.root)
        tree.add_to_root("VP1")
        print("rooted", tree.root)

        print(tree)

        print("sent label", tree.sent_label())

        print("leaves", tree.leaves())

        print("is leaf (leaves)",
              [(x, tree.is_leaf(x)) for (x, _, _) in tree.leaves()])
        print("is leaf (internal)", [(x, tree.is_leaf(x)) for x in tree.ids()])
        print("leaf index",
              [(x, tree.leaf_index(x)) for x in ["f1", "f2", "f3"]])

        print("pos yield", tree.pos_yield())
        print("word yield", tree.word_yield())

        # reentrant
        # parent

        print("ids", tree.ids())

        # reorder
        print("n nodes", tree.n_nodes())
        print("n gaps", tree.n_gaps())

        print("fringe VP", tree.fringe("VP"))
        print("fringe V", tree.fringe("V"))

        print("empty fringe", tree.empty_fringe())

        print("complete?", tree.complete())

        print("max n spans", tree.max_n_spans())

        print("unlabelled structure", tree.unlabelled_structure())

        print("labelled spans", tree.labelled_spans())

    def test_induction(self):
        naming = 'child'

        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))
            # return fanout_k_left_to_right(tree, 1)

        tree = self.tree
        tree.add_to_root("VP1")

        feature_log1 = defaultdict(lambda: 0)

        grammar = fringe_extract_lcfrs(tree,
                                       rec_part(tree),
                                       feature_logging=feature_log1,
                                       naming=naming)

        for key in feature_log1:
            print(key, feature_log1[key])

        print(grammar)

        feats = defaultdict(lambda: 0)
        grammar_ = fringe_extract_lcfrs(tree,
                                        rec_part(tree),
                                        isolate_pos=True,
                                        feature_logging=feats,
                                        naming=naming)

        print(grammar_)

        for key in feats:
            print(key, feats[key])

        print("Adding 2nd grammar to first")

        grammar.add_gram(grammar_, feature_logging=(feature_log1, feats))
        for idx in range(0, len(grammar.rules())):
            print(idx, grammar.rule_index(idx))

        print("Adding 3rd grammar to first")
        feats3 = defaultdict(lambda: 0)
        grammar3 = fringe_extract_lcfrs(self.tree2,
                                        rec_part(self.tree2),
                                        isolate_pos=True,
                                        feature_logging=feats3,
                                        naming=naming)
        grammar.add_gram(grammar3, feature_logging=(feature_log1, feats3))

        print()
        for idx in range(0, len(grammar.rules())):
            print(idx, grammar.rule_index(idx))
        print()
        print("New feature log")
        print()
        for key in feature_log1:
            print(key, feature_log1[key])
        grammar.make_proper()

        build_nont_splits_dict(grammar,
                               feature_log1,
                               nonterminals=Enumerator())

        print(grammar.rule_index(0))
        print(grammar.rule_index(2))

    def test_markovized_induction(self):
        naming = 'strict-markov-v-2-h-0'

        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))
            # return fanout_k_left_to_right(tree, 1)

        tree = self.tree
        tree.add_to_root("VP1")

        print(tree)

        grammar = fringe_extract_lcfrs(tree,
                                       rec_part(tree),
                                       naming=naming,
                                       isolate_pos=True)
        print(grammar)

    def test_induction_2(self):
        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))

        features = defaultdict(lambda: 0)
        grammar = fringe_extract_lcfrs(self.tree3,
                                       rec_part(self.tree3),
                                       naming="child",
                                       feature_logging=features,
                                       isolate_pos=True)
        grammar.make_proper()

        if False:
            for idx in range(0, len(grammar.rules())):
                print(grammar.rule_index(idx))
                for key in features:
                    if key[0] == idx:
                        print(key, features[key])
                print()
            for key in features:
                if type(key[0]) == int:
                    continue
                print(key, features[key])

        nont_splits, root_weights, rule_weights, _ = build_nont_splits_dict(
            grammar,
            features,
            nonterminals=Enumerator(),
            feat_function=pos_cat_feats,
            debug=True)
        print(nont_splits)
        print(root_weights)
        print(rule_weights)

    def setUp(self):
        tree = ConstituentTree("s1")
        tree.add_leaf("f1",
                      "VAFIN",
                      "hat",
                      morph=[("number", "Sg"), ("person", "3"),
                             ("tense", "Past"), ("mood", "Ind")])
        tree.add_leaf("f2", "ADV", "schnell", morph=[("degree", "Pos")])
        tree.add_leaf("f3", "VVPP", "gearbeitet")
        tree.add_punct("f4", "PUNC", ".")

        tree.add_child("VP2", "f1")
        tree.add_child("VP2", "f3")
        tree.add_child("ADVP", "f2")

        tree.add_child("VP1", "VP2")
        tree.add_child("VP1", "ADVP")

        tree.set_label("VP2", "VP")
        tree.set_label("VP1", "VP")
        tree.set_label("ADVP", "ADVP")

        self.tree = tree

        tree2 = ConstituentTree("s2")
        tree2.add_leaf("f1",
                       "VAFIN",
                       "haben",
                       morph=[("number", "Pl"), ("person", "3"),
                              ("tense", "Past"), ("mood", "Ind")])
        tree2.add_leaf("f2", "ADV", "gut", morph=[("degree", "Pos")])
        tree2.add_leaf("f3", "VVPP", "gekocht")
        tree2.add_punct("f4", "PUNC", ".")

        tree2.add_child("VP2", "f1")
        tree2.add_child("VP2", "f3")
        tree2.add_child("ADVP", "f2")

        tree2.add_child("VP1", "VP2")
        tree2.add_child("VP1", "ADVP")

        tree2.set_label("VP2", "VP")
        tree2.set_label("VP1", "VP")
        tree2.set_label("ADVP", "ADVP")
        tree2.add_to_root("VP1")
        self.tree2 = tree2

        self.tree3 = ConstituentTree("s3")
        self.tree3.add_leaf("f1",
                            "ADJA",
                            "Allgemeiner",
                            edge="NK",
                            morph=[("number", "Sg")])
        self.tree3.add_leaf("f2",
                            "ADJA",
                            "Deutscher",
                            edge="NK",
                            morph=[("degree", "Pos"), ("number", "Sg")])
        self.tree3.add_leaf("f3",
                            "NN",
                            "Fahrrad",
                            edge="NK",
                            morph=[("number", "Sg"), ("gender", "Neut")])
        self.tree3.add_leaf("f4",
                            "NN",
                            "Club",
                            edge="NK",
                            morph=[("number", "Sg"), ("gender", "Neut")])
        for i in range(1, 5):
            self.tree3.add_child("NP", "f" + str(i))
        self.tree3.set_label("NP", "NP")
        self.tree3.add_to_root("NP")
예제 #9
0
def flat_dummy_constituent_tree(token_yield,
                                full_token_yield,
                                dummy_label,
                                dummy_root,
                                label=None,
                                gold_pos=True):
    """
    :param token_yield: connected yield of a parse tree
    :type token_yield: list[ConstituentTerminal]
    :param full_token_yield: full yield of the parse tree
    :type full_token_yield: list[ConstituentTerminal]
    :return: dummy constituent tree
    :rtype: ConstituentTree
    generates a flat dummy tree for a given yield where all nodes are attached under the root
    """
    tree = ConstituentTree(label)

    # generate root node
    root_id = 'n_root'
    tree.add_node(root_id, ConstituentCategory(dummy_root))
    tree.add_to_root(root_id)

    parent = root_id

    # create all leaves and punctuation
    for token in full_token_yield:
        pos = token.pos() if gold_pos else '--'
        if token not in token_yield:
            tree.add_punct(full_token_yield.index(token), pos, token.form())
        else:
            idx = full_token_yield.index(token)
            tree.add_leaf(idx,
                          pos,
                          token.form(),
                          morph=token.morph_feats(),
                          lemma=token.lemma())

            tree.add_child(parent, idx)

    return tree
예제 #10
0
def dummy_constituent_tree(token_yield,
                           full_token_yield,
                           dummy_label,
                           dummy_root,
                           label=None):
    """
    :param token_yield: connected yield of a parse tree
    :type token_yield: list[ConstituentTerminal]
    :param full_token_yield: full yield of the parse tree
    :type full_token_yield: list[ConstituentTerminal]
    :return: dummy constituent tree
    :rtype: ConstituentTree
    generates a dummy tree for a given yield using dummy_label as inner node symbol
    """
    tree = ConstituentTree(label)

    # create all leaves and punctuation
    for token in full_token_yield:
        if token not in token_yield:
            tree.add_punct(full_token_yield.index(token), token.pos(),
                           token.form())
        else:
            tree.add_leaf(full_token_yield.index(token), token.pos(),
                          token.form())

    # generate root node
    root_id = 'n0'
    tree.add_node(root_id, ConstituentCategory(dummy_root))
    tree.add_to_root(root_id)

    parent = root_id

    if len(token_yield) > 1:
        i = 1
        # generate inner nodes of branching tree
        for token in token_yield[:-2]:
            node = ConstituentCategory(str(dummy_label))
            tree.add_node('n' + str(i), node)
            tree.add_child(parent, 'n' + str(i))
            tree.add_child(parent, full_token_yield.index(token))
            parent = 'n' + str(i)
            i += 1

        token = token_yield[len(token_yield) - 2]
        tree.add_child(parent, full_token_yield.index(token))
        token = token_yield[len(token_yield) - 1]
        tree.add_child(parent, full_token_yield.index(token))
    elif len(token_yield) == 1:
        tree.add_child(parent, full_token_yield.index(token_yield[0]))

    return tree
def main():
    # train_path = '../res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/train/train.German.gold.xml'
    # corpus = sentence_names_to_hybridtrees(["s" + str(i) for i in range(1, 10)], file_name=train_path, hold=False)

    train_path = '../res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/dev/dev.German.gold.xml'
    names = ["s" + str(i) for i in range(40675, 40700)]
    names = ['s40564']
    corpus = sentence_names_to_hybridtrees(names, path=train_path, hold=False)

    cp = TreeComparator()

    tree_sys = ConstituentTree()
    tree_sys.add_node('0', ConstituentCategory('PN'))
    tree_sys.add_node('1', corpus[0].token_yield()[0], order=True)
    tree_sys.add_punct("3", '$.', '.')
    tree_sys.add_to_root('0')
    tree_sys.add_child('0', '1')

    param = build_param()

    for i, hybridtree in enumerate(corpus):
        print(i)

        # discotree = convert_tree(hybridtree)
        tree, sent = convert_tree(hybridtree)
        tree2, sent2 = convert_tree(tree_sys)

        if i == 11:
            pass

        # print(discotree)

        # print(discotree.draw())

        # print(DrawTree(discotree, discotree.sent))
        print(DrawTree(tree, sent))

        print(' '.join(map(lambda x: x.form(), hybridtree.full_token_yield())))

        print(DrawTree(tree2, sent2))

        print(tree[::-1])

        print('POS', tree.pos())

        result = TreePairResult(i, tree, sent, tree2, sent2, param)
        print(result.scores())

        print("Comparator: ", cp.compare_hybridtrees(hybridtree, hybridtree))
예제 #12
0
def build_score_validator(baseline_grammar, grammarInfo, nont_map,
                          storageManager, term_labelling, parser,
                          corpus_validation, validationMethod):
    validator = PyCandidateScoreValidator(grammarInfo, storageManager,
                                          validationMethod)

    # parser = GFParser(baseline_grammar)
    tree_count = 0
    der_count = 0
    for gold_tree in corpus_validation:
        tree_count += 1
        parser.set_input(
            term_labelling.prepare_parser_input(gold_tree.token_yield()))
        parser.parse()
        derivations = [der for _, der in parser.k_best_derivation_trees()]
        manager = PyDerivationManager(baseline_grammar, nont_map)
        manager.convert_hypergraphs(derivations)
        scores = []

        relevant = set([tuple(t) for t in gold_tree.labelled_spans()])

        for der in derivations:
            der_count += 1

            h_tree = ConstituentTree()
            cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield())
            dcp = DCP_evaluator(der).getEvaluation()
            dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                              construct_constituent_token)

            retrieved = set([tuple(t) for t in h_tree.labelled_spans()])
            inters = retrieved & relevant

            # in case of parse failure there are two options here:
            #   - parse failure -> no spans at all, thus precision = 1
            #   - parse failure -> a dummy tree with all spans wrong, thus precision = 0

            precision = 1.0 * len(inters) / len(retrieved) \
                if len(retrieved) > 0 else 0
            recall = 1.0 * len(inters) / len(relevant) \
                if len(relevant) > 0 else 0
            fmeasure = 2.0 * precision * recall / (precision + recall) \
                if precision + recall > 0 else 0

            if validationMethod == "F1":
                scores.append(fmeasure)
            elif validationMethod == "Precision":
                scores.append(precision)
            elif validationMethod == "Recall":
                scores.append(recall)
            else:
                raise ()

        validator.add_scored_candidates(manager, scores,
                                        1.0 if len(relevant) > 0 else 0.0)
        # print(tree_count, scores)
        parser.clear()

    print("trees used for validation ", tree_count, "with",
          der_count * 1.0 / tree_count, "derivations on average")

    return validator
예제 #13
0
def do_parsing(parser, corpus):
    accuracy = ParseAccuracyPenalizeFailures()
    system_trees = []

    start_at = time.time()

    n = 0

    for tree in corpus:

        if not tree.complete() \
                or tree.empty_fringe() \
                or not 0 < len(tree.word_yield()) <= max_length:
            continue

        parser.set_input(
            terminal_labeling.prepare_parser_input(tree.token_yield()))
        parser.parse()
        if not parser.recognized():
            relevant = tree.labelled_spans()
            accuracy.add_failure(relevant)

            system_trees.append(
                dummy_constituent_tree(tree.token_yield(),
                                       tree.full_token_yield(), "NP", "S"))
            # print('failure', tree.sent_label()) # for testing
        else:
            n += 1
            dcp_tree = ConstituentTree()
            punctuation_positions = [
                i + 1 for i, idx in enumerate(tree.full_yield())
                if idx not in tree.id_yield()
            ]
            dcp_tree = parser.dcp_hybrid_tree_best_derivation(
                dcp_tree,
                tree.full_token_yield(),
                False,
                construct_constituent_token,
                punctuation_positions=punctuation_positions)

            retrieved = dcp_tree.labelled_spans()
            relevant = tree.labelled_spans()
            accuracy.add_accuracy(retrieved, relevant)

            system_trees.append(dcp_tree)

        parser.clear()

    end_at = time.time()

    print('Parsed:', n)
    if accuracy.n() > 0:
        print('Recall:', accuracy.recall())
        print('Precision:', accuracy.precision())
        print('F-measure:', accuracy.fmeasure())
        print('Parse failures:', accuracy.n_failures())
    else:
        print('No successful parsing')
    print('time:', end_at - start_at)
    print('')

    name = parse_results
    # do not overwrite existing result files
    i = 1
    while os.path.isfile(
            os.path.join(parse_results_prefix, name + parse_results_suffix)):
        i += 1
        name = parse_results + '_' + str(i)

    path = os.path.join(parse_results_prefix, name + parse_results_suffix)
    #
    # with open(path, 'w') as result_file:
    #     print('Exporting parse trees of length <=', max_length, 'to', str(path))
    #     map(lambda x: x.strip_vroot(), system_trees)
    #     result_file.writelines(hybridtrees_to_sentence_names(system_trees, test_start, max_length))

    return accuracy
    def test_stanford_unking_scheme(self):
        naming = 'child'

        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))

        tree = self.tree
        tree.add_to_root("VP1")

        print(tree)

        terminal_labeling = StanfordUNKing([tree])

        grammar = fringe_extract_lcfrs(tree,
                                       rec_part(tree),
                                       naming=naming,
                                       isolate_pos=True,
                                       term_labeling=terminal_labeling)
        print(grammar)

        parser = LCFRS_parser(grammar)
        parser.set_input([token.form() for token in tree.token_yield()])
        parser.parse()
        self.assertTrue(parser.recognized())
        derivation = parser.best_derivation_tree()
        e = DCP_evaluator(derivation)
        dcp_term = e.getEvaluation()
        print(str(dcp_term[0]))
        t = ConstituentTree()
        dcp_to_hybridtree(
            t,
            dcp_term, [
                construct_constituent_token(token.form(), '--', True)
                for token in tree.token_yield()
            ],
            ignore_punctuation=False,
            construct_token=construct_constituent_token)
        print(t)
        self.assertEqual(len(tree.token_yield()), len(t.token_yield()))
        for tok1, tok2 in zip(tree.token_yield(), t.token_yield()):
            self.assertEqual(tok1.form(), tok2.form())
            self.assertEqual(tok1.pos(), tok2.pos())

        rules = terminal_labeling.create_smoothed_rules()
        print(rules)

        new_rules = {}

        for rule in grammar.rules():
            if rule.rhs() == []:
                assert len(rule.dcp()) == 1
                dcp = rule.dcp()[0]
                assert len(dcp.rhs()) == 1
                term = dcp.rhs()[0]
                head = term.head()
                pos = head.pos()

                for tag, form in rules:
                    if tag == pos:
                        lhs = LCFRS_lhs(rule.lhs().nont())
                        lhs.add_arg([form])
                        new_rules[lhs, dcp] = rules[tag, form]

        for lhs, dcp in new_rules:
            print(str(lhs), str(dcp), new_rules[(lhs, dcp)])

        tokens = [
            construct_constituent_token('hat', '--', True),
            construct_constituent_token('HAT', '--', True)
        ]
        self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat')
        self.assertEqual(terminal_labeling.token_label(tokens[1]), '_UNK')
        terminal_labeling.test_mode = True
        self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat')
        self.assertEqual(terminal_labeling.token_label(tokens[1]), 'hat')
class ConstituentTreeTest(unittest.TestCase):
    def test_basic_tree_methods(self):
        tree = self.tree
        print("rooted", tree.root)
        tree.add_to_root("VP1")
        print("rooted", tree.root)

        print(tree)

        print("sent label", tree.sent_label())

        print("leaves", tree.leaves())

        print("is leaf (leaves)",
              [(x, tree.is_leaf(x)) for (x, _, _) in tree.leaves()])
        print("is leaf (internal)", [(x, tree.is_leaf(x)) for x in tree.ids()])
        print("leaf index",
              [(x, tree.leaf_index(x)) for x in ["f1", "f2", "f3"]])

        print("pos yield", tree.pos_yield())
        print("word yield", tree.word_yield())

        # reentrant
        # parent

        print("ids", tree.ids())

        # reorder
        print("n nodes", tree.n_nodes())
        print("n gaps", tree.n_gaps())

        print("fringe VP", tree.fringe("VP"))
        print("fringe V", tree.fringe("V"))

        print("empty fringe", tree.empty_fringe())

        print("complete?", tree.complete())

        print("max n spans", tree.max_n_spans())

        print("unlabelled structure", tree.unlabelled_structure())

        print("labelled spans", tree.labelled_spans())

    def test_induction(self):
        naming = 'child'

        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))
            # return fanout_k_left_to_right(tree, 1)

        tree = self.tree
        tree.add_to_root("VP1")

        feature_log1 = defaultdict(lambda: 0)

        grammar = fringe_extract_lcfrs(tree,
                                       rec_part(tree),
                                       feature_logging=feature_log1,
                                       naming=naming)

        for key in feature_log1:
            print(key, feature_log1[key])

        print(grammar)

        feats = defaultdict(lambda: 0)
        grammar_ = fringe_extract_lcfrs(tree,
                                        rec_part(tree),
                                        isolate_pos=True,
                                        feature_logging=feats,
                                        naming=naming)

        print(grammar_)

        for key in feats:
            print(key, feats[key])

        print("Adding 2nd grammar to first")

        grammar.add_gram(grammar_, feature_logging=(feature_log1, feats))
        for idx in range(0, len(grammar.rules())):
            print(idx, grammar.rule_index(idx))

        print("Adding 3rd grammar to first")
        feats3 = defaultdict(lambda: 0)
        grammar3 = fringe_extract_lcfrs(self.tree2,
                                        rec_part(self.tree2),
                                        isolate_pos=True,
                                        feature_logging=feats3,
                                        naming=naming)
        grammar.add_gram(grammar3, feature_logging=(feature_log1, feats3))

        print()
        for idx in range(0, len(grammar.rules())):
            print(idx, grammar.rule_index(idx))
        print()
        print("New feature log")
        print()
        for key in feature_log1:
            print(key, feature_log1[key])
        grammar.make_proper()

        build_nont_splits_dict(grammar,
                               feature_log1,
                               nonterminals=Enumerator())

        print(grammar.rule_index(0))
        print(grammar.rule_index(2))

    def test_markovized_induction(self):
        naming = 'strict-markov-v-2-h-0'

        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))
            # return fanout_k_left_to_right(tree, 1)

        tree = self.tree
        tree.add_to_root("VP1")

        print(tree)

        grammar = fringe_extract_lcfrs(tree,
                                       rec_part(tree),
                                       naming=naming,
                                       isolate_pos=True)
        print(grammar)

    def test_induction_and_parsing_with_pos_recovery(self):
        naming = 'child'

        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))

        tree = self.tree
        tree.add_to_root("VP1")

        print(tree)

        grammar = fringe_extract_lcfrs(tree,
                                       rec_part(tree),
                                       naming=naming,
                                       isolate_pos=True,
                                       term_labeling=FormTerminals())
        print(grammar)

        parser = LCFRS_parser(grammar)
        parser.set_input([token.form() for token in tree.token_yield()])
        parser.parse()
        self.assertTrue(parser.recognized())
        derivation = parser.best_derivation_tree()
        e = DCP_evaluator(derivation)
        dcp_term = e.getEvaluation()
        print(str(dcp_term[0]))
        t = ConstituentTree()
        dcp_to_hybridtree(
            t,
            dcp_term, [
                construct_constituent_token(token.form(), '--', True)
                for token in tree.token_yield()
            ],
            ignore_punctuation=False,
            construct_token=construct_constituent_token)
        print(t)
        self.assertEqual(len(tree.token_yield()), len(t.token_yield()))
        for tok1, tok2 in zip(tree.token_yield(), t.token_yield()):
            self.assertEqual(tok1.form(), tok2.form())
            self.assertEqual(tok1.pos(), tok2.pos())

    def test_stanford_unking_scheme(self):
        naming = 'child'

        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))

        tree = self.tree
        tree.add_to_root("VP1")

        print(tree)

        terminal_labeling = StanfordUNKing([tree])

        grammar = fringe_extract_lcfrs(tree,
                                       rec_part(tree),
                                       naming=naming,
                                       isolate_pos=True,
                                       term_labeling=terminal_labeling)
        print(grammar)

        parser = LCFRS_parser(grammar)
        parser.set_input([token.form() for token in tree.token_yield()])
        parser.parse()
        self.assertTrue(parser.recognized())
        derivation = parser.best_derivation_tree()
        e = DCP_evaluator(derivation)
        dcp_term = e.getEvaluation()
        print(str(dcp_term[0]))
        t = ConstituentTree()
        dcp_to_hybridtree(
            t,
            dcp_term, [
                construct_constituent_token(token.form(), '--', True)
                for token in tree.token_yield()
            ],
            ignore_punctuation=False,
            construct_token=construct_constituent_token)
        print(t)
        self.assertEqual(len(tree.token_yield()), len(t.token_yield()))
        for tok1, tok2 in zip(tree.token_yield(), t.token_yield()):
            self.assertEqual(tok1.form(), tok2.form())
            self.assertEqual(tok1.pos(), tok2.pos())

        rules = terminal_labeling.create_smoothed_rules()
        print(rules)

        new_rules = {}

        for rule in grammar.rules():
            if rule.rhs() == []:
                assert len(rule.dcp()) == 1
                dcp = rule.dcp()[0]
                assert len(dcp.rhs()) == 1
                term = dcp.rhs()[0]
                head = term.head()
                pos = head.pos()

                for tag, form in rules:
                    if tag == pos:
                        lhs = LCFRS_lhs(rule.lhs().nont())
                        lhs.add_arg([form])
                        new_rules[lhs, dcp] = rules[tag, form]

        for lhs, dcp in new_rules:
            print(str(lhs), str(dcp), new_rules[(lhs, dcp)])

        tokens = [
            construct_constituent_token('hat', '--', True),
            construct_constituent_token('HAT', '--', True)
        ]
        self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat')
        self.assertEqual(terminal_labeling.token_label(tokens[1]), '_UNK')
        terminal_labeling.test_mode = True
        self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat')
        self.assertEqual(terminal_labeling.token_label(tokens[1]), 'hat')

    def test_induction_with_spans(self):
        naming = 'child-spans'

        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))
            # return fanout_k_left_to_right(tree, 1)

        tree = self.tree
        tree.add_to_root("VP1")

        print(tree)

        grammar = fringe_extract_lcfrs(tree,
                                       rec_part(tree),
                                       naming=naming,
                                       isolate_pos=True)
        print(grammar)

    def test_induction_2(self):
        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))

        features = defaultdict(lambda: 0)
        grammar = fringe_extract_lcfrs(self.tree3,
                                       rec_part(self.tree3),
                                       naming="child",
                                       feature_logging=features,
                                       isolate_pos=True)
        grammar.make_proper()

        if False:
            for idx in range(0, len(grammar.rules())):
                print(grammar.rule_index(idx))
                for key in features:
                    if key[0] == idx:
                        print(key, features[key])
                print()
            for key in features:
                if type(key[0]) == int:
                    continue
                print(key, features[key])

        nont_splits, root_weights, rule_weights, _ = build_nont_splits_dict(
            grammar,
            features,
            nonterminals=Enumerator(),
            feat_function=pos_cat_feats,
            debug=True)
        print(nont_splits)
        print(root_weights)
        print(rule_weights)

    def setUp(self):
        tree = ConstituentTree("s1")
        tree.add_leaf("f1",
                      "VAFIN",
                      "hat",
                      morph=[("number", "Sg"), ("person", "3"),
                             ("tense", "Past"), ("mood", "Ind")])
        tree.add_leaf("f2", "ADV", "schnell", morph=[("degree", "Pos")])
        tree.add_leaf("f3", "VVPP", "gearbeitet")
        tree.add_punct("f4", "PUNC", ".")

        tree.add_child("VP2", "f1")
        tree.add_child("VP2", "f3")
        tree.add_child("ADVP", "f2")

        tree.add_child("VP1", "VP2")
        tree.add_child("VP1", "ADVP")

        tree.set_label("VP2", "VP")
        tree.set_label("VP1", "VP")
        tree.set_label("ADVP", "ADVP")

        self.tree = tree

        tree2 = ConstituentTree("s2")
        tree2.add_leaf("f1",
                       "VAFIN",
                       "haben",
                       morph=[("number", "Pl"), ("person", "3"),
                              ("tense", "Past"), ("mood", "Ind")])
        tree2.add_leaf("f2", "ADV", "gut", morph=[("degree", "Pos")])
        tree2.add_leaf("f3", "VVPP", "gekocht")
        tree2.add_punct("f4", "PUNC", ".")

        tree2.add_child("VP2", "f1")
        tree2.add_child("VP2", "f3")
        tree2.add_child("ADVP", "f2")

        tree2.add_child("VP1", "VP2")
        tree2.add_child("VP1", "ADVP")

        tree2.set_label("VP2", "VP")
        tree2.set_label("VP1", "VP")
        tree2.set_label("ADVP", "ADVP")
        tree2.add_to_root("VP1")
        self.tree2 = tree2

        self.tree3 = ConstituentTree("s3")
        self.tree3.add_leaf("f1",
                            "ADJA",
                            "Allgemeiner",
                            edge="NK",
                            morph=[("number", "Sg")])
        self.tree3.add_leaf("f2",
                            "ADJA",
                            "Deutscher",
                            edge="NK",
                            morph=[("degree", "Pos"), ("number", "Sg")])
        self.tree3.add_leaf("f3",
                            "NN",
                            "Fahrrad",
                            edge="NK",
                            morph=[("number", "Sg"), ("gender", "Neut")])
        self.tree3.add_leaf("f4",
                            "NN",
                            "Club",
                            edge="NK",
                            morph=[("number", "Sg"), ("gender", "Neut")])
        for i in range(1, 5):
            self.tree3.add_child("NP", "f" + str(i))
        self.tree3.set_label("NP", "NP")
        self.tree3.add_to_root("NP")
예제 #16
0
def sentence_names_to_hybridtrees(names,
                                  path,
                                  enc="utf-8",
                                  disconnect_punctuation=True,
                                  add_vroot=False,
                                  mode="STANDARD",
                                  secedge=False):
    """
    :param names:  list of sentence identifiers
    :type names: list[str]
    :param path: path to corpus
    :type path: str
    :param enc: file encoding
    :type enc: str
    :param disconnect_punctuation: disconnect
    :type disconnect_punctuation: bool
    :param add_vroot: adds a virtual root node labelled 'VROOT'
    :type add_vroot: bool
    :param mode: either 'STANDARD' (no lemma field) or 'DISCODOP' (lemma field)
    :type mode: str
    :param secedge: add secondary edges
    :type secedge: bool
    :return: list of constituent structures (HybridTrees or HybridDags) from file_name whose names are in names
    """
    negra = codecs.open(expanduser(path), encoding=enc)
    trees = []
    tree = None
    name = ''
    n_leaves = 0
    for line in negra:
        match_mode = DISCODOP_HEADER.match(line)
        if match_mode:
            mode = "DISCO-DOP"
            continue
        match_sent_start = BOS.search(line)
        match_sent_end = EOS.match(line)
        if mode == "STANDARD":
            match_nont = \
                STANDARD_NONTERMINAL.match(line)
            match_term = \
                STANDARD_TERMINAL.match(line)
        elif mode == "DISCO-DOP":
            match_nont = DISCODOP_NONTERMINAL.match(line)
            match_term = DISCODOP_TERMINAL.match(line)
        if match_sent_start:
            this_name = match_sent_start.group(1)
            if this_name in names:
                name = this_name
                if secedge:
                    tree = HybridDag(name)
                else:
                    tree = ConstituentTree(name)
                n_leaves = 0
                if add_vroot:
                    tree.set_label('0', 'VROOT')
                    tree.add_to_root('0')
        elif match_sent_end:
            this_name = match_sent_end.group(1)
            if name == this_name:
                tree.reorder()
                trees += [tree]
                tree = None
        elif tree:
            if match_nont:
                id = match_nont.group(1)
                if mode == "STANDARD":
                    OFFSET = 0
                else:
                    OFFSET = 1
                nont = match_nont.group(2 + OFFSET)
                edge = match_nont.group(4 + OFFSET)
                parent = match_nont.group(5 + OFFSET)
                # print(match_nont.groups(), len(match_nont.groups()))
                secedges = [] if not secedge or match_nont.group(6 + OFFSET) is None else \
                    match_nont.group(6 + OFFSET).split()

                tree.add_node(id, ConstituentCategory(nont), False, True)

                tree.node_token(id).set_edge_label(edge)
                if parent == '0' and not add_vroot:
                    tree.add_to_root(id)
                else:
                    tree.add_child(parent, id)
                if secedge and secedges:
                    # print(secedges)
                    for sei in range(0, len(secedges) // 2, 2):
                        # sec_label = secedges[sei]
                        assert secedges[sei] == edge
                        sec_parent = secedges[sei + 1]
                        tree.add_sec_child(sec_parent, id)
            elif match_term:
                if mode == "STANDARD":
                    OFFSET = 0
                else:
                    OFFSET = 1

                word = match_term.group(1)
                pos = match_term.group(2 + OFFSET)
                edge = match_term.group(4 + OFFSET)
                parent = match_term.group(5 + OFFSET)
                # print(match_term.groups(), len(match_term.groups()))
                secedges = [] if not secedge or match_term.group(6 + OFFSET) is None else \
                    match_term.group(6 + OFFSET).split()

                n_leaves += 1
                leaf_id = str(100 + n_leaves)
                if parent == '0' and disconnect_punctuation:
                    tree.add_punct(leaf_id, pos, word)
                else:
                    if parent == '0' and not add_vroot:
                        tree.add_to_root(leaf_id)
                    else:
                        tree.add_child(parent, leaf_id)

                    token = ConstituentTerminal(word, pos, edge, None, '--')
                    tree.add_node(leaf_id, token, True, True)

                    tree.node_token(leaf_id).set_edge_label(edge)
                    if secedge and secedges:
                        # print(secedges)
                        for sei in range(0, len(secedges) // 2, 2):
                            # sec_label = secedges[sei]
                            assert secedges[sei] == edge
                            sec_parent = secedges[sei + 1]
                            tree.add_sec_child(sec_parent, leaf_id)
    negra.close()
    return trees