Пример #1
0
    def test_minimum_risk_parsing(self):
        limit_train = 20
        limit_test = 10
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        test = train
        parser_type = GFParser_k_best
        # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory(
        ).create_simple_labeling_strategy("childtop", "deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling,
                                                 term_labelling.token_label,
                                                 recursive_partitioning, start)

        parser_type.preprocess_grammar(grammar_prim)
        tree_yield = term_labelling.prepare_parser_input

        trees = parse_conll_corpus(test, False, limit_test)

        for i, tree in enumerate(trees):
            print("Parsing sentence ", i, file=stderr)

            # print >>stderr, tree

            parser = parser_type(grammar_prim,
                                 tree_yield(tree.token_yield()),
                                 k=50)

            self.assertTrue(parser.recognized())

            derivations = [der for der in parser.k_best_derivation_trees()]
            print("# derivations: ", len(derivations), file=stderr)
            h_trees = []
            current_weight = 0
            weights = []
            derivation_list = []
            for weight, der in derivations:

                self.assertTrue(not der in derivation_list)

                derivation_list.append(der)

                dcp = DCP_evaluator(der).getEvaluation()
                h_tree = HybridTree()
                cleaned_tokens = copy.deepcopy(tree.full_token_yield())
                dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                                  construct_conll_token)

                h_trees.append(h_tree)
                weights.append(weight)

            if True:
                min_risk_tree = compute_minimum_risk_tree(h_trees, weights)
                if not min_risk_tree.__eq__(h_trees[0]):
                    print(h_trees[0])
                    print(min_risk_tree)
Пример #2
0
    def test_basic_sdcp_parsing_constituency(self):
        tree1 = constituent_tree_1()
        tree2 = constituent_tree_2()
        tree3 = constituent_tree_1_pos_stripped()

        terminal_labeling = FormTerminals() # [tree1, tree2], 1, filter=["VP"])
        fanout = 1

        grammar = LCFRS('START')
        for tree in [tree1, tree2]:
            tree_part = tree.unlabelled_structure()
            part = fanout_limited_partitioning(tree_part, fanout)
            tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling)
            grammar.add_gram(tree_grammar)
        grammar.make_proper()

        print("grammar induced. Printing rules...", file=stderr)

        for rule in grammar.rules():
            print(rule, file=stderr)

        parser_type = LCFRS_sDCP_Parser

        print("preprocessing grammar", file=stderr)

        parser_type.preprocess_grammar(grammar, terminal_labeling, debug=True)

        print("invoking parser", file=stderr)

        parser = parser_type(grammar, tree1)

        print("listing derivations", file=stderr)

        for der in parser.all_derivation_trees():
            print(der)
            output_tree = ConstituentTree(tree1.sent_label())
            tokens = [construct_constituent_token(token.form(), '--', True) for token in tree1.token_yield()]
            dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False,
                              construct_constituent_token)
            print(tree1)
            print(output_tree)

        parser = parser_type(grammar, tree3)
        print(parser.recognized())
        for der in parser.all_derivation_trees():
            print(der)
            output_tree = ConstituentTree(tree3.sent_label())
            tokens = [construct_constituent_token(token.form(), '--', True) for token in tree3.token_yield()]
            dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False,
                              construct_constituent_token)
            print(tree3)
            print(output_tree)

        print("completed test", file=stderr)
Пример #3
0
    def test_grammar_export(self):
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        _, grammar = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'),
            # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'),
            terminal_labeling.token_label,
            [direct_extraction],
            'START')
        print(max([grammar.fanout(nont) for nont in grammar.nonts()]))
        print(grammar)

        prefix = '/tmp/'
        name = 'tmpGrammar'

        name_ = export(grammar, prefix, name)

        self.assertEqual(0, compile_gf_grammar(prefix, name_))

        GFParser.preprocess_grammar(grammar)

        string = ["NP", "N", "V", "V", "V"]

        parser = GFParser(grammar, string)

        self.assertTrue(parser.recognized())

        der = parser.best_derivation_tree()
        self.assertTrue(
            der.check_integrity_recursive(der.root_id(), grammar.start()))

        print(der)

        print(
            derivation_to_hybrid_tree(der, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        dcp = DCP_evaluator(der).getEvaluation()

        h_tree_2 = HybridTree()
        token_sequence = [
            construct_conll_token(form, lemma)
            for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '),
                                   'NP N V V V'.split(' '))
        ]
        dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                          construct_conll_token)

        print(h_tree_2)
    def test_dcp_evaluation_with_induced_dependency_grammar(self):
        tree = hybrid_tree_1()

        print(tree)

        tree2 = hybrid_tree_2()

        print(tree2)
        # print tree.recursive_partitioning()

        labeling = the_labeling_factory().create_simple_labeling_strategy(
            'child', 'pos')
        term_pos = the_terminal_labeling_factory().get_strategy(
            'pos').token_label
        (_, grammar) = induce_grammar([tree, tree2], labeling, term_pos,
                                      [direct_extraction], 'START')

        # print grammar

        self.assertEqual(grammar.well_formed(), None)
        self.assertEqual(grammar.ordered()[0], True)
        # print max([grammar.fanout(nont) for nont in grammar.nonts()])
        print(grammar)

        parser = Parser(grammar, 'NP N V V'.split(' '))

        self.assertEqual(parser.recognized(), True)

        for item in parser.successful_root_items():
            der = Derivation()
            derivation_tree(der, item, None)
            print(der)

            hybrid_tree = derivation_to_hybrid_tree(
                der, 'NP N V V'.split(' '),
                'Piet Marie helpen lezen'.split(' '),
                construct_constituent_token)
            print(hybrid_tree)

            dcp = DCP_evaluator(der).getEvaluation()
            h_tree_2 = HybridTree()
            token_sequence = [
                construct_conll_token(form, lemma)
                for form, lemma in zip('Piet Marie helpen lezen'.split(' '),
                                       'NP N V V'.split(' '))
            ]
            dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                              construct_conll_token)
Пример #5
0
    def parsing_postprocess(self, sentence, derivation, label=None):
        full_yield, id_yield, full_token_yield, token_yield = sentence

        dcp_tree = ConstituentTree(label)
        punctuation_positions = [i + 1 for i, idx in enumerate(full_yield)
                                 if idx not in id_yield]

        cleaned_tokens = copy.deepcopy(full_token_yield)
        dcp = DCP_evaluator(derivation).getEvaluation()
        dcp_to_hybridtree(dcp_tree, dcp, cleaned_tokens, False, construct_constituent_token,
                          punct_positions=punctuation_positions)

        if True or self.strip_vroot:
            dcp_tree.strip_vroot()

        return dcp_tree
 def dcp_hybrid_tree_best_derivation(self,
                                     tree,
                                     tokens,
                                     ignore_punctuation,
                                     construct_token,
                                     punctuation_positions=None):
     """
     :param tree:
     :type tree: GeneralHybridTree
     :param tokens: list[MonadicToken]
     :param ignore_punctuation:
     :type ignore_punctuation: bool
     :return: The Hybrid Tree obtained through evaluation of the dcp-component of the best parse.
     :rtype: GeneralHybridTree
     """
     dcp_evaluation = self.dcp_best_derivation()
     if dcp_evaluation:
         return dcp_to_hybridtree(tree,
                                  dcp_evaluation,
                                  tokens,
                                  ignore_punctuation,
                                  construct_token,
                                  punct_positions=punctuation_positions)
     else:
         return None
Пример #7
0
    def test_cfg_parser(self):
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START')

        for parser_class in [LCFRS_parser, CFGParser]:

            parser_class.preprocess_grammar(grammar)

            string = ["NP", "N", "V", "V", "V"]

            parser = parser_class(grammar, string)

            self.assertTrue(parser.recognized())

            der = parser.best_derivation_tree()
            self.assertTrue(
                der.check_integrity_recursive(der.root_id(), grammar.start()))

            print(der)

            print(
                derivation_to_hybrid_tree(
                    der, string, "Piet Marie helpen lezen leren".split(),
                    construct_conll_token))

            dcp = DCP_evaluator(der).getEvaluation()

            h_tree_2 = HybridTree()
            token_sequence = [
                construct_conll_token(form, lemma) for form, lemma in zip(
                    'Piet Marie helpen lezen leren'.split(' '),
                    'NP N V V V'.split(' '))
            ]
            dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                              construct_conll_token)

            print(h_tree_2)
    def test_induction_and_parsing_with_pos_recovery(self):
        naming = 'child'

        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))

        tree = self.tree
        tree.add_to_root("VP1")

        print(tree)

        grammar = fringe_extract_lcfrs(tree,
                                       rec_part(tree),
                                       naming=naming,
                                       isolate_pos=True,
                                       term_labeling=FormTerminals())
        print(grammar)

        parser = LCFRS_parser(grammar)
        parser.set_input([token.form() for token in tree.token_yield()])
        parser.parse()
        self.assertTrue(parser.recognized())
        derivation = parser.best_derivation_tree()
        e = DCP_evaluator(derivation)
        dcp_term = e.getEvaluation()
        print(str(dcp_term[0]))
        t = ConstituentTree()
        dcp_to_hybridtree(
            t,
            dcp_term, [
                construct_constituent_token(token.form(), '--', True)
                for token in tree.token_yield()
            ],
            ignore_punctuation=False,
            construct_token=construct_constituent_token)
        print(t)
        self.assertEqual(len(tree.token_yield()), len(t.token_yield()))
        for tok1, tok2 in zip(tree.token_yield(), t.token_yield()):
            self.assertEqual(tok1.form(), tok2.form())
            self.assertEqual(tok1.pos(), tok2.pos())
    def test_basic_sdcp_parsing_dependency(self):
        tree1 = hybrid_tree_1()
        tree2 = hybrid_tree_2()

        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree1, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START')

        print("grammar induced. Printing rules...", file=stderr)

        for rule in grammar.rules():
            print(rule, file=stderr)

        parser_type = LCFRS_sDCP_Parser

        print("preprocessing grammar", file=stderr)

        parser_type.preprocess_grammar(grammar, terminal_labeling)

        print("invoking parser", file=stderr)

        parser = parser_type(grammar, tree1)

        print("listing derivations", file=stderr)

        for der in parser.all_derivation_trees():
            print(der)
            output_tree = HybridTree()
            tokens = tree1.token_yield()
            dcp_to_hybridtree(output_tree,
                              DCP_evaluator(der).getEvaluation(), tokens,
                              False, construct_conll_token)
            print(tree1)
            print(output_tree)

        print("completed test", file=stderr)
    def generic_parsing_test(self, parser_type, limit_train, limit_test,
                             compare_order):
        def filter_by_id(n, trees):
            j = 0
            for tree in trees:
                if j in n:
                    yield tree
                j += 1

        #params
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        test = train
        # test = 'res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory(
        ).create_simple_labeling_strategy("childtop", "deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling,
                                                 term_labelling.token_label,
                                                 recursive_partitioning, start)

        parser_type.preprocess_grammar(grammar_prim, term_labelling)

        trees = parse_conll_corpus(test, False, limit_test)

        count_derivs = {}
        no_complete_match = 0

        for i, tree in enumerate(trees):
            print("Parsing tree for ", i, file=stderr)

            print(tree, file=stderr)

            parser = parser_type(grammar_prim, tree)
            self.assertTrue(parser.recognized())
            count_derivs[i] = 0

            print("Found derivations for ", i, file=stderr)
            j = 0

            derivations = []

            for der in parser.all_derivation_trees():
                self.assertTrue(
                    der.check_integrity_recursive(der.root_id(), start))

                print(count_derivs[i], file=stderr)
                print(der, file=stderr)

                output_tree = HybridTree()
                tokens = tree.token_yield()

                the_yield = der.compute_yield()
                # print >>stderr, the_yield
                tokens2 = list(
                    map(lambda pos: construct_conll_token('_', pos),
                        the_yield))

                dcp_to_hybridtree(output_tree,
                                  DCP_evaluator(der).getEvaluation(),
                                  tokens2,
                                  False,
                                  construct_conll_token,
                                  reorder=False)
                print(tree, file=stderr)
                print(output_tree, file=stderr)

                self.compare_hybrid_trees(tree, output_tree, compare_order)
                count_derivs[i] += 1
                derivations.append(der)

            self.assertTrue(
                sDCPParserTest.pairwise_different(
                    derivations, sDCPParserTest.compare_derivations))
            self.assertEqual(len(derivations), count_derivs[i])

            if count_derivs[i] == 0:
                no_complete_match += 1

        for key in count_derivs:
            print(key, count_derivs[key])

        print("# trees with no complete match:", no_complete_match)
Пример #11
0
def build_score_validator(baseline_grammar, grammarInfo, nont_map,
                          storageManager, term_labelling, parser,
                          corpus_validation, validationMethod):
    validator = PyCandidateScoreValidator(grammarInfo, storageManager,
                                          validationMethod)

    # parser = GFParser(baseline_grammar)
    tree_count = 0
    der_count = 0
    for gold_tree in corpus_validation:
        tree_count += 1
        parser.set_input(
            term_labelling.prepare_parser_input(gold_tree.token_yield()))
        parser.parse()
        derivations = [der for _, der in parser.k_best_derivation_trees()]
        manager = PyDerivationManager(baseline_grammar, nont_map)
        manager.convert_hypergraphs(derivations)
        scores = []

        relevant = set([tuple(t) for t in gold_tree.labelled_spans()])

        for der in derivations:
            der_count += 1

            h_tree = ConstituentTree()
            cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield())
            dcp = DCP_evaluator(der).getEvaluation()
            dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                              construct_constituent_token)

            retrieved = set([tuple(t) for t in h_tree.labelled_spans()])
            inters = retrieved & relevant

            # in case of parse failure there are two options here:
            #   - parse failure -> no spans at all, thus precision = 1
            #   - parse failure -> a dummy tree with all spans wrong, thus precision = 0

            precision = 1.0 * len(inters) / len(retrieved) \
                if len(retrieved) > 0 else 0
            recall = 1.0 * len(inters) / len(relevant) \
                if len(relevant) > 0 else 0
            fmeasure = 2.0 * precision * recall / (precision + recall) \
                if precision + recall > 0 else 0

            if validationMethod == "F1":
                scores.append(fmeasure)
            elif validationMethod == "Precision":
                scores.append(precision)
            elif validationMethod == "Recall":
                scores.append(recall)
            else:
                raise ()

        validator.add_scored_candidates(manager, scores,
                                        1.0 if len(relevant) > 0 else 0.0)
        # print(tree_count, scores)
        parser.clear()

    print("trees used for validation ", tree_count, "with",
          der_count * 1.0 / tree_count, "derivations on average")

    return validator
Пример #12
0
    def test_fst_compilation_left(self):
        if not test_pynini:
            return
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label,
            [left_branching], 'START')

        fst, rules = compile_wfst_from_left_branching_grammar(grammar)

        print(repr(fst))

        symboltable = fst.input_symbols()

        string = ["NP", "N", "V", "V", "V"]

        fsa = fsa_from_list_of_symbols(string, symboltable)
        self.assertEqual(
            fsa.text().decode('utf-8'),
            '0\t1\tNP\tNP\n1\t2\tN\tN\n2\t3\tV\tV\n3\t4\tV\tV\n4\t5\tV\tV\n5\n'
        )

        b = compose(fsa, fst)

        print(b.text(symboltable, symboltable))

        print("Shortest path probability", end=' ')
        best = shortestpath(b)
        best.topsort()
        # self.assertAlmostEquals(pow(e, -float(shortestdistance(best)[-1])), 1.80844898756e-05)
        print(best.text())

        polish_rules = retrieve_rules(best)
        self.assertSequenceEqual(polish_rules, [1, 2, 3, 4, 5, 4, 9, 4, 7, 8])

        polish_rules = list(map(rules.index_object, polish_rules))

        for rule in polish_rules:
            print(rule)
        print()

        der = ReversePolishDerivation(polish_rules[0:-1])
        self.assertTrue(der.check_integrity_recursive(der.root_id()))

        print(der)

        LeftBranchingFSTParser.preprocess_grammar(grammar)
        parser = LeftBranchingFSTParser(grammar, string)
        der_ = parser.best_derivation_tree()

        print(der_)
        self.assertTrue(der_.check_integrity_recursive(der_.root_id()))

        print(
            derivation_to_hybrid_tree(der, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        print(
            derivation_to_hybrid_tree(der_, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        dcp = DCP_evaluator(der).getEvaluation()

        h_tree_2 = HybridTree()
        token_sequence = [
            construct_conll_token(form, lemma)
            for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '),
                                   'NP N V V V'.split(' '))
        ]
        dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                          construct_conll_token)

        print(h_tree_2)
Пример #13
0
    def test_fst_compilation_right(self):
        if not test_pynini:
            return
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label,
            [right_branching], 'START')

        a, rules = compile_wfst_from_right_branching_grammar(grammar)

        print(repr(a))

        symboltable = a.input_symbols()

        string = 'NP N V V V'.split(' ')

        token_sequence = [
            construct_conll_token(form, lemma) for form, lemma in zip(
                'Piet Marie helpen leren lezen'.split(' '), string)
        ]

        fsa = fsa_from_list_of_symbols(string, symboltable)
        self.assertEqual(
            '0\t1\tNP\tNP\n1\t2\tN\tN\n2\t3\tV\tV\n3\t4\tV\tV\n4\t5\tV\tV\n5\n',
            fsa.text().decode('utf-8'))

        b = compose(fsa, a)

        print(b.input_symbols())
        for i in b.input_symbols():
            print(i)

        print("Input Composition")
        print(b.text(symboltable, symboltable).decode('utf-8'))

        i = 0
        for path in paths(b):
            print(i, "th path:", path, end=' ')
            r = list(map(rules.index_object, path))
            d = PolishDerivation(r[1::])
            dcp = DCP_evaluator(d).getEvaluation()
            h = HybridTree()
            dcp_to_hybridtree(h, dcp, token_sequence, False,
                              construct_conll_token)
            h.reorder()
            if h == tree2:
                print("correct")
            else:
                print("incorrect")
            i += 1

        stats = defaultdict(lambda: 0)
        local_rule_stats(b, stats, 15)

        print(stats)

        print("Shortest path probability")
        best = shortestpath(b)
        best.topsort()
        self.assertAlmostEqual(1.80844898756e-05,
                               pow(e, -float(shortestdistance(best)[-1])))
        print(best.text())

        polish_rules = retrieve_rules(best)
        self.assertSequenceEqual(polish_rules, [8, 7, 1, 6, 2, 5, 3, 10, 3, 3])

        polish_rules = list(map(rules.index_object, polish_rules))

        print(polish_rules)

        der = PolishDerivation(polish_rules[1::])

        print(der)

        print(
            derivation_to_hybrid_tree(der, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        dcp = DCP_evaluator(der).getEvaluation()

        h_tree_2 = HybridTree()
        dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                          construct_conll_token)

        print(h_tree_2)
Пример #14
0
    def test_best_trees(self):
        limit_train = 5000
        limit_test = 100
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        test = train
        parser_type = GFParser_k_best
        # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory(
        ).create_simple_labeling_strategy("child", "pos+deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling,
                                                 term_labelling.token_label,
                                                 recursive_partitioning, start)

        parser_type.preprocess_grammar(grammar_prim)
        tree_yield = term_labelling.prepare_parser_input

        trees = parse_conll_corpus(test, False, limit_test)

        for i, tree in enumerate(trees):
            print("Parsing sentence ", i, file=stderr)

            parser = parser_type(grammar_prim,
                                 tree_yield(tree.token_yield()),
                                 k=200)

            self.assertTrue(parser.recognized())

            viterbi_weight = parser.viterbi_weight()
            viterbi_deriv = parser.viterbi_derivation()

            der_to_tree = lambda der: dcp_to_hybridtree(
                HybridTree(),
                DCP_evaluator(der).getEvaluation(),
                copy.deepcopy(tree.full_token_yield()), False,
                construct_conll_token)

            viterbi_tree = der_to_tree(viterbi_deriv)

            ordered_parse_trees = parser.best_trees(der_to_tree)

            best_tree, best_weight, best_witnesses = ordered_parse_trees[0]

            for i, (parsed_tree, _, _) in enumerate(ordered_parse_trees):
                if parsed_tree.__eq__(tree):
                    print("Gold tree is ",
                          i + 1,
                          " in best tree list",
                          file=stderr)
                    break

            if (not viterbi_tree.__eq__(best_tree)
                    and viterbi_weight != best_weight):
                print("viterbi and k-best tree differ", file=stderr)
                print("viterbi: ", viterbi_weight, file=stderr)
                print("k-best: ", best_weight, best_witnesses, file=stderr)
                if False:
                    print(viterbi_tree, file=stderr)
                    print(tree_to_conll_str(viterbi_tree), file=stderr)
                    print(best_tree, file=stderr)
                    print(tree_to_conll_str(best_tree), file=stderr)
                    print("gold tree", file=stderr)
                    print(tree, file=stderr)
                    print(tree_to_conll_str(tree), file=stderr)
Пример #15
0
    def test_k_best_parsing(self):
        limit_train = 20
        limit_test = 10
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        test = train
        parser_type = GFParser_k_best
        # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory(
        ).create_simple_labeling_strategy("childtop", "deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling,
                                                 term_labelling.token_label,
                                                 recursive_partitioning, start)

        parser_type.preprocess_grammar(grammar_prim)
        tree_yield = term_labelling.prepare_parser_input

        trees = parse_conll_corpus(test, False, limit_test)

        for i, tree in enumerate(trees):
            print("Parsing sentence ", i, file=stderr)

            # print >>stderr, tree

            parser = parser_type(grammar_prim,
                                 tree_yield(tree.token_yield()),
                                 k=50)

            self.assertTrue(parser.recognized())

            derivations = [der for der in parser.k_best_derivation_trees()]
            print("# derivations: ", len(derivations), file=stderr)
            h_trees = []
            current_weight = 0
            weights = []
            derivation_list = []
            for weight, der in derivations:
                # print >>stderr, exp(-weight)
                # print >>stderr, der

                self.assertTrue(not der in derivation_list)

                derivation_list.append(der)

                # TODO this should hold, but it looks like a GF bug!
                # self.assertGreaterEqual(weight, current_weight)
                current_weight = weight

                dcp = DCP_evaluator(der).getEvaluation()
                h_tree = HybridTree()
                cleaned_tokens = copy.deepcopy(tree.full_token_yield())
                dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                                  construct_conll_token)

                h_trees.append(h_tree)
                weights.append(weight)

                # print >>stderr, h_tree

            # print a matrix indicating which derivations result
            # in the same hybrid tree
            if True:
                for i, h_tree1 in enumerate(h_trees):
                    for h_tree2 in h_trees:
                        if h_tree1 == h_tree2:
                            print("x", end=' ', file=stderr)
                        else:
                            print("", end=' ', file=stderr)
                    print(weights[i], file=stderr)
                print(file=stderr)
    def test_stanford_unking_scheme(self):
        naming = 'child'

        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))

        tree = self.tree
        tree.add_to_root("VP1")

        print(tree)

        terminal_labeling = StanfordUNKing([tree])

        grammar = fringe_extract_lcfrs(tree,
                                       rec_part(tree),
                                       naming=naming,
                                       isolate_pos=True,
                                       term_labeling=terminal_labeling)
        print(grammar)

        parser = LCFRS_parser(grammar)
        parser.set_input([token.form() for token in tree.token_yield()])
        parser.parse()
        self.assertTrue(parser.recognized())
        derivation = parser.best_derivation_tree()
        e = DCP_evaluator(derivation)
        dcp_term = e.getEvaluation()
        print(str(dcp_term[0]))
        t = ConstituentTree()
        dcp_to_hybridtree(
            t,
            dcp_term, [
                construct_constituent_token(token.form(), '--', True)
                for token in tree.token_yield()
            ],
            ignore_punctuation=False,
            construct_token=construct_constituent_token)
        print(t)
        self.assertEqual(len(tree.token_yield()), len(t.token_yield()))
        for tok1, tok2 in zip(tree.token_yield(), t.token_yield()):
            self.assertEqual(tok1.form(), tok2.form())
            self.assertEqual(tok1.pos(), tok2.pos())

        rules = terminal_labeling.create_smoothed_rules()
        print(rules)

        new_rules = {}

        for rule in grammar.rules():
            if rule.rhs() == []:
                assert len(rule.dcp()) == 1
                dcp = rule.dcp()[0]
                assert len(dcp.rhs()) == 1
                term = dcp.rhs()[0]
                head = term.head()
                pos = head.pos()

                for tag, form in rules:
                    if tag == pos:
                        lhs = LCFRS_lhs(rule.lhs().nont())
                        lhs.add_arg([form])
                        new_rules[lhs, dcp] = rules[tag, form]

        for lhs, dcp in new_rules:
            print(str(lhs), str(dcp), new_rules[(lhs, dcp)])

        tokens = [
            construct_constituent_token('hat', '--', True),
            construct_constituent_token('HAT', '--', True)
        ]
        self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat')
        self.assertEqual(terminal_labeling.token_label(tokens[1]), '_UNK')
        terminal_labeling.test_mode = True
        self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat')
        self.assertEqual(terminal_labeling.token_label(tokens[1]), 'hat')
Пример #17
0
def do_parsing(grammar_prim,
               limit,
               ignore_punctuation,
               recompile=True,
               preprocess_path=None):
    trees = parse_conll_corpus(test, False, limit)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)

    total_time = 0.0

    load_preprocess = preprocess_path
    if recompile or (not os.path.isfile(
            parser_type.resolve_path(preprocess_path))):
        load_preprocess = None

    parser = parser_type(grammar_prim,
                         save_preprocess=preprocess_path,
                         load_preprocess=load_preprocess)

    with open(result, 'w') as result_file:
        failures = 0
        for tree in trees:
            if len(tree.id_yield()) > limit:
                continue
            time_stamp = time.clock()

            parser.set_input(tree_yield(tree.token_yield()))
            parser.parse()
            # if not parser.recognized():
            #     parser = parser_type(grammar_second, tree_yield(tree.token_yield()))
            # if not parser.recognized():
            #     parser = parser_type(grammar_tern, tree_yield(tree.token_yield()))
            time_stamp = time.clock() - time_stamp
            total_time += time_stamp

            cleaned_tokens = copy.deepcopy(tree.full_token_yield())
            for token in cleaned_tokens:
                token.set_edge_label('_')

            h_tree = HybridTree(tree.sent_label())

            if parser_type == GFParser_k_best and parser.recognized():
                der_to_tree = lambda der: dcp_to_hybridtree(
                    HybridTree(),
                    DCP_evaluator(der).getEvaluation(),
                    copy.deepcopy(tree.full_token_yield()), False,
                    construct_conll_token)
                h_tree = parser.best_trees(der_to_tree)[0][0]
            elif parser_type == CFGParser \
                     or parser_type == GFParser \
                     or parser_type == LeftBranchingFSTParser \
                     or parser_type == RightBranchingFSTParser:
                h_tree = parser.dcp_hybrid_tree_best_derivation(
                    h_tree, cleaned_tokens, ignore_punctuation,
                    construct_conll_token)
            else:
                h_tree = None

            if h_tree:
                result_file.write(tree_to_conll_str(h_tree))
                result_file.write('\n\n')
            else:
                failures += 1
                forms = [token.form() for token in tree.full_token_yield()]
                poss = [token.pos() for token in tree.full_token_yield()]
                result_file.write(
                    tree_to_conll_str(fall_back_left_branching(forms, poss)))
                result_file.write('\n\n')

            parser.clear()

    print("parse failures", failures)
    print("parse time", total_time)

    print("eval.pl", "no punctuation")
    p = subprocess.Popen(
        ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q"])
    p.communicate()
    print("eval.pl", "punctation")
    p = subprocess.Popen(
        ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q", "-p"])
    p.communicate()
    def test_something(self):
        normal_corpus = 'res/tiger/tiger_8000.export'
        binarized_corpus = 'res/tiger/tiger_8000_bin.export'
        limit = 55000
        # limit = 30
        corpus_bin = sentence_names_to_hybridtrees(
            {str(x)
             for x in range(limit)},
            binarized_corpus,
            disconnect_punctuation=False,
            add_vroot=True,
            mode="DISCO-DOP")

        corpus = sentence_names_to_hybridtrees({str(x)
                                                for x in range(limit)},
                                               normal_corpus,
                                               disconnect_punctuation=False,
                                               add_vroot=True,
                                               mode="DISCO-DOP")
        term_labeling = terminal_labeling(corpus, threshold=4)

        grammar = None

        for htree, htree_bin in zip(corpus, corpus_bin):
            # print(htree_bin)

            try:
                htree_grammar = direct_extract_lcfrs_from_prebinarized_corpus(
                    htree_bin, term_labeling=term_labeling)
            except Exception as e:
                print(e)
                print(htree_bin)
                print(htree_bin.nodes())
                print(htree_bin.word_yield())
                raise e
            # print(htree_grammar)

            parser_input = term_labeling.prepare_parser_input(
                htree.token_yield())

            p = LCFRS_sDCP_Parser(htree_grammar,
                                  terminal_labelling=term_labeling)
            p.set_input(htree)
            p.parse()
            # p = LCFRS_parser(htree_grammar, parser_input)
            self.assertTrue(p.recognized())

            derivs = list(p.all_derivation_trees())
            # print("derivations:", len(derivs))

            for der in derivs:
                dcp = DCP_evaluator(der).getEvaluation()
                sys_tree = HybridTree(htree.sent_label())

                sys_tree = dcp_to_hybridtree(
                    sys_tree,
                    dcp,
                    deepcopy(htree.token_yield()),
                    ignore_punctuation=False,
                    construct_token=construct_constituent_token)
                # print(sys_tree)
                # print(htree == sys_tree)
                # print(der)
                if htree != sys_tree:
                    print(htree.sent_label())
                    print(htree)
                    print(sys_tree)

                self.assertEqual(htree, sys_tree)

            if grammar is None:
                grammar = htree_grammar
            else:
                grammar.add_gram(htree_grammar)

            htree_grammar.make_proper()
            try:
                disco_parser = DiscodopKbestParser(htree_grammar)
            except ValueError as ve:
                print(ve)
                print(htree.sent_label())
                print(htree)
                print(htree_bin)
                print(htree_grammar)
                raise ve

        grammar.make_proper()
        disco_parser = DiscodopKbestParser(grammar)
Пример #19
0
def do_parsing(grammar,
               test_corpus,
               term_labelling,
               result,
               grammar_identifier,
               parser_type,
               k_best,
               minimum_risk=False,
               oracle_parse=False,
               recompile=True,
               reparse=False,
               dir=None,
               opt=None):
    tree_yield = term_labelling.prepare_parser_input

    result_path = result(grammar_identifier)
    minimum_risk_path = result(grammar_identifier, 'min_risk')
    oracle_parse_path = result(grammar_identifier, 'oracle_file')

    total_time = 0.0

    preprocess_path = [os.path.join(dir, grammar_identifier), "gf_grammar"]
    # print(preprocess_path)
    load_preprocess = preprocess_path
    if parser_type not in [GFParser, GFParser_k_best, Coarse_to_fine_parser] \
            or recompile \
            or (not os.path.isfile(GFParser.resolve_path(preprocess_path))):
        load_preprocess = None
    if parser_type in [GFParser, GFParser_k_best, Coarse_to_fine_parser] \
            and not os.path.isdir(os.path.join(dir, grammar_identifier)):
        os.makedirs(os.path.join(dir, grammar_identifier))

    if parser_type == GFParser_k_best:
        parser = GFParser_k_best(grammar,
                                 save_preprocessing=preprocess_path,
                                 load_preprocessing=load_preprocess,
                                 k=k_best)
    elif parser_type == Coarse_to_fine_parser:
        parser = Coarse_to_fine_parser(grammar,
                                       base_parser_type=GFParser_k_best,
                                       la=opt["latentAnnotation"],
                                       grammarInfo=opt["grammarInfo"],
                                       nontMap=opt["nontMap"],
                                       save_preprocessing=preprocess_path,
                                       load_preprocessing=load_preprocess,
                                       k=k_best)
    else:
        parser = parser_type(grammar,
                             save_preprocess=preprocess_path,
                             load_preprocess=load_preprocess)

    if recompile or reparse or \
            not os.path.isfile(result_path) \
            or (minimum_risk and not os.path.isfile(minimum_risk_path)) \
            or (oracle_parse and not os.path.isfile(oracle_parse_path)):

        result_dirs = map(lambda path: os.path.split(path)[0],
                          [result_path, minimum_risk_path, oracle_parse_path])
        for result_dir in result_dirs:
            if not os.path.isdir(result_dir):
                os.makedirs(result_dir)

        with open(result_path, 'w') as result_file, \
                open(minimum_risk_path, 'w') as minimum_risk_file, \
                open(oracle_parse_path, 'w') as oracle_parse_file:
            failures = 0
            for tree in test_corpus.get_trees():
                time_stamp = time.clock()

                parser.set_input(tree_yield(tree.token_yield()))
                parser.parse()
                # if not parser.recognized():
                #     parser = parser_type(grammar_second, tree_yield(tree.token_yield()))
                # if not parser.recognized():
                #     parser = parser_type(grammar_tern, tree_yield(tree.token_yield()))
                time_stamp = time.clock() - time_stamp
                total_time += time_stamp

                cleaned_tokens = copy.deepcopy(tree.full_token_yield())
                for token in cleaned_tokens:
                    token.set_edge_label('_')

                h_tree = HybridTree(tree.sent_label())

                if parser_type in [GFParser_k_best, Coarse_to_fine_parser
                                   ] and parser.recognized():
                    if minimum_risk or oracle_parse:
                        h_trees = []
                        weights = []

                        for weight, der in parser.k_best_derivation_trees():

                            dcp = DCP_evaluator(der).getEvaluation()
                            h_tree = HybridTree()
                            cleaned_tokens = copy.deepcopy(
                                tree.full_token_yield())
                            dcp_to_hybridtree(h_tree, dcp, cleaned_tokens,
                                              False, construct_conll_token)

                            h_trees.append(h_tree)
                            weights.append(weight)

                        if minimum_risk:
                            h_tree_min_risk = compute_minimum_risk_tree(
                                h_trees, weights)
                        if oracle_parse:
                            h_tree_oracle = compute_oracle_tree(h_trees, tree)

                    der_to_tree = lambda der: dcp_to_hybridtree(
                        HybridTree(),
                        DCP_evaluator(der).getEvaluation(),
                        copy.deepcopy(tree.full_token_yield()), False,
                        construct_conll_token)
                    # h_tree = parser.best_trees(der_to_tree)[0][0]
                    h_tree = HybridTree(tree.sent_label())
                    h_tree = parser.dcp_hybrid_tree_best_derivation(
                        h_tree, copy.deepcopy(tree.full_token_yield()),
                        ignore_punctuation, construct_conll_token)
                elif parser_type == CFGParser \
                        or parser_type == GFParser \
                        or parser_type == LeftBranchingFSTParser \
                        or parser_type == RightBranchingFSTParser:
                    h_tree = parser.dcp_hybrid_tree_best_derivation(
                        h_tree, cleaned_tokens, ignore_punctuation,
                        construct_conll_token)
                else:
                    h_tree = None

                if h_tree:
                    result_file.write(tree_to_conll_str(h_tree))
                    result_file.write('\n\n')
                    if minimum_risk and parser_type in [
                            GFParser_k_best, Coarse_to_fine_parser
                    ]:
                        minimum_risk_file.write(
                            tree_to_conll_str(h_tree_min_risk))
                        minimum_risk_file.write('\n\n')
                    if oracle_parse and parser_type in [
                            GFParser_k_best, Coarse_to_fine_parser
                    ]:
                        oracle_parse_file.write(
                            tree_to_conll_str(h_tree_oracle))
                        oracle_parse_file.write('\n\n')
                else:
                    failures += 1
                    forms = [token.form() for token in tree.full_token_yield()]
                    poss = [token.pos() for token in tree.full_token_yield()]
                    fall_back = tree_to_conll_str(
                        fall_back_left_branching(forms, poss))
                    files = [result_file]
                    if minimum_risk:
                        files.append(minimum_risk_file)
                    if oracle_parse:
                        files.append(oracle_parse_file)
                    for file in files:
                        file.write(fall_back)
                        file.write('\n\n')

                parser.clear()

        print("parse failures", failures)
        print("parse time", total_time)

    if parser_type == GFParser_k_best:
        print("best parse results")
    else:
        print("viterbi parse results")
    eval_pl_call(test_corpus._path, result_path)
    if oracle_parse:
        print("\noracle parse results")
        eval_pl_call(test_corpus._path, oracle_parse_path)
    if minimum_risk:
        print("\nminimum risk results")
        eval_pl_call(test_corpus._path, minimum_risk_path)

    return parser
Пример #20
0
def build_score_validator(baseline_grammar, grammarInfo, nont_map,
                          storageManager, term_labelling, parser,
                          corpus_validation, validationMethod):
    validator = PyCandidateScoreValidator(grammarInfo, storageManager,
                                          validationMethod)

    # parser = GFParser(baseline_grammar)
    tree_count = 0
    der_count = 0
    for gold_tree in corpus_validation.get_trees():
        tree_count += 1
        parser.set_input(
            term_labelling.prepare_parser_input(gold_tree.token_yield()))
        parser.parse()
        derivations = map(lambda x: x[1], parser.k_best_derivation_trees())
        manager = PyDerivationManager(baseline_grammar, nont_map)
        manager.convert_derivations_to_hypergraphs(derivations)
        scores = []

        gold_labels = {}
        gold_heads = {}

        for position, id in enumerate(gold_tree.id_yield()):
            parent_id = gold_tree.parent(id)
            gold_labels[position] = gold_tree.node_token(id).deprel()
            if parent_id is None:
                assert id in gold_tree.root
                gold_heads[position] = 0
            else:
                gold_heads[position] = gold_tree.id_yield().index(
                    parent_id) + 1

        derivations = parser.k_best_derivation_trees()
        for _, der in derivations:
            der_count += 1
            h_tree = HybridTree()
            cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield())
            dcp = DCP_evaluator(der).getEvaluation()
            dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                              construct_conll_token)

            las, uas, lac = 0, 0, 0
            for position, id in enumerate(h_tree.id_yield()):
                parent_id = h_tree.parent(id)
                if parent_id is None:
                    assert id in h_tree.root
                    head = 0
                else:
                    head = h_tree.id_yield().index(parent_id) + 1
                label = h_tree.node_token(id).deprel()

                if gold_heads[position] == head:
                    uas += 1
                if gold_labels[position] == label:
                    lac += 1
                if gold_heads[position] == head and gold_labels[
                        position] == label:
                    las += 1

            if validationMethod == "LAS":
                scores.append(las)
            elif validationMethod == "UAS":
                scores.append(uas)
            elif validationMethod == "LAC":
                scores.append(lac)

        max_score = len(gold_tree.id_yield())
        validator.add_scored_candidates(manager, scores, max_score)
        print(tree_count, max_score, scores)
        parser.clear()

    print("trees used for validation ", tree_count, "with",
          der_count * 1.0 / tree_count, "derivations on average")

    return validator