Пример #1
0
    def test_grammar_export(self):
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        _, grammar = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'),
            # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'),
            terminal_labeling.token_label,
            [direct_extraction],
            'START')
        print(max([grammar.fanout(nont) for nont in grammar.nonts()]))
        print(grammar)

        prefix = '/tmp/'
        name = 'tmpGrammar'

        name_ = export(grammar, prefix, name)

        self.assertEqual(0, compile_gf_grammar(prefix, name_))

        GFParser.preprocess_grammar(grammar)

        string = ["NP", "N", "V", "V", "V"]

        parser = GFParser(grammar, string)

        self.assertTrue(parser.recognized())

        der = parser.best_derivation_tree()
        self.assertTrue(
            der.check_integrity_recursive(der.root_id(), grammar.start()))

        print(der)

        print(
            derivation_to_hybrid_tree(der, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        dcp = DCP_evaluator(der).getEvaluation()

        h_tree_2 = HybridTree()
        token_sequence = [
            construct_conll_token(form, lemma)
            for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '),
                                   'NP N V V V'.split(' '))
        ]
        dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                          construct_conll_token)

        print(h_tree_2)
    def test_dcp_evaluation_with_induced_dependency_grammar(self):
        tree = hybrid_tree_1()

        print(tree)

        tree2 = hybrid_tree_2()

        print(tree2)
        # print tree.recursive_partitioning()

        labeling = the_labeling_factory().create_simple_labeling_strategy(
            'child', 'pos')
        term_pos = the_terminal_labeling_factory().get_strategy(
            'pos').token_label
        (_, grammar) = induce_grammar([tree, tree2], labeling, term_pos,
                                      [direct_extraction], 'START')

        # print grammar

        self.assertEqual(grammar.well_formed(), None)
        self.assertEqual(grammar.ordered()[0], True)
        # print max([grammar.fanout(nont) for nont in grammar.nonts()])
        print(grammar)

        parser = Parser(grammar, 'NP N V V'.split(' '))

        self.assertEqual(parser.recognized(), True)

        for item in parser.successful_root_items():
            der = Derivation()
            derivation_tree(der, item, None)
            print(der)

            hybrid_tree = derivation_to_hybrid_tree(
                der, 'NP N V V'.split(' '),
                'Piet Marie helpen lezen'.split(' '),
                construct_constituent_token)
            print(hybrid_tree)

            dcp = DCP_evaluator(der).getEvaluation()
            h_tree_2 = HybridTree()
            token_sequence = [
                construct_conll_token(form, lemma)
                for form, lemma in zip('Piet Marie helpen lezen'.split(' '),
                                       'NP N V V'.split(' '))
            ]
            dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                              construct_conll_token)
    def test_a4(self):
        word = ['a'] * 4
        parser = Parser(self.grammar_ab_copy, word)
        print("Parse", word)
        counter = 0
        print("Found items:")

        for passive_item in parser.successful_root_items():
            print(passive_item)
            derivation = print_derivation_tree(passive_item)
            print(derivation)
            poss = ['P' + str(i) for i in range(1, len(word) + 1)]
            tree = derivation_to_hybrid_tree(derivation, poss, word,
                                             construct_constituent_token)
            print(tree)
            counter += 1
        self.assertEqual(counter, 2)
        print()
 def test_ambncmdn(self):
     m = 6
     n = 3
     word = (['a'] * m + ['b'] * n + ['c'] * m + ['d'] * n)
     print("Parse", word)
     parser = Parser(kaeshammer_grammar(), word)
     counter = 0
     print("Found items:")
     for passive_item in parser.successful_root_items():
         print(passive_item)
         counter += 1
         derivation = print_derivation_tree(passive_item)
         print(derivation)
         hybrid_tree = derivation_to_hybrid_tree(
             derivation, word, word, construct_constituent_token)
         # print hybrid_tree
     self.assertEqual(counter, 1)
     print()
Пример #5
0
    def test_cfg_parser(self):
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START')

        for parser_class in [LCFRS_parser, CFGParser]:

            parser_class.preprocess_grammar(grammar)

            string = ["NP", "N", "V", "V", "V"]

            parser = parser_class(grammar, string)

            self.assertTrue(parser.recognized())

            der = parser.best_derivation_tree()
            self.assertTrue(
                der.check_integrity_recursive(der.root_id(), grammar.start()))

            print(der)

            print(
                derivation_to_hybrid_tree(
                    der, string, "Piet Marie helpen lezen leren".split(),
                    construct_conll_token))

            dcp = DCP_evaluator(der).getEvaluation()

            h_tree_2 = HybridTree()
            token_sequence = [
                construct_conll_token(form, lemma) for form, lemma in zip(
                    'Piet Marie helpen lezen leren'.split(' '),
                    'NP N V V V'.split(' '))
            ]
            dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                              construct_conll_token)

            print(h_tree_2)
Пример #6
0
def main():
    # induce grammar from a corpus
    trees = parse_conll_corpus(train, False, limit_train)
    nonterminal_labelling = the_labeling_factory(
    ).create_simple_labeling_strategy("childtop", "deprel")
    term_labelling = the_terminal_labeling_factory().get_strategy('pos')
    start = 'START'
    recursive_partitioning = [cfg]
    _, grammar = induce_grammar(trees, nonterminal_labelling,
                                term_labelling.token_label,
                                recursive_partitioning, start)

    # compute some derivations
    derivations = obtain_derivations(grammar, term_labelling)

    # create derivation manager and add derivations
    manager = PyDerivationManager(grammar)
    manager.convert_derivations_to_hypergraphs(derivations)
    manager.serialize(b"/tmp/derivations.txt")

    # build and configure split/merge trainer and supplementary objects

    rule_to_nonterminals = []
    for i in range(0, len(grammar.rule_index())):
        rule = grammar.rule_index(i)
        nonts = [
            manager.get_nonterminal_map().object_index(rule.lhs().nont())
        ] + [
            manager.get_nonterminal_map().object_index(nont)
            for nont in rule.rhs()
        ]
        rule_to_nonterminals.append(nonts)

    grammarInfo = PyGrammarInfo(grammar, manager.get_nonterminal_map())
    storageManager = PyStorageManager()
    builder = PySplitMergeTrainerBuilder(manager, grammarInfo)
    builder.set_em_epochs(20)
    builder.set_percent_merger(60.0)

    splitMergeTrainer = builder.build()

    latentAnnotation = [
        build_PyLatentAnnotation_initial(grammar, grammarInfo, storageManager)
    ]

    for i in range(max_cycles + 1):
        latentAnnotation.append(
            splitMergeTrainer.split_merge_cycle(latentAnnotation[-1]))
        # pickle.dump(map(lambda la: la.serialize(), latentAnnotation), open(sm_info_path, 'wb'))
        smGrammar = build_sm_grammar(latentAnnotation[i],
                                     grammar,
                                     grammarInfo,
                                     rule_pruning=0.0001,
                                     rule_smoothing=0.01)
        print("Cycle: ", i, "Rules: ", len(smGrammar.rules()))

        if parsing:
            parser = GFParser(smGrammar)

            trees = parse_conll_corpus(test, False, limit_test)
            for tree in trees:
                parser.set_input(
                    term_labelling.prepare_parser_input(tree.token_yield()))
                parser.parse()
                if parser.recognized():
                    print(
                        derivation_to_hybrid_tree(
                            parser.best_derivation_tree(),
                            [token.pos() for token in tree.token_yield()],
                            [token.form() for token in tree.token_yield()],
                            construct_constituent_token))
Пример #7
0
    def test_fst_compilation_left(self):
        if not test_pynini:
            return
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label,
            [left_branching], 'START')

        fst, rules = compile_wfst_from_left_branching_grammar(grammar)

        print(repr(fst))

        symboltable = fst.input_symbols()

        string = ["NP", "N", "V", "V", "V"]

        fsa = fsa_from_list_of_symbols(string, symboltable)
        self.assertEqual(
            fsa.text().decode('utf-8'),
            '0\t1\tNP\tNP\n1\t2\tN\tN\n2\t3\tV\tV\n3\t4\tV\tV\n4\t5\tV\tV\n5\n'
        )

        b = compose(fsa, fst)

        print(b.text(symboltable, symboltable))

        print("Shortest path probability", end=' ')
        best = shortestpath(b)
        best.topsort()
        # self.assertAlmostEquals(pow(e, -float(shortestdistance(best)[-1])), 1.80844898756e-05)
        print(best.text())

        polish_rules = retrieve_rules(best)
        self.assertSequenceEqual(polish_rules, [1, 2, 3, 4, 5, 4, 9, 4, 7, 8])

        polish_rules = list(map(rules.index_object, polish_rules))

        for rule in polish_rules:
            print(rule)
        print()

        der = ReversePolishDerivation(polish_rules[0:-1])
        self.assertTrue(der.check_integrity_recursive(der.root_id()))

        print(der)

        LeftBranchingFSTParser.preprocess_grammar(grammar)
        parser = LeftBranchingFSTParser(grammar, string)
        der_ = parser.best_derivation_tree()

        print(der_)
        self.assertTrue(der_.check_integrity_recursive(der_.root_id()))

        print(
            derivation_to_hybrid_tree(der, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        print(
            derivation_to_hybrid_tree(der_, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        dcp = DCP_evaluator(der).getEvaluation()

        h_tree_2 = HybridTree()
        token_sequence = [
            construct_conll_token(form, lemma)
            for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '),
                                   'NP N V V V'.split(' '))
        ]
        dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                          construct_conll_token)

        print(h_tree_2)
Пример #8
0
    def test_fst_compilation_right(self):
        if not test_pynini:
            return
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label,
            [right_branching], 'START')

        a, rules = compile_wfst_from_right_branching_grammar(grammar)

        print(repr(a))

        symboltable = a.input_symbols()

        string = 'NP N V V V'.split(' ')

        token_sequence = [
            construct_conll_token(form, lemma) for form, lemma in zip(
                'Piet Marie helpen leren lezen'.split(' '), string)
        ]

        fsa = fsa_from_list_of_symbols(string, symboltable)
        self.assertEqual(
            '0\t1\tNP\tNP\n1\t2\tN\tN\n2\t3\tV\tV\n3\t4\tV\tV\n4\t5\tV\tV\n5\n',
            fsa.text().decode('utf-8'))

        b = compose(fsa, a)

        print(b.input_symbols())
        for i in b.input_symbols():
            print(i)

        print("Input Composition")
        print(b.text(symboltable, symboltable).decode('utf-8'))

        i = 0
        for path in paths(b):
            print(i, "th path:", path, end=' ')
            r = list(map(rules.index_object, path))
            d = PolishDerivation(r[1::])
            dcp = DCP_evaluator(d).getEvaluation()
            h = HybridTree()
            dcp_to_hybridtree(h, dcp, token_sequence, False,
                              construct_conll_token)
            h.reorder()
            if h == tree2:
                print("correct")
            else:
                print("incorrect")
            i += 1

        stats = defaultdict(lambda: 0)
        local_rule_stats(b, stats, 15)

        print(stats)

        print("Shortest path probability")
        best = shortestpath(b)
        best.topsort()
        self.assertAlmostEqual(1.80844898756e-05,
                               pow(e, -float(shortestdistance(best)[-1])))
        print(best.text())

        polish_rules = retrieve_rules(best)
        self.assertSequenceEqual(polish_rules, [8, 7, 1, 6, 2, 5, 3, 10, 3, 3])

        polish_rules = list(map(rules.index_object, polish_rules))

        print(polish_rules)

        der = PolishDerivation(polish_rules[1::])

        print(der)

        print(
            derivation_to_hybrid_tree(der, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        dcp = DCP_evaluator(der).getEvaluation()

        h_tree_2 = HybridTree()
        dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                          construct_conll_token)

        print(h_tree_2)