예제 #1
0
    def test_la_viterbi_parsing_3(self):
        grammar = LCFRS("S")

        # rule 0
        lhs = LCFRS_lhs("B")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [], 0.25)

        # rule 1
        lhs = LCFRS_lhs("A")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [], 0.5)

        # rule 2
        lhs = LCFRS_lhs("S")
        lhs.add_arg([LCFRS_var(0, 0)])
        grammar.add_rule(lhs, ["B"], 1.0)

        # rule 3
        lhs = LCFRS_lhs("A")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["A", "B"], 0.5)

        # rule 4
        lhs = LCFRS_lhs("B")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["A", "B"], 0.75)

        grammar.make_proper()

        inp = ["a"] * 3

        nontMap = Enumerator()
        gi = PyGrammarInfo(grammar, nontMap)
        sm = PyStorageManager()
        print(nontMap.object_index("S"))
        print(nontMap.object_index("B"))

        la = build_PyLatentAnnotation_initial(grammar, gi, sm)
        parser = DiscodopKbestParser(grammar,
                                     la=la,
                                     nontMap=nontMap,
                                     grammarInfo=gi,
                                     latent_viterbi_mode=True)
        parser.set_input(inp)
        parser.parse()
        self.assertTrue(parser.recognized())
        der = parser.latent_viterbi_derivation(True)
        print(der)

        der2 = None

        for w, der_ in parser.k_best_derivation_trees():
            if der2 is None:
                der2 = der_
            print(w, der_)

        print(der2)
예제 #2
0
    def test_la_viterbi_parsing_2(self):
        grammar = self.build_paper_grammar()
        inp = ["a"] * 3
        nontMap = Enumerator()
        gi = PyGrammarInfo(grammar, nontMap)
        sm = PyStorageManager()
        print(nontMap.object_index("S"))
        print(nontMap.object_index("B"))
        la = build_PyLatentAnnotation(
            [2, 1], [1.0], [[0.25, 1.0], [1.0, 0.0],
                            [0.0, 0.5, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0]], gi, sm)
        self.assertTrue(la.is_proper())

        parser = DiscodopKbestParser(grammar,
                                     la=la,
                                     nontMap=nontMap,
                                     grammarInfo=gi,
                                     latent_viterbi_mode=True)
        parser.set_input(inp)
        parser.parse()
        self.assertTrue(parser.recognized())
        der = parser.latent_viterbi_derivation(True)
        print(der)
        ranges = {der.spanned_ranges(idx)[0] for idx in der.ids()}
        self.assertSetEqual({(0, 3), (0, 2), (0, 1), (1, 2), (2, 3)}, ranges)
예제 #3
0
 def initialize_parser(self):
     if "disco-dop" in self.parsing_mode:
         self.parser = DiscodopKbestParser(grammar=self.base_grammar,
                                           k=self.k_best,
                                           beam_beta=self.disco_dop_params["beam_beta"],
                                           beam_delta=self.disco_dop_params["beam_delta"],
                                           pruning_k=self.disco_dop_params["pruning_k"],
                                           cfg_ctf=self.disco_dop_params["cfg_ctf"])
     else:
         self.parser = GFParser_k_best(grammar=self.base_grammar, k=self.k_best,
                                       save_preprocessing=(self.directory, "gfgrammar"))
예제 #4
0
 def test_discodop_kbest_parser(self):
     grammar = self.build_grammar()
     parser = DiscodopKbestParser(grammar)
     inp = ["a"] * 5
     parser.set_input(inp)
     parser.parse()
     self.assertTrue(parser.recognized())
     counter = 0
     for weight, der in parser.k_best_derivation_trees():
         # print(weight, der)
         self.assertTrue(der.check_integrity_recursive(der.root_id(), grammar.start()))
         counter += 1
     self.assertEqual(50, counter)
예제 #5
0
 def initialize_parser(self):
     save_preprocess = (self.directory, "mygrammar")
     k = 1 if not self.organizer.disable_split_merge or self.oracle_parsing else self.k_best
     if "disco-dop" in self.parsing_mode:
         self.parser = DiscodopKbestParser(grammar=self.base_grammar, k=self.k_best,
                                           cfg_ctf=self.disco_dop_params["cfg_ctf"],
                                           pruning_k=self.disco_dop_params["pruning_k"],
                                           beam_beta=self.disco_dop_params["beam_beta"],
                                           beam_delta=self.disco_dop_params["beam_delta"]
                                          )
     else:
         self.parser = GFParser_k_best(self.base_grammar, save_preprocessing=save_preprocess, k=k)
예제 #6
0
    def test_la_viterbi_parsing(self):
        grammar = self.build_grammar()
        inp = ["a"] * 3
        nontMap = Enumerator()
        gi = PyGrammarInfo(grammar, nontMap)
        sm = PyStorageManager()
        la = build_PyLatentAnnotation_initial(grammar, gi, sm)

        parser = DiscodopKbestParser(grammar, la=la, nontMap=nontMap, grammarInfo=gi, latent_viterbi_mode=True)
        parser.set_input(inp)
        parser.parse()
        self.assertTrue(parser.recognized())
        der = parser.best_derivation_tree()
        print(der)

        for node in der.ids():
            print(node, der.getRule(node), der.spanned_ranges(node))
예제 #7
0
 def test_copy_grammar(self):
     grammar = self.build_nm_grammar()
     for cfg_aprrox in [True, False]:
         parser = DiscodopKbestParser(grammar, cfg_ctf=cfg_aprrox)
         n = 2
         m = 3
         inp = ["a"] * n + ["b"] * m + ["c"] * n + ["d"] * m
         parser.set_input(inp)
         parser.parse()
         self.assertTrue(parser.recognized())
         counter = 0
         for weight, der in parser.k_best_derivation_trees():
             print(weight, der)
             self.assertTrue(der.check_integrity_recursive(der.root_id(), grammar.start()))
             self.assertEqual(inp, der.compute_yield())
             counter += 1
         self.assertEqual(1, counter)
예제 #8
0
    def test_negra_dag_small_grammar(self):
        DAG_CORPUS = 'res/tiger/tiger_full_with_sec_edges.export'
        DAG_CORPUS_BIN = 'res/tiger/tiger_full_with_sec_edges_bin_h1_v1.export'
        names = list([str(i) for i in range(1, 101)])
        if not os.path.exists(DAG_CORPUS):
            print(
                'run the following command to create an export corpus with dags:'
            )
            print('\tPYTHONPATH=. util/tiger_dags_to_negra.py ' +
                  'res/tiger/tiger_release_aug07.corrected.16012013.xml ' +
                  DAG_CORPUS + ' 1 50474')
        self.assertTrue(os.path.exists(DAG_CORPUS))

        if not os.path.exists(DAG_CORPUS_BIN):
            print(
                'run the following command to binarize the export corpus with dags:'
            )
            print("discodop treetransforms --binarize -v 1 -h 1 " +
                  DAG_CORPUS + " " + DAG_CORPUS_BIN)
            # _, DAG_CORPUS_BIN = tempfile.mkstemp(prefix='corpus_bin_', suffix='.export')
            # subprocess.call(["discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", DAG_CORPUS, DAG_CORPUS_BIN])
        self.assertTrue(os.path.exists(DAG_CORPUS_BIN))
        corpus = np.sentence_names_to_hybridtrees(names,
                                                  DAG_CORPUS,
                                                  secedge=True)
        corpus_bin = np.sentence_names_to_hybridtrees(names,
                                                      DAG_CORPUS_BIN,
                                                      secedge=True)

        grammar = LCFRS(start="START")

        for hybrid_dag, hybrid_dag_bin in zip(corpus, corpus_bin):
            self.assertEqual(len(hybrid_dag.token_yield()),
                             len(hybrid_dag_bin.token_yield()))

            dag_grammar = direct_extract_lcfrs_from_prebinarized_corpus(
                hybrid_dag_bin)
            grammar.add_gram(dag_grammar)

        grammar.make_proper()
        print(
            "Extracted LCFRS/DCP-hybrid grammar with %i nonterminals and %i rules"
            % (len(grammar.nonts()), len(grammar.rules())))

        parser = DiscodopKbestParser(grammar, k=1)

        _, RESULT_FILE = tempfile.mkstemp(prefix='parser_results_',
                                          suffix='.export')

        with open(RESULT_FILE, 'w') as results:
            for hybrid_dag in corpus:

                poss = list(map(lambda x: x.pos(), hybrid_dag.token_yield()))
                parser.set_input(poss)
                parser.parse()
                self.assertTrue(parser.recognized())
                der = parser.best_derivation_tree()

                dcp_term = DCP_evaluator(der).getEvaluation()
                dag_eval = HybridDag(hybrid_dag.sent_label())
                dcp_to_hybriddag(dag_eval,
                                 dcp_term,
                                 copy.deepcopy(hybrid_dag.token_yield()),
                                 False,
                                 construct_token=construct_constituent_token)
                lines = np.serialize_hybridtrees_to_negra(
                    [dag_eval], 1, 500, use_sentence_names=True)
                for line in lines:
                    print(line, end='', file=results)
                parser.clear()

        print("Wrote results to %s" % RESULT_FILE)
    def test_something(self):
        normal_corpus = 'res/tiger/tiger_8000.export'
        binarized_corpus = 'res/tiger/tiger_8000_bin.export'
        limit = 55000
        # limit = 30
        corpus_bin = sentence_names_to_hybridtrees(
            {str(x)
             for x in range(limit)},
            binarized_corpus,
            disconnect_punctuation=False,
            add_vroot=True,
            mode="DISCO-DOP")

        corpus = sentence_names_to_hybridtrees({str(x)
                                                for x in range(limit)},
                                               normal_corpus,
                                               disconnect_punctuation=False,
                                               add_vroot=True,
                                               mode="DISCO-DOP")
        term_labeling = terminal_labeling(corpus, threshold=4)

        grammar = None

        for htree, htree_bin in zip(corpus, corpus_bin):
            # print(htree_bin)

            try:
                htree_grammar = direct_extract_lcfrs_from_prebinarized_corpus(
                    htree_bin, term_labeling=term_labeling)
            except Exception as e:
                print(e)
                print(htree_bin)
                print(htree_bin.nodes())
                print(htree_bin.word_yield())
                raise e
            # print(htree_grammar)

            parser_input = term_labeling.prepare_parser_input(
                htree.token_yield())

            p = LCFRS_sDCP_Parser(htree_grammar,
                                  terminal_labelling=term_labeling)
            p.set_input(htree)
            p.parse()
            # p = LCFRS_parser(htree_grammar, parser_input)
            self.assertTrue(p.recognized())

            derivs = list(p.all_derivation_trees())
            # print("derivations:", len(derivs))

            for der in derivs:
                dcp = DCP_evaluator(der).getEvaluation()
                sys_tree = HybridTree(htree.sent_label())

                sys_tree = dcp_to_hybridtree(
                    sys_tree,
                    dcp,
                    deepcopy(htree.token_yield()),
                    ignore_punctuation=False,
                    construct_token=construct_constituent_token)
                # print(sys_tree)
                # print(htree == sys_tree)
                # print(der)
                if htree != sys_tree:
                    print(htree.sent_label())
                    print(htree)
                    print(sys_tree)

                self.assertEqual(htree, sys_tree)

            if grammar is None:
                grammar = htree_grammar
            else:
                grammar.add_gram(htree_grammar)

            htree_grammar.make_proper()
            try:
                disco_parser = DiscodopKbestParser(htree_grammar)
            except ValueError as ve:
                print(ve)
                print(htree.sent_label())
                print(htree)
                print(htree_bin)
                print(htree_grammar)
                raise ve

        grammar.make_proper()
        disco_parser = DiscodopKbestParser(grammar)
    def prepare_sm_parser(self):
        last_la = self.organizer.latent_annotations[
            self.organizer.last_sm_cycle]
        if self.parsing_mode == "discodop-multi-method":
            if self.organizer.project_weights_before_parsing:
                self.project_weights()
            self.parser = DiscodopKbestParser(
                self.base_grammar,
                k=self.k_best,
                la=last_la,
                nontMap=self.organizer.nonterminal_map,
                variational=False,
                sum_op=False,
                cfg_ctf=self.disco_dop_params["cfg_ctf"],
                beam_beta=self.disco_dop_params["beam_beta"],
                beam_delta=self.disco_dop_params["beam_delta"],
                pruning_k=self.disco_dop_params["pruning_k"],
                grammarInfo=self.organizer.grammarInfo,
                projection_mode=False,
                latent_viterbi_mode=True,
                secondaries=[
                    "VARIATIONAL", "MAX-RULE-PRODUCT", "LATENT-RERANK"
                ])
            self.parser.k_best_reranker = Coarse_to_fine_parser(
                self.base_grammar,
                last_la,
                self.organizer.grammarInfo,
                self.organizer.nonterminal_map,
                base_parser=self.parser)

        elif self.parsing_mode == "best-latent-derivation":
            grammar = build_sm_grammar(last_la,
                                       self.base_grammar,
                                       self.organizer.grammarInfo,
                                       rule_pruning=0.0001,
                                       rule_smoothing=0.1)
            self.parser = GFParser_k_best(grammar=grammar,
                                          k=1,
                                          save_preprocessing=(self.directory,
                                                              "gfgrammar"))
        elif self.parsing_mode in {
                method + engine
                for method in {"k-best-rerank", "latent-viterbi"}
                for engine in {"-GF", "-disco-dop", ""}
        }:
            if self.organizer.project_weights_before_parsing:
                self.project_weights()
            if "disco-dop" in self.parsing_mode:
                engine = DiscodopKbestParser(
                    grammar=self.base_grammar,
                    k=self.k_best,
                    la=last_la,
                    nontMap=self.organizer.nonterminal_map,
                    grammarInfo=self.organizer.grammarInfo,
                    cfg_ctf=self.disco_dop_params["cfg_ctf"],
                    beam_beta=self.disco_dop_params["beam_beta"],
                    beam_delta=self.disco_dop_params["beam_beta"],
                    pruning_k=self.disco_dop_params["pruning_k"],
                    latent_viterbi_mode="latent-viterbi" in self.parsing_mode)
            else:
                engine = GFParser_k_best(grammar=self.base_grammar,
                                         k=self.k_best,
                                         heuristics=self.heuristics,
                                         save_preprocessing=(self.directory,
                                                             "gfgrammar"))
            if "latent-viterbi" in self.parsing_mode:
                self.parser = engine
            else:
                self.parser = Coarse_to_fine_parser(
                    self.base_grammar,
                    last_la,
                    self.organizer.grammarInfo,
                    self.organizer.nonterminal_map,
                    base_parser=engine)
        elif self.parsing_mode in {
                method + "%s" % engine
                for method in {"max-rule-prod", "max-rule-sum", "variational"}
                for engine in {"-GF", "-disco-dop", ""}
        }:
            if self.organizer.project_weights_before_parsing:
                self.project_weights()
            if "GF" in self.parsing_mode:
                self.parser = Coarse_to_fine_parser(
                    self.base_grammar,
                    last_la,
                    self.organizer.grammarInfo,
                    nontMap=self.organizer.nonterminal_map,
                    base_parser_type=GFParser_k_best,
                    k=self.k_best,
                    heuristics=self.heuristics,
                    save_preprocessing=(self.directory, "gfgrammar"),
                    mode=self.parsing_mode,
                    variational="variational" in self.parsing_mode,
                    sum_op="sum" in self.parsing_mode)
            else:
                self.parser = DiscodopKbestParser(
                    self.base_grammar,
                    k=self.k_best,
                    la=last_la,
                    nontMap=self.organizer.nonterminal_map,
                    variational="variational" in self.parsing_mode,
                    sum_op="sum" in self.parsing_mode,
                    cfg_ctf=self.disco_dop_params["cfg_ctf"],
                    beam_beta=self.disco_dop_params["beam_beta"],
                    beam_delta=self.disco_dop_params["beam_delta"],
                    pruning_k=self.disco_dop_params["pruning_k"],
                    grammarInfo=self.organizer.grammarInfo,
                    projection_mode=True)

        else:
            raise ValueError("Unknown parsing mode %s" % self.parsing_mode)