Пример #1
0
    def test_la_viterbi_parsing_2(self):
        grammar = self.build_paper_grammar()
        inp = ["a"] * 3
        nontMap = Enumerator()
        gi = PyGrammarInfo(grammar, nontMap)
        sm = PyStorageManager()
        print(nontMap.object_index("S"))
        print(nontMap.object_index("B"))
        la = build_PyLatentAnnotation(
            [2, 1], [1.0], [[0.25, 1.0], [1.0, 0.0],
                            [0.0, 0.5, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0]], gi, sm)
        self.assertTrue(la.is_proper())

        parser = DiscodopKbestParser(grammar,
                                     la=la,
                                     nontMap=nontMap,
                                     grammarInfo=gi,
                                     latent_viterbi_mode=True)
        parser.set_input(inp)
        parser.parse()
        self.assertTrue(parser.recognized())
        der = parser.latent_viterbi_derivation(True)
        print(der)
        ranges = {der.spanned_ranges(idx)[0] for idx in der.ids()}
        self.assertSetEqual({(0, 3), (0, 2), (0, 1), (1, 2), (2, 3)}, ranges)
Пример #2
0
    def test_json_grammar_export(self):
        dog = build_acyclic_dog()
        terminals = Enumerator()
        data = dog.export_graph_json(terminals)
        with open('/tmp/json_graph_1.json', 'w') as file:
            json.dump(data, file)

        dsg = build_dsg()
        data = dsg.export_bihypergraph_json(terminals)
        with open('/tmp/json_bigraph_1.json', 'w') as file:
            json.dump(data, file)

        rule_dog = dog_se()
        data2 = rule_dog.export_graph_json(terminals)
        with open('/tmp/json_nonterminal_graph_1.json', 'w') as file:
            json.dump(data2, file)

        terminals.print_index()

        dsg = build_dsg()
        rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('right-branching')[0]
        rec_part = rec_part_strategy(dsg)
        dcmp = compute_decomposition(dsg, rec_part)

        grammar = induce_grammar_from(dsg, rec_part, dcmp, labeling=simple_labeling, terminal_labeling=str)

        print(grammar)
        data = export_dog_grammar_to_json(grammar, terminals)
        with open('/tmp/json_grammar.json', 'w') as file:
            json.dump(data, file)

        with open('/tmp/json_corpus.json', 'w') as file:
            json.dump(export_corpus_to_json([dsg], terminals), file)
Пример #3
0
    def test_la_viterbi_parsing_3(self):
        grammar = LCFRS("S")

        # rule 0
        lhs = LCFRS_lhs("B")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [], 0.25)

        # rule 1
        lhs = LCFRS_lhs("A")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [], 0.5)

        # rule 2
        lhs = LCFRS_lhs("S")
        lhs.add_arg([LCFRS_var(0, 0)])
        grammar.add_rule(lhs, ["B"], 1.0)

        # rule 3
        lhs = LCFRS_lhs("A")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["A", "B"], 0.5)

        # rule 4
        lhs = LCFRS_lhs("B")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["A", "B"], 0.75)

        grammar.make_proper()

        inp = ["a"] * 3

        nontMap = Enumerator()
        gi = PyGrammarInfo(grammar, nontMap)
        sm = PyStorageManager()
        print(nontMap.object_index("S"))
        print(nontMap.object_index("B"))

        la = build_PyLatentAnnotation_initial(grammar, gi, sm)
        parser = DiscodopKbestParser(grammar,
                                     la=la,
                                     nontMap=nontMap,
                                     grammarInfo=gi,
                                     latent_viterbi_mode=True)
        parser.set_input(inp)
        parser.parse()
        self.assertTrue(parser.recognized())
        der = parser.latent_viterbi_derivation(True)
        print(der)

        der2 = None

        for w, der_ in parser.k_best_derivation_trees():
            if der2 is None:
                der2 = der_
            print(w, der_)

        print(der2)
    def __test_projection(self,
                          split_weights,
                          goal_weights,
                          merge_method=False):
        grammar = LCFRS("S")
        # rule 0
        lhs = LCFRS_lhs("S")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["A", "A"])

        # rule 1
        lhs = LCFRS_lhs("A")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [])

        lhs = LCFRS_lhs("A")
        lhs.add_arg(["b"])
        grammar.add_rule(lhs, [], weight=2.0)

        grammar.make_proper()
        # print(grammar)

        nonterminal_map = Enumerator()
        grammarInfo = PyGrammarInfo(grammar, nonterminal_map)
        storageManager = PyStorageManager()

        la = build_PyLatentAnnotation([1, 2], [1.0], split_weights,
                                      grammarInfo, storageManager)

        # parser = LCFRS_parser(grammar)
        # parser.set_input(["a", "b"])
        # parser.parse()
        # der = parser.best_derivation_tree()

        # print(la.serialize())
        if merge_method:
            la.project_weights(grammar, grammarInfo)
        else:
            splits, _, _ = la.serialize()
            merge_sources = [[[
                split for split in range(0, splits[nont_idx])
            ]] for nont_idx in range(0, nonterminal_map.get_counter())]

            # print("Projecting to fine grammar LA", file=self.logger)
            coarse_la = la.project_annotation_by_merging(grammarInfo,
                                                         merge_sources,
                                                         debug=False)
            coarse_la.project_weights(grammar, grammarInfo)

        # print(grammar)
        for i in range(3):
            self.assertAlmostEqual(
                grammar.rule_index(i).weight(), goal_weights[i])
    def test_induction_2(self):
        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))

        features = defaultdict(lambda: 0)
        grammar = fringe_extract_lcfrs(self.tree3,
                                       rec_part(self.tree3),
                                       naming="child",
                                       feature_logging=features,
                                       isolate_pos=True)
        grammar.make_proper()

        if False:
            for idx in range(0, len(grammar.rules())):
                print(grammar.rule_index(idx))
                for key in features:
                    if key[0] == idx:
                        print(key, features[key])
                print()
            for key in features:
                if type(key[0]) == int:
                    continue
                print(key, features[key])

        nont_splits, root_weights, rule_weights, _ = build_nont_splits_dict(
            grammar,
            features,
            nonterminals=Enumerator(),
            feat_function=pos_cat_feats,
            debug=True)
        print(nont_splits)
        print(root_weights)
        print(rule_weights)
Пример #6
0
    def test_something(self):
        grammar, r1, r2 = self.build_grammar()
        nont_map = Enumerator()
        grammarInfo = PyGrammarInfo(grammar, nont_map)

        def w(x):
            return "S", x

        rtg = RTG(w(3))
        rtg.construct_and_add_rule(w(3), r1, [w(1), w(2)])
        rtg.construct_and_add_rule(w(3), r1, [w(2), w(1)])
        rtg.construct_and_add_rule(w(2), r1, [w(1), w(1)])
        rtg.construct_and_add_rule(w(1), r2, [])

        rtg2 = RTG(("A", 3))

        rtg3 = RTG(w(3))
        rtg3.construct_and_add_rule(w(3), r1, [w(1), w(2)])
        rtg3.construct_and_add_rule(w(3), r1, [w(2), w(1)])
        rtg3.construct_and_add_rule(w(2), r2, [w(1), w(1)])
        rtg3.construct_and_add_rule(w(1), r2, [])

        traces = PyDerivationManager(grammar, nont_map)
        traces.convert_rtgs_to_hypergraphs([rtg, rtg2, rtg3])

        self.assertTrue(
            traces.is_consistent_with_grammar(grammarInfo, traceId=0))
        self.assertFalse(
            traces.is_consistent_with_grammar(grammarInfo, traceId=1))
        self.assertFalse(
            traces.is_consistent_with_grammar(grammarInfo, traceId=2))
Пример #7
0
    def test_json_export(self):
        dog = build_acyclic_dog()
        terminals = Enumerator()
        data = dog.export_graph_json(terminals)
        with open('/tmp/json_graph_1.json', 'w') as file:
            json.dump(data, file)

        dsg = build_dsg()
        data = dsg.export_bihypergraph_json(terminals)
        with open('/tmp/json_bigraph_1.json', 'w') as file:
            json.dump(data, file)

        rule_dog = dog_se()
        data2 = rule_dog.export_graph_json(terminals)
        with open('/tmp/json_nonterminal_graph_1.json', 'w') as file:
            json.dump(data2, file)

        terminals.print_index()
Пример #8
0
    def test_projection_based_parser_k_best_hack(self):
        grammar = LCFRS("S")

        # rule 0
        lhs = LCFRS_lhs("B")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [], 0.25)

        # rule 1
        lhs = LCFRS_lhs("A")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [], 0.5)

        # rule 2
        lhs = LCFRS_lhs("S")
        lhs.add_arg([LCFRS_var(0, 0)])
        grammar.add_rule(lhs, ["B"], 1.0)

        # rule 3
        lhs = LCFRS_lhs("A")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["A", "B"], 0.5)

        # rule 4
        lhs = LCFRS_lhs("B")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["A", "B"], 0.75)

        grammar.make_proper()

        inp = ["a"] * 3
        nontMap = Enumerator()
        gi = PyGrammarInfo(grammar, nontMap)
        sm = PyStorageManager()
        la = build_PyLatentAnnotation_initial(grammar, gi, sm)

        parser = Coarse_to_fine_parser(grammar,
                                       la,
                                       gi,
                                       nontMap,
                                       base_parser_type=GFParser_k_best)
        parser.set_input(inp)
        parser.parse()
        self.assertTrue(parser.recognized())
        der = parser.max_rule_product_derivation()
        print(der)

        der = parser.best_derivation_tree()
        print(der)

        for node in der.ids():
            print(der.getRule(node), der.spanned_ranges(node))
Пример #9
0
    def test_la_viterbi_parsing(self):
        grammar = self.build_grammar()
        inp = ["a"] * 3
        nontMap = Enumerator()
        gi = PyGrammarInfo(grammar, nontMap)
        sm = PyStorageManager()
        la = build_PyLatentAnnotation_initial(grammar, gi, sm)

        parser = DiscodopKbestParser(grammar, la=la, nontMap=nontMap, grammarInfo=gi, latent_viterbi_mode=True)
        parser.set_input(inp)
        parser.parse()
        self.assertTrue(parser.recognized())
        der = parser.best_derivation_tree()
        print(der)

        for node in der.ids():
            print(node, der.getRule(node), der.spanned_ranges(node))
Пример #10
0
    def test_projection_based_parser_k_best_hack(self):
        grammar = self.build_grammar()
        inp = ["a"] * 3
        nontMap = Enumerator()
        gi = PyGrammarInfo(grammar, nontMap)
        sm = PyStorageManager()
        la = build_PyLatentAnnotation_initial(grammar, gi, sm)

        parser = Coarse_to_fine_parser(grammar,
                                       la,
                                       gi,
                                       nontMap,
                                       base_parser_type=GFParser_k_best)
        parser.set_input(inp)
        parser.parse()
        self.assertTrue(parser.recognized())
        der = parser.max_rule_product_derivation()
        print(der)

        der = parser.best_derivation_tree()
        print(der)

        for node in der.ids():
            print(der.getRule(node), der.spanned_ranges(node))
    def test_induction(self):
        naming = 'child'

        def rec_part(tree):
            return left_branching_partitioning(len(tree.id_yield()))
            # return fanout_k_left_to_right(tree, 1)

        tree = self.tree
        tree.add_to_root("VP1")

        feature_log1 = defaultdict(lambda: 0)

        grammar = fringe_extract_lcfrs(tree,
                                       rec_part(tree),
                                       feature_logging=feature_log1,
                                       naming=naming)

        for key in feature_log1:
            print(key, feature_log1[key])

        print(grammar)

        feats = defaultdict(lambda: 0)
        grammar_ = fringe_extract_lcfrs(tree,
                                        rec_part(tree),
                                        isolate_pos=True,
                                        feature_logging=feats,
                                        naming=naming)

        print(grammar_)

        for key in feats:
            print(key, feats[key])

        print("Adding 2nd grammar to first")

        grammar.add_gram(grammar_, feature_logging=(feature_log1, feats))
        for idx in range(0, len(grammar.rules())):
            print(idx, grammar.rule_index(idx))

        print("Adding 3rd grammar to first")
        feats3 = defaultdict(lambda: 0)
        grammar3 = fringe_extract_lcfrs(self.tree2,
                                        rec_part(self.tree2),
                                        isolate_pos=True,
                                        feature_logging=feats3,
                                        naming=naming)
        grammar.add_gram(grammar3, feature_logging=(feature_log1, feats3))

        print()
        for idx in range(0, len(grammar.rules())):
            print(idx, grammar.rule_index(idx))
        print()
        print("New feature log")
        print()
        for key in feature_log1:
            print(key, feature_log1[key])
        grammar.make_proper()

        build_nont_splits_dict(grammar,
                               feature_log1,
                               nonterminals=Enumerator())

        print(grammar.rule_index(0))
        print(grammar.rule_index(2))
Пример #12
0
    def test_json_corpus_grammar_export(self):
        start = 1
        stop = 50
        # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
        path = "res/tiger/tiger_8000.xml"
        exclude = []
        dsgs = sentence_names_to_deep_syntax_graphs(
            ['s' + str(i) for i in range(start, stop + 1) if i not in exclude]
            , path
            , hold=False)

        rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0]

        def label_edge(edge):
            if isinstance(edge.label, ConstituentTerminal):
                return edge.label.pos()
            else:
                return edge.label

        nonterminal_labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge)

        term_labeling_token = PosTerminals()

        def term_labeling(token):
            if isinstance(token, ConstituentTerminal):
                return term_labeling_token.token_label(token)
            else:
                return token

        grammar = induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, term_labeling)
        grammar.make_proper()

        terminals = Enumerator()

        data = export_dog_grammar_to_json(grammar, terminals)
        grammar_path = '/tmp/json_grammar.json'
        with open(grammar_path, 'w') as file:
            json.dump(data, file)

        corpus_path = '/tmp/json_corpus.json'
        with open(corpus_path, 'w') as file:
            json.dump(export_corpus_to_json(dsgs, terminals, terminal_labeling=term_labeling), file)

        with open('/tmp/enumerator.enum', 'w') as file:
            terminals.print_index(file)

        reduct_dir = '/tmp/reduct_grammars'
        if os.path.isdir(reduct_dir):
            shutil.rmtree(reduct_dir)
        os.makedirs(reduct_dir)
        p = subprocess.Popen([' '.join(
            ["java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g', grammar_path, '-t',
             corpus_path, "-o", reduct_dir])], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

        print("stdout", p.stdout.name)

        while True:
            nextline = p.stdout.readline()
            if nextline == b'' and p.poll() is not None:
                break
            print(nextline.decode('unicode_escape'), end='')
            # sys.stdout.write(nextline)
            # sys.stdout.flush()

        p.wait()
        p.stdout.close()
        self.assertEqual(0, p.returncode)

        rtgs = []
        for i in range(1, len(dsgs) + 1):
            rtgs.append(read_rtg('/tmp/reduct_grammars/' + str(i) + '.gra'))

        derivation_manager = PyDerivationManager(grammar)
        derivation_manager.convert_rtgs_to_hypergraphs(rtgs)
        derivation_manager.serialize(bytes('/tmp/reduct_manager.trace', encoding='utf8'))

        f = lambda token: token.pos() if isinstance(token, ConstituentTerminal) else token

        for i, (rtg, dsg) in enumerate(zip(rtgs, dsgs)):
            derivations = [LCFRSDerivationWrapper(der) for der in derivation_manager.enumerate_derivations(i, grammar)]
            self.assertGreaterEqual(len(derivations), 1)
            if len(derivations) > 1:
                print("Sentence", i)
                for der in derivations:
                    print(der)

            for der in derivations:
                dog, sync = dog_evaluation(der)
                dsg2 = DeepSyntaxGraph(der.compute_yield(), dog, sync)
                dsg.dog.project_labels(f)
                dsg.sentence = list(map(f, dsg.sentence))
                self.assertEqual(dsg.sentence, dsg2.sentence)
                morphs = dsg.dog.compute_isomorphism(dsg2.dog)
                self.assertFalse(morphs is None)
                self.assertListEqual([[morphs[0].get(node, node) for node in syncs]
                                      for syncs in dsg.synchronization], dsg2.synchronization)
        pass
Пример #13
0
def serialize_acyclic_dogs_to_negra(dsg, sec_edge_to_terminal=False):
    """
    converts a sequence of acyclic syntax graphs to the negra export format
    :type dsg: DeepSyntaxGraph
    :type sec_edge_to_terminal: bool
    :param sec_edge_to_terminal: if true, exports secondary edges with terminals as target
    """
    assert not dsg.dog.cyclic()
    assert len(dsg.sentence) < 500

    enum = Enumerator(first_index=500)
    # NB: contrary to the export standard, we index words starting from 1 (and not starting from 0)
    # NB: because 0 also refers to the virtual root (important for sec_edge_to_terminal == True)
    # NB: see http://www.coli.uni-saarland.de/~thorsten/publications/Brants-CLAUS98.pdf
    # NB: only relevant for TiGer s22084, probably annotation error
    synced_idxs = {idx: i + 1 for i, l in enumerate(dsg.synchronization) for idx in l}

    def idNum(tree_idx):
        if tree_idx in synced_idxs:
            return str(synced_idxs[tree_idx])
        else:
            return str(enum.object_index(tree_idx))

    # NB: here we enforce the indices to be topologically ordered as required by the export standard
    for idx in dsg.dog.topological_order():
        if idx not in synced_idxs:
            idNum(idx)

    lines = []

    for idx, token in enumerate(dsg.sentence):
        assert isinstance(token, ConstituentTerminal)
        # if not isinstance(token.form(), str):
        #     print(token.form(), type(token.form()))
        #     assert isinstance(token.form(), str)
        morph_order = ['person', 'case', 'number', 'tense', 'mood', 'gender', 'degree']
        morph = sorted(token.morph_feats(), key=lambda x: morph_order.index(x[0]))
        morph = '.'.join([str(x[1]) for x in morph if str(x[1]) != '--'])
        if morph == '':
            morph = u'--'
        line = [token.form(), token.pos(), morph]
        tree_idx = dsg.get_graph_position(idx)
        assert len(tree_idx) == 1
        tree_idx = tree_idx[0]

        parents = []
        if tree_idx in dsg.dog.outputs:
            parents.append(u'--')
            parents.append(u'0')

        for parent_idx in dsg.dog.parents:
            if not sec_edge_to_terminal and parent_idx in synced_idxs:
                continue
            edge = dsg.dog.incoming_edge(parent_idx)
            for j, child_idx in enumerate(edge.inputs):
                if child_idx == tree_idx:
                    if j in edge.primary_inputs:
                        parents = [edge.get_function(j), idNum(parent_idx)] + parents
                    else:
                        parents.append(edge.get_function(j))
                        parents.append(idNum(parent_idx))
        line += parents
        lines.append(u'\t'.join(line) + u'\n')

    category_lines = []
    for tree_idx in dsg.dog.nodes:
        token = dsg.dog.incoming_edge(tree_idx).label
        if isinstance(token, ConstituentTerminal):
            continue
        morph = u'--'

        line = ['#' + str(idNum(tree_idx)), token, morph]

        parents = []
        if tree_idx in dsg.dog.outputs:
            parents.append(u'--')
            parents.append(u'0')

        for parent_idx in dsg.dog.parents:
            if not sec_edge_to_terminal and parent_idx in synced_idxs:
                continue
            edge = dsg.dog.incoming_edge(parent_idx)
            for j, child_idx in enumerate(edge.inputs):
                if child_idx == tree_idx:
                    if j in edge.primary_inputs:
                        parents = [edge.get_function(j), idNum(parent_idx)] + parents
                    else:
                        parents.append(edge.get_function(j))
                        parents.append(idNum(parent_idx))
        line += parents

        category_lines.append(line)

    category_lines = sorted(category_lines, key=lambda x: x[0])

    for line in category_lines:
        lines.append(u'\t'.join(line) + u'\n')

    return lines
Пример #14
0
def linearize(grammar,
              nonterminal_labeling,
              terminal_labeling,
              file,
              delimiter='::',
              nonterminal_encoder=None):
    """
    :type grammar: LCFRS
    :param nonterminal_labeling:
    :param terminal_labeling:
    :param file: file handle to write to
    :type delimiter: str
    :param delimiter: string used to join terminal symbol with edge label symbol
    :type nonterminal_encoder: Enumerator
    :param nonterminal_encoder: mapping that assigns unique non-negative integer to each nonterminal
    """
    print("Nonterminal Labeling: ", nonterminal_labeling, file=file)
    print("Terminal Labeling: ", terminal_labeling, file=file)
    print(file=file)

    terminals = Enumerator(first_index=1)
    if nonterminal_encoder is None:
        nonterminals = Enumerator()
    else:
        nonterminals = nonterminal_encoder
    num_inherited_args = {}
    num_synthesized_args = {}

    for rule in grammar.rules():
        rid = 'r%i' % (rule.get_idx() + 1)
        print(rid,
              'RTG   ',
              nonterminals.object_index(rule.lhs().nont()),
              '->',
              file=file,
              end=" ")
        print(list(
            map(lambda nont: nonterminals.object_index(nont), rule.rhs())),
              ';',
              file=file)

        print(rid, 'WEIGHT', rule.weight(), ';', file=file)

        sync_index = {}
        inh_args = defaultdict(lambda: 0)
        lhs_var_counter = CountLHSVars()
        synthesized_attributes = 0

        dcp_ordered = sorted(rule.dcp(),
                             key=lambda x: (x.lhs().mem(), x.lhs().arg()))

        for dcp in dcp_ordered:
            if dcp.lhs().mem() != -1:
                inh_args[dcp.lhs().mem()] += 1
            else:
                synthesized_attributes += 1
            lhs_var_counter.evaluate_list(dcp.rhs())
        num_inherited_args[nonterminals.object_index(
            rule.lhs().nont())] = inh_args[-1] = lhs_var_counter.get_number()
        num_synthesized_args[nonterminals.object_index(
            rule.lhs().nont())] = synthesized_attributes

        for dcp in dcp_ordered:
            printer = DcpPrinter(terminals.object_index,
                                 rule,
                                 sync_index,
                                 inh_args,
                                 delimiter=delimiter)
            printer.evaluate_list(dcp.rhs())
            var = dcp.lhs()
            if var.mem() == -1:
                var_string = 's<0,%i>' % (var.arg() + 1 - inh_args[-1])
            else:
                var_string = 's<%i,%i>' % (var.mem() + 1, var.arg() + 1)
            print('%s sDCP   %s == %s ;' % (rid, var_string, printer.string),
                  file=file)

        s = 0
        for j, arg in enumerate(rule.lhs().args()):
            print(rid, 'LCFRS  s<0,%i> == [' % (j + 1), end=' ', file=file)
            first = True
            for a in arg:
                if not first:
                    print(",", end=' ', file=file)
                if isinstance(a, LCFRS_var):
                    print("x<%i,%i>" % (a.mem + 1, a.arg + 1),
                          end=' ',
                          file=file)
                    pass
                else:
                    if s in sync_index:
                        print(str(terminals.object_index(a)) +
                              '^{%i}' % sync_index[s],
                              end=' ',
                              file=file)
                    else:
                        print(str(terminals.object_index(a)),
                              end=' ',
                              file=file)
                    s += 1
                first = False
            print('] ;', file=file)
        print(file=file)

    print("Terminals: ", file=file)
    terminals.print_index(to_file=file)
    print(file=file)

    print("Nonterminal ID, nonterminal name, fanout, #inh, #synth: ",
          file=file)
    max_fanout, max_inh, max_syn, max_args, fanouts, inherits, synths, args \
        = print_index_and_stats(nonterminals, grammar, num_inherited_args, num_synthesized_args, file=file)
    print(file=file)
    print("max fanout:", max_fanout, file=file)
    print("max inh:", max_inh, file=file)
    print("max synth:", max_syn, file=file)
    print("max args:", max_args, file=file)
    print(file=file)
    for s, d, m in [('fanout', fanouts, max_fanout),
                    ('inh', inherits, max_inh), ('syn', synths, max_syn),
                    ('args', args, max_args)]:
        for i in range(m + 1):
            print('# the number of nonterminals with %s = %i is %i' %
                  (s, i, d[i]),
                  file=file)
        print(file=file)
    print(file=file)

    print("Initial nonterminal: ",
          nonterminals.object_index(grammar.start()),
          file=file)
    print(file=file)
    return nonterminals, terminals
Пример #15
0
def run_experiment(rec_part_strategy,
                   nonterminal_labeling,
                   exp,
                   reorder_children,
                   binarize=True):
    start = 1
    stop = 7000

    test_start = 7001
    test_stop = 7200

    # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
    corpus_path = "res/tiger/tiger_8000.xml"
    exclude = []
    train_dsgs = sentence_names_to_deep_syntax_graphs(
        ['s' + str(i) for i in range(start, stop + 1) if i not in exclude],
        corpus_path,
        hold=False,
        reorder_children=reorder_children)
    test_dsgs = sentence_names_to_deep_syntax_graphs(
        [
            's' + str(i)
            for i in range(test_start, test_stop + 1) if i not in exclude
        ],
        corpus_path,
        hold=False,
        reorder_children=reorder_children)

    # Grammar induction
    term_labeling_token = PosTerminals()

    def term_labeling(token):
        if isinstance(token, ConstituentTerminal):
            return term_labeling_token.token_label(token)
        else:
            return token

    if binarize:

        def modify_token(token):
            if isinstance(token, ConstituentCategory):
                token_new = deepcopy(token)
                token_new.set_category(token.category() + '-BAR')
                return token_new
            elif isinstance(token, str):
                return token + '-BAR'
            else:
                assert False

        train_dsgs = [
            dsg.binarize(bin_modifier=modify_token) for dsg in train_dsgs
        ]

        def is_bin(token):
            if isinstance(token, ConstituentCategory):
                if token.category().endswith('-BAR'):
                    return True
            elif isinstance(token, str):
                if token.endswith('-BAR'):
                    return True
            return False

        def debinarize(dsg):
            return dsg.debinarize(is_bin=is_bin)

    else:
        debinarize = id

    grammar = induction_on_a_corpus(train_dsgs, rec_part_strategy,
                                    nonterminal_labeling, term_labeling)
    grammar.make_proper()

    print("Nonterminals", len(grammar.nonts()), "Rules", len(grammar.rules()))

    parser = GFParser_k_best(grammar, k=500)
    return do_parsing(parser,
                      test_dsgs,
                      term_labeling_token,
                      oracle=True,
                      debinarize=debinarize)

    # Compute reducts, i.e., intersect grammar with each training dsg
    basedir = path.join('/tmp/dog_experiments', 'exp' + str(exp))
    reduct_dir = path.join(basedir, 'reduct_grammars')

    terminal_map = Enumerator()
    if not os.path.isdir(basedir):
        os.makedirs(basedir)
    data = export_dog_grammar_to_json(grammar, terminal_map)
    grammar_path = path.join(basedir, 'grammar.json')
    with open(grammar_path, 'w') as file:
        json.dump(data, file)

    corpus_path = path.join(basedir, 'corpus.json')
    with open(corpus_path, 'w') as file:
        json.dump(
            export_corpus_to_json(train_dsgs,
                                  terminal_map,
                                  terminal_labeling=term_labeling), file)

    with open(path.join(basedir, 'enumerator.enum'), 'w') as file:
        terminal_map.print_index(file)

    if os.path.isdir(reduct_dir):
        shutil.rmtree(reduct_dir)
    os.makedirs(reduct_dir)
    p = subprocess.Popen([
        ' '.join([
            "java", "-jar",
            os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g',
            grammar_path, '-t', corpus_path, "-o", reduct_dir
        ])
    ],
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT)

    while True:
        nextline = p.stdout.readline()
        if nextline == '' and p.poll() is not None:
            break
        sys.stdout.write(nextline)
        sys.stdout.flush()

    p.wait()
    p.stdout.close()

    rtgs = []
    for i in range(1, len(train_dsgs) + 1):
        rtgs.append(read_rtg(path.join(reduct_dir, str(i) + '.gra')))

    derivation_manager = PyDerivationManager(grammar)
    derivation_manager.convert_rtgs_to_hypergraphs(rtgs)
    derivation_manager.serialize(path.join(basedir, 'reduct_manager.trace'))

    # Training
    ## prepare EM training
    em_epochs = 20
    seed = 0
    smoothing_factor = 0.01
    split_randomization = 0.01
    sm_cycles = 2
    merge_percentage = 50.0
    grammarInfo = PyGrammarInfo(grammar,
                                derivation_manager.get_nonterminal_map())
    storageManager = PyStorageManager()

    em_builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo)
    em_builder.set_em_epochs(em_epochs)
    em_builder.set_simple_expector(threads=THREADS)
    emTrainer = em_builder.build()

    # randomize initial weights and do em training
    la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo,
                                                    storageManager)
    la_no_splits.add_random_noise(seed=seed)
    emTrainer.em_train(la_no_splits)
    la_no_splits.project_weights(grammar, grammarInfo)

    do_parsing(CFGParser(grammar), test_dsgs, term_labeling_token)
    return
    ## prepare SM training
    builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo)
    builder.set_em_epochs(em_epochs)
    builder.set_split_randomization(1.0, seed + 1)
    builder.set_simple_expector(threads=THREADS)
    builder.set_smoothing_factor(smoothingFactor=smoothing_factor)
    builder.set_split_randomization(percent=split_randomization)
    # builder.set_scc_merger(-0.2)
    builder.set_percent_merger(merge_percentage)
    splitMergeTrainer = builder.build()

    # splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing")
    splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing")

    # set initial latent annotation
    latentAnnotation = [la_no_splits]

    # carry out split/merge training and do parsing
    parsing_method = "filter-ctf"
    # parsing_method = "single-best-annotation"
    k_best = 50
    for i in range(1, sm_cycles + 1):
        splitMergeTrainer.reset_random_seed(seed + i + 1)
        latentAnnotation.append(
            splitMergeTrainer.split_merge_cycle(latentAnnotation[-1]))
        print("Cycle: ", i)
        if parsing_method == "single-best-annotation":
            smGrammar = latentAnnotation[i].build_sm_grammar(
                grammar, grammarInfo, rule_pruning=0.0001, rule_smoothing=0.1)
            print("Rules in smoothed grammar: ", len(smGrammar.rules()))
            parser = GFParser(smGrammar)
        elif parsing_method == "filter-ctf":
            latentAnnotation[-1].project_weights(grammar, grammarInfo)
            parser = Coarse_to_fine_parser(
                grammar,
                latentAnnotation[-1],
                grammarInfo,
                derivation_manager.get_nonterminal_map(),
                base_parser_type=GFParser_k_best,
                k=k_best)
        else:
            raise (Exception())
        do_parsing(parser, test_dsgs, term_labeling_token)
        del parser