def setUp(self):
     self.tree = HybridTree()
     self.tree.add_node("v1", construct_conll_token("Piet", "NP"), True)
     self.tree.add_node("v21", construct_conll_token("Marie", "N"), True)
     self.tree.add_node("v", construct_conll_token("helpen", "VP"), True)
     self.tree.add_node("v2", construct_conll_token("lezen", "V"), True)
     self.tree.add_child("v", "v2")
     self.tree.add_child("v", "v1")
     self.tree.add_child("v2", "v21")
     self.tree.add_node("v3", construct_conll_token(".", "Punc"), True,
                        False)
     self.tree.add_to_root("v")
示例#2
0
    def test_multiroot(self):
        tree = multi_dep_tree()
        term_pos = the_terminal_labeling_factory().get_strategy(
            'pos').token_label
        fanout_1 = the_recursive_partitioning_factory().get_partitioning(
            'fanout-1')
        for top_level_labeling_strategy in ['strict', 'child']:
            labeling_strategy = the_labeling_factory(
            ).create_simple_labeling_strategy(top_level_labeling_strategy,
                                              'pos+deprel')
            for recursive_partitioning in [[direct_extraction], fanout_1,
                                           [left_branching]]:
                (_, grammar) = induce_grammar([tree], labeling_strategy,
                                              term_pos, recursive_partitioning,
                                              'START')
                print(grammar)

                parser = LCFRS_parser(grammar, 'pA pB pC pD pE'.split(' '))
                print(parser.best_derivation_tree())

                cleaned_tokens = copy.deepcopy(tree.full_token_yield())
                for token in cleaned_tokens:
                    token.set_edge_label('_')
                hybrid_tree = HybridTree()
                hybrid_tree = parser.dcp_hybrid_tree_best_derivation(
                    hybrid_tree, cleaned_tokens, True, construct_conll_token)
                print(hybrid_tree)
                self.assertEqual(tree, hybrid_tree)
示例#3
0
    def test_minimum_risk_parsing(self):
        limit_train = 20
        limit_test = 10
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        test = train
        parser_type = GFParser_k_best
        # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory(
        ).create_simple_labeling_strategy("childtop", "deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling,
                                                 term_labelling.token_label,
                                                 recursive_partitioning, start)

        parser_type.preprocess_grammar(grammar_prim)
        tree_yield = term_labelling.prepare_parser_input

        trees = parse_conll_corpus(test, False, limit_test)

        for i, tree in enumerate(trees):
            print("Parsing sentence ", i, file=stderr)

            # print >>stderr, tree

            parser = parser_type(grammar_prim,
                                 tree_yield(tree.token_yield()),
                                 k=50)

            self.assertTrue(parser.recognized())

            derivations = [der for der in parser.k_best_derivation_trees()]
            print("# derivations: ", len(derivations), file=stderr)
            h_trees = []
            current_weight = 0
            weights = []
            derivation_list = []
            for weight, der in derivations:

                self.assertTrue(not der in derivation_list)

                derivation_list.append(der)

                dcp = DCP_evaluator(der).getEvaluation()
                h_tree = HybridTree()
                cleaned_tokens = copy.deepcopy(tree.full_token_yield())
                dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                                  construct_conll_token)

                h_trees.append(h_tree)
                weights.append(weight)

            if True:
                min_risk_tree = compute_minimum_risk_tree(h_trees, weights)
                if not min_risk_tree.__eq__(h_trees[0]):
                    print(h_trees[0])
                    print(min_risk_tree)
示例#4
0
def fall_back_left_branching_token(clean_tokens):
    tree = HybridTree()
    for i, token in enumerate(clean_tokens):
        token.set_edge_label('_')
        tree.add_node(i, token, True)
        if i == 0:
            tree.add_to_root(i)
        else:
            tree.add_child(i - 1, i)
    return tree
示例#5
0
def fall_back_left_branching(forms, poss):
    tree = HybridTree()
    for i, (form, pos) in enumerate(zip(forms, poss)):
        token = construct_conll_token(form, pos)
        token.set_edge_label('_')
        tree.add_node(i, token, True)
        if i == 0:
            tree.add_to_root(i)
        else:
            tree.add_child(i - 1, i)
    return tree
示例#6
0
    def test_grammar_export(self):
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        _, grammar = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'),
            # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'),
            terminal_labeling.token_label,
            [direct_extraction],
            'START')
        print(max([grammar.fanout(nont) for nont in grammar.nonts()]))
        print(grammar)

        prefix = '/tmp/'
        name = 'tmpGrammar'

        name_ = export(grammar, prefix, name)

        self.assertEqual(0, compile_gf_grammar(prefix, name_))

        GFParser.preprocess_grammar(grammar)

        string = ["NP", "N", "V", "V", "V"]

        parser = GFParser(grammar, string)

        self.assertTrue(parser.recognized())

        der = parser.best_derivation_tree()
        self.assertTrue(
            der.check_integrity_recursive(der.root_id(), grammar.start()))

        print(der)

        print(
            derivation_to_hybrid_tree(der, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        dcp = DCP_evaluator(der).getEvaluation()

        h_tree_2 = HybridTree()
        token_sequence = [
            construct_conll_token(form, lemma)
            for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '),
                                   'NP N V V V'.split(' '))
        ]
        dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                          construct_conll_token)

        print(h_tree_2)
    def test_dcp_evaluation_with_induced_dependency_grammar(self):
        tree = hybrid_tree_1()

        print(tree)

        tree2 = hybrid_tree_2()

        print(tree2)
        # print tree.recursive_partitioning()

        labeling = the_labeling_factory().create_simple_labeling_strategy(
            'child', 'pos')
        term_pos = the_terminal_labeling_factory().get_strategy(
            'pos').token_label
        (_, grammar) = induce_grammar([tree, tree2], labeling, term_pos,
                                      [direct_extraction], 'START')

        # print grammar

        self.assertEqual(grammar.well_formed(), None)
        self.assertEqual(grammar.ordered()[0], True)
        # print max([grammar.fanout(nont) for nont in grammar.nonts()])
        print(grammar)

        parser = Parser(grammar, 'NP N V V'.split(' '))

        self.assertEqual(parser.recognized(), True)

        for item in parser.successful_root_items():
            der = Derivation()
            derivation_tree(der, item, None)
            print(der)

            hybrid_tree = derivation_to_hybrid_tree(
                der, 'NP N V V'.split(' '),
                'Piet Marie helpen lezen'.split(' '),
                construct_constituent_token)
            print(hybrid_tree)

            dcp = DCP_evaluator(der).getEvaluation()
            h_tree_2 = HybridTree()
            token_sequence = [
                construct_conll_token(form, lemma)
                for form, lemma in zip('Piet Marie helpen lezen'.split(' '),
                                       'NP N V V'.split(' '))
            ]
            dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                              construct_conll_token)
示例#8
0
    def test_cfg_parser(self):
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START')

        for parser_class in [LCFRS_parser, CFGParser]:

            parser_class.preprocess_grammar(grammar)

            string = ["NP", "N", "V", "V", "V"]

            parser = parser_class(grammar, string)

            self.assertTrue(parser.recognized())

            der = parser.best_derivation_tree()
            self.assertTrue(
                der.check_integrity_recursive(der.root_id(), grammar.start()))

            print(der)

            print(
                derivation_to_hybrid_tree(
                    der, string, "Piet Marie helpen lezen leren".split(),
                    construct_conll_token))

            dcp = DCP_evaluator(der).getEvaluation()

            h_tree_2 = HybridTree()
            token_sequence = [
                construct_conll_token(form, lemma) for form, lemma in zip(
                    'Piet Marie helpen lezen leren'.split(' '),
                    'NP N V V V'.split(' '))
            ]
            dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                              construct_conll_token)

            print(h_tree_2)
    def test_basic_sdcp_parsing_dependency(self):
        tree1 = hybrid_tree_1()
        tree2 = hybrid_tree_2()

        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree1, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START')

        print("grammar induced. Printing rules...", file=stderr)

        for rule in grammar.rules():
            print(rule, file=stderr)

        parser_type = LCFRS_sDCP_Parser

        print("preprocessing grammar", file=stderr)

        parser_type.preprocess_grammar(grammar, terminal_labeling)

        print("invoking parser", file=stderr)

        parser = parser_type(grammar, tree1)

        print("listing derivations", file=stderr)

        for der in parser.all_derivation_trees():
            print(der)
            output_tree = HybridTree()
            tokens = tree1.token_yield()
            dcp_to_hybridtree(output_tree,
                              DCP_evaluator(der).getEvaluation(), tokens,
                              False, construct_conll_token)
            print(tree1)
            print(output_tree)

        print("completed test", file=stderr)
示例#10
0
def parse_conll_corpus(path, ignore_punctuation, limit=sys.maxsize, start=0):
    """
    :param path: path to corpus
    :type: str
    :param ignore_punctuation: exclude punctuation from tree structure
    :type ignore_punctuation: bool
    :param limit: stop generation after limit trees
    :type: int
    :param start: start generation with start'th tree
    :type start: int
    :return: a series of hybrid trees read from file
    :rtype: __generator[HybridTree]
    :raise Exception: unexpected input in corpus file
    Lazily parses a dependency corpus (in CoNLL format) and generates GeneralHybridTrees.
    """

    # print path
    with open(path) as file_content:
        tree_count = 0

        while tree_count < limit:
            tree = None

            try:
                line = next(file_content)
                while line.startswith('#'):
                    line = next(file_content)
            except StopIteration:
                break

            match = CONLL_LINE.match(line)
            while match:
                if match.group(1) == '1':
                    tree_count += 1
                    tree = HybridTree('tree' + str(tree_count))

                node_id = match.group(1)
                form = match.group(2)
                lemma = match.group(3)
                cpos = match.group(4)
                pos = match.group(5)
                feats = match.group(6)
                parent = match.group(7)
                deprel = match.group(8)

                # We ignore information about multiple token's as present in the UD version of Prague Dep. TB
                if MULTI_TOKEN.search(node_id):
                  pass
                else:
                    # If punctuation is to be ignored, we
                    # remove it from the hybrid tree
                    # Punctuation according to definition
                    # cf. http://ilk.uvt.nl/conll/software.html#eval

                    # if not ignore_punctuation or form.translate(no_translation, string.punctuation):
                    tree.add_node(node_id, CoNLLToken(form, lemma, cpos, pos, feats, deprel), True, True)
                    if parent != '0':
                        tree.add_child(parent, node_id)
                    # else:
                    #    tree.add_node(node_id, CoNLLToken(form, lemma, pos, fine_grained_pos, feats, deprel), True, False)

                    # TODO: If punctuation is ignored and the root is punctuation,
                    # TODO: it is added to the tree anyhow.
                    if parent == '0':
                        tree.add_to_root(node_id)

                try:
                    line = next(file_content)
                    while line.startswith('#'):
                        line = next(file_content)
                    match = CONLL_LINE.search(line)
                except StopIteration:
                    line = ''
                    match = None

            # Assume empty line, otherwise raise exception
            match = EMPTY_LINE.match(line)
            if not match:
                raise Exception("Unexpected input in CoNLL corpus file.")

            if tree:
                # basic sanity checks
                if not tree.root:
                    # FIXME: ignoring punctuation may leads to malformed trees
                    print("non-rooted")
                    if ignore_punctuation:
                        continue
                    raise Exception
                    # elif root > 1:
                    # FIXME: turkish corpus contains trees with more than one root
                    # FIXME: currently, they are ignored
                    # continue
                elif tree.n_nodes() != len(tree.id_yield()) or len(tree.nodes()) != len(tree.full_yield()):
                    # FIXME: ignoring punctuation may leads to malformed trees
                    if ignore_punctuation:
                        continue
                    raise Exception(
                        '{4}: connected nodes: {0}, total nodes: {1}, full yield: {2}, connected yield: {3}'.format(
                            str(tree.n_nodes()), str(len(tree.nodes())), str(len(tree.full_yield())),
                            str(len(tree.id_yield())), tree.sent_label()))
                if tree_count > start:
                    yield tree
def query_result_tree(connection, exp, tree_id):
    """
    :param connection:
    :param exp:
    :param tree_id:
    :rtype: str, HybridTree
    :return:
    """
    cursor = connection.cursor()
    result_tree_ids = cursor.execute(
        '''SELECT rt_id, status FROM result_trees WHERE exp_id = ? AND t_id = ?''',
        (exp, tree_id)).fetchall()

    # parse:
    if result_tree_ids:
        assert (len(result_tree_ids) == 1)
        result_tree_id, status = result_tree_ids[0]
        if status in ["parse", "fallback"]:
            name = cursor.execute('''SELECT name FROM trees WHERE t_id = ?''',
                                  (tree_id, )).fetchall()[0][0]
            tree_nodes = cursor.execute((
                ' SELECT tree_nodes.sent_position, label, pos, result_tree_nodes.head, result_tree_nodes.deprel FROM result_tree_nodes\n'
                '                JOIN result_trees\n'
                '                  ON result_tree_nodes.rt_id = result_trees.rt_id\n'
                '                JOIN tree_nodes\n'
                '                  ON result_trees.t_id = tree_nodes.t_id\n'
                '                  AND result_tree_nodes.sent_position = tree_nodes.sent_position\n'
                '                WHERE result_tree_nodes.rt_id = ?'),
                                        (result_tree_id, ))
            tree = HybridTree(name)
            for i, label, pos, head, deprel in tree_nodes:
                if deprel is None:
                    deprel = 'UNKNOWN'
                token = CoNLLToken(label, '_', pos, pos, '_', deprel)
                tree.add_node(str(i), token, True, True)
                if head == 0:
                    tree.add_to_root(str(i))
                else:
                    tree.add_child(str(head), str(i))
            assert tree.root is not []
            return status, tree
    # legacy: no entry found
    else:
        status = "simple_fallback"

    # Create a left branching tree without labels as default strategy
    tree_nodes = cursor.execute(
        ''' SELECT tree_nodes.sent_position, label, pos FROM tree_nodes
        WHERE tree_nodes.t_id = ?''', (tree_id, )).fetchall()

    left_branch = lambda x: x - 1
    right_branch = lambda x: x + 1
    strategy = left_branch

    length = len(tree_nodes)
    tree = HybridTree()
    for i, label, pos in tree_nodes:
        token = CoNLLToken(label, '_', pos, pos, '_', '_')
        tree.add_node(str(i), token, True, True)
        parent = strategy(i)
        if (parent == 0
                and strategy == left_branch) or (parent == length + 1
                                                 and strategy == right_branch):
            tree.add_to_root(str(i))
        else:
            tree.add_child(str(parent), str(i))
    assert tree.root is not []
    return status, tree
示例#12
0
def disconnect_punctuation(trees):
    """
    :param trees: corpus of hybrid trees
    :type trees: __generator[HybridTree]
    :return: corpus of hybrid trees
    :rtype: __generator[GeneralHybridTree]
    lazily disconnect punctuation from each hybrid tree in a corpus of hybrid trees
    """
    for tree in trees:
        tree2 = HybridTree(tree.sent_label())
        for root_id in tree.root:
            if not is_punctuation(tree.node_token(root_id).form()):
                tree2.add_to_root(root_id)
        for id in tree.full_yield():
            token = tree.node_token(id)
            if not is_punctuation(token.form()):
                parent = tree.parent(id)
                while parent and parent not in tree.root and is_punctuation(
                        tree.node_token(parent).form()):
                    parent = tree.parent(parent)
                if parent and is_punctuation(tree.node_token(parent).form()):
                    tree2.add_to_root(id)
                else:
                    tree2.add_child(parent, id)
                tree2.add_node(id, token, True, True)
            else:
                tree2.add_node(id, token, True, False)

        if tree2:
            # basic sanity checks
            if not tree2.root \
                    and len(tree2.id_yield()) == 0 \
                    and len(tree2.nodes()) == len(tree2.full_yield()):
                # Tree consists only of punctuation
                continue
            elif not tree2.root \
                    or tree2.n_nodes() != len(tree2.id_yield()) \
                    or len(tree2.nodes()) != len(tree2.full_yield()):
                print(tree)

                print(tree2)
                print(tree2.sent_label())
                print("Root:", tree2.root)
                print("Nodes: ", tree2.n_nodes())
                print("Id_yield:", len(tree2.id_yield()), tree2.id_yield())
                print("Nodes: ", len(tree2.nodes()))
                print("full yield: ", len(tree2.full_yield()))
                raise Exception()
            yield tree2
示例#13
0
def hybrid_tree_2():
    tree2 = HybridTree()
    tree2.add_node('v1', CoNLLToken('Piet', '_', 'NP', 'NP', '_', 'SBJ'), True)
    tree2.add_node('v211', CoNLLToken('Marie', '_', 'N', 'N', '_', 'OBJ'),
                   True)
    tree2.add_node('v', CoNLLToken('helpen', '_', 'V', 'V', '_', 'ROOT'), True)
    tree2.add_node('v2', CoNLLToken('leren', '_', 'V', 'V', '_', 'VBI'), True)
    tree2.add_node('v21', CoNLLToken('lezen', '_', 'V', 'V', '_', 'VFIN'),
                   True)
    tree2.add_child('v', 'v2')
    tree2.add_child('v', 'v1')
    tree2.add_child('v2', 'v21')
    tree2.add_child('v21', 'v211')
    tree2.add_to_root('v')
    tree2.reorder()
    return tree2
    def generic_parsing_test(self, parser_type, limit_train, limit_test,
                             compare_order):
        def filter_by_id(n, trees):
            j = 0
            for tree in trees:
                if j in n:
                    yield tree
                j += 1

        #params
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        test = train
        # test = 'res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory(
        ).create_simple_labeling_strategy("childtop", "deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling,
                                                 term_labelling.token_label,
                                                 recursive_partitioning, start)

        parser_type.preprocess_grammar(grammar_prim, term_labelling)

        trees = parse_conll_corpus(test, False, limit_test)

        count_derivs = {}
        no_complete_match = 0

        for i, tree in enumerate(trees):
            print("Parsing tree for ", i, file=stderr)

            print(tree, file=stderr)

            parser = parser_type(grammar_prim, tree)
            self.assertTrue(parser.recognized())
            count_derivs[i] = 0

            print("Found derivations for ", i, file=stderr)
            j = 0

            derivations = []

            for der in parser.all_derivation_trees():
                self.assertTrue(
                    der.check_integrity_recursive(der.root_id(), start))

                print(count_derivs[i], file=stderr)
                print(der, file=stderr)

                output_tree = HybridTree()
                tokens = tree.token_yield()

                the_yield = der.compute_yield()
                # print >>stderr, the_yield
                tokens2 = list(
                    map(lambda pos: construct_conll_token('_', pos),
                        the_yield))

                dcp_to_hybridtree(output_tree,
                                  DCP_evaluator(der).getEvaluation(),
                                  tokens2,
                                  False,
                                  construct_conll_token,
                                  reorder=False)
                print(tree, file=stderr)
                print(output_tree, file=stderr)

                self.compare_hybrid_trees(tree, output_tree, compare_order)
                count_derivs[i] += 1
                derivations.append(der)

            self.assertTrue(
                sDCPParserTest.pairwise_different(
                    derivations, sDCPParserTest.compare_derivations))
            self.assertEqual(len(derivations), count_derivs[i])

            if count_derivs[i] == 0:
                no_complete_match += 1

        for key in count_derivs:
            print(key, count_derivs[key])

        print("# trees with no complete match:", no_complete_match)
示例#15
0
def do_parsing(grammar_prim,
               limit,
               ignore_punctuation,
               recompile=True,
               preprocess_path=None):
    trees = parse_conll_corpus(test, False, limit)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)

    total_time = 0.0

    load_preprocess = preprocess_path
    if recompile or (not os.path.isfile(
            parser_type.resolve_path(preprocess_path))):
        load_preprocess = None

    parser = parser_type(grammar_prim,
                         save_preprocess=preprocess_path,
                         load_preprocess=load_preprocess)

    with open(result, 'w') as result_file:
        failures = 0
        for tree in trees:
            if len(tree.id_yield()) > limit:
                continue
            time_stamp = time.clock()

            parser.set_input(tree_yield(tree.token_yield()))
            parser.parse()
            # if not parser.recognized():
            #     parser = parser_type(grammar_second, tree_yield(tree.token_yield()))
            # if not parser.recognized():
            #     parser = parser_type(grammar_tern, tree_yield(tree.token_yield()))
            time_stamp = time.clock() - time_stamp
            total_time += time_stamp

            cleaned_tokens = copy.deepcopy(tree.full_token_yield())
            for token in cleaned_tokens:
                token.set_edge_label('_')

            h_tree = HybridTree(tree.sent_label())

            if parser_type == GFParser_k_best and parser.recognized():
                der_to_tree = lambda der: dcp_to_hybridtree(
                    HybridTree(),
                    DCP_evaluator(der).getEvaluation(),
                    copy.deepcopy(tree.full_token_yield()), False,
                    construct_conll_token)
                h_tree = parser.best_trees(der_to_tree)[0][0]
            elif parser_type == CFGParser \
                     or parser_type == GFParser \
                     or parser_type == LeftBranchingFSTParser \
                     or parser_type == RightBranchingFSTParser:
                h_tree = parser.dcp_hybrid_tree_best_derivation(
                    h_tree, cleaned_tokens, ignore_punctuation,
                    construct_conll_token)
            else:
                h_tree = None

            if h_tree:
                result_file.write(tree_to_conll_str(h_tree))
                result_file.write('\n\n')
            else:
                failures += 1
                forms = [token.form() for token in tree.full_token_yield()]
                poss = [token.pos() for token in tree.full_token_yield()]
                result_file.write(
                    tree_to_conll_str(fall_back_left_branching(forms, poss)))
                result_file.write('\n\n')

            parser.clear()

    print("parse failures", failures)
    print("parse time", total_time)

    print("eval.pl", "no punctuation")
    p = subprocess.Popen(
        ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q"])
    p.communicate()
    print("eval.pl", "punctation")
    p = subprocess.Popen(
        ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q", "-p"])
    p.communicate()
示例#16
0
    def test_recursive_partitioning_transformation(self):
        tree = HybridTree("mytree")
        ids = ['a', 'b', 'c', 'd']
        for f in ids:
            tree.add_node(f, CoNLLToken(f, '_', '_', '_', '_', '_'), True,
                          True)
            if f != 'a':
                tree.add_child('a', f)
        tree.add_to_root('a')

        print(tree)
        self.assertEqual([token.form() for token in tree.token_yield()], ids)
        self.assertEqual(tree.recursive_partitioning(),
                         (set([0, 1, 2, 3]), [(set([0]), []), (set([1]), []),
                                              (set([2]), []), (set([3]), [])]))
        print(tree.recursive_partitioning())

        [fanout_1
         ] = the_recursive_partitioning_factory().get_partitioning('fanout-1')

        print(fanout_1(tree))
def main(limit=100000, ignore_punctuation=False):
    if PARSER_TYPE.__name__ != 'GFParser':
        print('GFParser not found, using', PARSER_TYPE.__name__, 'instead!')
        print('Please install grammatical framework to reproduce experiments.')

    test_limit = 10000
    trees = parse_conll_corpus(TRAIN, False, limit)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)
    (n_trees, grammar_prim) = d_i.induce_grammar(trees, PRIMARY_LABELLING, TERMINAL_LABELLING.token_label,
                                                 RECURSIVE_PARTITIONING, START)
    PARSER_TYPE.preprocess_grammar(grammar_prim)

    trees = parse_conll_corpus(TRAIN, False, limit)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)
    (n_trees, grammar_second) = d_i.induce_grammar(trees, SECONDARY_LABELLING, TERMINAL_LABELLING.token_label,
                                                   RECURSIVE_PARTITIONING, START)
    PARSER_TYPE.preprocess_grammar(grammar_second)

    trees = parse_conll_corpus(TRAIN, False, limit)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)
    (n_trees, grammar_tern) = d_i.induce_grammar(trees, TERNARY_LABELLING, TERMINAL_LABELLING.token_label,
                                                 RECURSIVE_PARTITIONING, START)
    PARSER_TYPE.preprocess_grammar(grammar_tern)

    trees = parse_conll_corpus(TEST, False, test_limit)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)

    total_time = 0.0

    with open(RESULT, 'w') as result_file:
        failures = 0
        for tree in trees:
            time_stamp = time.clock()

            the_parser = PARSER_TYPE(grammar_prim, TREE_YIELD(tree.token_yield()))
            if not the_parser.recognized():
                the_parser = PARSER_TYPE(grammar_second, TREE_YIELD(tree.token_yield()))
            if not the_parser.recognized():
                the_parser = PARSER_TYPE(grammar_tern, TREE_YIELD(tree.token_yield()))
            time_stamp = time.clock() - time_stamp
            total_time += time_stamp

            cleaned_tokens = copy.deepcopy(tree.full_token_yield())
            for token in cleaned_tokens:
                token.set_edge_label('_')
            h_tree = HybridTree(tree.sent_label())
            h_tree = the_parser.dcp_hybrid_tree_best_derivation(h_tree, cleaned_tokens, ignore_punctuation,
                                                                construct_conll_token)

            if h_tree:
                result_file.write(tree_to_conll_str(h_tree))
                result_file.write('\n\n')
            else:
                failures += 1
                forms = [token.form() for token in tree.full_token_yield()]
                poss = [token.pos() for token in tree.full_token_yield()]
                result_file.write(tree_to_conll_str(fall_back_left_branching(forms, poss)))
                result_file.write('\n\n')

    print("parse failures", failures)
    print("parse time", total_time)

    print("eval.pl", "no punctuation")
    p = subprocess.Popen(["perl", "util/eval.pl", "-g", TEST, "-s", RESULT, "-q"])
    p.communicate()
    print("eval.pl", "punctuation")
    p = subprocess.Popen(
        ["perl", "util/eval.pl", "-g", TEST, "-s", RESULT, "-q", "-p"])
    p.communicate()
示例#18
0
def parse_sentences_from_file(grammar,
                              parser_type,
                              experiment,
                              connection,
                              path,
                              tree_yield,
                              max_length=sys.maxsize,
                              limit=sys.maxsize,
                              quiet=False,
                              ignore_punctuation=True,
                              root_default_deprel=None,
                              disconnected_default_deprel=None):
    """
    :rtype: None
    :type grammar: LCFRS
    :param path: file path for test corpus (dependency grammar in CoNLL format)
    :type path: str
    :param tree_yield: parse on words or POS or ..
    :type tree_yield: GeneralHybridTree -> list[str]
    :param max_length: don't parse sentences with yield > max_length
    :type max_length: int
    :param limit:      only parse the limit first sentences of the corpus
    :type limit: int
    :param quiet:      output status information
    :type quiet: bool
    :param ignore_punctuation: exclude punctuation from parsing
    :type ignore_punctuation: bool
    
    Parse sentences from corpus and compare derived dependency structure with gold standard information.
    """
    if not quiet:
        print("Building lookahead tables for grammar")
        parser_type.preprocess_grammar(grammar)

    experiment_database.set_experiment_test_corpus(connection, experiment,
                                                   path)

    if not quiet:
        if max_length != sys.maxsize:
            s = ', ignoring sentences with length > ' + str(max_length)
        else:
            s = ''
        print('Start parsing sentences' + s)

    trees = parse_conll_corpus(path, False, limit)
    trees = add_trees_to_db(path, connection, trees)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)

    (UAS, LAS, UEM, LEM) = (0, 0, 0, 0)
    parse = 0
    no_parse = 0
    n_gaps_gold = 0
    n_gaps_test = 0
    skipped = 0
    start_at = time.clock()
    for tree in trees:
        if len(tree.id_yield()) > max_length:
            skipped += 1
            continue
        time_stamp = time.clock()

        parser = parser_type(grammar, tree_yield(tree.token_yield()))
        time_stamp = time.clock() - time_stamp

        cleaned_tokens = copy.deepcopy(tree.full_token_yield())
        for token in cleaned_tokens:
            token.set_edge_label('_')
        h_tree = HybridTree(tree.sent_label())
        h_tree = parser.dcp_hybrid_tree_best_derivation(
            h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token)

        if h_tree:
            experiment_database.add_result_tree(connection, h_tree,
                                                path, experiment, 1,
                                                parser.best(), time_stamp,
                                                'parse', root_default_deprel,
                                                disconnected_default_deprel)
            n_gaps_gold += tree.n_gaps()
            n_gaps_test += h_tree.n_gaps()
            parse += 1
            (dUAS, dLAS, dUEM, dLEM) = score_cmp_dep_trees(tree, h_tree)
            UAS += dUAS
            LAS += dLAS
            UEM += dUEM
            LEM += dLEM
        else:
            experiment_database.no_parse_result(connection, tree.sent_label(),
                                                path, experiment, time_stamp,
                                                "no_parse")
            no_parse += 1

    end_at = time.clock()
    total = parse + no_parse
    if not quiet:
        print('Parsed ' + str(parse) + ' out of ' + str(total) + ' (skipped ' +
              str(skipped) + ')')
        print('fail: ', no_parse)
        if parse > 0:
            print('UAS: ', UAS / parse)
            print('LAS: ', LAS / parse)
            print('UEM: ', UEM / parse)
            print('LEM: ', LEM / parse)
            print('n gaps (gold): ', n_gaps_gold * 1.0 / parse)
            print('n gaps (test): ', n_gaps_test * 1.0 / parse)
        print('parse time: ', end_at - start_at, 's')
        print()
示例#19
0
def parse_with_pgf(grammar, forms, poss, bin):
    """"
    :type grammar: PGF
    :return:
    :rtype:
    """
    lcfrs = grammar.languages[bin + 'grammargfconcrete']

    # sentence = "ADJD ADV _COMMA_ KOUS ADV PIS PROAV VVINF VMFIN _PUNCT_"
    sentence = ' '.join(map(escape, poss))

    try:
        i = lcfrs.parse(sentence, n=1)
        p, e = next(i)
    except (StopIteration, pgf.ParseError):
        return None

    # print_ast(gr, e, 0)
    s = lcfrs.graphvizParseTree(e)
    assert isinstance(s, str)
    s_ = s.splitlines()

    tree = HybridTree()

    # print s
    i = 0
    for line in s.splitlines():
        match = re.search(r'^\s*(n\d+)\[label="([^\s]+)"\]\s*$', line)
        if match:
            node_id = match.group(1)
            label = match.group(2)
            order = int(node_id[1:]) >= 100000
            if order:
                assert escape(poss[i]) == label
                tree.add_node(
                    node_id,
                    construct_constituent_token(form=forms[i],
                                                pos=poss[i],
                                                terminal=True), True)
                i += 1
            else:
                tree.add_node(
                    node_id,
                    construct_constituent_token(form=label,
                                                pos='_',
                                                terminal=False), False)
            # print node_id, label
            if label == 'VROOT1':
                tree.add_to_root(node_id)
            continue
        match = re.search(r'^  (n\d+) -- (n\d+)\s*$', line)
        if match:
            parent = match.group(1)
            child = match.group(2)
            tree.add_child(parent, child)
            # print line
            # print parent, child
            continue

    # print tree

    assert poss == [token.pos() for token in tree.token_yield()]
    # print the_yield

    dep_tree = HybridTree()
    head_table = defaultdict(lambda: None)
    attachment_point = defaultdict(lambda: None)
    for i, node in enumerate(tree.id_yield()):
        token = tree.node_token(node)
        dep_token = construct_conll_token(token.form(), un_escape(token.pos()))
        current = tree.parent(node)
        current = tree.parent(current)
        while current:
            current_label = tree.node_token(current).category()
            if not re.search(r'\d+X\d+$', current_label):
                s = un_escape(current_label)
                if s == 'TOP1':
                    s = 'ROOT1'
                dep_token.set_edge_label(s[:-1])
                head_table[current] = i + 1
                attachment_point[node] = current
                break
            else:
                current = tree.parent(current)
        dep_tree.add_node(i + 1, dep_token, order=True)

    # print head_table

    for node, dep_node in zip(tree.id_yield(), dep_tree.id_yield()):
        node = tree.parent(attachment_point[node])
        while node:
            if head_table[node]:
                dep_tree.add_child(head_table[node], dep_node)
                break
            node = tree.parent(node)
        if not node:
            dep_tree.add_to_root(dep_node)

    # print "dep_tree"
    # print dep_tree
    # print ' '.join(['(' + token.form() + '/' + token.deprel() + ')' for token in dep_tree.token_yield()])
    return dep_tree
示例#20
0
 def test_recursive_partition(self):
     self.assertEqual(
         PartitionBuilder(choice_function=choose_min,
                          split_function=spans_split).string_partition(
                              tree=HybridTree()), (set(), []))
示例#21
0
def build_score_validator(baseline_grammar, grammarInfo, nont_map,
                          storageManager, term_labelling, parser,
                          corpus_validation, validationMethod):
    validator = PyCandidateScoreValidator(grammarInfo, storageManager,
                                          validationMethod)

    # parser = GFParser(baseline_grammar)
    tree_count = 0
    der_count = 0
    for gold_tree in corpus_validation.get_trees():
        tree_count += 1
        parser.set_input(
            term_labelling.prepare_parser_input(gold_tree.token_yield()))
        parser.parse()
        derivations = map(lambda x: x[1], parser.k_best_derivation_trees())
        manager = PyDerivationManager(baseline_grammar, nont_map)
        manager.convert_derivations_to_hypergraphs(derivations)
        scores = []

        gold_labels = {}
        gold_heads = {}

        for position, id in enumerate(gold_tree.id_yield()):
            parent_id = gold_tree.parent(id)
            gold_labels[position] = gold_tree.node_token(id).deprel()
            if parent_id is None:
                assert id in gold_tree.root
                gold_heads[position] = 0
            else:
                gold_heads[position] = gold_tree.id_yield().index(
                    parent_id) + 1

        derivations = parser.k_best_derivation_trees()
        for _, der in derivations:
            der_count += 1
            h_tree = HybridTree()
            cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield())
            dcp = DCP_evaluator(der).getEvaluation()
            dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                              construct_conll_token)

            las, uas, lac = 0, 0, 0
            for position, id in enumerate(h_tree.id_yield()):
                parent_id = h_tree.parent(id)
                if parent_id is None:
                    assert id in h_tree.root
                    head = 0
                else:
                    head = h_tree.id_yield().index(parent_id) + 1
                label = h_tree.node_token(id).deprel()

                if gold_heads[position] == head:
                    uas += 1
                if gold_labels[position] == label:
                    lac += 1
                if gold_heads[position] == head and gold_labels[
                        position] == label:
                    las += 1

            if validationMethod == "LAS":
                scores.append(las)
            elif validationMethod == "UAS":
                scores.append(uas)
            elif validationMethod == "LAC":
                scores.append(lac)

        max_score = len(gold_tree.id_yield())
        validator.add_scored_candidates(manager, scores, max_score)
        print(tree_count, max_score, scores)
        parser.clear()

    print("trees used for validation ", tree_count, "with",
          der_count * 1.0 / tree_count, "derivations on average")

    return validator
示例#22
0
def do_parsing(grammar,
               test_corpus,
               term_labelling,
               result,
               grammar_identifier,
               parser_type,
               k_best,
               minimum_risk=False,
               oracle_parse=False,
               recompile=True,
               reparse=False,
               dir=None,
               opt=None):
    tree_yield = term_labelling.prepare_parser_input

    result_path = result(grammar_identifier)
    minimum_risk_path = result(grammar_identifier, 'min_risk')
    oracle_parse_path = result(grammar_identifier, 'oracle_file')

    total_time = 0.0

    preprocess_path = [os.path.join(dir, grammar_identifier), "gf_grammar"]
    # print(preprocess_path)
    load_preprocess = preprocess_path
    if parser_type not in [GFParser, GFParser_k_best, Coarse_to_fine_parser] \
            or recompile \
            or (not os.path.isfile(GFParser.resolve_path(preprocess_path))):
        load_preprocess = None
    if parser_type in [GFParser, GFParser_k_best, Coarse_to_fine_parser] \
            and not os.path.isdir(os.path.join(dir, grammar_identifier)):
        os.makedirs(os.path.join(dir, grammar_identifier))

    if parser_type == GFParser_k_best:
        parser = GFParser_k_best(grammar,
                                 save_preprocessing=preprocess_path,
                                 load_preprocessing=load_preprocess,
                                 k=k_best)
    elif parser_type == Coarse_to_fine_parser:
        parser = Coarse_to_fine_parser(grammar,
                                       base_parser_type=GFParser_k_best,
                                       la=opt["latentAnnotation"],
                                       grammarInfo=opt["grammarInfo"],
                                       nontMap=opt["nontMap"],
                                       save_preprocessing=preprocess_path,
                                       load_preprocessing=load_preprocess,
                                       k=k_best)
    else:
        parser = parser_type(grammar,
                             save_preprocess=preprocess_path,
                             load_preprocess=load_preprocess)

    if recompile or reparse or \
            not os.path.isfile(result_path) \
            or (minimum_risk and not os.path.isfile(minimum_risk_path)) \
            or (oracle_parse and not os.path.isfile(oracle_parse_path)):

        result_dirs = map(lambda path: os.path.split(path)[0],
                          [result_path, minimum_risk_path, oracle_parse_path])
        for result_dir in result_dirs:
            if not os.path.isdir(result_dir):
                os.makedirs(result_dir)

        with open(result_path, 'w') as result_file, \
                open(minimum_risk_path, 'w') as minimum_risk_file, \
                open(oracle_parse_path, 'w') as oracle_parse_file:
            failures = 0
            for tree in test_corpus.get_trees():
                time_stamp = time.clock()

                parser.set_input(tree_yield(tree.token_yield()))
                parser.parse()
                # if not parser.recognized():
                #     parser = parser_type(grammar_second, tree_yield(tree.token_yield()))
                # if not parser.recognized():
                #     parser = parser_type(grammar_tern, tree_yield(tree.token_yield()))
                time_stamp = time.clock() - time_stamp
                total_time += time_stamp

                cleaned_tokens = copy.deepcopy(tree.full_token_yield())
                for token in cleaned_tokens:
                    token.set_edge_label('_')

                h_tree = HybridTree(tree.sent_label())

                if parser_type in [GFParser_k_best, Coarse_to_fine_parser
                                   ] and parser.recognized():
                    if minimum_risk or oracle_parse:
                        h_trees = []
                        weights = []

                        for weight, der in parser.k_best_derivation_trees():

                            dcp = DCP_evaluator(der).getEvaluation()
                            h_tree = HybridTree()
                            cleaned_tokens = copy.deepcopy(
                                tree.full_token_yield())
                            dcp_to_hybridtree(h_tree, dcp, cleaned_tokens,
                                              False, construct_conll_token)

                            h_trees.append(h_tree)
                            weights.append(weight)

                        if minimum_risk:
                            h_tree_min_risk = compute_minimum_risk_tree(
                                h_trees, weights)
                        if oracle_parse:
                            h_tree_oracle = compute_oracle_tree(h_trees, tree)

                    der_to_tree = lambda der: dcp_to_hybridtree(
                        HybridTree(),
                        DCP_evaluator(der).getEvaluation(),
                        copy.deepcopy(tree.full_token_yield()), False,
                        construct_conll_token)
                    # h_tree = parser.best_trees(der_to_tree)[0][0]
                    h_tree = HybridTree(tree.sent_label())
                    h_tree = parser.dcp_hybrid_tree_best_derivation(
                        h_tree, copy.deepcopy(tree.full_token_yield()),
                        ignore_punctuation, construct_conll_token)
                elif parser_type == CFGParser \
                        or parser_type == GFParser \
                        or parser_type == LeftBranchingFSTParser \
                        or parser_type == RightBranchingFSTParser:
                    h_tree = parser.dcp_hybrid_tree_best_derivation(
                        h_tree, cleaned_tokens, ignore_punctuation,
                        construct_conll_token)
                else:
                    h_tree = None

                if h_tree:
                    result_file.write(tree_to_conll_str(h_tree))
                    result_file.write('\n\n')
                    if minimum_risk and parser_type in [
                            GFParser_k_best, Coarse_to_fine_parser
                    ]:
                        minimum_risk_file.write(
                            tree_to_conll_str(h_tree_min_risk))
                        minimum_risk_file.write('\n\n')
                    if oracle_parse and parser_type in [
                            GFParser_k_best, Coarse_to_fine_parser
                    ]:
                        oracle_parse_file.write(
                            tree_to_conll_str(h_tree_oracle))
                        oracle_parse_file.write('\n\n')
                else:
                    failures += 1
                    forms = [token.form() for token in tree.full_token_yield()]
                    poss = [token.pos() for token in tree.full_token_yield()]
                    fall_back = tree_to_conll_str(
                        fall_back_left_branching(forms, poss))
                    files = [result_file]
                    if minimum_risk:
                        files.append(minimum_risk_file)
                    if oracle_parse:
                        files.append(oracle_parse_file)
                    for file in files:
                        file.write(fall_back)
                        file.write('\n\n')

                parser.clear()

        print("parse failures", failures)
        print("parse time", total_time)

    if parser_type == GFParser_k_best:
        print("best parse results")
    else:
        print("viterbi parse results")
    eval_pl_call(test_corpus._path, result_path)
    if oracle_parse:
        print("\noracle parse results")
        eval_pl_call(test_corpus._path, oracle_parse_path)
    if minimum_risk:
        print("\nminimum risk results")
        eval_pl_call(test_corpus._path, minimum_risk_path)

    return parser
def multi_const_tree():
    tree = HybridTree("multi")
    tree.add_node('1.1', ConstituentTerminal('A', 'pA'), True, True)
    tree.add_node('2.1', ConstituentTerminal('B', 'pB'), True, True)
    tree.add_node('1.2', ConstituentTerminal('C', 'pC'), True, True)
    tree.add_node('2.2', ConstituentTerminal('D', 'pD'), True, True)
    tree.add_node('1', ConstituentCategory('E'), False, True)
    tree.add_node('2', ConstituentCategory('F'), False, True)
    for p in ['2', '1']:
        tree.add_to_root(p)
        for c in ['1', '2']:
            tree.add_child(p, p + '.' + c)
    return tree
示例#24
0
    def test_best_trees(self):
        limit_train = 5000
        limit_test = 100
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        test = train
        parser_type = GFParser_k_best
        # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory(
        ).create_simple_labeling_strategy("child", "pos+deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling,
                                                 term_labelling.token_label,
                                                 recursive_partitioning, start)

        parser_type.preprocess_grammar(grammar_prim)
        tree_yield = term_labelling.prepare_parser_input

        trees = parse_conll_corpus(test, False, limit_test)

        for i, tree in enumerate(trees):
            print("Parsing sentence ", i, file=stderr)

            parser = parser_type(grammar_prim,
                                 tree_yield(tree.token_yield()),
                                 k=200)

            self.assertTrue(parser.recognized())

            viterbi_weight = parser.viterbi_weight()
            viterbi_deriv = parser.viterbi_derivation()

            der_to_tree = lambda der: dcp_to_hybridtree(
                HybridTree(),
                DCP_evaluator(der).getEvaluation(),
                copy.deepcopy(tree.full_token_yield()), False,
                construct_conll_token)

            viterbi_tree = der_to_tree(viterbi_deriv)

            ordered_parse_trees = parser.best_trees(der_to_tree)

            best_tree, best_weight, best_witnesses = ordered_parse_trees[0]

            for i, (parsed_tree, _, _) in enumerate(ordered_parse_trees):
                if parsed_tree.__eq__(tree):
                    print("Gold tree is ",
                          i + 1,
                          " in best tree list",
                          file=stderr)
                    break

            if (not viterbi_tree.__eq__(best_tree)
                    and viterbi_weight != best_weight):
                print("viterbi and k-best tree differ", file=stderr)
                print("viterbi: ", viterbi_weight, file=stderr)
                print("k-best: ", best_weight, best_witnesses, file=stderr)
                if False:
                    print(viterbi_tree, file=stderr)
                    print(tree_to_conll_str(viterbi_tree), file=stderr)
                    print(best_tree, file=stderr)
                    print(tree_to_conll_str(best_tree), file=stderr)
                    print("gold tree", file=stderr)
                    print(tree, file=stderr)
                    print(tree_to_conll_str(tree), file=stderr)
def multi_dep_tree():
    tree = HybridTree('multi')
    tree.add_node('1', CoNLLToken('A', '_', 'pA', 'pA', '_', 'dA'), True)
    tree.add_node('211', CoNLLToken('B', '_', 'pB', 'pB', '_', 'dB'), True)
    tree.add_node('11', CoNLLToken('C', '_', 'pC', 'pC', '_', 'dC'), True)
    tree.add_node('2', CoNLLToken('D', '_', 'pD', 'pD', '_', 'dD'), True)
    tree.add_node('21', CoNLLToken('E', '_', 'pE', 'pE', '_', 'dE'), True)
    tree.add_to_root('2')
    tree.add_to_root('1')
    for c in ['21', '211']:
        tree.add_child('2', c)
    tree.add_child('1', '11')
    tree.reorder()
    return tree
示例#26
0
    def test_k_best_parsing(self):
        limit_train = 20
        limit_test = 10
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        test = train
        parser_type = GFParser_k_best
        # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory(
        ).create_simple_labeling_strategy("childtop", "deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling,
                                                 term_labelling.token_label,
                                                 recursive_partitioning, start)

        parser_type.preprocess_grammar(grammar_prim)
        tree_yield = term_labelling.prepare_parser_input

        trees = parse_conll_corpus(test, False, limit_test)

        for i, tree in enumerate(trees):
            print("Parsing sentence ", i, file=stderr)

            # print >>stderr, tree

            parser = parser_type(grammar_prim,
                                 tree_yield(tree.token_yield()),
                                 k=50)

            self.assertTrue(parser.recognized())

            derivations = [der for der in parser.k_best_derivation_trees()]
            print("# derivations: ", len(derivations), file=stderr)
            h_trees = []
            current_weight = 0
            weights = []
            derivation_list = []
            for weight, der in derivations:
                # print >>stderr, exp(-weight)
                # print >>stderr, der

                self.assertTrue(not der in derivation_list)

                derivation_list.append(der)

                # TODO this should hold, but it looks like a GF bug!
                # self.assertGreaterEqual(weight, current_weight)
                current_weight = weight

                dcp = DCP_evaluator(der).getEvaluation()
                h_tree = HybridTree()
                cleaned_tokens = copy.deepcopy(tree.full_token_yield())
                dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                                  construct_conll_token)

                h_trees.append(h_tree)
                weights.append(weight)

                # print >>stderr, h_tree

            # print a matrix indicating which derivations result
            # in the same hybrid tree
            if True:
                for i, h_tree1 in enumerate(h_trees):
                    for h_tree2 in h_trees:
                        if h_tree1 == h_tree2:
                            print("x", end=' ', file=stderr)
                        else:
                            print("", end=' ', file=stderr)
                    print(weights[i], file=stderr)
                print(file=stderr)
示例#27
0
def derivation_to_hybrid_tree(der,
                              poss,
                              ordered_labels,
                              construct_token,
                              disconnected=None):
    """
    :param der:
    :type der: LCFRSDerivation
    :param poss: list of POS-tags
    :type poss: list[str]
    :param ordered_labels: list of words
    :type ordered_labels: list[str]
    :param disconnected: list of positions in ordered_labels that are disconnected
    :type disconnected: list[object]
    :rtype: GeneralHybridTree
    Turn a derivation tree into a hybrid tree. Assuming poss and ordered_labels to have equal length.
    """
    if not disconnected:
        disconnected = []
    tree = HybridTree()
    j = 1
    for i in range(len(ordered_labels)):
        token = construct_token(ordered_labels[i], poss[i], True)
        if i in disconnected:
            tree.add_node("d" + str(i), token, True, False)
        else:
            tree.add_node("c" + str(j), token, True, True)
            j += 1
    for id in der.ids():
        token = construct_token(der.getRule(id).lhs().nont(), '_', False)
        tree.add_node(id, token)
        for child in der.child_ids(id):
            tree.add_child(id, child)
        for position in der.terminal_positions(id):
            tree.add_child(id, "c" + str(position))
    tree.add_to_root(der.root_id())
    tree.reorder()
    return tree
示例#28
0
    def test_single_root_induction(self):
        tree = hybrid_tree_1()
        # print tree.children("v")
        # print tree
        #
        # for id_set in ['v v1 v2 v21'.split(' '), 'v1 v2'.split(' '),
        # 'v v21'.split(' '), ['v'], ['v1'], ['v2'], ['v21']]:
        # print id_set, 'top:', top(tree, id_set), 'bottom:', bottom(tree, id_set)
        # print id_set, 'top_max:', max(tree, top(tree, id_set)), 'bottom_max:', max(tree, bottom(tree, id_set))
        #
        # print "some rule"
        # for mem, arg in [(-1, 0), (0,0), (1,0)]:
        # print create_DCP_rule(mem, arg, top_max(tree, ['v','v1','v2','v21']), bottom_max(tree, ['v','v1','v2','v21']),
        # [(top_max(tree, l), bottom_max(tree, l)) for l in [['v1', 'v2'], ['v', 'v21']]])
        #
        #
        # print "some other rule"
        # for mem, arg in [(-1,1),(1,0)]:
        # print create_DCP_rule(mem, arg, top_max(tree, ['v1','v2']), bottom_max(tree, ['v1','v2']),
        # [(top_max(tree, l), bottom_max(tree, l)) for l in [['v1'], ['v2']]])
        #
        # print 'strict:' , strict_labeling(tree, top_max(tree, ['v','v21']), bottom_max(tree, ['v','v21']))
        # print 'child:' , child_labeling(tree, top_max(tree, ['v','v21']), bottom_max(tree, ['v','v21']))
        # print '---'
        # print 'strict: ', strict_labeling(tree, top_max(tree, ['v1','v21']), bottom_max(tree, ['v1','v21']))
        # print 'child: ', child_labeling(tree, top_max(tree, ['v1','v21']), bottom_max(tree, ['v1','v21']))
        # print '---'
        # print 'strict:' , strict_labeling(tree, top_max(tree, ['v','v1', 'v21']), bottom_max(tree, ['v','v1', 'v21']))
        # print 'child:' , child_labeling(tree, top_max(tree, ['v','v1', 'v21']), bottom_max(tree, ['v','v1', 'v21']))

        tree2 = hybrid_tree_2()

        # print tree2.children("v")
        # print tree2
        #
        # print 'siblings v211', tree2.siblings('v211')
        # print top(tree2, ['v','v1', 'v211'])
        # print top_max(tree2, ['v','v1', 'v211'])
        #
        # print '---'
        # print 'strict:' , strict_labeling(tree2, top_max(tree2, ['v','v1', 'v211']), bottom_max(tree2, ['v','v11', 'v211']))
        # print 'child:' , child_labeling(tree2, top_max(tree2, ['v','v1', 'v211']), bottom_max(tree2, ['v','v11', 'v211']))

        # rec_par = ('v v1 v2 v21'.split(' '),
        # [('v1 v2'.split(' '), [(['v1'],[]), (['v2'],[])])
        #                ,('v v21'.split(' '), [(['v'],[]), (['v21'],[])])
        #            ])
        #
        # grammar = LCFRS(nonterminal_str(tree, top_max(tree, rec_par[0]), bottom_max(tree, rec_par[0]), 'strict'))
        #
        # add_rules_to_grammar_rec(tree, rec_par, grammar, 'child')
        #
        # grammar.make_proper()
        # print grammar

        print(tree.recursive_partitioning())

        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'),
            # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'),
            terminal_labeling.token_label,
            [direct_extraction],
            'START')
        print(max([grammar.fanout(nont) for nont in grammar.nonts()]))
        print(grammar)

        parser = LCFRS_parser(grammar, 'NP N V V'.split(' '))
        print(parser.best_derivation_tree())

        tokens = [
            construct_conll_token(form, pos) for form, pos in zip(
                'Piet Marie helpen lezen'.split(' '), 'NP N V V'.split(' '))
        ]
        hybrid_tree = HybridTree()
        hybrid_tree = parser.dcp_hybrid_tree_best_derivation(
            hybrid_tree, tokens, True, construct_conll_token)
        print(list(map(str, hybrid_tree.full_token_yield())))
        print(hybrid_tree)

        string = "foo"
        dcp_string = DCP_string(string)
        dcp_string.set_edge_label("bar")
        print(dcp_string, dcp_string.edge_label())

        linearize(
            grammar,
            the_labeling_factory().create_simple_labeling_strategy(
                'child', 'pos+deprel'),
            the_terminal_labeling_factory().get_strategy('pos'), sys.stdout)
示例#29
0
    def test_fst_compilation_left(self):
        if not test_pynini:
            return
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label,
            [left_branching], 'START')

        fst, rules = compile_wfst_from_left_branching_grammar(grammar)

        print(repr(fst))

        symboltable = fst.input_symbols()

        string = ["NP", "N", "V", "V", "V"]

        fsa = fsa_from_list_of_symbols(string, symboltable)
        self.assertEqual(
            fsa.text().decode('utf-8'),
            '0\t1\tNP\tNP\n1\t2\tN\tN\n2\t3\tV\tV\n3\t4\tV\tV\n4\t5\tV\tV\n5\n'
        )

        b = compose(fsa, fst)

        print(b.text(symboltable, symboltable))

        print("Shortest path probability", end=' ')
        best = shortestpath(b)
        best.topsort()
        # self.assertAlmostEquals(pow(e, -float(shortestdistance(best)[-1])), 1.80844898756e-05)
        print(best.text())

        polish_rules = retrieve_rules(best)
        self.assertSequenceEqual(polish_rules, [1, 2, 3, 4, 5, 4, 9, 4, 7, 8])

        polish_rules = list(map(rules.index_object, polish_rules))

        for rule in polish_rules:
            print(rule)
        print()

        der = ReversePolishDerivation(polish_rules[0:-1])
        self.assertTrue(der.check_integrity_recursive(der.root_id()))

        print(der)

        LeftBranchingFSTParser.preprocess_grammar(grammar)
        parser = LeftBranchingFSTParser(grammar, string)
        der_ = parser.best_derivation_tree()

        print(der_)
        self.assertTrue(der_.check_integrity_recursive(der_.root_id()))

        print(
            derivation_to_hybrid_tree(der, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        print(
            derivation_to_hybrid_tree(der_, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        dcp = DCP_evaluator(der).getEvaluation()

        h_tree_2 = HybridTree()
        token_sequence = [
            construct_conll_token(form, lemma)
            for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '),
                                   'NP N V V V'.split(' '))
        ]
        dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                          construct_conll_token)

        print(h_tree_2)
def trainAndEval(strategy,
                 labelling1,
                 labelling2,
                 fanout,
                 parser_type,
                 train,
                 test,
                 cDT,
                 parseStrings,
                 ignore_punctuation=False):
    file = open('results.txt', 'a')
    term_labelling = the_terminal_labeling_factory().get_strategy('pos')
    recursive_partitioning = d_i.the_recursive_partitioning_factory(
    ).get_partitioning('fanout-' + str(fanout) + strategy)
    primary_labelling = d_l.the_labeling_factory(
    ).create_simple_labeling_strategy(labelling1, labelling2)

    trees = parse_conll_corpus(train, False, train_limit)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)
    (n_trees, grammar) = d_i.induce_grammar(trees, primary_labelling,
                                            term_labelling.token_label,
                                            recursive_partitioning, start)

    # write current transformation strategy and hyperparameters to results.txt
    if strategy == '':
        file.write('rtl ' + labelling1 + ' ' + labelling2 +
                   '    maximal fanout:' + fanout)
    else:
        splitList = strategy.split('-')
        if splitList[1] == 'left':
            file.write('ltr ' + labelling1 + ' ' + labelling2 +
                       '    maximal fanout:' + fanout)
        elif splitList[1] == 'random':
            file.write('random seed:' + splitList[2] + ' ' + labelling1 + ' ' +
                       labelling2 + ' maximal fanout:' + fanout)
        elif splitList[1] == 'no':
            if splitList[4] == 'random':
                file.write('nnont fallback:random seed:' + splitList[5] + ' ' +
                           labelling1 + ' ' + labelling2 + ' maximal fanout:' +
                           fanout)
            elif splitList[4] == 'ltr':
                file.write('nnont fallback:ltr' + ' ' + labelling1 + ' ' +
                           labelling2 + ' maximal fanout:' + fanout)
            elif splitList[4] == 'rtl':
                file.write('nnont fallback:rtl' + ' ' + labelling1 + ' ' +
                           labelling2 + ' maximal fanout:' + fanout)
            else:
                file.write('nnont fallback:argmax' + ' ' + labelling1 + ' ' +
                           labelling2 + ' maximal fanout:' + fanout)
        else:  #argmax
            file.write('argmax ' + labelling1 + ' ' + labelling2 +
                       ' maximal fanout:' + fanout)
    file.write('\n')

    res = ''

    res += '#nonts:' + str(len(grammar.nonts()))
    res += ' #rules:' + str(len(grammar.rules()))

    file.write(res)
    res = ''

    # The following code is to count the number of derivations for a hypergraph (tree parser required)
    if cDT == True:
        tree_parser.preprocess_grammar(grammar, term_labelling)

        trees = parse_conll_corpus(train, False, train_limit)
        if ignore_punctuation:
            trees = disconnect_punctuation(trees)

        derCount = 0
        derMax = 0
        for tree in trees:
            parser = tree_parser(grammar, tree)  # if tree parser is used
            der = parser.count_derivation_trees()
            if der > derMax:
                derMax = der
            derCount += der

        res += "\n#derivation trees:  average: " + str(
            1.0 * derCount / n_trees)
        res += " maximal: " + str(derMax)
    file.write(res)

    res = ''
    total_time = 0.0

    # The following code works for string parsers for evaluating
    if parseStrings == True:
        parser_type.preprocess_grammar(grammar)

        trees = parse_conll_corpus(test, False, test_limit)
        if ignore_punctuation:
            trees = disconnect_punctuation(trees)

        i = 0
        with open(result, 'w') as result_file:
            failures = 0
            for tree in trees:
                time_stamp = time.clock()
                i += i
                #if (i % 100 == 0):
                #print '.',
                #sys.stdout.flush()

                parser = parser_type(grammar, tree_yield(tree.token_yield()))

                time_stamp = time.clock() - time_stamp
                total_time += time_stamp

                cleaned_tokens = copy.deepcopy(tree.full_token_yield())
                for token in cleaned_tokens:
                    token.set_edge_label('_')
                h_tree = HybridTree(tree.sent_label())
                h_tree = parser.dcp_hybrid_tree_best_derivation(
                    h_tree, cleaned_tokens, ignore_punctuation,
                    construct_conll_token)

                if h_tree:
                    result_file.write(tree_to_conll_str(h_tree))
                    result_file.write('\n\n')
                else:
                    failures += 1
                    forms = [token.form() for token in tree.full_token_yield()]
                    poss = [token.pos() for token in tree.full_token_yield()]
                    result_file.write(
                        tree_to_conll_str(
                            fall_back_left_branching_token(cleaned_tokens)))
                    result_file.write('\n\n')

        res += "\nattachment scores:\nno punctuation: "
        out = subprocess.check_output(
            ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q"])
        match = re.search(r'[^=]*= (\d+\.\d+)[^=]*= (\d+.\d+).*', out)
        res += ' labelled:' + match.group(1)  #labeled attachment score
        res += ' unlabelled:' + match.group(2)  #unlabeled attachment score
        res += "\npunctation: "
        out = subprocess.check_output(
            ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q", "-p"])
        match = re.search(r'[^=]*= (\d+\.\d+)[^=]*= (\d+.\d+).*', out)
        res += ' labelled:' + match.group(1)
        res += ' unlabelled:' + match.group(2)

        res += "\nparse time: " + str(total_time)

    file.write(res)
    file.write('\n\n\n')
    file.close()