Exemplo n.º 1
0
def test_upos_retag(pipeline):
    """
    Test using the English tagger that trees will be correctly retagged by read_trees using upos
    """
    text = "((S (VP (X Find)) (NP (X Mox) (X Opal))))   ((S (NP (X Ragavan)) (VP (X steals) (NP (X important) (X cards)))))"
    expected = "((S (VP (VERB Find)) (NP (PROPN Mox) (PROPN Opal)))) ((S (NP (PROPN Ragavan)) (VP (VERB steals) (NP (ADJ important) (NOUN cards)))))"

    trees = tree_reader.read_trees(text)

    new_trees = utils.retag_trees(trees, pipeline, xpos=False)
    assert new_trees == tree_reader.read_trees(expected)
Exemplo n.º 2
0
def test_xpos_retag(pipeline):
    """
    Test using the English tagger that trees will be correctly retagged by read_trees using xpos
    """
    text = "((S (VP (X Find)) (NP (X Mox) (X Opal))))   ((S (NP (X Ragavan)) (VP (X steals) (NP (X important) (X cards)))))"
    expected = "((S (VP (VB Find)) (NP (NNP Mox) (NNP Opal)))) ((S (NP (NNP Ragavan)) (VP (VBZ steals) (NP (JJ important) (NNS cards)))))"

    trees = tree_reader.read_trees(text)

    new_trees = utils.retag_trees(trees, pipeline, xpos=True)
    assert new_trees == tree_reader.read_trees(expected)
Exemplo n.º 3
0
def test_simplify_labels():
    text = "( (SBARQ-FOO (WHNP-BAR (WP Who)) (SQ#ASDF (VP=1 (VBZ sits) (PP (IN in) (NP (DT this) (- -))))) (. ?)))"
    expected = "(ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (- -))))) (. ?)))"
    trees = tree_reader.read_trees(text)
    trees = [t.simplify_labels() for t in trees]
    assert len(trees) == 1
    assert expected == str(trees[0])
Exemplo n.º 4
0
def test_newlines():
    """
    The same test should work if there are newlines
    """
    text = "(VB Unban)\n\n(NNP Opal)"
    trees = tree_reader.read_trees(text)
    assert len(trees) == 2
Exemplo n.º 5
0
def test_yield_preterminals():
    text = "((S (VP (VB Unban)) (NP (NNP Mox) (NNP Opal))))"
    trees = tree_reader.read_trees(text)

    preterminals = trees[0].preterminals()
    assert len(preterminals) == 3
    assert str(preterminals) == "[(VB Unban), (NNP Mox), (NNP Opal)]"
Exemplo n.º 6
0
def check_reproduce_tree(transition_scheme):
    text = "((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
    trees = tree_reader.read_trees(text)

    model = SimpleModel(transition_scheme)
    transitions = transition_sequence.build_sequence(trees[0],
                                                     transition_scheme)
    states = parse_transitions.initial_state_from_gold_trees(trees, model)
    assert (len(states)) == 1
    state = states[0]
    assert state.num_transitions() == 0

    for t in transitions:
        assert t.is_legal(state, model)
        state = t.apply(state, model)

    # one item for the final tree
    # one item for the sentinel at the end
    assert len(state.constituents) == 2
    # the transition sequence should put all of the words
    # from the buffer onto the tree
    # one spot left for the sentinel value
    assert len(state.word_queue) == 7
    assert state.sentence_length == 6
    assert state.word_position == state.sentence_length
    assert len(state.transitions) == len(transitions) + 1

    result_tree = state.constituents.value
    assert result_tree == trees[0]
def test_build_tree():
    text="((S (VP (VB Unban)) (NP (NNP Mox) (NNP Opal))))\n( (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
    trees = tree_reader.read_trees(text)
    assert len(trees) == 2

    for tree in trees:
        proto_tree = java_protobuf_requests.build_tree(trees[0], 1.0)
        check_tree(proto_tree, trees[0], 1.0)
Exemplo n.º 8
0
def build_one_tree_treebank():
    text = "((S (VP (VB Unban)) (NP (NNP Mox) (NNP Opal))))"
    trees = tree_reader.read_trees(text)
    assert len(trees) == 1
    gold = trees[0]
    prediction = (gold, 1.0)
    treebank = [(gold, [prediction])]
    return treebank
Exemplo n.º 9
0
def test_remap_constituent_words():
    text = "(ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in)))) (. ?)))"
    expected = "(ROOT (SBARQ (WHNP (WP unban)) (SQ (VP (VBZ mox) (PP (IN opal)))) (. ?)))"

    word_map = {"Who": "unban", "sits": "mox", "in": "opal"}
    trees = tree_reader.read_trees(text)
    trees = [t.remap_words(word_map) for t in trees]
    assert len(trees) == 1
    assert expected == str(trees[0])
Exemplo n.º 10
0
def test_root_labels():
    text = "( (SBARQ-FOO (WHNP-BAR (WP Who)) (SQ#ASDF (VP=1 (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
    trees = tree_reader.read_trees(text)
    assert ["ROOT"] == Tree.get_root_labels(trees)

    text = (
        "( (SBARQ-FOO (WHNP-BAR (WP Who)) (SQ#ASDF (VP=1 (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
        +
        "( (SBARQ-FOO (WHNP-BAR (WP Who)) (SQ#ASDF (VP=1 (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
        +
        "( (SBARQ-FOO (WHNP-BAR (WP Who)) (SQ#ASDF (VP=1 (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
    )
    trees = tree_reader.read_trees(text)
    assert ["ROOT"] == Tree.get_root_labels(trees)

    text = "(FOO) (BAR)"
    trees = tree_reader.read_trees(text)
    assert ["BAR", "FOO"] == Tree.get_root_labels(trees)
Exemplo n.º 11
0
def test_remap_constituent_labels():
    text = "(ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in)))) (. ?)))"
    expected = "(ROOT (FOO (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in)))) (. ?)))"

    label_map = {"SBARQ": "FOO"}
    trees = tree_reader.read_trees(text)
    trees = [t.remap_constituent_labels(label_map) for t in trees]
    assert len(trees) == 1
    assert expected == str(trees[0])
Exemplo n.º 12
0
def test_compound_constituents():
    # TODO: add skinny trees like this to the various transition tests
    text = "((VP (VB Unban)))"
    trees = tree_reader.read_trees(text)
    assert Tree.get_compound_constituents(trees) == [('ROOT', 'VP')]

    text = "(ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in)))) (. ?)))"
    trees = tree_reader.read_trees(text)
    assert Tree.get_compound_constituents(trees) == [('PP', ),
                                                     ('ROOT', 'SBARQ'),
                                                     ('SQ', 'VP'), ('WHNP', )]

    text = "((VP (VB Unban)))   (ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in)))) (. ?)))"
    trees = tree_reader.read_trees(text)
    assert Tree.get_compound_constituents(trees) == [('PP', ),
                                                     ('ROOT', 'SBARQ'),
                                                     ('ROOT', 'VP'),
                                                     ('SQ', 'VP'), ('WHNP', )]
Exemplo n.º 13
0
def test_equals():
    """
    Check one tree from the actual dataset for ==

    when built with compound Open, this didn't work because of a silly bug
    """
    text = "(ROOT (S (NP (DT The) (NNP Arizona) (NNPS Corporations) (NNP Commission)) (VP (VBD authorized) (NP (NP (DT an) (ADJP (CD 11.5)) (NN %) (NN rate) (NN increase)) (PP (IN at) (NP (NNP Tucson) (NNP Electric) (NNP Power) (NNP Co.))) (, ,) (UCP (ADJP (ADJP (RB substantially) (JJR lower)) (SBAR (IN than) (S (VP (VBN recommended) (NP (JJ last) (NN month)) (PP (IN by) (NP (DT a) (NN commission) (NN hearing) (NN officer))))))) (CC and) (NP (NP (QP (RB barely) (PDT half)) (DT the) (NN rise)) (VP (VBN sought) (PP (IN by) (NP (DT the) (NN utility)))))))) (. .)))"

    trees = tree_reader.read_trees(text)
    assert len(trees) == 1
    tree = trees[0]

    assert tree == tree

    trees2 = tree_reader.read_trees(text)
    tree2 = trees2[0]

    assert tree is not tree2
    assert tree == tree2
Exemplo n.º 14
0
def test_replace_words():
    text = "(ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in)))) (. ?)))"
    expected = "(ROOT (SBARQ (WHNP (WP unban)) (SQ (VP (VBZ mox) (PP (IN opal)))) (. ?)))"
    new_words = ["unban", "mox", "opal", "?"]

    trees = tree_reader.read_trees(text)
    assert len(trees) == 1
    tree = trees[0]
    new_tree = tree.replace_words(new_words)
    assert expected == str(new_tree)
Exemplo n.º 15
0
def test_unique_tags():
    """
    Test getting the unique tags from a tree
    """
    text = "((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"

    trees = tree_reader.read_trees(text)

    tags = Tree.get_unique_tags(trees)
    expected = ['.', 'DT', 'IN', 'NN', 'VBZ', 'WP']
    assert tags == expected
Exemplo n.º 16
0
def test_rare_words():
    """
    Test getting the unique words from a tree
    """
    text = "((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))  ((SBARQ (NP (DT this) (NN seat)) (. ?)))"

    trees = tree_reader.read_trees(text)

    words = Tree.get_rare_words(trees, 0.5)
    expected = ['Who', 'in', 'sits']
    assert words == expected
Exemplo n.º 17
0
def main():
    """
    Convert a sample tree and print its transitions
    """
    text = "( (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
    #text = "(WP Who)"

    tree = read_trees(text)[0]

    print(tree)
    transitions = build_sequence(tree)
    print(transitions)
Exemplo n.º 18
0
def load_trees(filename, pipeline):
    # some of the files are in latin-1 encoding rather than utf-8
    try:
        raw_text = load_without_asterisks(filename, "utf-8")
    except UnicodeDecodeError:
        raw_text = load_without_asterisks(filename, "latin-1")

    # also, some have messed up validation (it will be logged)
    # hence the broken_ok=True argument
    trees = tree_reader.read_trees("".join(raw_text), broken_ok=True)

    filtered_trees = []
    for tree in trees:
        if tree.children[0].label is None:
            print("Skipping a broken tree (missing label) in {}: {}".format(
                filename, tree))
            continue

        try:
            words = tuple(tree.leaf_labels())
        except ValueError:
            print("Skipping a broken tree (missing preterminal) in {}: {}".
                  format(filename, tree))
            continue

        if any('www.facebook' in pt.label for pt in tree.preterminals()):
            print("Skipping a tree with a weird preterminal label in {}: {}".
                  format(filename, tree))
            continue

        tree = tree.prune_none().simplify_labels(CONSTITUENT_SPLIT)
        tree = tree.remap_constituent_labels(REMAP_NODES)
        tree = tree.remap_words(REMAP_WORDS)

        tree = split_mwe(tree, pipeline)
        if tree is None:
            continue

        constituents = set(parse_tree.Tree.get_unique_constituent_labels(tree))
        for weird_label in NODES_TO_ELIMINATE:
            if weird_label in constituents:
                break
        else:
            weird_label = None
        if weird_label is not None:
            print("Skipping a tree with a weird label {} in {}: {}".format(
                weird_label, filename, tree))
            continue

        filtered_trees.append(tree)

    return filtered_trees
Exemplo n.º 19
0
def build_trainer(pt, *args):
    # TODO: build a fake embedding some other way?
    train_trees = tree_reader.read_trees(TREEBANK)
    dev_trees = train_trees[-1:]

    args = constituency_parser.parse_args(args)
    forward_charlm = trainer.load_charlm(args['charlm_forward_file'])
    backward_charlm = trainer.load_charlm(args['charlm_backward_file'])

    model, _, _ = trainer.build_trainer(args, train_trees, dev_trees, pt,
                                        forward_charlm, backward_charlm)
    assert isinstance(model.model, lstm_model.LSTMModel)
    return model
Exemplo n.º 20
0
def test_unique_labels():
    """
    Test getting the unique labels from a tree

    Assumes tree_reader works, which should be fine since it is tested elsewhere
    """
    text = "((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?))) ((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"

    trees = tree_reader.read_trees(text)

    labels = Tree.get_unique_constituent_labels(trees)
    expected = ['NP', 'PP', 'ROOT', 'SBARQ', 'SQ', 'VP', 'WHNP']
    assert labels == expected
Exemplo n.º 21
0
def test_simple():
    """
    Tests reading two simple trees from the same text
    """
    text = "(VB Unban) (NNP Opal)"
    trees = tree_reader.read_trees(text)
    assert len(trees) == 2
    assert trees[0].is_preterminal()
    assert trees[0].label == 'VB'
    assert trees[0].children[0].label == 'Unban'
    assert trees[1].is_preterminal()
    assert trees[1].label == 'NNP'
    assert trees[1].children[0].label == 'Opal'
Exemplo n.º 22
0
def test_replace_tags():
    """
    Test the underlying replace_tags method

    Also tests that the method throws exceptions when it is supposed to
    """
    text = "((S (VP (X Find)) (NP (X Mox) (X Opal))))"
    expected = "((S (VP (A Find)) (NP (B Mox) (C Opal))))"

    trees = tree_reader.read_trees(text)

    new_tags = ["A", "B", "C"]
    new_tree = utils.replace_tags(trees[0], new_tags)

    assert new_tree == tree_reader.read_trees(expected)[0]

    with pytest.raises(ValueError):
        new_tags = ["A", "B"]
        new_tree = utils.replace_tags(trees[0], new_tags)

    with pytest.raises(ValueError):
        new_tags = ["A", "B", "C", "D"]
        new_tree = utils.replace_tags(trees[0], new_tags)
Exemplo n.º 23
0
def test_one_word():
    """
    Check that one node trees are correctly read

    probably not super relevant for the parsing use case
    """
    text = "(FOO) (BAR)"
    trees = tree_reader.read_trees(text)
    assert len(trees) == 2

    assert trees[0].is_leaf()
    assert trees[0].label == 'FOO'

    assert trees[1].is_leaf()
    assert trees[1].label == 'BAR'
Exemplo n.º 24
0
def test_complicated():
    """
    A more complicated tree that should successfully read
    """
    text = "( (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
    trees = tree_reader.read_trees(text)
    assert len(trees) == 1
    tree = trees[0]
    assert not tree.is_leaf()
    assert not tree.is_preterminal()
    assert tree.label == 'ROOT'
    assert len(tree.children) == 1
    assert tree.children[0].label == 'SBARQ'
    assert len(tree.children[0].children) == 3
    assert [x.label for x in tree.children[0].children] == ['WHNP', 'SQ', '.']
Exemplo n.º 25
0
def test_prune_none():
    text = [
        "((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (-NONE- in) (NP (DT this) (NN seat))))) (. ?)))",  # test one dead node
        "((SBARQ (WHNP (-NONE- Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))",  # test recursive dead nodes
        "((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (-NONE- this) (-NONE- seat))))) (. ?)))"
    ]  # test all children dead
    expected = [
        "(ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (NP (DT this) (NN seat))))) (. ?)))",
        "(ROOT (SBARQ (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))",
        "(ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in)))) (. ?)))"
    ]

    for t, e in zip(text, expected):
        trees = tree_reader.read_trees(t)
        assert len(trees) == 1
        tree = trees[0].prune_none()
        assert e == str(tree)
Exemplo n.º 26
0
def test_all_transitions():
    text = "((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
    trees = tree_reader.read_trees(text)
    model = SimpleModel()
    transitions = transition_sequence.build_treebank(trees)

    expected = [
        Shift(),
        CloseConstituent(),
        CompoundUnary("ROOT"),
        CompoundUnary("SQ"),
        CompoundUnary("WHNP"),
        OpenConstituent("NP"),
        OpenConstituent("PP"),
        OpenConstituent("SBARQ"),
        OpenConstituent("VP")
    ]
    assert transition_sequence.all_transitions(transitions) == expected
Exemplo n.º 27
0
def test_top_down_compound_unary():
    text = "(ROOT (S (NP (DT The) (NNP Arizona) (NNPS Corporations) (NNP Commission)) (VP (VBD authorized) (NP (NP (DT an) (ADJP (CD 11.5)) (NN %) (NN rate) (NN increase)) (PP (IN at) (NP (NNP Tucson) (NNP Electric) (NNP Power) (NNP Co.))) (, ,) (UCP (ADJP (ADJP (RB substantially) (JJR lower)) (SBAR (IN than) (S (VP (VBN recommended) (NP (JJ last) (NN month)) (PP (IN by) (NP (DT a) (NN commission) (NN hearing) (NN officer))))))) (CC and) (NP (NP (QP (RB barely) (PDT half)) (DT the) (NN rise)) (VP (VBN sought) (PP (IN by) (NP (DT the) (NN utility)))))))) (. .)))"

    trees = tree_reader.read_trees(text)
    assert len(trees) == 1

    model = SimpleModel()
    transitions = transition_sequence.build_sequence(
        trees[0], transition_scheme=TransitionScheme.TOP_DOWN_COMPOUND)

    states = parse_transitions.initial_state_from_gold_trees(trees, model)
    assert len(states) == 1
    state = states[0]

    for t in transitions:
        assert t.is_legal(state, model)
        state = t.apply(state, model)

    result = model.get_top_constituent(state.constituents)
    assert trees[0] == result
Exemplo n.º 28
0
def test_depth():
    text = "(foo) ((S (VP (VB Unban)) (NP (NNP Mox) (NNP Opal))))"
    trees = tree_reader.read_trees(text)
    assert trees[0].depth() == 0
    assert trees[1].depth() == 4