Пример #1
0
def full_model_accuracy():
    # train a classifier on the entire training data
    train_parses = depparse.read_conllu(
        "UD_English-EWT/en_ewt-ud-train.conllu")
    classifier = depparse.Classifier(train_parses)

    # test the classifier on the development set
    correct = 0
    total = 0
    for deps in depparse.read_conllu("UD_English-EWT/en_ewt-ud-dev.conllu"):
        total += len(deps)

        # clear out all the head information
        orig_heads = clear_heads(deps)

        # parse using the classifier to predict actions
        depparse.parse(deps, classifier)

        # count how many of the heads have been correctly restored
        for dep, orig_head in zip(deps, orig_heads):
            if dep.head == orig_head:
                correct += 1

    # return the accuracy
    return correct / total
def test_read_conllu():
    # read the entire training data and count the words and sentences read
    n_sentences = 0
    n_deps = 0
    for parse in depparse.read_conllu("UD_English-EWT/en_ewt-ud-train.conllu"):
        n_sentences += 1
        n_deps += len(parse)

        # make sure each sentence is a list of Dep objects
        assert all(isinstance(dep, depparse.Dep) for dep in parse)

        # make sure there is exactly one root node
        assert len([dep for dep in parse if dep.deprel == "root"]) == 1

    # make sure all sentences and words were read
    assert n_sentences == 12543
    assert n_deps == 204607

    # now do a deeper inspection of a single sentence from the training data

    # 1	Over	over	ADV	RB	_	2	advmod	2:advmod	_
    # 2	300	300	NUM	CD	NumType=Card	3	nummod	3:nummod	_
    # 3	Iraqis	Iraqis	PROPN	NNPS	Number=Plur	5	nsubj:pass	5:nsubj:pass|6:nsubj:xsubj|8:nsubj:pass	_
    # 4	are	be	AUX	VBP	Mood=Ind|Tense=Pres|VerbForm=Fin	5	aux:pass	5:aux:pass	_
    # 5	reported	report	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	0:root	_
    # 6	dead	dead	ADJ	JJ	Degree=Pos	5	xcomp	5:xcomp	_
    # 7	and	and	CCONJ	CC	_	8	cc	8:cc|8.1:cc	_
    # 8	500	500	NUM	CD	NumType=Card	5	conj	5:conj:and|8.1:nsubj:pass|9:nsubj:xsubj	_
    # 8.1	reported	report	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	_	_	5:conj:and	CopyOf=5
    # 9	wounded	wounded	ADJ	JJ	Degree=Pos	8	orphan	8.1:xcomp	_
    # 10	in	in	ADP	IN	_	11	case	11:case	_
    # 11	Fallujah	Fallujah	PROPN	NNP	Number=Sing	5	obl	5:obl:in	_
    # 12	alone	alone	ADV	RB	_	11	advmod	11:advmod	SpaceAfter=No
    # 13	.	.	PUNCT	.	_	5	punct	5:punct	_
    parses = depparse.read_conllu("UD_English-EWT/en_ewt-ud-train.conllu")
    [parse] = itertools.islice(parses, 61, 62)
    assert parse[0] == depparse.Dep(
        "1", "Over", "over", "ADV", "RB", [], "2", "advmod", ["2:advmod"], None)
    assert parse[2] == depparse.Dep(
        "3", "Iraqis", "Iraqis", "PROPN", "NNPS", ["Number=Plur"], "5",
        "nsubj:pass", ["5:nsubj:pass", "6:nsubj:xsubj", "8:nsubj:pass"], None)
    assert parse[3] == depparse.Dep(
        "4", "are", "be", "AUX", "VBP",
        ["Mood=Ind", "Tense=Pres", "VerbForm=Fin"],
        "5", "aux:pass", ["5:aux:pass"], None)
    assert parse[4] == depparse.Dep(
        "5", "reported", "report", "VERB", "VBN",
        ["Tense=Past", "VerbForm=Part", "Voice=Pass"],
        "0", "root", ["0:root"], None)
    assert parse[8] == depparse.Dep(
        "8.1", "reported", "report", "VERB", "VBN",
        ["Tense=Past", "VerbForm=Part", "Voice=Pass"],
        None, None,	["5:conj:and"], "CopyOf=5")
Пример #3
0
def test_parse():
    # consider a specific sentence from the training data

    # # sent_id = weblog-blogspot.com_alaindewitt_20040929103700_ENG_20040929_103700-0026
    # # text = The future president joined the Guard in May 1968.
    # 1    The    the    DET    DT    Definite=Def|PronType=Art    3    det    3:det    _
    # 2    future    future    ADJ    JJ    Degree=Pos    3    amod    3:amod    _
    # 3    president    president    NOUN    NN    Number=Sing    4    nsubj    4:nsubj    _
    # 4    joined    join    VERB    VBD    Mood=Ind|Tense=Past|VerbForm=Fin    0    root    0:root    _
    # 5    the    the    DET    DT    Definite=Def|PronType=Art    6    det    6:det    _
    # 6    Guard    Guard    PROPN    NNP    Number=Sing    4    obj    4:obj    _
    # 7    in    in    ADP    IN    _    8    case    8:case    _
    # 8    May    May    PROPN    NNP    Number=Sing    4    obl    4:obl:in    _
    # 9    1968    1968    NUM    CD    NumType=Card    8    nummod    8:nummod    SpaceAfter=No
    # 10    .    .    PUNCT    .    _    4    punct    4:punct    _
    parses = depparse.read_conllu("UD_English-EWT/en_ewt-ud-train.conllu")
    [deps] = itertools.islice(parses, 352, 353)

    # clear out all the head information
    orig_heads = clear_heads(deps)

    # run the parser with the oracle list of actions
    depparse.parse(
        deps,
        IterActions([
            Action.SHIFT,
            Action.SHIFT,
            Action.SHIFT,
            Action.LEFT_ARC,
            Action.LEFT_ARC,
            Action.SHIFT,
            Action.LEFT_ARC,
            Action.SHIFT,
            Action.SHIFT,
            Action.LEFT_ARC,
            Action.RIGHT_ARC,
            Action.SHIFT,
            Action.SHIFT,
            Action.LEFT_ARC,
            Action.SHIFT,
            Action.RIGHT_ARC,
            Action.RIGHT_ARC,
            Action.SHIFT,
            Action.RIGHT_ARC,
        ]))

    # make sure that the original heads have been restored by the parser
    assert [dep.head for dep in deps] == orig_heads
Пример #4
0
def test_oracle():
    # consider a specific sentence from the training data

    # # sent_id = answers-20111108085734AATXy0E_ans-0004
    # # text = Plaster of Paris does two things
    # 1    Plaster    plaster    NOUN    NN    Number=Sing    4    nsubj    4:nsubj    _
    # 2    of    of    ADP    IN    _    3    case    3:case    _
    # 3    Paris    Paris    PROPN    NNP    Number=Sing    1    nmod    1:nmod:of    _
    # 4    does    do    VERB    VBZ    Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin    0    root    0:root    _
    # 5    two    two    NUM    CD    NumType=Card    6    nummod    6:nummod    _
    # 6    things    thing    NOUN    NNS    Number=Plur    4    obj    4:obj    _
    parses = depparse.read_conllu("UD_English-EWT/en_ewt-ud-train.conllu")
    [deps] = itertools.islice(parses, 7475, 7476)

    # create an oracle for the sentence and try a few actions
    oracle = depparse.Oracle(deps)
    # shift on an empty stack
    assert oracle([], deps) == Action.SHIFT
    # shift on a stack with only one entry
    assert oracle(deps[:1], deps[1:]) == Action.SHIFT
    # shift because "Plaster" and "of" are not in a head-dependent relation
    assert oracle(deps[:2], deps[2:]) == Action.SHIFT
    # left-arc because "Paris" is the head of "of"
    assert oracle(deps[:3], deps[3:]) == Action.LEFT_ARC
    # right-arc because "Plaster" is the head of "Paris"
    assert oracle(deps[:1] + deps[2:3], deps[3:]) == Action.RIGHT_ARC

    # create a new oracle for the same sentence and extract all the actions
    oracle = depparse.Oracle(deps)
    depparse.parse(deps, oracle)

    assert oracle.actions == [
        Action.SHIFT,  #1
        Action.SHIFT,  #2
        Action.SHIFT,  #3
        Action.LEFT_ARC,  #4
        Action.RIGHT_ARC,  #5
        Action.SHIFT,  #6
        Action.LEFT_ARC,  #7
        Action.SHIFT,  #8
        Action.SHIFT,  #9
        Action.LEFT_ARC,  #10
        Action.RIGHT_ARC,  #11
    ]
    '''print([(dep.form, dep.head, dep.id) for dep in deps])
Пример #5
0
def test_oracle_round_trip():
    # take the first 50 parses from the training data
    parses = depparse.read_conllu("UD_English-EWT/en_ewt-ud-train.conllu")
    for i, deps in enumerate(itertools.islice(parses, 50)):

        # skip the non-projective parses
        if i in {4, 21, 25, 31}:
            continue

        # collect the head for each word
        orig_heads = [dep.head for dep in deps]

        # run the oracle to determine the sequence of actions
        oracle = depparse.Oracle(deps)
        depparse.parse(deps, oracle)

        # clear out all the head information
        clear_heads(deps)

        # feed the oracle-identified actions in, one at a time
        depparse.parse(deps, IterActions(oracle.actions))

        # make sure that the original heads have been restored by the parser
        assert [dep.head for dep in deps] == orig_heads