예제 #1
0
def test_load_from_file_and_url_equivalence():
    """
    Test that the Conll object created from a string and file is the same if
    the underlying source is the same.
    """
    TEST_CONLL_URL = 'https://myconllrepo.com/english/train'
    with open(fixture_location('long.conll')) as f:
        contents = f.read()
        responses.add(responses.GET, TEST_CONLL_URL, body=contents)

    url_c = load_from_url(TEST_CONLL_URL)
    file_c = load_from_file(fixture_location('long.conll'))

    assert len(url_c) == len(file_c)
    for i in range(len(url_c)):
        assert url_c[i].id == file_c[i].id
        assert url_c[i].text == file_c[i].text
        print(url_c[i].conll())
        print(file_c[i].conll())

        for url_token in url_c[i]:
            file_token = file_c[i][url_token.id]
            assert_token_members(url_token, file_token.id, file_token.form,
                                 file_token.lemma, file_token.upos,
                                 file_token.xpos, file_token.feats,
                                 file_token.head, file_token.deprel,
                                 file_token.deps, file_token.misc)
예제 #2
0
def test_append_contains():
    """
    Test that contains still works after appending an Sentence.
    """
    with open(fixture_location('long.conll')) as f:
        c = Conll(f)

    sent = c[6]
    source = (
        '# sent_id = fr-ud-dev_00002\n'
        '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n'
        '1	Les	le	DET	_	Definite=Def|Gender=Fem|Number=Plur|PronType=Art	2	det	_	_\n'
        '2	études	étude	NOUN	_	Gender=Fem|Number=Plur	3	nsubj	_	_\n'
        '3	durent	durer	VERB	_	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	_\n'
        '4	six	six	NUM	_	_	5	nummod	_	_\n'
        '5	ans	an	NOUN	_	Gender=Masc|Number=Plur	3	obj	_	_\n'
        '6	mais	mais	CCONJ	_	_	9	cc	_	_\n'
        '7	leur	son	DET	_	Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs	8	det	_	_\n'
        '8	contenu	contenu	NOUN	_	Gender=Masc|Number=Sing	9	nsubj	_	_\n'
        '9	diffère	différer	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	3	conj	_	_\n'
        '10	donc	donc	ADV	_	_	9	advmod	_	_\n'
        '11	selon	selon	ADP	_	_	13	case	_	_\n'
        '12	les	le	DET	_	Definite=Def|Number=Plur|PronType=Art	13	det	_	_\n'
        '13	Facultés	Facultés	PROPN	_	_	9	obl	_	SpaceAfter=No\n'
        '14	.	.	PUNCT	_	_	3	punct	_	_')
    new_sent = Sentence(source)
    other_sent = Sentence(source)
    other_sent.id = 'xyz'

    c.append(new_sent)

    assert new_sent in c
    assert sent in c
    assert other_sent not in c
예제 #3
0
def test_setitem():
    """
    Test that Sentences are properly assigned when using setitem.
    """
    with open(fixture_location('basic.conll')) as f:
        c = Conll(f)

    source = (
        '# sent_id = fr-ud-dev_00002\n'
        '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n'
        '1	Les	le	DET	_	Definite=Def|Gender=Fem|Number=Plur|PronType=Art	2	det	_	_\n'
        '2	études	étude	NOUN	_	Gender=Fem|Number=Plur	3	nsubj	_	_\n'
        '3	durent	durer	VERB	_	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	_\n'
        '4	six	six	NUM	_	_	5	nummod	_	_\n'
        '5	ans	an	NOUN	_	Gender=Masc|Number=Plur	3	obj	_	_\n'
        '6	mais	mais	CCONJ	_	_	9	cc	_	_\n'
        '7	leur	son	DET	_	Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs	8	det	_	_\n'
        '8	contenu	contenu	NOUN	_	Gender=Masc|Number=Sing	9	nsubj	_	_\n'
        '9	diffère	différer	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	3	conj	_	_\n'
        '10	donc	donc	ADV	_	_	9	advmod	_	_\n'
        '11	selon	selon	ADP	_	_	13	case	_	_\n'
        '12	les	le	DET	_	Definite=Def|Number=Plur|PronType=Art	13	det	_	_\n'
        '13	Facultés	Facultés	PROPN	_	_	9	obl	_	SpaceAfter=No\n'
        '14	.	.	PUNCT	_	_	3	punct	_	_')
    sentence = Sentence(source)

    c[1] = sentence
    assert c[1].conll() == source
    assert c[1].id == 'fr-ud-dev_00002'
예제 #4
0
def test_contains_non_existent_id():
    """
    Test that contains properly executes when the sentence id is unknown.
    """
    with open(fixture_location('basic.conll')) as f:
        conll = Conll(f)

    source = (
        '# sent_id = fr-ud-dev_00037'
        '# text = Thionville et Congerville furent créée en 1793 avec leur nom actuel et fusionnèrent en 1973.\n'
        '1	Thionville	Thionville	PROPN	_	_	5	nsubj:pass	_	_\n'
        '2	et	et	CCONJ	_	_	3	cc	_	_\n'
        '3	Congerville	Congerville	PROPN	_	_	1	conj	_	_\n'
        '4	furent	être	AUX	_	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	5	aux:pass	_	_\n'
        '5	créée	créer	VERB	_	Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part	0	root	_	_\n'
        '6	en	en	ADP	_	_	7	case	_	_\n'
        '7	1793	1793	NUM	_	_	5	obl	_	_\n'
        '8	avec	avec	ADP	_	_	10	case	_	_\n'
        '9	leur	son	DET	_	Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs	10	det	_	_\n'
        '10	nom	nom	NOUN	_	Gender=Masc|Number=Sing	5	obl:mod	_	_\n'
        '11	actuel	actuel	ADJ	_	Gender=Masc|Number=Sing	10	amod	_	_\n'
        '12	et	et	CCONJ	_	_	13	cc	_	_\n'
        '13	fusionnèrent	fusionner	VERB	_	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	5	conj	_	_\n'
        '14	en	en	ADP	_	_	15	case	_	_\n'
        '15	1973	1973	NUM	_	_	13	obl	_	SpaceAfter=No\n'
        '16	.	.	PUNCT	_	_	5	punct	_	_\n')
    sentence = Sentence(source)

    assert sentence not in conll
예제 #5
0
def test_contains_true():
    """
    Test that a Conll object can test for membership presence properly.
    """
    with open(fixture_location('basic.conll')) as f:
        conll = Conll(f)

    source = (
        '# sent_id = fr-ud-dev_00002\n'
        '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n'
        '1	Les	le	DET	_	Definite=Def|Gender=Fem|Number=Plur|PronType=Art	2	det	_	_\n'
        '2	études	étude	NOUN	_	Gender=Fem|Number=Plur	3	nsubj	_	_\n'
        '3	durent	durer	VERB	_	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	_\n'
        '4	six	six	NUM	_	_	5	nummod	_	_\n'
        '5	ans	an	NOUN	_	Gender=Masc|Number=Plur	3	obj	_	_\n'
        '6	mais	mais	CCONJ	_	_	9	cc	_	_\n'
        '7	leur	son	DET	_	Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs	8	det	_	_\n'
        '8	contenu	contenu	NOUN	_	Gender=Masc|Number=Sing	9	nsubj	_	_\n'
        '9	diffère	différer	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	3	conj	_	_\n'
        '10	donc	donc	ADV	_	_	9	advmod	_	_\n'
        '11	selon	selon	ADP	_	_	13	case	_	_\n'
        '12	les	le	DET	_	Definite=Def|Number=Plur|PronType=Art	13	det	_	_\n'
        '13	Facultés	Facultés	PROPN	_	_	9	obl	_	SpaceAfter=No\n'
        '14	.	.	PUNCT	_	_	3	punct	_	_')
    sentence = Sentence(source)
    conll.append(sentence)

    assert sentence in conll

    sentence['1'].pos = 'NOUN'

    assert sentence in conll
예제 #6
0
def test_insert():
    """
    Test that a sentence can be inserted to a Conll object.
    """
    with open(fixture_location('basic.conll')) as f:
        conll = Conll(f)
    orig_length = len(conll)

    source = (
        '# sent_id = fr-ud-dev_00002\n'
        '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n'
        '1	Les	le	DET	_	Definite=Def|Gender=Fem|Number=Plur|PronType=Art	2	det	_	_\n'
        '2	études	étude	NOUN	_	Gender=Fem|Number=Plur	3	nsubj	_	_\n'
        '3	durent	durer	VERB	_	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	_\n'
        '4	six	six	NUM	_	_	5	nummod	_	_\n'
        '5	ans	an	NOUN	_	Gender=Masc|Number=Plur	3	obj	_	_\n'
        '6	mais	mais	CCONJ	_	_	9	cc	_	_\n'
        '7	leur	son	DET	_	Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs	8	det	_	_\n'
        '8	contenu	contenu	NOUN	_	Gender=Masc|Number=Sing	9	nsubj	_	_\n'
        '9	diffère	différer	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	3	conj	_	_\n'
        '10	donc	donc	ADV	_	_	9	advmod	_	_\n'
        '11	selon	selon	ADP	_	_	13	case	_	_\n'
        '12	les	le	DET	_	Definite=Def|Number=Plur|PronType=Art	13	det	_	_\n'
        '13	Facultés	Facultés	PROPN	_	_	9	obl	_	SpaceAfter=No\n'
        '14	.	.	PUNCT	_	_	3	punct	_	_')
    sentence = Sentence(source)

    conll.insert(2, sentence)

    assert len(conll) == orig_length + 1
    assert conll[2].id == 'fr-ud-dev_00002'
    assert len(conll[2]) == 14
예제 #7
0
def test_invalid_conll():
    """
    Test that an invalid sentence results in an invalid Conll object.
    """
    with open(fixture_location('invalid.conll')) as f:
        with pytest.raises(ValueError):
            c = Conll(f)
예제 #8
0
def test_getitem_raises_typeerror():
    """
    Test that a non integer or slice key raises a TypeError.
    """
    with open(fixture_location('basic.conll')) as f:
        c = Conll(f)

    with pytest.raises(TypeError):
        sent = c['error']
예제 #9
0
def test_writing_output():
    """
    Test that CoNLL files are properly created.
    """
    with open(fixture_location('basic.conll')) as f:
        contents_basic = f.read()
        f.seek(0)
        conll = Conll(f)

    output_loc = fixture_location('output.conll')
    with open(output_loc, 'w') as f:
        conll.write(f)

    with open(output_loc) as f:
        contents_write = f.read()
    os.remove(fixture_location('output.conll'))

    assert contents_basic == contents_write
예제 #10
0
def test_ngram_first_word_match():
    """
    Test that a first word match is not enough to match.
    """
    c = load_from_file(fixture_location('long.conll'))
    it = find_ngrams(c, 'un cabinet'.split())

    with pytest.raises(StopIteration):
        next(it)
예제 #11
0
def test_ngram_standard():
    """
    Test if the find_ngram method works for standard situations.
    """
    c = load_from_file(fixture_location('basic.conll'))

    s, i = next(find_ngrams(c, 'un film sur la'.split()))
    assert s.id == 'fr-ud-dev_00001'
    assert i == 2
예제 #12
0
def test_ngram_none():
    """
    Test that no ngram is identified when no exist
    """
    c = load_from_file(fixture_location('long.conll'))
    it = find_ngrams(c, 'cabinet'.split())

    with pytest.raises(StopIteration):
        next(it)
예제 #13
0
def test_no_nonprojectivities():
    """
    Test with a sentence with no non-projective dependencies.
    """
    c = load_from_file(fixture_location('projectivities.conll'))
    sent = c[0]
    deps = find_nonprojective_deps(sent)

    assert not deps
예제 #14
0
def test_numeric_indexing():
    """
    Test the ability to index sentences through their numeric position.
    """
    with open(fixture_location('basic.conll')) as f:
        conll = Conll(f)

    assert len(conll[0]) == 10
    assert conll[0].id == 'fr-ud-dev_00001'
예제 #15
0
def test_delitem_single_int():
    """
    Test that Sentences keyed by index are properly deleted from Conll objects.
    """
    with open(fixture_location('basic.conll')) as f:
        c = Conll(f)

    del c[2]
    assert len(c) == 3
    assert c[2].id == 'fr-ud-dev_00004'
예제 #16
0
def test_multiword_ignore():
    """
    Test that multiword tokens are ignored and do not cause errors.
    """
    c = load_from_file(fixture_location('projectivities.conll'))

    sent = c[3]
    deps = find_nonprojective_deps(sent)

    assert deps == [(sent['16'], sent['4'])]
예제 #17
0
def test_string_output():
    """
    Test that the strings are properly created.
    """
    with open(fixture_location('basic.conll')) as f:
        contents = f.read()
        f.seek(0)
        conll = Conll(f)

    assert contents == conll.conll()
예제 #18
0
def test_load_from_file():
    """
    Test that a CoNLL file can properly be loaded from a filename.
    """
    c = load_from_file(fixture_location('basic.conll'))
    sent = c[1]

    assert len(c) == 4
    assert len(sent) == 14
    assert sent['10'].form == 'donc'
예제 #19
0
def test_iter_from_file():
    """
    Test that CoNLL files can be iterated over without memory given the
    filename.
    """
    expected_ids = ['fr-ud-dev_0000{}'.format(i) for i in range(1, 5)]
    actual_ids = [
        sent.id for sent in iter_from_file(fixture_location('basic.conll'))
    ]

    assert expected_ids == actual_ids
예제 #20
0
def test_iter_from_string():
    """
    Test that CoNLL files in string form can be iterated over without memory.
    """
    with open(fixture_location('basic.conll')) as f:
        contents = f.read()

    expected_ids = ['fr-ud-dev_0000{}'.format(i) for i in range(1, 5)]
    actual_ids = [sent.id for sent in iter_from_string(contents)]

    assert expected_ids == actual_ids
예제 #21
0
def test_multiple_nonprojectivities():
    """
    Test that multiple disjoint projectivities are properly identified.
    """
    c = load_from_file(fixture_location('projectivities.conll'))

    sent = c[5]
    deps = find_nonprojective_deps(sent)

    assert set(deps) == set([(sent['22'], sent['3']), (sent['22'], sent['21']),
                             (sent['28'], sent['25'])])
예제 #22
0
def test_overlapping_nonprojectivities():
    """
    Test that multiple non-projectivities can overlap.
    """
    c = load_from_file(fixture_location('projectivities.conll'))

    sent = c[4]
    deps = find_nonprojective_deps(sent)

    assert set(deps) == set([(sent['16'], sent['4']),
                             (sent['16'], sent['11'])])
예제 #23
0
def test_delitem_contains():
    """
    Test that the contains method still works after deletion.
    """
    with open(fixture_location('long.conll')) as f:
        c = Conll(f)

    sent = c[1]

    assert sent in c
    del c[1]
    assert sent not in c
예제 #24
0
def test_par_and_doc_id_basic():
    """
    Test that the paragraph and document ids are properly associated with the
    Sentences.
    """
    with open(fixture_location('par_doc_ids_basic.conll')) as f:
        c = Conll(f)

    expected_doc_ids = ['2', '2', '1', '1']
    actual_doc_ids = [s.doc_id for s in c]

    assert expected_doc_ids == actual_doc_ids
예제 #25
0
def test_iter_from_network():
    """
    Test that a CoNLL file over a network can be iterated.
    """
    TEST_CONLL_URL = 'https://myconllrepo.com/english/train'
    with open(fixture_location('basic.conll')) as f:
        responses.add(responses.GET, TEST_CONLL_URL, body=f.read())

    expected_ids = ['fr-ud-dev_0000{}'.format(i) for i in range(1, 5)]
    actual_ids = [sent.id for sent in iter_from_url(TEST_CONLL_URL)]

    assert expected_ids == actual_ids
예제 #26
0
def test_iter_from_network_fail():
    """
    Test that a CoNLL file over a network can be iterated.
    """
    TEST_CONLL_URL = 'https://myconllrepo.com/english/train'
    WRONG_URL = 'https://myconllrepo.com/english/gibberish'
    with open(fixture_location('basic.conll')) as f:
        responses.add(responses.GET, TEST_CONLL_URL, body=f.read())

    with pytest.raises(requests.exceptions.RequestException):
        for sent in iter_from_url(WRONG_URL):
            pass
예제 #27
0
def test_sentence_line_numbers():
    """
    Test that the CoNLL files properly associate line numbers.
    """
    sentence_bounds = [(1, 12), (14, 29), (31, 41), (43, 96)]

    with open(fixture_location('basic.conll')) as f:
        c = Conll(f)

    for i, sent in enumerate(c):
        cur_bounds = sentence_bounds[i]
        assert sent.start_line_number == cur_bounds[0]
        assert sent.end_line_number == cur_bounds[1]
예제 #28
0
def test_load_from_string():
    """
    Test that a CoNLL file can properly be loaded from a string.
    """
    with open(fixture_location('basic.conll')) as f:
        contents = f.read()

    c = load_from_string(contents)
    sent = c[1]

    assert len(c) == 4
    assert len(sent) == 14
    assert sent['10'].form == 'donc'
예제 #29
0
def test_simple_nonprojectivities():
    """
    Test logic with a sentence with one single non-projectivity.
    """
    c = load_from_file(fixture_location('projectivities.conll'))
    sent1 = c[3]
    deps1 = find_nonprojective_deps(sent1)

    sent2 = c[2]
    deps2 = find_nonprojective_deps(sent2)

    assert deps1 == [(sent1['16'], sent1['4'])]
    assert deps2 == [(sent2['8'], sent2['5'])]
예제 #30
0
def test_ngram_multiword_split():
    """
    Test that ngram searches still work when they go over a multiword token.
    """
    c = load_from_file(fixture_location('long.conll'))

    it = find_ngrams(c, 'de " décentrement de le Sujet "'.split())
    s, i = next(it)

    assert s.id == 'fr-ud-test_00002'
    assert i == 8

    with pytest.raises(StopIteration):
        next(it)