def test_invalid_comment_modification(): """ Test that an error is thrown when the text is attempted to be changed through the set_meta function. """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) with pytest.raises(ValueError): sentence.set_meta('text', 'Qualcosa differente alla frase')
def test_text_readonly(): """ Test that the text comment of a Sentence is read properly and is readonly. """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) with pytest.raises(AttributeError): sentence.text = 'error causing text' assert sentence.text == 'Les études durent six ans mais leur contenu diffère donc selon les Facultés.'
def test_tree_no_extra_nodes(): """ Test that there are the right amount of nodes in the tree. """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) st = sentence.to_tree() count = 0 nodes = [st] while len(nodes) > 0: count += 1 node = nodes.pop() for child in node: nodes.append(child) assert len(sentence) == count
def test_append_contains(): """ Test that contains still works after appending an Sentence. """ with open(fixture_location('long.conll')) as f: c = Conll(f) sent = c[6] source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') new_sent = Sentence(source) other_sent = Sentence(source) other_sent.id = 'xyz' c.append(new_sent) assert new_sent in c assert sent in c assert other_sent not in c
def test_tree_empty_sentence(): """ Test that an empty sentence is properly parsed. """ source = '' sentence = Sentence(source) st = sentence.to_tree() assert st.data == None assert st.parent == None assert len(st) == 0
def test_modified_output(): """ Test if the sentence is properly outputted after changing the annotation. """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) sentence.id = 'fr-ud-dev_00231' sentence['13'].lemma = 'facultés' sentence['13'].upos = 'NOUN' sentence['13'].feats['Number'] = set() sentence['13'].feats['Number'].add('Fem') output = ( '# sent_id = fr-ud-dev_00231\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés facultés NOUN _ Number=Fem 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') assert sentence.conll() == output
def test_int_indexing(): """ Test indexing by the integer position in the sentence (int). """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) test_token = sentence[7] assert_token_members(test_token, '8', 'contenu', 'contenu', 'NOUN', None, { 'Gender': set(('Masc', )), 'Number': set(('Sing', )) }, '9', 'nsubj', {}, {})
def test_invalid_sentence_by_token(): """ Test that an invalid token results in an invalid sentence. """ source = ( '# newpar id\n' '# sent_id =\n' '# text =\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') with pytest.raises(ValueError): sentence = Sentence(source)
def test_token_line_numbers(): """ Test that the lines for tokens are correct. """ sent_start = 14 sent_end = 29 source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source, _start_line_number=sent_start, _end_line_number=sent_end) assert sentence['1'].line_number == 16 assert sentence['7'].line_number == 22
def test_setitem(): """ Test that Sentences are properly assigned when using setitem. """ with open(fixture_location('basic.conll')) as f: c = Conll(f) source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) c[1] = sentence assert c[1].conll() == source assert c[1].id == 'fr-ud-dev_00002'
def test_contains_non_existent_id(): """ Test that contains properly executes when the sentence id is unknown. """ with open(fixture_location('basic.conll')) as f: conll = Conll(f) source = ( '# sent_id = fr-ud-dev_00037' '# text = Thionville et Congerville furent créée en 1793 avec leur nom actuel et fusionnèrent en 1973.\n' '1 Thionville Thionville PROPN _ _ 5 nsubj:pass _ _\n' '2 et et CCONJ _ _ 3 cc _ _\n' '3 Congerville Congerville PROPN _ _ 1 conj _ _\n' '4 furent être AUX _ Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin 5 aux:pass _ _\n' '5 créée créer VERB _ Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part 0 root _ _\n' '6 en en ADP _ _ 7 case _ _\n' '7 1793 1793 NUM _ _ 5 obl _ _\n' '8 avec avec ADP _ _ 10 case _ _\n' '9 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 10 det _ _\n' '10 nom nom NOUN _ Gender=Masc|Number=Sing 5 obl:mod _ _\n' '11 actuel actuel ADJ _ Gender=Masc|Number=Sing 10 amod _ _\n' '12 et et CCONJ _ _ 13 cc _ _\n' '13 fusionnèrent fusionner VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin 5 conj _ _\n' '14 en en ADP _ _ 15 case _ _\n' '15 1973 1973 NUM _ _ 13 obl _ SpaceAfter=No\n' '16 . . PUNCT _ _ 5 punct _ _\n') sentence = Sentence(source) assert sentence not in conll
def test_contains_true(): """ Test that a Conll object can test for membership presence properly. """ with open(fixture_location('basic.conll')) as f: conll = Conll(f) source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) conll.append(sentence) assert sentence in conll sentence['1'].pos = 'NOUN' assert sentence in conll
def test_insert(): """ Test that a sentence can be inserted to a Conll object. """ with open(fixture_location('basic.conll')) as f: conll = Conll(f) orig_length = len(conll) source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) conll.insert(2, sentence) assert len(conll) == orig_length + 1 assert conll[2].id == 'fr-ud-dev_00002' assert len(conll[2]) == 14
def test_no_text_singleton(): """ Test that a sentence can be properly constructed with no text field. """ source = ( '# newpar id\n' '# sent_id =\n' '# text =\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) assert sentence.text is None
def test_proper_slice_type(): """ Test that the type provided to a slice must be an int, str, or slice. """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) with pytest.raises(ValueError): token = sentence[7.8]
def test_new_pardoc_comment_parsing(): """ Test that the id keyword does not inappropriately trigger an id association. """ source = ( '# newpar id\n' '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) assert sentence.par_id is None
def test_remove_comments(): """ Test that comments can be removed from the sentence (other than text), and removing non-existent comments throws a KeyError. """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '# x-coord = 2\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') expected = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) sentence.remove_meta('x-coord') assert sentence.conll() == expected with pytest.raises(ValueError): sentence.remove_meta('text') with pytest.raises(KeyError): sentence.remove_meta('x-coord')
def test_len_empty(): """ Test if an empty sentence is properly parsed. """ source = '' sentence = Sentence(source) assert len(sentence) == 0
def test_id_updating(): """ Test updating the sentence id. """ source = ('# sent_id = fr-ud-dev_00003\n' '# newdoc id = test id\n' '# text = Mais comment faire ?\n' '# text_en = But how is it done ?\n' '# translit = tat yathānuśrūyate.\n' '1 Mais mais CCONJ _ _ 3 cc _ _\n' '2 comment comment ADV _ _ 3 advmod _ _\n' '3 faire faire VERB _ VerbForm=Inf 0 root _ _\n' '4 ? ? PUNCT _ _ 3 punct _ _\n') sentence = Sentence(source) sentence.id = 'fr-ud-train_00123' assert sentence.meta_value('sent_id') == 'fr-ud-train_00123'
def test_add_comments(): """ Test that comment values (other than text or id) can be added through the meta api. """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') expected = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '# x-coord = 2\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) sentence.set_meta('x-coord', '2') assert sentence.conll() == expected
def test_metadata_error(): """ Test if the proper error is seen when asking for the value of a nonexisting comment. """ source = ('# sent_id = fr-ud-dev_00003\n' '# newdoc\n' '# text = Mais comment faire ?\n' '# text_en = But how is it done ?\n' '# translit = tat yathānuśrūyate.\n' '1 Mais mais CCONJ _ _ 3 cc _ _\n' '2 comment comment ADV _ _ 3 advmod _ _\n' '3 faire faire VERB _ VerbForm=Inf 0 root _ _\n' '4 ? ? PUNCT _ _ 3 punct _ _\n') sentence = Sentence(source) with pytest.raises(KeyError): sentence.meta_value('newpar')
def test_to_tree_standard_sentence(): """ Test that a normal sentence can be parsed properly. """ source = ('# sent_id = fr-ud-dev_00003\n' '# text = Mais comment faire ?\n' '1 Mais mais CCONJ _ _ 3 cc _ _\n' '2 comment comment ADV _ _ 3 advmod _ _\n' '3 faire faire VERB _ VerbForm=Inf 0 root _ _\n' '4 ? ? PUNCT _ _ 3 punct _ _\n') sentence = Sentence(source) st = sentence.to_tree() assert_tree_structure( st, { (): sentence[2], (0, ): sentence[0], (1, ): sentence[1], (2, ): sentence[3] })
def test_to_tree_multi_level(): """ Test a sentence with several levels of dependencies deep is properly parsed. """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) st = sentence.to_tree() assert_tree_structure( st, { (): sentence[2], (0, ): sentence[1], (1, ): sentence[4], (2, ): sentence[8], (3, ): sentence[13], (0, 0): sentence[0], (1, 0): sentence[3], (2, 0): sentence[5], (2, 1): sentence[7], (2, 2): sentence[9], (2, 3): sentence[12], (2, 1, 0): sentence[6], (2, 3, 0): sentence[10], (2, 3, 1): sentence[11] })
def test_cannot_assign_tokens(): """ Test Sentence tokens cannot be assigned by id. """ source = ('# sent_id = fr-ud-dev_00003\n' '# text = Mais comment faire ?\n' '1 Mais mais CCONJ _ _ 3 cc _ _\n' '2 comment comment ADV _ _ 3 advmod _ _\n' '3 faire faire VERB _ VerbForm=Inf 0 root _ _\n' '4 ? ? PUNCT _ _ 3 punct _ _\n') sentence = Sentence(source) with pytest.raises(TypeError): sentence['1'] = sentence['2']
def test_metadata_parsing(): """ Test if the sentence can accurately parse all metadata in the comments. """ source = ('# sent_id = fr-ud-dev_00003\n' '# newdoc id = test id\n' '# text = Mais comment faire ?\n' '# text_en = But how is it done ?\n' '# translit = tat yathānuśrūyate.\n' '1 Mais mais CCONJ _ _ 3 cc _ _\n' '2 comment comment ADV _ _ 3 advmod _ _\n' '3 faire faire VERB _ VerbForm=Inf 0 root _ _\n' '4 ? ? PUNCT _ _ 3 punct _ _\n') sentence = Sentence(source) assert sentence.meta_value('sent_id') == 'fr-ud-dev_00003' assert sentence.meta_value('newdoc id') == 'test id' assert sentence.meta_value('text') == 'Mais comment faire ?' assert sentence.meta_value('text_en') == 'But how is it done ?' assert sentence.meta_value('translit') == 'tat yathānuśrūyate.' assert sentence.meta_present('text') is True assert sentence.meta_present('translit') is True assert sentence.meta_present('fake') is False
def test_output(): """ Test if the sentence output is propertly produced. """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) assert sentence.conll() == source
def _create_sentence(sent_lines): """ Creates a Sentence object given the current state of the source iteration. Args: sent_lines: An iterable of the lines that make up the source. Returns: The created Sentence. Raises: ParseError: If the sentence source is not valid. """ sent_source = '\n'.join(sent_lines) sentence = Sentence(sent_source) return sentence
def test_int_slice_indexing_missing_value_start(): """ Test that the sentence is properly sliced when the start or end is missing. """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) test_tokens = sentence[:3] assert_token_members( test_tokens[0], '1', 'Les', 'le', 'DET', None, { 'Definite': set(('Def', )), 'Gender': set(('Fem', )), 'Number': set(('Plur', )), 'PronType': set(('Art', )) }, '2', 'det', {}, {}) assert_token_members(test_tokens[1], '2', 'études', 'étude', 'NOUN', None, { 'Gender': set(('Fem', )), 'Number': set(('Plur', )) }, '3', 'nsubj', {}, {}) assert_token_members( test_tokens[2], '3', 'durent', 'durer', 'VERB', None, { 'Mood': set(('Ind', )), 'Number': set(('Plur', )), 'Person': set(('3', )), 'Tense': set(('Pres', )), 'VerbForm': set(('Fin', )) }, '0', 'root', {}, {})
def _create_sentence(sent_lines, start, end): """ Creates a Sentence object given the current state of the source iteration. Args: sent_lines: An iterable of the lines that make up the source. start: The line number for the start of the Sentence. end: The line number for the end of the Sentence. Returns: The created Sentence. """ sent_source = '\n'.join(sent_lines) sentence = Sentence(sent_source, _start_line_number=start, _end_line_number=end) sent_lines.clear() return sentence
def test_str_slice_indexing(): """ Test slicing with strings over tokens. """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) test_tokens = sentence['8':'11'] assert_token_members(test_tokens[0], '8', 'contenu', 'contenu', 'NOUN', None, { 'Gender': set(('Masc', )), 'Number': set(('Sing', )) }, '9', 'nsubj', {}, {}) assert_token_members( test_tokens[1], '9', 'diffère', 'différer', 'VERB', None, { 'Mood': set(('Ind', )), 'Number': set(('Sing', )), 'Person': set(('3', )), 'Tense': set(('Pres', )), 'VerbForm': set(('Fin', )) }, '3', 'conj', {}, {}) assert_token_members(test_tokens[2], '10', 'donc', 'donc', 'ADV', None, {}, '9', 'advmod', {}, {})