def test_int_indexing(): """ Test indexing by the integer position in the sentence (int). """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) test_token = sentence[7] assert_token_members(test_token, '8', 'contenu', 'contenu', 'NOUN', None, { 'Gender': set(('Masc', )), 'Number': set(('Sing', )) }, '9', 'nsubj', {}, {})
def test_multiple_features_modify(): """ Test modification of features. """ token_line = '28 une un DET _ ' \ 'Definite=Ind|Gender=Fem|Number=Sing|PronType=Art 30 det _ _\n' token = Token(token_line) assert_token_members( token, '28', 'une', 'un', 'DET', None, { 'Definite': set(('Ind', )), 'Gender': set(('Fem', )), 'Number': set(('Sing', )), 'PronType': set(('Art', )) }, '30', 'det', {}, {}) # Somehow this word is definite and indefinite! token.feats['Definite'].add('Def') assert_token_members( token, '28', 'une', 'un', 'DET', None, { 'Definite': set(('Ind', 'Def')), 'Gender': set(('Fem', )), 'Number': set(('Sing', )), 'PronType': set(('Art', )) }, '30', 'det', {}, {})
def test_load_from_file_and_url_equivalence(): """ Test that the Conll object created from a string and file is the same if the underlying source is the same. """ TEST_CONLL_URL = 'https://myconllrepo.com/english/train' with open(fixture_location('long.conll')) as f: contents = f.read() responses.add(responses.GET, TEST_CONLL_URL, body=contents) url_c = load_from_url(TEST_CONLL_URL) file_c = load_from_file(fixture_location('long.conll')) assert len(url_c) == len(file_c) for i in range(len(url_c)): assert url_c[i].id == file_c[i].id assert url_c[i].text == file_c[i].text print(url_c[i].conll()) print(file_c[i].conll()) for url_token in url_c[i]: file_token = file_c[i][url_token.id] assert_token_members(url_token, file_token.id, file_token.form, file_token.lemma, file_token.upos, file_token.xpos, file_token.feats, file_token.head, file_token.deprel, file_token.deps, file_token.misc)
def test_only_form_and_lemma(): """ Test construction when token line only has a form and lemma. """ token_line = '10.1 micro-pays micro-pays _ _ _ _ _ _ _\n' token = Token(token_line) assert_token_members(token, '10.1', 'micro-pays', 'micro-pays', None, None, {}, None, None, {}, {})
def test_empty_lemma_empty_form_with_assumption(): """ Test that a Token with no form or lemma with the empty assumption gets values of None. """ token_line = '33 _ _ SYM _ _ 30 punct _ SpaceAfter=No' token = Token(token_line, empty=True) assert_token_members(token, '33', None, None, 'SYM', None, {}, '30', 'punct', {}, {'SpaceAfter': set(('No', ))})
def test_empty_lemma_present_form(): """ Test construction of token without empty assumption and no lemma but a present form. """ token_line = '33 _ hate VERB _ _ 30 nmod _ SpaceAfter=No' token = Token(token_line) assert_token_members(token, '33', None, 'hate', 'VERB', None, {}, '30', 'nmod', {}, {'SpaceAfter': set(('No', ))})
def test_underscore_construction(): """ Test construction of token without empty assumption and no form or lemma. """ token_line = '33 _ _ PUN _ _ 30 nmod _ SpaceAfter=No' token = Token(token_line) assert_token_members(token, '33', '_', '_', 'PUN', None, {}, '30', 'nmod', {}, {'SpaceAfter': set(('No', ))})
def test_multiword_construction(): """ Test the creation of a token that is a multiword token line. """ token_line = '8-9 du _ _ _ _ _ _ _ _' token = Token(token_line) assert_token_members(token, '8-9', 'du', None, None, None, {}, None, None, {}, {}) assert token.is_multiword()
def test_construction(): """ Test the normal construction of a general token. """ token_line = '7 vie vie NOUN _ Gender=Fem|Number=Sing 4 nmod _ SpaceAfter=No\n' token = Token(token_line) assert_token_members(token, '7', 'vie', 'vie', 'NOUN', None, { 'Gender': set(('Fem', )), 'Number': set(('Sing', )) }, '4', 'nmod', {}, {'SpaceAfter': set(('No', ))})
def test_construction_no_newline(): """ Test the construction of a token with no newline at the end of the line. """ token_line = '7 vie vie NOUN _ Gender=Fem|Number=Sing 4 nmod _ _' token = Token(token_line) assert_token_members(token, '7', 'vie', 'vie', 'NOUN', None, { 'Gender': set(('Fem', )), 'Number': set(('Sing', )) }, '4', 'nmod', {}, {})
def test_deps_construction(): """ Test construction of a token when the deps field is present. """ token_line = '1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _\n' token = Token(token_line) assert_token_members(token, '1', 'They', 'they', 'PRON', 'PRP', { 'Case': set(('Nom', )), 'Number': set(('Plur', )) }, '2', 'nsubj', { '2': ('nsubj', None, None, None), '4': ('nsubj', None, None, None) }, {})
def test_load_from_file_and_string_equivalence(): """ Test that the Conll object created from a string and file is the same if the underlying source is the same. """ with open(fixture_location('long.conll')) as f: contents = f.read() str_c = load_from_string(contents) file_c = load_from_file(fixture_location('long.conll')) assert len(str_c) == len(file_c) for i in range(len(str_c)): assert str_c[i].id == file_c[i].id assert str_c[i].text == file_c[i].text print(str_c[i].conll()) print(file_c[i].conll()) for str_token in str_c[i]: file_token = file_c[i][str_token.id] assert_token_members(str_token, file_token.id, file_token.form, file_token.lemma, file_token.upos, file_token.xpos, file_token.feats, file_token.head, file_token.deprel, file_token.deps, file_token.misc)
def test_int_slice_indexing_missing_value_stop(): """ Test that the sentence is properly sliced when the start or end is missing. """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) test_tokens = sentence[10:] assert_token_members(test_tokens[0], '11', 'selon', 'selon', 'ADP', None, {}, '13', 'case', {}, {}) assert_token_members( test_tokens[1], '12', 'les', 'le', 'DET', None, { 'Definite': set(('Def', )), 'Number': set(('Plur', )), 'PronType': set(('Art', )) }, '13', 'det', {}, {}) assert_token_members(test_tokens[2], '13', 'Facultés', 'Facultés', 'PROPN', None, {}, '9', 'obl', {}, {'SpaceAfter': set(('No', ))}) assert_token_members(test_tokens[3], '14', '.', '.', 'PUNCT', None, {}, '3', 'punct', {}, {})
def test_simple_sentence_construction(): """ Test the construction of a simple sentence. """ source = ('# sent_id = fr-ud-dev_00003\n' '# text = Mais comment faire ?\n' '1 Mais mais CCONJ _ _ 3 cc _ _\n' '2 comment comment ADV _ _ 3 advmod _ _\n' '3 faire faire VERB _ VerbForm=Inf 0 root _ _\n' '4 ? ? PUNCT _ _ 3 punct _ _\n') sentence = Sentence(source) assert sentence.id == 'fr-ud-dev_00003' assert sentence.text == 'Mais comment faire ?' assert len(sentence) == 4 assert_token_members(sentence['1'], '1', 'Mais', 'mais', 'CCONJ', None, {}, '3', 'cc', {}, {}) assert_token_members(sentence['2'], '2', 'comment', 'comment', 'ADV', None, {}, '3', 'advmod', {}, {}) assert_token_members(sentence['3'], '3', 'faire', 'faire', 'VERB', None, {'VerbForm': set(('Inf', ))}, '0', 'root', {}, {}) assert_token_members(sentence['4'], '4', '?', '?', 'PUNCT', None, {}, '3', 'punct', {}, {})
def test_str_slice_indexing_step(): """ Test slicing with string indices and with a step size. """ source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) test_tokens = sentence['1':'6':2] assert_token_members( test_tokens[0], '1', 'Les', 'le', 'DET', None, { 'Definite': set(('Def', )), 'Gender': set(('Fem', )), 'Number': set(('Plur', )), 'PronType': set(('Art', )) }, '2', 'det', {}, {}) assert_token_members( test_tokens[1], '3', 'durent', 'durer', 'VERB', None, { 'Mood': set(('Ind', )), 'Number': set(('Plur', )), 'Person': set(('3', )), 'Tense': set(('Pres', )), 'VerbForm': set(('Fin', )) }, '0', 'root', {}, {}) assert_token_members(test_tokens[2], '5', 'ans', 'an', 'NOUN', None, { 'Gender': set(('Masc', )), 'Number': set(('Plur', )), }, '3', 'obj', {}, {})