def test_txt(tmp_path): lines1 = ["text"] * 10 filename = tmp_path / 'data.txt' pipe(lines1).write_file(filename, 'txt') lines2 = pipe().read_file(filename, 'txt').collect() assert lines2 == [l + '\n' for l in lines1]
def test_merge_feats(data2): sentences1 = pipe().read_conllu(data2).unwind_feats().remove_fields( 'feats').collect() sentences2 = pipe().read_conllu(data2).collect() assert [[t.get('feats') for t in s] for s in pipe(sentences1).merge_feats() ] == [[t.get('feats') for t in s] for s in sentences2]
def test_only_fields(data2): sentences = pipe().read_conllu(data2).only_fields({'id', 'form'}).collect() assert [[t.keys() for t in s] for s in sentences ] == [[{'id', 'form'}] * len(s) for s in sentences] sentences = pipe().read_conllu(data2).only_fields('id', 'form').collect() assert [[t.keys() for t in s] for s in sentences ] == [[{'id', 'form'}] * len(s) for s in sentences]
def test_pipe(): p = pipe(range(10), pipe().filter(lambda x: x < 5), pipe(range(10)).map(lambda x: x * 2)) assert p.collect() == [0, 2, 4, 6, 8] with pytest.raises(RuntimeError): pipe().collect()
def test_to_instance(data2): sentences = pipe().read_conllu(data2).collect() index = pipe(sentences).create_index(fields=set(FIELDS) - {ID, HEAD}) inverse_index = create_inverse_index(index) p = pipe().read_conllu(data2).to_instance(index) k = pipe(p).to_sentence(inverse_index) assert k.collect() == sentences
def test_only_projective(data2, data5): p = pipe().read_conllu(data2).only_projective() assert [s.is_projective() for s in p.collect()] == [True, True] p = pipe().read_conllu(data5).only_projective() assert p.collect() == [] p = pipe().read_conllu(data5).only_projective(False) assert [s.is_projective() for s in p.collect()] == [False]
def test_batch(): p = pipe(range(10)).filter(lambda x: x < 5).stream(10).batch(3) assert p.collect() == [[0, 1, 2], [3, 4, 0], [1, 2, 3], [4]] p = pipe(range(5)).batch(3, size=lambda _: 2) assert p.collect() == [[0, 1], [2, 3], [4]] p = pipe(range(5)).batch(3, size=lambda _: 3) assert p.collect() == [[0], [1], [2], [3], [4]]
def test_replace_missing(data4): sentences = pipe().read_conllu(data4).collect() del sentences[0][0].form pipe(sentences).replace_missing('form', '__missing__', 'new').collect() assert [t.get('new') for t in sentences[0] ] == ['__missing__', None, None, None, None, None, None] pipe(sentences).replace_missing('form', None, 'new').collect() assert [t.get('new') for t in sentences[0] ] == [None, None, None, None, None, None, None]
def test_from_conllu(): p = pipe().from_conllu(_DATA1_CONLLU) assert [[t.form for t in s] for s in p.collect() ] == [['vámonos', 'vamos', 'nos', 'al', 'a', 'el', 'mar'], ['Sue', 'likes', 'coffee', 'and', 'Bill', 'likes', 'tea']] with pytest.raises(RuntimeError): pipe().map(lambda x: x).from_conllu(_DATA1_CONLLU) with pytest.raises(RuntimeError): pipe(range(10)).from_conllu(_DATA1_CONLLU)
def test_replace(data4): sentences = pipe().read_conllu(data4).replace( 'form', r"[0-9]+|[0-9]+\.[0-9]+|[0-9]+[0-9,]+", '__number__', 'new').collect() assert [t.get('new') for t in sentences[0]] == [ 'Posledná', 'revízia', 'vyšla', 'v', 'roku', '__number__', '.' ] del sentences[0][0].form pipe(sentences).replace('form', None, '__missing__', 'new').collect() assert [t.get('new') for t in sentences[0]] == [ '__missing__', 'revízia', 'vyšla', 'v', 'roku', '__number__', '.' ]
def test_remove_fields(data2): sentences = pipe().read_conllu(data2).remove_fields({'id', 'form'}).collect() assert [['id' in t.keys() for t in s] for s in sentences] == [[False] * len(s) for s in sentences] assert [['form' in t.keys() for t in s] for s in sentences] == [[False] * len(s) for s in sentences] sentences = pipe().read_conllu(data2).remove_fields('id', 'form').collect() assert [['id' in t.keys() for t in s] for s in sentences] == [[False] * len(s) for s in sentences] assert [['form' in t.keys() for t in s] for s in sentences] == [[False] * len(s) for s in sentences]
def test_flatten_tokens(data1): tokens = pipe().read_conllu(data1).flatten().only_words().lowercase( 'form').collect() assert [t.form for t in tokens] == [ 'vamos', 'nos', 'a', 'el', 'mar', 'sue', 'likes', 'coffee', 'and', 'bill', 'tea' ]
def test_upos_feats(data2): sentences = pipe().read_conllu(data2).upos_feats('new').collect() assert [t.get('new') for t in sentences[0]] == [ 'POS=PRON|Case=Nom|Number=Plur', 'POS=VERB|Number=Plur|Person=3|Tense=Pres', 'POS=CONJ', 'POS=VERB|Number=Plur|Person=3|Tense=Pres', 'POS=NOUN|Number=Plur', 'POS=PUNCT' ]
def test_only_universal_deprel(data4): sentences = pipe().read_conllu(data4).collect() sentences[0][0].deprel = 'test:test' sentences[0][0].deps = '1:test1:test:test|2:test2' sentences = pipe(sentences).only_universal_deprel().collect() assert sentences[0][0].deprel == 'test' assert sentences[0][0].deps == '1:test1|2:test2' assert sentences[0][-1].deps == '3:punct' sentences = pipe().read_conllu(data4, parse_deps=True).collect() sentences[0][0].deps = {(1, 'test1:test'), (2, 'test2')} sentences = pipe(sentences).only_universal_deprel().collect() assert sentences[0][0].deprel == 'amod' assert sentences[0][0].deps == {(1, 'test1'), (2, 'test2')} assert sentences[0][-1].deps == {(3, 'punct')}
def test_unwind_feats(data2): sentences = pipe().read_conllu(data2).unwind_feats().collect() assert [[t.get('feats:Number') for t in s] for s in sentences ] == [['Plur', 'Plur', None, 'Plur', 'Plur', None], ['Sing', 'Sing', None, 'Sing', None]] assert [[t.get('feats:Case') for t in s] for s in sentences] == [['Nom', None, None, None, None, None], ['Nom', None, None, None, None]]
def test_hdf5(data2, data3, tmp_path): index = pipe().read_conllu(data2).create_index() instances1 = pipe().read_conllu(data2).to_instance(index).collect() filename = tmp_path / 'data.hdf5' pipe(instances1).write_file(filename, 'hdf5') instances2 = pipe().read_file(filename, 'hdf5').collect() for ins1, ins2 in zip(instances1, instances2): equal_instance(ins1, ins2) index = pipe().read_conllu(data3).create_index() instances1 = pipe().read_conllu(data3).to_instance(index).collect() filename = tmp_path / 'data.hdf5' pipe(instances1).write_file(filename, 'hdf5') instances2 = pipe().read_file(filename, 'hdf5').collect() for ins1, ins2 in zip(instances1, instances2): equal_instance(ins1, ins2)
def test_to_flatten(): p = pipe(range(10)).batch(3).flatten() assert p.collect() == list(range(10)) p = pipe(range(10)).flatten() assert p.collect() == list(range(10))
def test_collu(data2, tmp_path): sentences = pipe().read_conllu(data2).collect() filename = tmp_path / 'data.conllu' pipe(sentences).write_file(filename, 'conllu') assert pipe().read_file(filename, 'conllu').collect() == sentences
def test_read_write_file(): with pytest.raises(ValueError): pipe().read_file('temp', 'unknown').collect()
def test_collect(): assert pipe(range(5)).collect(l=[-1]) == list(range(-1, 5)) assert pipe(range(5)).collect(3, [-1]) == list(range(-1, 3))
def test_first(): assert pipe(range(5)).first() == 0 assert pipe([]).first() == None assert pipe([]).first(0) == 0
def test_count(): assert pipe(range(5)).count() == 5 assert pipe([]).count() == 0
def test_only_words(data1): p = pipe().read_conllu(data1).only_words().text() assert p.collect() == [ 'vamos nos a el mar ', 'Sue likes coffee and Bill tea ' ]
def test_map(): p = pipe(range(10)).map(lambda x: 2 * x) assert p.collect() == [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
def test_filter(): p = pipe(range(10)).filter(lambda x: x < 5) assert p.collect() == [0, 1, 2, 3, 4]
def test_split_chars(data2): sentences = pipe().read_conllu(data2).split_chars('form').collect() assert [[t['form:chars'] for t in s] for s in sentences ] == [[tuple(t.form) for t in s] for s in sentences]
def test_uppercase(data1): p = pipe().read_conllu(data1).only_words().uppercase('form').text() assert p.collect() == [ 'vamos nos a el mar '.upper(), 'Sue likes coffee and Bill tea '.upper() ]
def test_filter_field(data1): tokens = pipe().read_conllu(data1).flatten().filter_field( 'form', lambda s: False).collect() assert [t for t in tokens if 'form' in t] == []
def test_map_token(data1): sentences = pipe().read_conllu(data1).map_token(lambda t: None).collect() assert sentences == [[], []]
def test_filter_token(data1): sentences = pipe().read_conllu(data1).filter_token( lambda t: False).collect() assert sentences == [[], []]