def test_gh715(): bin_data = u'\u20ac'.encode('utf-8') with tmpfile() as fn: with open(fn, 'wb') as f: f.write(bin_data) a = db.read_text(fn) assert a.compute()[0] == bin_data.decode('utf-8')
def test_to_textfiles_endlines(): b = db.from_sequence(['a', 'b', 'c'], npartitions=1) with tmpfile() as fn: b.to_textfiles([fn]) with open(fn, 'r') as f: result = f.readlines() assert result == ['a\n', 'b\n', 'c']
def test_gh715(): bin_data = u'\u20ac'.encode('utf-8') with tmpfile() as fn: with open(fn, 'wb') as f: f.write(bin_data) a = db.from_filenames(fn) assert a.compute()[0] == bin_data.decode('utf-8')
def test_read_text_large(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write(('Hello, world!' + os.linesep).encode() * 100) b = db.read_text(fn, blocksize=100) c = db.read_text(fn) assert len(b.dask) > 5 assert list(map(str, b.str.strip())) == list(map(str, c.str.strip())) d = db.read_text([fn], blocksize=100) assert list(b) == list(d)
def test_from_filenames_encoding(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write((u'你好!' + os.linesep).encode('gb18030') * 100) b = db.from_filenames(fn, chunkbytes=100, encoding='gb18030') c = db.from_filenames(fn, encoding='gb18030') assert len(b.dask) > 5 assert list(map(lambda x: x.encode('utf-8'), b)) == list(map(lambda x: x.encode('utf-8'), c)) d = db.from_filenames([fn], chunkbytes=100, encoding='gb18030') assert list(b) == list(d)
def test_from_filenames_large(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write(('Hello, world!' + os.linesep).encode() * 100) b = db.from_filenames(fn, chunkbytes=100) c = db.from_filenames(fn) assert len(b.dask) > 5 assert list(map(str, b)) == list(map(str, c)) d = db.from_filenames([fn], chunkbytes=100) assert list(b) == list(d)
def test_read_text_encoding(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write((u'你好!' + os.linesep).encode('gb18030') * 100) b = db.read_text(fn, blocksize=100, encoding='gb18030') c = db.read_text(fn, encoding='gb18030') assert len(b.dask) > 5 assert list(map(lambda x: x.encode('utf-8'), b)) == list(map(lambda x: x.encode('utf-8'), c)) d = db.read_text([fn], blocksize=100, encoding='gb18030') assert list(b) == list(d)
def test_read_text_large(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write(('Hello, world!' + os.linesep).encode() * 100) b = db.read_text(fn, blocksize=100) c = db.read_text(fn) assert len(b.dask) > 5 assert list(map(str, b)) == list(map(str, c)) d = db.read_text([fn], blocksize=100) assert list(b) == list(d)
def test_stream_decompress(): data = 'abc\ndef\n123'.encode() assert [s.strip() for s in stream_decompress('', data)] == \ ['abc', 'def', '123'] assert [s.strip() for s in stream_decompress('bz2', bz2.compress(data))] == \ ['abc', 'def', '123'] with tmpfile() as fn: f = GzipFile(fn, 'wb') f.write(data) f.close() with open(fn, 'rb') as f: compressed = f.read() assert [s.strip() for s in stream_decompress('gz', compressed)] == \ [b'abc', b'def', b'123']