def test_empty(blocksize): raw_inner = io.BytesIO() raw = _ReadRecordWrapper(raw_inner) b = BlockBuffer(raw, blocksize) assert b.size == 0 assert b.read() == b"" assert raw.records == []
def test_caching_chunks2(raw): b = BlockBuffer(raw, 2) b.seek(3) b.read(1) assert raw.records == [(2, 2)] b.seek(0) b.read() assert raw.records == [(2, 2), (0, 2), (4, 2)]
def test_giga(): raw_size = 100 * 1024**3 # 100 GB raw_inner = _ZeroFile(raw_size) raw = _ReadRecordWrapper(raw_inner) blocksize = 4 * 1024**2 # 4MB b = BlockBuffer(raw, blocksize) assert b.size == raw_size assert b.read(10) == b"\0" * 10 assert b.seek(-10, 2) == raw_size - 10 assert b.read(10) == b"\0" * 10 assert raw.records == [(0, blocksize), (raw_size - blocksize, blocksize)]
def test_caching_reuse(raw): b = BlockBuffer(raw, 3) b.read(1) assert raw.records == [(0, 3)] b.seek(0) b.read(2) assert raw.records == [(0, 3)]
def get_physical_partition_stats(metapartitions, store): """ Get statistics for partition. .. hint:: To get the metapartitions pre-aligned, use ``concat_partitions_on_primary_index=True`` during dispatch. Parameters ---------- metapartitions: Iterable[kartothek.io_components.metapartition.MetaPartition] Iterable of metapartitions belonging to the same physical partition. store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] KV store. Returns ------- stats: Dict[str, int] Statistics for the current partition. """ if callable(store): store = store() files = 0 blobsize = 0 rows = 0 for mp in metapartitions: for f in mp.files.values(): files += 1 fp = BlockBuffer(store.open(f)) try: fp_parquet = pq.ParquetFile(fp) rows += fp_parquet.metadata.num_rows blobsize += fp.size finally: fp.close() return { "blobsize": blobsize, "files": files, "partitions": 1, "rows": rows }
def test_init_fails_not_readable(): raw = io.BytesIO() raw.readable = lambda: False with pytest.raises(ValueError, match="raw must be readable"): BlockBuffer(raw)
def example_buffer(raw, blocksize): return BlockBuffer(raw, blocksize)
def test_real_file(tmpdir, blocksize): path = tmpdir.join("test_real_file.bin").strpath with open(path, "wb") as fp: fp.write(b"foxbar") real_file = open(path, "rb") b = BlockBuffer(real_file, blocksize) assert not b.closed assert b.size == 6 assert b.seekable() is True assert b.readable() is True assert b.tell() == 0 assert b.seek(1) == 1 assert b.read() == b"oxbar" # final close b.close() # closing twice works b.close()
def test_closed(blocksize): raw = io.BytesIO() b = BlockBuffer(raw, blocksize) b.close() assert b.closed assert raw.closed with pytest.raises(ValueError, match="I/O operation on closed file."): b.size with pytest.raises(ValueError, match="I/O operation on closed file."): b.seekable() with pytest.raises(ValueError, match="I/O operation on closed file."): b.readable() with pytest.raises(ValueError, match="I/O operation on closed file."): b.tell() with pytest.raises(ValueError, match="I/O operation on closed file."): b.seek(0) with pytest.raises(ValueError, match="I/O operation on closed file."): b.read() # closing twice works b.close()
def test_caching_remainder(raw): b = BlockBuffer(raw, 4) b.seek(5) b.read() assert raw.records == [(4, 2)]
def test_init_fails_closed(blocksize): raw = io.BytesIO() raw.close() with pytest.raises(ValueError, match="Cannot use closed file object"): BlockBuffer(raw, blocksize)
def test_init_fails_blocksize(blocksize): raw = io.BytesIO() with pytest.raises(ValueError, match="blocksize must be at least 1"): BlockBuffer(raw, blocksize)