def test_dataframe_object_dtype(): pd = pytest.importorskip('pandas') df = pd.DataFrame({'x': ['a'] * 1000}) assert sizeof('a') * 1000 < sizeof(df) < 2 * sizeof('a') * 1000 s = pd.Series(['a' * 1000] * 1000) assert sizeof(s) > 1000000
def test_serires_object_dtype(): pd = pytest.importorskip('pandas') s = pd.Series(['a'] * 1000) assert sizeof('a') * 1000 < sizeof(s) < 2 * sizeof('a') * 1000 s = pd.Series(['a' * 1000] * 1000) assert sizeof(s) > 1000000
def test_empty(): pd = pytest.importorskip('pandas') df = pd.DataFrame({'x': [1, 2, 3], 'y': ['a' * 100, 'b' * 100, 'c' * 100]}, index=[10, 20, 30]) empty = df.head(0) assert sizeof(empty) > 0 assert sizeof(empty.x) > 0 assert sizeof(empty.y) > 0 assert sizeof(empty.index) > 0
def safe_sizeof(obj, default_size=1e6): """ Safe variant of sizeof that captures and logs exceptions This returns a default size of 1e6 if the sizeof function fails """ try: return sizeof(obj) except Exception: logger.warning('Sizeof calculation failed. Defaulting to 1MB', exc_info=True) return int(default_size)
def test_sparse_matrix(): sparse = pytest.importorskip('scipy.sparse') sp = sparse.eye(10) assert sizeof(sp.todia()) >= 152 assert sizeof(sp.tobsr()) >= 232 assert sizeof(sp.tocoo()) >= 252 assert sizeof(sp.tocsc()) >= 232 assert sizeof(sp.tocsr()) >= 260 assert sizeof(sp.todok()) >= 260 assert sizeof(sp.tolil()) >= 324
def test_sparse_matrix(): sparse = pytest.importorskip('scipy.sparse') sp = sparse.eye(10) # These are the 32-bit Python 2.7 values. assert sizeof(sp.todia()) >= 152 assert sizeof(sp.tobsr()) >= 232 assert sizeof(sp.tocoo()) >= 240 assert sizeof(sp.tocsc()) >= 232 assert sizeof(sp.tocsr()) >= 232 assert sizeof(sp.todok()) >= 192 assert sizeof(sp.tolil()) >= 204
def test_pandas_repeated_column(): pd = pytest.importorskip('pandas') df = pd.DataFrame({'x': [1, 2, 3]}) assert sizeof(df[['x', 'x', 'x']]) > sizeof(df)
def test_pandas_repeated_column(): pd = pytest.importorskip('pandas') df = pd.DataFrame({'x': [1, 2, 3]}) assert sizeof(df[['x', 'x', 'x']]) > sizeof(df)
def test_spillbuffer(tmpdir): buf = SpillBuffer(str(tmpdir), target=300) # Convenience aliases assert buf.memory is buf.fast assert buf.disk is buf.slow assert not buf.slow.weight_by_key assert buf.slow.total_weight == (0, 0) assert buf.spilled_total == (0, 0) a, b, c, d = "a" * 100, "b" * 99, "c" * 98, "d" * 97 # Test assumption made by this test, mostly for non CPython implementations assert 100 < sizeof(a) < 200 assert psize(a)[0] != psize(a)[1] buf["a"] = a assert not buf.slow assert buf.fast.weights == {"a": sizeof(a)} assert buf.fast.total_weight == sizeof(a) assert buf.slow.weight_by_key == {} assert buf.slow.total_weight == (0, 0) assert buf["a"] == a buf["b"] = b assert not buf.slow assert not buf.slow.weight_by_key assert buf.slow.total_weight == (0, 0) buf["c"] = c assert set(buf.slow) == {"a"} assert buf.slow.weight_by_key == {"a": psize(a)} assert buf.slow.total_weight == psize(a) assert buf["a"] == a assert set(buf.slow) == {"b"} assert buf.slow.weight_by_key == {"b": psize(b)} assert buf.slow.total_weight == psize(b) buf["d"] = d assert set(buf.slow) == {"b", "c"} assert buf.slow.weight_by_key == {"b": psize(b), "c": psize(c)} assert buf.slow.total_weight == psize(b, c) # Deleting an in-memory key does not automatically move spilled keys back to memory del buf["a"] assert set(buf.slow) == {"b", "c"} assert buf.slow.weight_by_key == {"b": psize(b), "c": psize(c)} assert buf.slow.total_weight == psize(b, c) with pytest.raises(KeyError): buf["a"] # Deleting a spilled key updates the metadata del buf["b"] assert set(buf.slow) == {"c"} assert buf.slow.weight_by_key == {"c": psize(c)} assert buf.slow.total_weight == psize(c) with pytest.raises(KeyError): buf["b"] # Updating a spilled key moves it to the top of the LRU and to memory buf["c"] = c * 2 assert set(buf.slow) == {"d"} assert buf.slow.weight_by_key == {"d": psize(d)} assert buf.slow.total_weight == psize(d) # Single key is larger than target and goes directly into slow e = "e" * 500 buf["e"] = e assert set(buf.slow) == {"d", "e"} assert buf.slow.weight_by_key == {"d": psize(d), "e": psize(e)} assert buf.slow.total_weight == psize(d, e) # Updating a spilled key with another larger than target updates slow directly d = "d" * 500 buf["d"] = d assert set(buf.slow) == {"d", "e"} assert buf.slow.weight_by_key == {"d": psize(d), "e": psize(e)} assert buf.slow.total_weight == psize(d, e)
def test_bytes_like(): assert 1000 <= sizeof(bytes(1000)) <= 2000 assert 1000 <= sizeof(bytearray(1000)) <= 2000 assert 1000 <= sizeof(memoryview(bytes(1000))) <= 2000 assert 8000 <= sizeof(array("d", range(1000))) <= 9000
def test_pyarrow_table(): pd = pytest.importorskip("pandas") pa = pytest.importorskip("pyarrow") df = pd.DataFrame( {"x": [1, 2, 3], "y": ["a" * 100, "b" * 100, "c" * 100]}, index=[10, 20, 30] ) table = pa.Table.from_pandas(df) assert sizeof(table) > sizeof(table.schema.metadata) assert isinstance(sizeof(table), int) assert isinstance(sizeof(table.columns[0]), int) assert isinstance(sizeof(table.columns[1]), int) assert isinstance(sizeof(table.columns[2]), int) empty = pa.Table.from_pandas(df.head(0)) assert sizeof(empty) > sizeof(empty.schema.metadata) assert sizeof(empty.columns[0]) > 0 assert sizeof(empty.columns[1]) > 0 assert sizeof(empty.columns[2]) > 0
def test_sizeof(dtype): c = cupy.random.random((2, 3, 4), dtype=dtype) assert sizeof(c) == c.nbytes
def test_pandas(): pd = pytest.importorskip("pandas") df = pd.DataFrame({ "x": [1, 2, 3], "y": ["a" * 100, "b" * 100, "c" * 100] }, index=[10, 20, 30]) assert sizeof(df) >= sizeof(df.x) + sizeof(df.y) - sizeof(df.index) assert sizeof(df.x) >= sizeof(df.index) assert sizeof(df.y) >= 100 * 3 assert sizeof(df.index) >= 20 assert isinstance(sizeof(df), int) assert isinstance(sizeof(df.x), int) assert isinstance(sizeof(df.index), int)
def test_safe_sizeof(obj): assert safe_sizeof(obj) == sizeof(obj)
def psize(*objs) -> tuple[int, int]: return ( sum(sizeof(o) for o in objs), sum(len(frame) for obj in objs for frame in serialize_bytelist(obj)), )
def test_pandas(): pd = pytest.importorskip('pandas') df = pd.DataFrame({ 'x': [1, 2, 3], 'y': ['a' * 100, 'b' * 100, 'c' * 100] }, index=[10, 20, 30]) assert sizeof(df) >= sizeof(df.x) + sizeof(df.y) - sizeof(df.index) assert sizeof(df.x) >= sizeof(df.index) if pd.__version__ >= '0.17.1': assert sizeof(df.y) >= 100 * 3 assert sizeof(df.index) >= 20 assert isinstance(sizeof(df), int) assert isinstance(sizeof(df.x), int) assert isinstance(sizeof(df.index), int)
def test_numpy(): np = pytest.importorskip('numpy') assert 8000 <= sizeof(np.empty(1000, dtype='f8')) <= 9000 dt = np.dtype('f8') assert sizeof(dt) == sys.getsizeof(dt)
def test_numpy_0_strided(): np = pytest.importorskip("numpy") x = np.broadcast_to(1, (100, 100, 100)) assert sizeof(x) <= 8
def __init__(self, obj: object): self.id = id(obj) self.nbytes = sizeof(obj)
def test_spillbuffer(tmpdir): buf = SpillBuffer(str(tmpdir), target=300) # Convenience aliases assert buf.memory is buf.fast assert buf.disk is buf.slow assert not buf.spilled_by_key assert buf.spilled_total == 0 a, b, c, d = "a" * 100, "b" * 100, "c" * 100, "d" * 100 s = sizeof(a) # Test assumption made by this test, mostly for non CPython implementations assert 100 < s < 200 buf["a"] = a assert not buf.disk assert not buf.spilled_by_key assert buf.spilled_total == 0 assert buf["a"] == a buf["b"] = b assert not buf.disk assert not buf.spilled_by_key assert buf.spilled_total == 0 buf["c"] = c assert set(buf.disk) == {"a"} assert buf.spilled_by_key == {"a": s} assert buf.spilled_total == s assert buf["a"] == a assert set(buf.disk) == {"b"} assert buf.spilled_by_key == {"b": s} assert buf.spilled_total == s buf["d"] = d assert set(buf.disk) == {"b", "c"} assert buf.spilled_by_key == {"b": s, "c": s} assert buf.spilled_total == s * 2 # Deleting an in-memory key does not automatically move spilled keys back to memory del buf["a"] assert set(buf.disk) == {"b", "c"} assert buf.spilled_by_key == {"b": s, "c": s} assert buf.spilled_total == s * 2 with pytest.raises(KeyError): buf["a"] # Deleting a spilled key updates the metadata del buf["b"] assert set(buf.disk) == {"c"} assert buf.spilled_by_key == {"c": s} assert buf.spilled_total == s with pytest.raises(KeyError): buf["b"] # Updating a spilled key moves it to the top of the LRU and to memory buf["c"] = c * 2 assert set(buf.disk) == {"d"} assert buf.spilled_by_key == {"d": s} assert buf.spilled_total == s # Single key is larger than target and goes directly into slow e = "e" * 500 slarge = sizeof(e) buf["e"] = e assert set(buf.disk) == {"d", "e"} assert buf.spilled_by_key == {"d": s, "e": slarge} assert buf.spilled_total == s + slarge # Updating a spilled key with another larger than target updates slow directly buf["d"] = "d" * 500 assert set(buf.disk) == {"d", "e"} assert buf.spilled_by_key == {"d": slarge, "e": slarge} assert buf.spilled_total == slarge * 2
def test_dict(): np = pytest.importorskip("numpy") x = np.ones(10000) assert sizeof({"x": x}) > x.nbytes assert sizeof({"x": [x]}) > x.nbytes assert sizeof({"x": [{"y": x}]}) > x.nbytes
def test_numpy(): np = pytest.importorskip("numpy") assert 8000 <= sizeof(np.empty(1000, dtype="f8")) <= 9000 dt = np.dtype("f8") assert sizeof(dt) == sys.getsizeof(dt)
def test_base(): assert sizeof(1) == getsizeof(1)
def test_pandas_repeated_column(): pd = pytest.importorskip("pandas") df = pd.DataFrame({"x": [1, 2, 3]}) assert sizeof(df[["x", "x", "x"]]) > sizeof(df)
def test_containers(): assert sizeof([1, 2, [3]]) > (getsizeof(3) * 3 + getsizeof([]))
def test_base(): assert sizeof(1) == sys.getsizeof(1)
def test_numpy(): np = pytest.importorskip('numpy') assert 8000 <= sizeof(np.empty(1000, dtype='f8')) <= 9000 dt = np.dtype('f8') assert sizeof(dt) == sys.getsizeof(dt)
def test_containers(): assert sizeof([1, 2, [3]]) > (sys.getsizeof(3) * 3 + sys.getsizeof([]))
def test_pandas(): pd = pytest.importorskip('pandas') df = pd.DataFrame({'x': [1, 2, 3], 'y': ['a' * 100, 'b' * 100, 'c' * 100]}, index=[10, 20, 30]) assert sizeof(df) >= sizeof(df.x) + sizeof(df.y) - sizeof(df.index) assert sizeof(df.x) >= sizeof(df.index) assert sizeof(df.y) >= 100 * 3 assert sizeof(df.index) >= 20 assert isinstance(sizeof(df), int) assert isinstance(sizeof(df.x), int) assert isinstance(sizeof(df.index), int)
def test_sizeof(dtype): c = cupy.random.random((2, 3, 4), dtype=dtype) assert sizeof(c) == c.nbytes
def test_spillbuffer_maxlim(tmpdir): buf = SpillBuffer(str(tmpdir), target=200, max_spill=600, min_log_interval=0) a, b, c, d, e = "a" * 200, "b" * 100, "c" * 99, "d" * 199, "e" * 98 # size of a is bigger than target and is smaller than max_spill; # key should be in slow buf["a"] = a assert not buf.fast assert not buf.fast.weights assert set(buf.slow) == {"a"} assert buf.slow.weight_by_key == {"a": psize(a)} assert buf.slow.total_weight == psize(a) assert buf["a"] == a # size of b is smaller than target key should be in fast buf["b"] = b assert set(buf.fast) == {"b"} assert buf.fast.weights == {"b": sizeof(b)} assert buf["b"] == b assert buf.fast.total_weight == sizeof(b) # size of c is smaller than target but b+c > target, c should stay in fast and b # move to slow since the max_spill limit has not been reached yet buf["c"] = c assert set(buf.fast) == {"c"} assert buf.fast.weights == {"c": sizeof(c)} assert buf["c"] == c assert buf.fast.total_weight == sizeof(c) assert set(buf.slow) == {"a", "b"} assert buf.slow.weight_by_key == {"a": psize(a), "b": psize(b)} assert buf.slow.total_weight == psize(a, b) # size of e < target but e+c > target, this will trigger movement of c to slow # but the max spill limit prevents it. Resulting in e remaining in fast with captured_logger(logging.getLogger("distributed.spill")) as logs_e: buf["e"] = e assert "disk reached capacity" in logs_e.getvalue() assert set(buf.fast) == {"c", "e"} assert buf.fast.weights == {"c": sizeof(c), "e": sizeof(e)} assert buf["e"] == e assert buf.fast.total_weight == sizeof(c) + sizeof(e) assert set(buf.slow) == {"a", "b"} assert buf.slow.weight_by_key == {"a": psize(a), "b": psize(b)} assert buf.slow.total_weight == psize(a, b) # size of d > target, d should go to slow but slow reached the max_spill limit then # d will end up on fast with c (which can't be move to slow because it won't fit # either) with captured_logger(logging.getLogger("distributed.spill")) as logs_d: buf["d"] = d assert "disk reached capacity" in logs_d.getvalue() assert set(buf.fast) == {"c", "d", "e"} assert buf.fast.weights == {"c": sizeof(c), "d": sizeof(d), "e": sizeof(e)} assert buf["d"] == d assert buf.fast.total_weight == sizeof(c) + sizeof(d) + sizeof(e) assert set(buf.slow) == {"a", "b"} assert buf.slow.weight_by_key == {"a": psize(a), "b": psize(b)} assert buf.slow.total_weight == psize(a, b) # Overwrite a key that was in slow, but the size of the new key is larger than # max_spill a_large = "a" * 500 assert psize(a_large)[1] > 600 # size of max_spill with captured_logger( logging.getLogger("distributed.spill")) as logs_alarge: buf["a"] = a_large assert "disk reached capacity" in logs_alarge.getvalue() assert set(buf.fast) == {"a", "d", "e"} assert set(buf.slow) == {"b", "c"} assert buf.fast.total_weight == sizeof(d) + sizeof(a_large) + sizeof(e) assert buf.slow.total_weight == psize(b, c) # Overwrite a key that was in fast, but the size of the new key is larger than # max_spill d_large = "d" * 501 with captured_logger( logging.getLogger("distributed.spill")) as logs_dlarge: buf["d"] = d_large assert "disk reached capacity" in logs_dlarge.getvalue() assert set(buf.fast) == {"a", "d", "e"} assert set(buf.slow) == {"b", "c"} assert buf.fast.total_weight == sizeof(a_large) + sizeof(d_large) + sizeof( e) assert buf.slow.total_weight == psize(b, c)