def test_cupy_matmul(): cupy = pytest.importorskip("cupy") a, b = cupy.arange(10), cupy.arange(10) c = a @ b assert c == proxy_object.asproxy(a) @ b assert c == a @ proxy_object.asproxy(b) assert c == proxy_object.asproxy(a) @ proxy_object.asproxy(b)
def test_pandas(): """Check pandas operations on proxy objects""" pandas = pytest.importorskip("pandas") df1 = pandas.DataFrame({"a": range(10)}) df2 = pandas.DataFrame({"a": range(10)}) res = dask.dataframe.methods.concat([df1, df2]) got = dask.dataframe.methods.concat([df1, df2]) assert_frame_equal(res, got) got = dask.dataframe.methods.concat([proxy_object.asproxy(df1), df2]) assert_frame_equal(res, got) got = dask.dataframe.methods.concat([df1, proxy_object.asproxy(df2)]) assert_frame_equal(res, got) df1 = pandas.Series(range(10)) df2 = pandas.Series(range(10)) res = dask.dataframe.methods.concat([df1, df2]) got = dask.dataframe.methods.concat([df1, df2]) assert all(res == got) got = dask.dataframe.methods.concat([proxy_object.asproxy(df1), df2]) assert all(res == got) got = dask.dataframe.methods.concat([df1, proxy_object.asproxy(df2)]) assert all(res == got)
def test_proxy_object_serializer(): """Check the serializers argument""" pxy = proxy_object.asproxy(DummyObj(), serializers=("dask", "pickle")) assert pxy._pxy_get().serializer == "pickle" assert "DummyObj (serialized='pickle')" in repr(pxy) with pytest.raises(ValueError) as excinfo: pxy = proxy_object.asproxy([42], serializers=("dask", "pickle")) assert "Cannot wrap a collection" in str(excinfo.value)
def test_concatenate3_of_proxied_cupy_arrays(): """Check concatenate of cupy arrays""" from dask.array.core import concatenate3 cupy = pytest.importorskip("cupy") org = cupy.arange(10) a = proxy_object.asproxy(org.copy()) b = proxy_object.asproxy(org.copy()) assert all(concatenate3([a, b]) == concatenate3([org.copy(), org.copy()]))
def test_tensordot_of_proxied_cupy_arrays(): """Check tensordot of cupy arrays""" cupy = pytest.importorskip("cupy") org = cupy.arange(9).reshape((3, 3)) a = proxy_object.asproxy(org.copy()) b = proxy_object.asproxy(org.copy()) res1 = dask.array.tensordot(a, b).flatten() res2 = dask.array.tensordot(org.copy(), org.copy()).flatten() assert all(res1 == res2)
def test_double_proxy_object(serializers_first, serializers_second): """Check asproxy() when creating a proxy object of a proxy object""" org = list(range(10)) pxy1 = proxy_object.asproxy(org, serializers=serializers_first) assert pxy1._obj_pxy["serializers"] == serializers_first pxy2 = proxy_object.asproxy(pxy1, serializers=serializers_second) if serializers_second is None: # Check that `serializers=None` doesn't change the initial serializers assert pxy2._obj_pxy["serializers"] == serializers_first else: assert pxy2._obj_pxy["serializers"] == serializers_second assert pxy1 is pxy2
def test_cupy_imatmul(): cupy = pytest.importorskip("cupy") a = cupy.arange(9).reshape(3, 3) c = a.copy() c @= a a1 = a.copy() a1 @= proxy_object.asproxy(a) assert (a1 == c).all() a2 = proxy_object.asproxy(a.copy()) a2 @= a assert (a2 == c).all()
def test_double_proxy_object(serializers_first, serializers_second): """Check asproxy() when creating a proxy object of a proxy object""" serializer1 = serializers_first[0] if serializers_first else None serializer2 = serializers_second[0] if serializers_second else None org = bytearray(range(10)) pxy1 = proxy_object.asproxy(org, serializers=serializers_first) assert pxy1._pxy_get().serializer == serializer1 pxy2 = proxy_object.asproxy(pxy1, serializers=serializers_second) if serializers_second is None: # Check that `serializers=None` doesn't change the initial serializers assert pxy2._pxy_get().serializer == serializer1 else: assert pxy2._pxy_get().serializer == serializer2 assert pxy1 is pxy2
def test_communicating_disk_objects(protocol, shared_fs): """Testing disk serialization of cuDF dataframe when communicating""" cudf = pytest.importorskip("cudf") ProxifyHostFile._spill_shared_filesystem = shared_fs def task(x): # Check that the subclass survives the trip from client to worker assert isinstance(x, _PxyObjTest) serializer_used = x._pxy_get().serializer if shared_fs: assert serializer_used == "disk" else: assert serializer_used == "dask" with dask_cuda.LocalCUDACluster( n_workers=1, protocol=protocol, enable_tcp_over_ucx=protocol == "ucx") as cluster: with Client(cluster) as client: df = cudf.DataFrame({"a": range(10)}) df = proxy_object.asproxy(df, serializers=("disk", ), subclass=_PxyObjTest) df._pxy_get().assert_on_deserializing = False df = client.scatter(df) client.submit(task, df).result() client.shutdown() # Avoids a UCX shutdown error
def test_one_item_host_limit(capsys): memory_limit = sizeof(asproxy(one_item_array(), serializers=("dask", "pickle"))) dhf = ProxifyHostFile( device_memory_limit=one_item_nbytes, memory_limit=memory_limit ) a1 = one_item_array() + 1 a2 = one_item_array() + 2 dhf["k1"] = a1 dhf["k2"] = a2 dhf.manager.validate() # Check k1 is spilled because of the newer k2 k1 = dhf["k1"] k2 = dhf["k2"] assert k1._pxy_get().is_serialized() assert not k2._pxy_get().is_serialized() dhf.manager.validate() assert is_proxies_equal(dhf.manager._disk.get_proxies(), []) assert is_proxies_equal(dhf.manager._host.get_proxies(), [k1]) assert is_proxies_equal(dhf.manager._dev.get_proxies(), [k2]) # Check k1 is spilled to disk and k2 is spilled to host dhf["k3"] = one_item_array() + 3 k3 = dhf["k3"] dhf.manager.validate() assert is_proxies_equal(dhf.manager._disk.get_proxies(), [k1]) assert is_proxies_equal(dhf.manager._host.get_proxies(), [k2]) assert is_proxies_equal(dhf.manager._dev.get_proxies(), [k3]) dhf.manager.validate() # Accessing k2 spills k3 and unspill k2 k2_val = k2[0] assert k2_val == 2 dhf.manager.validate() assert is_proxies_equal(dhf.manager._disk.get_proxies(), [k1]) assert is_proxies_equal(dhf.manager._host.get_proxies(), [k3]) assert is_proxies_equal(dhf.manager._dev.get_proxies(), [k2]) # Adding a new array spill k3 to disk and k2 to host dhf["k4"] = one_item_array() + 4 k4 = dhf["k4"] dhf.manager.validate() assert is_proxies_equal(dhf.manager._disk.get_proxies(), [k1, k3]) assert is_proxies_equal(dhf.manager._host.get_proxies(), [k2]) assert is_proxies_equal(dhf.manager._dev.get_proxies(), [k4]) # Accessing k1 unspills k1 directly to device and spills k4 to host k1_val = k1[0] assert k1_val == 1 dhf.manager.validate() assert is_proxies_equal(dhf.manager._disk.get_proxies(), [k2, k3]) assert is_proxies_equal(dhf.manager._host.get_proxies(), [k4]) assert is_proxies_equal(dhf.manager._dev.get_proxies(), [k1]) # Clean up del k1, k2, k3, k4 dhf.clear() assert len(dhf.manager) == 0
def test_cupy_broadcast_to(): cupy = pytest.importorskip("cupy") a = cupy.arange(10) a_b = np.broadcast_to(a, (10, 10)) p_b = np.broadcast_to(proxy_object.asproxy(a), (10, 10)) assert a_b.shape == p_b.shape assert (a_b == p_b).all()
def test_serializing_to_disk(obj): """Check serializing to disk""" if isinstance(obj, str): backend = pytest.importorskip(obj) obj = backend.arange(100) # Serialize from device to disk pxy = proxy_object.asproxy(obj) ProxifyHostFile.serialize_proxy_to_disk_inplace(pxy) assert pxy._pxy_get().serializer == "disk" assert obj == proxy_object.unproxy(pxy) # Serialize from host to disk pxy = proxy_object.asproxy(obj, serializers=("pickle", )) ProxifyHostFile.serialize_proxy_to_disk_inplace(pxy) assert pxy._pxy_get().serializer == "disk" assert obj == proxy_object.unproxy(pxy)
def test_einsum_of_proxied_cupy_arrays(): """Check tensordot of cupy arrays""" cupy = pytest.importorskip("cupy") org = cupy.arange(25).reshape(5, 5) res1 = dask.array.einsum("ii", org) a = proxy_object.asproxy(org.copy()) res2 = dask.array.einsum("ii", a) assert all(res1.flatten() == res2.flatten())
def test_pickle_proxy_object(array_module, serializers): """Check pickle of the proxy object""" array_module = pytest.importorskip(array_module) org = array_module.arange(10) pxy = proxy_object.asproxy(org, serializers=serializers) data = pickle.dumps(pxy) restored = pickle.loads(data) repr(restored) assert all(org == restored)
def test_proxy_object_parquet(tmp_path): """Check parquet read/write of a proxy object""" cudf = pytest.importorskip("cudf") tmp_path = tmp_path / "proxy_test.parquet" df = cudf.DataFrame({"a": range(10)}) pxy = proxy_object.asproxy(df) pxy.to_parquet(str(tmp_path), engine="pyarrow") df2 = dask.dataframe.read_parquet(tmp_path) assert_frame_equal(df.to_pandas(), df2.compute())
def test_fixed_attribute_name(): """Test fixed attribute `x.name` access Notice, accessing fixed attributes shouldn't de-serialize the proxied object """ obj_without_name = SimpleNamespace() obj_with_name = SimpleNamespace(name="I have a name") # Access `name` of an array pxy = proxy_object.asproxy(obj_without_name, serializers=("pickle", )) with pytest.raises(AttributeError) as excinfo: pxy.name assert "has no attribute 'name'" in str(excinfo.value) assert pxy._obj_pxy_is_serialized() # Access `name` of a datatype pxy = proxy_object.asproxy(obj_with_name, serializers=("pickle", )) assert pxy.name == "I have a name" assert pxy._obj_pxy_is_serialized()
def test_fixed_attribute_length(backend): """Test fixed attribute `x.__len__` access Notice, accessing fixed attributes shouldn't de-serialize the proxied object """ np = pytest.importorskip(backend) # Access `len()`` of an array pxy = proxy_object.asproxy(np.arange(10), serializers=("dask", )) assert len(pxy) == 10 # Accessing the length shouldn't de-serialize the proxied object assert pxy._obj_pxy_is_serialized() # Access `len()` of a scalar pxy = proxy_object.asproxy(np.array(10), serializers=("dask", )) with pytest.raises(TypeError) as excinfo: len(pxy) assert "len() of unsized object" in str(excinfo.value) assert pxy._obj_pxy_is_serialized()
def test_serializing_array_to_disk(backend, serializers, size): """Check serializing arrays to disk""" np = pytest.importorskip(backend) obj = np.arange(size) # Serialize from host to disk pxy = proxy_object.asproxy(obj, serializers=serializers) ProxifyHostFile.serialize_proxy_to_disk_inplace(pxy) assert pxy._pxy_get().serializer == "disk" assert list(obj) == list(proxy_object.unproxy(pxy))
def test_from_cudf_of_proxy_object(): """Check from_cudf() of a proxy object""" cudf = pytest.importorskip("cudf") df = proxy_object.asproxy(cudf.DataFrame({"a": range(10)})) assert has_parallel_type(df) ddf = dask_cudf.from_cudf(df, npartitions=1) assert has_parallel_type(ddf) # Notice, the output is a dask-cudf dataframe and not a proxy object assert type(ddf) is dask_cudf.core.DataFrame
def test_serialize_of_proxied_cudf(proxy_serializers, dask_serializers): """Check that we can serialize a proxied cudf dataframe, which might be serialized already. """ cudf = pytest.importorskip("cudf") df = cudf.DataFrame({"a": range(10)}) pxy = proxy_object.asproxy(df, serializers=proxy_serializers) header, frames = serialize(pxy, serializers=dask_serializers, on_error="raise") pxy = deserialize(header, frames) assert_frame_equal(df.to_pandas(), pxy.to_pandas())
def test_sizeof_cupy(): cupy = pytest.importorskip("cupy") cupy.cuda.set_allocator(None) a = cupy.arange(1e7) a_size = sizeof(a) pxy = proxy_object.asproxy(a) assert a_size == pytest.approx(sizeof(pxy)) pxy._pxy_serialize(serializers=("dask", )) assert a_size == pytest.approx(sizeof(pxy)) assert pxy._pxy_get().is_serialized() pxy._pxy_cache = {} assert a_size == pytest.approx(sizeof(pxy)) assert pxy._pxy_get().is_serialized()
def test_sizeof_cudf(): cudf = pytest.importorskip("cudf") a = cudf.datasets.timeseries().reset_index() a_size = sizeof(a) pxy = proxy_object.asproxy(a) assert a_size == pytest.approx(sizeof(pxy)) pxy._pxy_serialize(serializers=("dask", )) assert a_size == pytest.approx(sizeof(pxy)) assert pxy._pxy_get().is_serialized() # By clearing the cache, `sizeof(pxy)` now measure the serialized data # thus we have to increase the tolerance. pxy._pxy_cache = {} assert a_size == pytest.approx(sizeof(pxy), rel=1e-2) assert pxy._pxy_get().is_serialized()
def test_proxy_object(serializers): """Check "transparency" of the proxy object""" org = list(range(10)) pxy = proxy_object.asproxy(org, serializers=serializers) assert len(org) == len(pxy) assert org[0] == pxy[0] assert 1 in pxy assert -1 not in pxy assert str(org) == str(pxy) assert "dask_cuda.proxy_object.ProxyObject at " in repr(pxy) assert "list at " in repr(pxy) pxy._obj_pxy_serialize(serializers=["dask", "pickle"]) assert "dask_cuda.proxy_object.ProxyObject at " in repr(pxy) assert "list (serialized=['dask', 'pickle'])" in repr(pxy) assert org == proxy_object.unproxy(pxy) assert org == proxy_object.unproxy(org)
def test_proxy_object(serializers): """Check "transparency" of the proxy object""" org = bytearray(range(10)) pxy = proxy_object.asproxy(org, serializers=serializers) assert len(org) == len(pxy) assert org[0] == pxy[0] assert 1 in pxy assert 10 not in pxy assert str(org) == str(pxy) assert "dask_cuda.proxy_object.ProxyObject at " in repr(pxy) assert "bytearray at " in repr(pxy) pxy._pxy_serialize(serializers=("dask", "pickle")) assert "dask_cuda.proxy_object.ProxyObject at " in repr(pxy) assert "bytearray (serialized='dask')" in repr(pxy) assert org == proxy_object.unproxy(pxy) assert org == proxy_object.unproxy(org)
def test_communicating_proxy_objects(protocol, send_serializers): """Testing serialization of cuDF dataframe when communicating""" cudf = pytest.importorskip("cudf") def task(x): # Check that the subclass survives the trip from client to worker assert isinstance(x, _PxyObjTest) serializers_used = list(x._obj_pxy["serializers"]) # Check that `x` is serialized with the expected serializers if protocol == "ucx": if send_serializers is None: assert serializers_used == ["cuda", "dask", "pickle"] else: assert serializers_used == send_serializers else: assert serializers_used == ["dask", "pickle"] with dask_cuda.LocalCUDACluster( n_workers=1, protocol=protocol, enable_tcp_over_ucx=protocol == "ucx") as cluster: with Client(cluster) as client: df = cudf.DataFrame({"a": range(10)}) df = proxy_object.asproxy(df, serializers=send_serializers, subclass=_PxyObjTest) # Notice, in one case we expect deserialization when communicating. # Since "tcp" cannot send device memory directly, it will be re-serialized # using the default dask serializers that spill the data to main memory. if protocol == "tcp" and send_serializers == ["cuda"]: df.assert_on_deserializing = False else: df.assert_on_deserializing = True df = client.scatter(df) client.submit(task, df).result() client.shutdown() # Avoids a UCX shutdown error
def test_proxy_object_of_numpy(serializers): """Check that a proxied numpy array behaves as a regular dataframe""" np = pytest.importorskip("numpy") # Make sure that equality works, which we use to test the other operators org = np.arange(10) + 1 pxy = proxy_object.asproxy(org.copy(), serializers=serializers) assert all(org == pxy) assert all(org + 1 != pxy) # Check unary scalar operators for op in [int, float, complex, operator.index, oct, hex]: org = np.int64(42) pxy = proxy_object.asproxy(org.copy(), serializers=serializers) expect = op(org) got = op(pxy) assert type(expect) == type(got) assert expect == got # Check unary operators for op_str in ["neg", "pos", "abs", "inv"]: op = getattr(operator, op_str) org = np.arange(10) + 1 pxy = proxy_object.asproxy(org.copy(), serializers=serializers) expect = op(org) got = op(pxy) assert type(expect) == type(got) assert all(expect == got) # Check binary operators that takes a scalar as second argument for op_str in ["rshift", "lshift", "pow"]: op = getattr(operator, op_str) org = np.arange(10) + 1 pxy = proxy_object.asproxy(org.copy(), serializers=serializers) expect = op(org, 2) got = op(pxy, 2) assert type(expect) == type(got) assert all(expect == got) # Check binary operators for op_str in [ "add", "eq", "floordiv", "ge", "gt", "le", "lshift", "lt", "mod", "mul", "ne", "or_", "sub", "truediv", "xor", "iadd", "ior", "iand", "ifloordiv", "ilshift", "irshift", "ipow", "imod", "imul", "isub", "ixor", ]: op = getattr(operator, op_str) org = np.arange(10) + 1 pxy = proxy_object.asproxy(org.copy(), serializers=serializers) expect = op(org.copy(), org) got = op(org.copy(), pxy) assert isinstance(got, type(expect)) assert all(expect == got) expect = op(org.copy(), org) got = op(pxy, org) assert isinstance(got, type(expect)) assert all(expect == got) # Check proxy-proxy operations if "i" != op_str[0]: # Skip in-place operators expect = op(org.copy(), org) got = op(pxy, proxy_object.asproxy(org.copy())) assert all(expect == got) # Check unary truth operators for op_str in ["not_", "truth"]: op = getattr(operator, op_str) org = np.arange(1) + 1 pxy = proxy_object.asproxy(org.copy(), serializers=serializers) expect = op(org) got = op(pxy) assert type(expect) == type(got) assert expect == got # Check reflected methods for op_str in [ "__radd__", "__rsub__", "__rmul__", "__rtruediv__", "__rfloordiv__", "__rmod__", "__rpow__", "__rlshift__", "__rrshift__", "__rxor__", "__ror__", ]: org = np.arange(10) + 1 pxy = proxy_object.asproxy(org.copy(), serializers=serializers) expect = getattr(org, op_str)(org) got = getattr(org, op_str)(pxy) assert isinstance(got, type(expect)) assert all(expect == got)
def test_proxy_object_of_cudf(serializers): """Check that a proxied cudf dataframe behaves as a regular dataframe""" cudf = pytest.importorskip("cudf") df = cudf.DataFrame({"a": range(10)}) pxy = proxy_object.asproxy(df, serializers=serializers) assert_frame_equal(df.to_pandas(), pxy.to_pandas())
def test_assignments(): """Check assignment to a proxied dataframe""" cudf = pytest.importorskip("cudf") df = proxy_object.asproxy(cudf.DataFrame({"a": range(10)})) df.index = df["a"].copy(deep=False)