def create_subdevice_queue(): """ Partition a CPU sycl device into sub-devices. Create a multi-device sycl context. """ cpu_d = dpctl.SyclDevice("cpu") cpu_count = cpu_d.max_compute_units sub_devs = cpu_d.create_sub_devices(partition=cpu_count // 2) multidevice_ctx = dpctl.SyclContext(sub_devs) # create a SyclQueue for each sub-device, using commont # multi-device context q0, q1 = [dpctl.SyclQueue(multidevice_ctx, d) for d in sub_devs] # for each sub-device allocate 26 bytes m0 = dpctl.memory.MemoryUSMDevice(26, queue=q0) m1 = dpctl.memory.MemoryUSMDevice(26, queue=q1) # populate m0 with host data of spaces hostmem = bytearray(b" " * 26) # copy spaces into m1 m1.copy_from_host(hostmem) for i in range(26): hostmem[i] = ord("a") + i # copy character sequence into m0 m0.copy_from_host(hostmem) # from from m0 to m1. Due to using multi-device context, # copying can be done directly m1.copy_from_device(m0) return bytes(m1.copy_to_host())
def test_with_dppy_context_cpu(self): @njit def nested_func(a, b): np.sin(a, b) @njit def func(b): a = np.ones((64), dtype=np.float64) nested_func(a, b) config.DEBUG = 1 expected = np.ones((64), dtype=np.float64) got_cpu = np.ones((64), dtype=np.float64) with captured_stdout() as got_cpu_message: device = dpctl.SyclDevice("opencl:cpu") with dppy.offload_to_sycl_device(device): func(got_cpu) config.DEBUG = 0 func(expected) np.testing.assert_array_equal(expected, got_cpu) self.assertTrue( "Parfor offloaded to opencl:cpu" in got_cpu_message.getvalue())
def test_mix_argtype(offload_device, input_arrays): usm_type = "device" a, b, expected = input_arrays got = np.ones_like(a) device = dpctl.SyclDevice(offload_device) queue = dpctl.SyclQueue(device) da = dpt.usm_ndarray( a.shape, dtype=a.dtype, buffer=usm_type, buffer_ctor_kwargs={"queue": queue}, ) da.usm_data.copy_from_host(a.reshape((-1)).view("|u1")) dc = dpt.usm_ndarray( got.shape, dtype=got.dtype, buffer=usm_type, buffer_ctor_kwargs={"queue": queue}, ) with pytest.raises(TypeError) as error_msg: sum_kernel[global_size, local_size](da, b, dc) assert mix_datatype_err_msg in error_msg
def select_device_SUAI(N): usm_type = "device" a = np.array(np.random.random(N), np.float32) b = np.array(np.random.random(N), np.float32) got = np.ones_like(a) device = dpctl.SyclDevice("opencl:gpu") queue = dpctl.SyclQueue(device) # We are allocating the data in Opencl GPU and this device # will be selected for compute. da, db, dc = allocate_SUAI_data(a, b, got, usm_type, queue) # Users don't need to specify where the computation will # take place. It will be inferred from data. sum_kernel[N, 1](da, db, dc) dc.usm_data.copy_to_host(got.reshape((-1)).view("|u1")) expected = a + b assert np.array_equal(got, expected) print( "Correct result when array with __sycl_usm_array_interface__ is passed!" )
def test_eig(filter_str, eig_input, capfd): if skip_test(filter_str): pytest.skip() a = eig_input fn = get_fn("linalg.eig", 1) f = njit(fn) device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device), dpnp_debug(): actual_val, actual_vec = f(a) captured = capfd.readouterr() assert "dpnp implementation" in captured.out expected_val, expected_vec = fn(a) # sort val/vec by abs value vvsort(actual_val, actual_vec) vvsort(expected_val, expected_vec) # NP change sign of vectors for i in range(expected_vec.shape[1]): if expected_vec[0, i] * actual_vec[0, i] < 0: expected_vec[:, i] = -expected_vec[:, i] assert np.allclose(actual_val, expected_val) assert np.allclose(actual_vec, expected_vec)
def test_kernel_arg_accessor(filter_str, input_arrays, kernel): a, b, actual = input_arrays expected = a + b device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): call_kernel(global_size, local_size, a, b, actual, kernel) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_unary_ops(filter_str, unary_op, input_arrays, get_shape, capfd): a = input_arrays[0] op, name = unary_op if name != "argsort" and name != "copy": a = np.reshape(a, get_shape) if name == "cumprod" and (filter_str == "opencl:cpu:0" or a.dtype == np.int32 or is_gen12(filter_str)): pytest.skip() if name == "cumsum" and (filter_str == "opencl:cpu:0" or a.dtype == np.int32 or is_gen12(filter_str)): pytest.skip() if name == "mean" and is_gen12(filter_str): pytest.skip() if name == "argmax" and is_gen12(filter_str): pytest.skip() actual = np.empty(shape=a.shape, dtype=a.dtype) expected = np.empty(shape=a.shape, dtype=a.dtype) f = njit(op) device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device), dpnp_debug(): actual = f(a) captured = capfd.readouterr() assert "dpnp implementation" in captured.out expected = op(a) np.testing.assert_allclose(actual, expected, rtol=1e-3, atol=0)
def test_asarray_change_usm_type(src_usm_type, dst_usm_type): d = dpctl.SyclDevice() if d.is_host: pytest.skip( "Skip test of host device, which only " "supports host USM allocations" ) X = dpt.empty(10, dtype="u1", usm_type=src_usm_type) Y = dpt.asarray(X, usm_type=dst_usm_type) assert X.shape == Y.shape assert X.usm_type == src_usm_type assert Y.usm_type == dst_usm_type with pytest.raises(ValueError): # zero copy is not possible dpt.asarray(X, usm_type=dst_usm_type, copy=False) Y = dpt.asarray(X, usm_type=dst_usm_type, sycl_queue=X.sycl_queue) assert X.shape == Y.shape assert Y.usm_type == dst_usm_type Y = dpt.asarray( X, usm_type=dst_usm_type, sycl_queue=X.sycl_queue, device=d.get_filter_string(), ) assert X.shape == Y.shape assert Y.usm_type == dst_usm_type
def test_dppy_fallback_false(self): @numba.jit def fill_value(i): return i def inner_call_fallback(): x = 10 a = np.empty(shape=x, dtype=np.float32) for i in numba.prange(x): a[i] = fill_value(i) return a try: config.DEBUG = 1 config.FALLBACK_ON_CPU = 0 with warnings.catch_warnings(record=True) as w: device = dpctl.SyclDevice("opencl:gpu") with numba_dppy.offload_to_sycl_device(device): dppy = numba.njit(parallel=True)(inner_call_fallback) dppy_fallback_false = dppy() finally: ref_result = inner_call_fallback() config.FALLBACK_ON_CPU = 1 config.DEBUG = 0 not np.testing.assert_array_equal(dppy_fallback_false, ref_result) self.assertNotIn("Failed to offload parfor", str(w[-1].message))
def test_multiple_prange(self): @njit def f(a, b): # dimensions must be provided as scalar m, n = a.shape for i in prange(m): val = 10 for j in prange(n): b[i, j] = a[i, j] * val for i in prange(m): for j in prange(n): a[i, j] = a[i, j] * 10 m = 8 n = 8 a = np.ones((m, n)) b = np.ones((m, n)) device = dpctl.SyclDevice("opencl:gpu") with assert_auto_offloading( parfor_offloaded=2), dpctl.device_context(device): f(a, b) assert np.all(b == 10) assert np.all(a == 10)
def test_kernel_atomic_simple(filter_str, input_arrays, kernel_result_pair): a, dtype = input_arrays kernel, expected = kernel_result_pair device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](a) assert a[0] == expected
def test_njit(filter_str): if _helper.platform_not_supported(filter_str): pytest.skip() if _helper.skip_test(filter_str): pytest.skip() @vectorize(nopython=True) def axy(a, x, y): return a * x + y def f(a0, a1): return np.cos(axy(a0, np.sin(a1) - 1.0, 1.0)) A = np.random.random(10) B = np.random.random(10) device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device), assert_auto_offloading(): f_njit = njit(f) expected = f_njit(A, B) actual = f(A, B) max_abs_err = expected.sum() - actual.sum() assert max_abs_err < 1e-5
def test_context_multi_device(): try: d = dpctl.SyclDevice("cpu") except ValueError: pytest.skip() if d.default_selector_score < 0: pytest.skip() n = d.max_compute_units n1 = n // 2 n2 = n - n1 if n1 == 0 or n2 == 0: pytest.skip() d1, d2 = d.create_sub_devices(partition=(n1, n2)) ctx = dpctl.SyclContext((d1, d2)) assert ctx.device_count == 2 assert type(repr(ctx)) is str q1 = dpctl.SyclQueue(ctx, d1) q2 = dpctl.SyclQueue(ctx, d2) import dpctl.memory as dpmem shmem_1 = dpmem.MemoryUSMShared(256, queue=q1) shmem_2 = dpmem.MemoryUSMDevice(256, queue=q2) shmem_2.copy_from_device(shmem_1) # create context for single sub-device ctx1 = dpctl.SyclContext(d1) q1 = dpctl.SyclQueue(ctx1, d1) shmem_1 = dpmem.MemoryUSMShared(256, queue=q1) cap = ctx1._get_capsule() cap2 = ctx1._get_capsule() del ctx1 del cap2 # exercise deleter of non-renamed capsule ctx2 = dpctl.SyclContext(cap) q2 = dpctl.SyclQueue(ctx2, d1) shmem_2 = dpmem.MemoryUSMDevice(256, queue=q2) shmem_2.copy_from_device(shmem_1)
def test_sum_reduction(self): # This test will only work for even case N = 1024 assert N % 2 == 0 A = np.array(np.random.random(N), dtype=np.float32) A_copy = A.copy() # at max we will require half the size of A to store sum R = np.array(np.random.random(math.ceil(N / 2)), dtype=np.float32) device = dpctl.SyclDevice("opencl:gpu") with dpctl.device_context(device): total = N while total > 1: # call kernel global_size = total // 2 reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE]( A, R, global_size ) total = total // 2 result = A_copy.sum() max_abs_err = result - R[0] assert max_abs_err < 1e-4
def test_dpctl_api(filter_str): device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): dpctl.lsplatform() dpctl.get_current_queue() dpctl.get_num_activated_queues() dpctl.is_in_device_context()
def test_dppy_func_ndarray(self): @dppy.func def g(a): return a + 1 @dppy.kernel def f(a, b): i = dppy.get_global_id(0) b[i] = g(a[i]) @dppy.kernel def h(a, b): i = dppy.get_global_id(0) b[i] = g(a[i]) + 1 a = np.ones(self.N) b = np.ones(self.N) device = dpctl.SyclDevice("opencl:gpu") with dpctl.device_context(device): f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b) assert np.all(b == 2) h[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b) assert np.all(b == 3)
def test_kernel_arg_types(filter_str, input_arrays): kernel = dppy.kernel(mul_kernel) a, actual, c = input_arrays expected = a * c device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): kernel[global_size, local_size](a, actual, c) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_valid_filter_selectors(valid_filter, check): """Tests if we can create a SyclDevice using a supported filter selector string.""" device = None try: device = dpctl.SyclDevice(valid_filter) except ValueError: pytest.skip("Failed to create device with supported filter") check(device)
def test_hashing_of_device(): """ Test that a :class:`dpctl.SyclDevice` object can be used as a dictionary key. """ device_dict = {dpctl.SyclDevice(): "default_device"} assert device_dict
def test_kernel_atomic_local(filter_str, input_arrays, return_list_of_op): a, dtype = input_arrays op_type, expected = return_list_of_op f = get_func_local(op_type, dtype) kernel = dppy.kernel(f) device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): kernel[global_size, global_size](a) assert a[0] == expected
def test_return(filter_str, sig): a = np.array(np.random.random(122), np.int32) with pytest.raises(TypeError): kernel = dppy.kernel(sig)(f) device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): kernel[a.size, dppy.DEFAULT_LOCAL_SIZE](a)
def test_context_can_be_used_in_queue2(valid_filter): try: d = dpctl.SyclDevice(valid_filter) except ValueError: pytest.skip() if d.default_selector_score < 0: # skip test for devices rejected by default selector pytest.skip() ctx = dpctl.SyclContext(d) dpctl.SyclQueue(ctx, d)
def test_filter_string_property(): """ Test that filter_string reconstructs the same device. """ devices = dpctl.get_devices() for d in devices: if d.default_selector_score >= 0: dev_id = d.filter_string d_r = dpctl.SyclDevice(dev_id) assert d == d_r
def test_dpctl_api(filter_str): if skip_test(filter_str): pytest.skip() device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): dpctl.lsplatform() dpctl.get_current_queue() dpctl.get_num_activated_queues() dpctl.is_in_device_context()
def test_kernel_arg_accessor(filter_str, input_arrays, kernel): if skip_test(filter_str): pytest.skip() a, b, actual = input_arrays expected = a + b device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): call_kernel(global_size, local_size, a, b, actual, kernel) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_bool_type(filter_str): kernel = dppy.kernel(check_bool_kernel) a = np.array([2], np.int64) device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): kernel[a.size, dppy.DEFAULT_LOCAL_SIZE](a, True) assert a[0] == 111 kernel[a.size, dppy.DEFAULT_LOCAL_SIZE](a, False) assert a[0] == 222
def test_kernel_atomic_simple(filter_str, input_arrays, kernel_result_pair): if atomic_skip_test(filter_str): pytest.skip() a, dtype = input_arrays kernel, expected = kernel_result_pair device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](a) assert a[0] == expected
def test_kernel_atomic_multi_dim(filter_str, return_list_of_op, return_list_of_dim, return_dtype): op_type, expected = return_list_of_op dim = return_list_of_dim kernel = get_kernel_multi_dim(op_type, len(dim)) a = np.zeros(dim, return_dtype) device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](a) assert a[0] == expected
def test_parfor_message(self): device = dpctl.SyclDevice("opencl:gpu") with dpctl.device_context(device): config.DEBUG = 1 jitted = njit(prange_example) with captured_stdout() as got: jitted() config.DEBUG = 0 assert "Parfor offloaded " in got.getvalue()
def test_kernel_arg_types(filter_str, input_arrays): if skip_test(filter_str): pytest.skip() kernel = dppy.kernel(mul_kernel) a, actual, c = input_arrays expected = a * c device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): kernel[global_size, local_size](a, actual, c) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)