def test_local_memory(filter_str): if skip_test(filter_str): pytest.skip() blocksize = 10 if skip_if_win(): pytest.skip() @dppy.kernel("void(float32[::1])") def reverse_array(A): lm = dppy.local.array(shape=10, dtype=np.float32) i = dppy.get_global_id(0) # preload lm[i] = A[i] # barrier local or global will both work as we only have one work group dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # local mem fence # write A[i] += lm[blocksize - 1 - i] arr = np.arange(blocksize).astype(np.float32) orig = arr.copy() with dppy.offload_to_sycl_device(filter_str): reverse_array[blocksize, blocksize](arr) expected = orig[::-1] + orig np.testing.assert_allclose(expected, arr)
def test_caching_kernel_using_same_context(filter_str): """Test kernel caching for the scenario where different SYCL queues that share a SYCL context are used to submit a kernel. Args: filter_str: SYCL filter selector string """ if skip_test(filter_str): pytest.skip() global_size = 10 N = global_size def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) c[i] = a[i] + b[i] a = np.array(np.random.random(N), dtype=np.float32) b = np.array(np.random.random(N), dtype=np.float32) c = np.ones_like(a) # Set the global queue to the default device so that the cached_kernel gets # created for that device dpctl.set_global_queue(filter_str) func = dppy.kernel(data_parallel_sum) default_queue = dpctl.get_current_queue() cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), default_queue) for i in range(0, 10): # Each iteration create a fresh queue that will share the same context with dppy.offload_to_sycl_device(filter_str) as gpu_queue: _kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), gpu_queue) assert _kernel == cached_kernel
def test_consuming_array_from_dpnp(offload_device, dtype): if not ensure_dpnp(): pytest.skip("No DPNP") import dpnp if skip_test(offload_device): pytest.skip("No device for " + offload_device) if ("opencl" not in dpctl.get_current_queue().sycl_device.filter_string and "opencl" in offload_device): pytest.skip("Bug in DPNP. See: IntelPython/dpnp#723") @dppy.kernel def data_parallel_sum(a, b, c): """ Vector addition using the ``kernel`` decorator. """ i = dppy.get_global_id(0) c[i] = a[i] + b[i] global_size = 1021 with dppy.offload_to_sycl_device(offload_device): a = dppy.asarray(dpnp.arange(global_size, dtype=dtype)) b = dppy.asarray(dpnp.arange(global_size, dtype=dtype)) c = dppy.asarray(dpnp.ones_like(a)) data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
def test_caching_kernel_using_same_queue(filter_str): """Test kernel caching when the same queue is used to submit a kernel multiple times. Args: filter_str: SYCL filter selector string """ if skip_test(filter_str): pytest.skip() global_size = 10 N = global_size def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) c[i] = a[i] + b[i] a = np.array(np.random.random(N), dtype=np.float32) b = np.array(np.random.random(N), dtype=np.float32) c = np.ones_like(a) with dppy.offload_to_sycl_device(filter_str) as gpu_queue: func = dppy.kernel(data_parallel_sum) cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), gpu_queue) for i in range(10): _kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), gpu_queue) assert _kernel == cached_kernel
def test_trigonometric_fn(filter_str, trig_op, input_arrays): if skip_test(filter_str): pytest.skip() # FIXME: Why does archcosh fail on Gen12 discrete graphics card? if trig_op == "arccosh" and is_gen12(filter_str): pytest.skip() a, b = input_arrays trig_fn = getattr(np, trig_op) actual = np.empty(shape=a.shape, dtype=a.dtype) expected = np.empty(shape=a.shape, dtype=a.dtype) if trig_op == "arctan2": @njit def f(a, b): return trig_fn(a, b) device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device), assert_auto_offloading(): actual = f(a, b) expected = trig_fn(a, b) else: @njit def f(a): return trig_fn(a) device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device), assert_auto_offloading(): actual = f(a) expected = trig_fn(a) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_proper_lowering(filter_str): if skip_test(filter_str): pytest.skip() # We perform eager compilation at the site of # @dppy.kernel. This takes the default dpctl # queue which is level_zero backed. Level_zero # is not yet supported on Windows platform and # hence we skip these tests if the platform is # Windows regardless of which backend filter_str # specifies. if skip_if_win(): pytest.skip() # This will trigger eager compilation @dppy.kernel("void(float32[::1])") def twice(A): i = dppy.get_global_id(0) d = A[i] dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # local mem fence A[i] = d * 2 N = 256 arr = np.random.random(N).astype(np.float32) orig = arr.copy() with dppy.offload_to_sycl_device(filter_str): twice[N, N // 2](arr) # The computation is correct? np.testing.assert_allclose(orig * 2, arr)
def test_usm_ndarray_type(offload_device, dtype, usm_type): if skip_test(offload_device): pytest.skip() a = np.array(np.random.random(10), dtype) da = dpt.usm_ndarray(a.shape, dtype=a.dtype, buffer=usm_type) assert isinstance(typeof(da), USMNdArrayType) assert da.usm_type == usm_type
def test_kernel_arg_accessor(filter_str, input_arrays, kernel): if skip_test(filter_str): pytest.skip() a, b, actual = input_arrays expected = a + b device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): call_kernel(global_size, local_size, a, b, actual, kernel) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_dpctl_api(filter_str): if skip_test(filter_str): pytest.skip() device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): dpctl.lsplatform() dpctl.get_current_queue() dpctl.get_num_activated_queues() dpctl.is_in_device_context()
def atomic_skip_test(device_type): skip = False if skip_test(device_type): skip = True if not skip: if not dppy.ocl.atomic_support_present(): skip = True return skip
def dpnp_skip_test(device_type): skip = False if skip_test(device_type): skip = True if not skip: if not ensure_dpnp(): skip = True return skip
def test_kernel_arg_types(filter_str, input_arrays): if skip_test(filter_str): pytest.skip() kernel = dppy.kernel(mul_kernel) a, actual, c = input_arrays expected = a * c device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): kernel[global_size, local_size](a, actual, c) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_return(offload_device, sig): if skip_test(offload_device): pytest.skip() a = np.array(np.random.random(122), np.int32) with pytest.raises(TypeError): kernel = dppy.kernel(sig)(f) device = dpctl.SyclDevice(offload_device) with dppy.offload_to_sycl_device(device): kernel[a.size, dppy.DEFAULT_LOCAL_SIZE](a)
def test_bool_type(filter_str): if skip_test(filter_str): pytest.skip() kernel = dppy.kernel(check_bool_kernel) a = np.array([2], np.int64) device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): kernel[a.size, dppy.DEFAULT_LOCAL_SIZE](a, True) assert a[0] == 111 kernel[a.size, dppy.DEFAULT_LOCAL_SIZE](a, False) assert a[0] == 222
def test_consuming_usm_ndarray(offload_device, dtype, usm_type): if skip_test(offload_device): pytest.skip() @dppy.kernel def data_parallel_sum(a, b, c): """ Vector addition using the ``kernel`` decorator. """ i = dppy.get_global_id(0) c[i] = a[i] + b[i] global_size = 1021 N = global_size a = np.array(np.random.random(N), dtype=dtype) b = np.array(np.random.random(N), dtype=dtype) got = np.ones_like(a) with dppy.offload_to_sycl_device(offload_device) as gpu_queue: da = dpt.usm_ndarray( a.shape, dtype=a.dtype, buffer=usm_type, buffer_ctor_kwargs={"queue": gpu_queue}, ) da.usm_data.copy_from_host(a.reshape((-1)).view("|u1")) db = dpt.usm_ndarray( b.shape, dtype=b.dtype, buffer=usm_type, buffer_ctor_kwargs={"queue": gpu_queue}, ) db.usm_data.copy_from_host(b.reshape((-1)).view("|u1")) dc = dpt.usm_ndarray( got.shape, dtype=got.dtype, buffer=usm_type, buffer_ctor_kwargs={"queue": gpu_queue}, ) data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](da, db, dc) dc.usm_data.copy_to_host(got.reshape((-1)).view("|u1")) expected = a + b assert np.array_equal(got, expected)
def test_dpnp_create_array_in_context(offload_device, dtype): if not ensure_dpnp(): pytest.skip("No DPNP") import dpnp if skip_test(offload_device): pytest.skip("No device for " + offload_device) if ("opencl" not in dpctl.get_current_queue().sycl_device.filter_string and "opencl" in offload_device): pytest.skip("Bug in DPNP. See: IntelPython/dpnp#723") with dpctl.device_context(offload_device): a = dpnp.arange(1024, dtype=dtype) # noqa
def test_strided_array_kernel(offload_device): if skip_test(offload_device): pytest.skip() global_size = 606 a = np.arange(global_size * 2, dtype="i4")[::2] b = np.arange(global_size, dtype="i4")[::-1] got = np.zeros(global_size, dtype="i4") expected = a + b with dpctl.device_context(offload_device): sum[global_size, numba_dppy.DEFAULT_LOCAL_SIZE](a, b, got) assert np.array_equal(expected, got)
def test_print(filter_str, input_arrays, capfd): if skip_test(filter_str): pytest.skip() @dppy.kernel def f(a): print("test", a[0]) a = input_arrays global_size = 3 device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): f[global_size, dppy.DEFAULT_LOCAL_SIZE](a) captured = capfd.readouterr() assert "test" in captured.out
def test_as_usm_obj(offload_device): if skip_test(offload_device): pytest.skip() a = np.ones(1023, dtype=np.float32) b = a * 3 with dpctl.device_context(offload_device) as queue: a_copy = np.empty_like(a) usm_mem = as_usm_obj(a, queue=queue) copy_to_numpy_from_usm_obj(usm_mem, a_copy) assert np.all(a == a_copy) b_copy = np.empty_like(b) usm_mem = as_usm_obj(b, queue=queue, copy=False) copy_to_numpy_from_usm_obj(usm_mem, b_copy) assert np.any(np.not_equal(b, b_copy))
def test_unary_ops(filter_str, unary_op, input_arrays): if skip_test(filter_str): pytest.skip() a = input_arrays[0] uop = getattr(np, unary_op) actual = np.empty(shape=a.shape, dtype=a.dtype) expected = np.empty(shape=a.shape, dtype=a.dtype) @njit def f(a): return uop(a) device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device), assert_auto_offloading(): actual = f(a) expected = uop(a) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_binary_ops(filter_str, unary_op, input_arrays): if skip_test(filter_str): pytest.skip() a, actual = input_arrays uop = getattr(math, unary_op) np_uop = getattr(np, unary_op) @dppy.kernel def f(a, b): i = dppy.get_global_id(0) b[i] = uop(a[i]) device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): f[a.size, dppy.DEFAULT_LOCAL_SIZE](a, actual) expected = np_uop(a) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_has_usm_memory(offload_device): if skip_test(offload_device): pytest.skip() a = np.ones(1023, dtype=np.float32) with dpctl.device_context(offload_device): # test usm_ndarray da = dpt.usm_ndarray(a.shape, dtype=a.dtype, buffer="shared") usm_mem = has_usm_memory(da) assert da.usm_data._pointer == usm_mem._pointer # test usm allocated numpy.ndarray buf = dpctl_mem.MemoryUSMShared(a.size * a.dtype.itemsize) ary_buf = np.ndarray(a.shape, buffer=buf, dtype=a.dtype) usm_mem = has_usm_memory(ary_buf) assert buf._pointer == usm_mem._pointer usm_mem = has_usm_memory(a) assert usm_mem is None
def test_unary_ops(filter_str, unary_op, input_arrays): if skip_test(filter_str): pytest.skip() # FIXME: Why does sign fail on Gen12 discrete graphics card? skip_ops = ["sign", "log", "log2", "log10", "expm1"] if unary_op in skip_ops and is_gen12(filter_str): pytest.skip() a = input_arrays[0] uop = getattr(np, unary_op) actual = np.empty(shape=a.shape, dtype=a.dtype) expected = np.empty(shape=a.shape, dtype=a.dtype) @njit def f(a): return uop(a) device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device), assert_auto_offloading(): actual = f(a) expected = uop(a) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_no_arg_barrier_support(filter_str): if skip_test(filter_str): pytest.skip() if skip_if_win(): pytest.skip() @dppy.kernel("void(float32[::1])") def twice(A): i = dppy.get_global_id(0) d = A[i] # no argument defaults to global mem fence dppy.barrier() A[i] = d * 2 N = 256 arr = np.random.random(N).astype(np.float32) orig = arr.copy() with dppy.offload_to_sycl_device(filter_str): twice[N, dppy.DEFAULT_LOCAL_SIZE](arr) # The computation is correct? np.testing.assert_allclose(orig * 2, arr)