def lower_parfor_rollback(lowerer, parfor): try: _lower_parfor_gufunc(lowerer, parfor) if config.DEBUG: device_filter_str = ( dpctl.get_current_queue().get_sycl_device().filter_string) msg = "Parfor offloaded to " + device_filter_str print(msg, parfor.loc) except Exception as e: device_filter_str = ( dpctl.get_current_queue().get_sycl_device().filter_string) msg = ( "Failed to offload parfor to " + device_filter_str + ". Falling " "back to default CPU parallelization. Please file a bug report " "at https://github.com/IntelPython/numba-dppy. To help us debug " "the issue, please add the traceback to the bug report.") if not config.DEBUG: msg += " Set the environment variable NUMBA_DPPY_DEBUG to 1 to " msg += "generate a traceback." warnings.warn(NumbaPerformanceWarning(msg, parfor.loc)) raise e
def test_dpctl_api(filter_str): device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): dpctl.lsplatform() dpctl.get_current_queue() dpctl.get_num_activated_queues() dpctl.is_in_device_context()
def test_dpctl_api(filter_str): if skip_test(filter_str): pytest.skip() device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): dpctl.lsplatform() dpctl.get_current_queue() dpctl.get_num_activated_queues() dpctl.is_in_device_context()
def test_create_program_from_source(self): oclSrc = " \ kernel void axpy(global int* a, global int* b, global int* c, int d) { \ size_t index = get_global_id(0); \ c[index] = d*a[index] + b[index]; \ }" with dpctl.device_context("opencl:gpu:0"): q = dpctl.get_current_queue() prog = dpctl_prog.create_program_from_source(q, oclSrc) axpyKernel = prog.get_sycl_kernel("axpy") abuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize) bbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize) cbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize) a = np.ndarray((1024), buffer=abuf, dtype="i") b = np.ndarray((1024), buffer=bbuf, dtype="i") c = np.ndarray((1024), buffer=cbuf, dtype="i") a[:] = np.arange(1024) b[:] = np.arange(1024, 0, -1) c[:] = 0 d = 2 args = [] args.append(a.base) args.append(b.base) args.append(c.base) args.append(ctypes.c_int(d)) r = [1024] q.submit(axpyKernel, args, r) self.assertTrue(np.allclose(c, a * d + b))
def test_caching_kernel_using_same_context(filter_str): """Test kernel caching for the scenario where different SYCL queues that share a SYCL context are used to submit a kernel. Args: filter_str: SYCL filter selector string """ global_size = 10 N = global_size def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) c[i] = a[i] + b[i] a = np.array(np.random.random(N), dtype=np.float32) b = np.array(np.random.random(N), dtype=np.float32) c = np.ones_like(a) # Set the global queue to the default device so that the cached_kernel gets # created for that device dpctl.set_global_queue(filter_str) func = dppy.kernel(data_parallel_sum) default_queue = dpctl.get_current_queue() cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), default_queue) for i in range(0, 10): # Each iteration create a fresh queue that will share the same context with dpctl.device_context(filter_str) as gpu_queue: _kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), gpu_queue) assert _kernel == cached_kernel
def test_debug_info_locals_vars_on_no_opt(): """ Check llvm debug tag DILocalVariable is emitting to IR for all variables if debug parameter is set to True and optimization is O0 """ @dppy.kernel def foo(var_a, var_b, var_c): i = dppy.get_global_id(0) var_c[i] = var_a[i] + var_b[i] ir_tags = [ '!DILocalVariable(name: "var_a"', '!DILocalVariable(name: "var_b"', '!DILocalVariable(name: "var_c"', '!DILocalVariable(name: "i"', ] sycl_queue = dpctl.get_current_queue() sig = ( npytypes_array_to_dppy_array(types.float32[:]), npytypes_array_to_dppy_array(types.float32[:]), npytypes_array_to_dppy_array(types.float32[:]), ) with override_config("OPT", 0): kernel_ir = get_kernel_ir(sycl_queue, foo, sig, debug=True) for tag in ir_tags: assert tag in kernel_ir
def test_create_program_from_source(self): oclSrc = " \ kernel void add(global int* a, global int* b, global int* c) { \ size_t index = get_global_id(0); \ c[index] = a[index] + b[index]; \ } \ kernel void axpy(global int* a, global int* b, global int* c, int d) { \ size_t index = get_global_id(0); \ c[index] = a[index] + d*b[index]; \ }" with dpctl.device_context("opencl:gpu:0"): q = dpctl.get_current_queue() prog = dpctl_prog.create_program_from_source(q, oclSrc) self.assertIsNotNone(prog) self.assertTrue(prog.has_sycl_kernel("add")) self.assertTrue(prog.has_sycl_kernel("axpy")) addKernel = prog.get_sycl_kernel("add") axpyKernel = prog.get_sycl_kernel("axpy") self.assertEqual(addKernel.get_function_name(), "add") self.assertEqual(axpyKernel.get_function_name(), "axpy") self.assertEqual(addKernel.get_num_args(), 3) self.assertEqual(axpyKernel.get_num_args(), 4)
def test_env_var_generates_ir_with_debuginfo_for_func(debug_option): """ Check debug info is emitting to IR if NUMBA_DPPY_DEBUGINFO is set to 1 """ @dppy.func def func_sum(a, b): result = a + b return result @dppy.kernel def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) c[i] = func_sum(a[i], b[i]) ir_tags = [ r'\!DISubprogram\(name: ".*func_sum"', r'\!DISubprogram\(name: ".*data_parallel_sum"', ] sycl_queue = dpctl.get_current_queue() sig = ( npytypes_array_to_dppy_array(types.float32[:]), npytypes_array_to_dppy_array(types.float32[:]), npytypes_array_to_dppy_array(types.float32[:]), ) with override_config("DEBUGINFO_DEFAULT", int(debug_option)): kernel_ir = get_kernel_ir(sycl_queue, data_parallel_sum, sig) for tag in ir_tags: assert debug_option == make_check(kernel_ir, tag)
def test_debug_flag_generates_ir_with_debuginfo_for_func(debug_option): """ Check debug info is emitting to IR if debug parameter is set to True """ @dppy.func(debug=debug_option) def func_sum(a, b): result = a + b return result @dppy.kernel(debug=debug_option) def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) c[i] = func_sum(a[i], b[i]) ir_tags = [ r'\!DISubprogram\(name: ".*func_sum"', r'\!DISubprogram\(name: ".*data_parallel_sum"', ] sycl_queue = dpctl.get_current_queue() sig = ( npytypes_array_to_dppy_array(types.float32[:]), npytypes_array_to_dppy_array(types.float32[:]), npytypes_array_to_dppy_array(types.float32[:]), ) kernel_ir = get_kernel_ir(sycl_queue, data_parallel_sum, sig, debug=debug_option) for tag in ir_tags: assert debug_option == make_check(kernel_ir, tag)
def test_consuming_array_from_dpnp(offload_device, dtype): if not ensure_dpnp(): pytest.skip("No DPNP") import dpnp if skip_test(offload_device): pytest.skip("No device for " + offload_device) if ("opencl" not in dpctl.get_current_queue().sycl_device.filter_string and "opencl" in offload_device): pytest.skip("Bug in DPNP. See: IntelPython/dpnp#723") @dppy.kernel def data_parallel_sum(a, b, c): """ Vector addition using the ``kernel`` decorator. """ i = dppy.get_global_id(0) c[i] = a[i] + b[i] global_size = 1021 with dppy.offload_to_sycl_device(offload_device): a = dppy.asarray(dpnp.arange(global_size, dtype=dtype)) b = dppy.asarray(dpnp.arange(global_size, dtype=dtype)) c = dppy.asarray(dpnp.ones_like(a)) data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
def test_create_program_from_spirv(self): CURR_DIR = os.path.dirname(os.path.abspath(__file__)) spirv_file = os.path.join(CURR_DIR, "input_files/multi_kernel.spv") with open(spirv_file, "rb") as fin: spirv = fin.read() with dpctl.device_context("level_zero:gpu:0"): q = dpctl.get_current_queue() prog = dpctl_prog.create_program_from_spirv(q, spirv)
def test_current_device(check): """Test is the device for the current queue is valid.""" try: q = dpctl.get_current_queue() except Exception: pytest.fail("Encountered an exception inside get_current_queue().") device = q.get_sycl_device() check(device)
def is_gen12(device_type): with dpctl.device_context(device_type): q = dpctl.get_current_queue() device = q.get_sycl_device() name = device.name if "Gen12" in name: return True return False
def test_memory_gpu_context(self): mobj = self._create_memory() # GPU context with dpctl.device_context("opencl:gpu:0"): usm_type = mobj._usm_type() self.assertEqual(usm_type, "shared") current_queue = dpctl.get_current_queue() usm_type = mobj._usm_type(current_queue) self.assertTrue(usm_type in ["unknown", "shared"])
def test_get_max_work_item_sizes(self): try: q = dpctl.get_current_queue() except Exception: self.fail("Encountered an exception inside get_current_queue().") try: max_work_item_sizes = q.get_sycl_device().get_max_work_item_sizes() except Exception: self.fail( "Encountered an exception inside get_max_work_item_sizes().") self.assertNotEqual(max_work_item_sizes, (None, None, None))
def test_get_max_num_sub_groups(self): try: q = dpctl.get_current_queue() except Exception: self.fail("Encountered an exception inside get_current_queue().") try: max_num_sub_groups = q.get_sycl_device().get_max_num_sub_groups() except Exception: self.fail( "Encountered an exception inside get_max_num_sub_groups().") self.assertTrue(max_num_sub_groups > 0)
def test_get_max_work_group_size(self): try: q = dpctl.get_current_queue() except Exception: self.fail("Encountered an exception inside get_current_queue().") try: max_work_group_size = q.get_sycl_device().get_max_work_group_size() except Exception: self.fail( "Encountered an exception inside get_max_work_group_size().") self.assertTrue(max_work_group_size > 0)
def _wrapped(pyfunc): current_queue = dpctl.get_current_queue() ordered_arg_access_types = get_ordered_arg_access_types( pyfunc, access_types ) # We create an instance of JitDPPYKernel to make sure at call time # we are going through the caching mechanism. dppy_kernel = JitDPPYKernel(pyfunc, debug, ordered_arg_access_types) # This will make sure we are compiling eagerly. dppy_kernel.specialize(argtypes, current_queue) return dppy_kernel
def test_has_int64_base_atomics(self): try: q = dpctl.get_current_queue() except Exception: self.fail("Encountered an exception inside get_current_queue().") try: aspects_base_atomics = q.get_sycl_device().has_int64_base_atomics() except Exception: self.fail( "Encountered an exception inside has_int64_base_atomics().") self.assertNotEqual(aspects_base_atomics, False)
def __call__(self, *args, **kwargs): assert not kwargs, "Keyword Arguments are not supported" try: current_queue = dpctl.get_current_queue() except: _raise_no_device_found_error() argtypes = self._get_argtypes(*args) kernel = self.specialize(argtypes, current_queue) cfg = kernel.configure(self.sycl_queue, self.global_size, self.local_size) cfg(*args)
def test_current_device(check): """ Test is the device for the current queue is valid. """ try: q = dpctl.get_current_queue() except Exception: pytest.fail("Encountered an exception inside get_current_queue().") ctx = q.get_sycl_context() devs = ctx.get_devices() # add check that device is among devs check(devs[0])
def test_memcpy_copy_usm_to_usm(self): mobj1 = self._create_memory() mobj2 = self._create_memory() q = dpctl.get_current_queue() mv1 = memoryview(mobj1) mv2 = memoryview(mobj2) mv1[:3] = b"123" q.memcpy(mobj2, mobj1, 3) self.assertEqual(mv2[:3], b"123")
def test_create_program_from_source(self): oclSrc = " \ kernel void add(global int* a, global int* b, global int* c) { \ size_t index = get_global_id(0); \ c[index] = a[index] + b[index]; \ } \ kernel void axpy(global int* a, global int* b, global int* c, int d) { \ size_t index = get_global_id(0); \ c[index] = a[index] + d*b[index]; \ }" with dpctl.device_context("level0:gpu:0"): q = dpctl.get_current_queue() prog = dpctl_prog.create_program_from_source(q, oclSrc)
def test_dpnp_create_array_in_context(offload_device, dtype): if not ensure_dpnp(): pytest.skip("No DPNP") import dpnp if skip_test(offload_device): pytest.skip("No device for " + offload_device) if ("opencl" not in dpctl.get_current_queue().sycl_device.filter_string and "opencl" in offload_device): pytest.skip("Bug in DPNP. See: IntelPython/dpnp#723") with dpctl.device_context(offload_device): a = dpnp.arange(1024, dtype=dtype) # noqa
def test_memcpy_type_error(self): mobj = self._create_memory() q = dpctl.get_current_queue() with self.assertRaises(TypeError) as cm: q.memcpy(None, mobj, 3) self.assertEqual(type(cm.exception), TypeError) self.assertEqual(str(cm.exception), "Parameter dest should be Memory.") with self.assertRaises(TypeError) as cm: q.memcpy(mobj, None, 3) self.assertEqual(type(cm.exception), TypeError) self.assertEqual(str(cm.exception), "Parameter src should be Memory.")
def test_memory_cpu_context(self): mobj = self._create_memory() # CPU context with dpctl.device_context("opencl:cpu:0"): # type respective to the context in which # memory was created usm_type = mobj._usm_type() self.assertEqual(usm_type, "shared") current_queue = dpctl.get_current_queue() # type as view from current queue usm_type = mobj._usm_type(current_queue) # type can be unknown if current queue is # not in the same SYCL context self.assertTrue(usm_type in ["unknown", "shared"])
def atomic_sub_tuple(context, builder, sig, args): device_type = dpctl.get_current_queue().sycl_device.device_type dtype = sig.args[0].dtype if dtype == types.float32 or dtype == types.float64: if (device_type == dpctl.device_type.gpu and config.NATIVE_FP_ATOMICS == 1): return atomic_sub_wrapper(context, builder, sig, args) else: # Currently, DPCPP only supports native floating point # atomics for GPUs. return atomic_add(context, builder, sig, args, "sub") elif dtype == types.int32 or dtype == types.int64: return atomic_sub_wrapper(context, builder, sig, args) else: raise TypeError("Atomic operation on unsupported type %s" % dtype)
def __call__(self, *args, **kwargs): assert not kwargs, "Keyword Arguments are not supported" argtypes = self._get_argtypes(*args) compute_queue = None # Get the array type and whether all array are of same type or not array_type, uniform = self._datatype_is_same(argtypes) if not uniform: _raise_datatype_mixed_error(argtypes) if type(array_type) == USMNdArrayType: if dpctl.is_in_device_context(): warnings.warn(cfd_ctx_mgr_wrng_msg) queues = [] for i, argtype in enumerate(argtypes): if type(argtype) == USMNdArrayType: memory = dpctl.memory.as_usm_memory(args[i]) if dpctl_version < (0, 12): queue = memory._queue else: queue = memory.sycl_queue queues.append(queue) # dpctl.utils.get_exeuction_queue() checks if the queues passed are equivalent and returns a # SYCL queue if they are equivalent and None if they are not. compute_queue = dpctl.utils.get_execution_queue(queues) if compute_queue is None: raise IndeterminateExecutionQueueError( "Data passed as argument are not equivalent. Please " "create dpctl.tensor.usm_ndarray with equivalent SYCL queue." ) if compute_queue is None: try: compute_queue = dpctl.get_current_queue() except: _raise_no_device_found_error() kernel = self.specialize(argtypes, compute_queue) cfg = kernel.configure( kernel.sycl_queue, self.global_size, self.local_size ) cfg(*args)
def __getitem__(self, args): """Mimick CUDA python's square-bracket notation for configuration. This assumes the argument to be: `global size, local size` """ ls = None nargs = len(args) # Check if the kernel enquing arguments are sane if nargs < 1 or nargs > 2: _raise_invalid_kernel_enqueue_args sycl_queue = dpctl.get_current_queue() gs = _ensure_valid_work_item_grid(args[0], sycl_queue) # If the optional local size argument is provided if nargs == 2 and args[1] != []: ls = _ensure_valid_work_group_size(args[1], gs) return self.configure(sycl_queue, gs, ls)
def test_debug_flag_generates_ir_with_debuginfo(debug_option): """ Check debug info is emitting to IR if debug parameter is set to True """ @dppy.kernel def foo(x): x = 1 # noqa sycl_queue = dpctl.get_current_queue() sig = (types.int32, ) kernel_ir = get_kernel_ir(sycl_queue, foo, sig, debug=debug_option) tag = "!dbg" if debug_option: assert tag in kernel_ir else: assert tag not in kernel_ir