예제 #1
0
def lower_parfor_rollback(lowerer, parfor):
    try:
        _lower_parfor_gufunc(lowerer, parfor)
        if config.DEBUG:

            device_filter_str = (
                dpctl.get_current_queue().get_sycl_device().filter_string)

            msg = "Parfor offloaded to " + device_filter_str
            print(msg, parfor.loc)
    except Exception as e:

        device_filter_str = (
            dpctl.get_current_queue().get_sycl_device().filter_string)
        msg = (
            "Failed to offload parfor to " + device_filter_str + ". Falling "
            "back to default CPU parallelization. Please file a bug report "
            "at https://github.com/IntelPython/numba-dppy. To help us debug "
            "the issue, please add the traceback to the bug report.")
        if not config.DEBUG:
            msg += " Set the environment variable NUMBA_DPPY_DEBUG to 1 to "
            msg += "generate a traceback."

        warnings.warn(NumbaPerformanceWarning(msg, parfor.loc))
        raise e
예제 #2
0
def test_dpctl_api(filter_str):
    device = dpctl.SyclDevice(filter_str)
    with dpctl.device_context(device):
        dpctl.lsplatform()
        dpctl.get_current_queue()
        dpctl.get_num_activated_queues()
        dpctl.is_in_device_context()
예제 #3
0
def test_dpctl_api(filter_str):
    if skip_test(filter_str):
        pytest.skip()

    device = dpctl.SyclDevice(filter_str)
    with dppy.offload_to_sycl_device(device):
        dpctl.lsplatform()
        dpctl.get_current_queue()
        dpctl.get_num_activated_queues()
        dpctl.is_in_device_context()
    def test_create_program_from_source(self):
        oclSrc = "                                                             \
        kernel void axpy(global int* a, global int* b, global int* c, int d) { \
            size_t index = get_global_id(0);                                   \
            c[index] = d*a[index] + b[index];                                  \
        }"

        with dpctl.device_context("opencl:gpu:0"):
            q = dpctl.get_current_queue()
            prog = dpctl_prog.create_program_from_source(q, oclSrc)
            axpyKernel = prog.get_sycl_kernel("axpy")

            abuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize)
            bbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize)
            cbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize)
            a = np.ndarray((1024), buffer=abuf, dtype="i")
            b = np.ndarray((1024), buffer=bbuf, dtype="i")
            c = np.ndarray((1024), buffer=cbuf, dtype="i")
            a[:] = np.arange(1024)
            b[:] = np.arange(1024, 0, -1)
            c[:] = 0
            d = 2
            args = []

            args.append(a.base)
            args.append(b.base)
            args.append(c.base)
            args.append(ctypes.c_int(d))

            r = [1024]

            q.submit(axpyKernel, args, r)
            self.assertTrue(np.allclose(c, a * d + b))
예제 #5
0
def test_caching_kernel_using_same_context(filter_str):
    """Test kernel caching for the scenario where different SYCL queues that
    share a SYCL context are used to submit a kernel.

    Args:
        filter_str: SYCL filter selector string
    """
    global_size = 10
    N = global_size

    def data_parallel_sum(a, b, c):
        i = dppy.get_global_id(0)
        c[i] = a[i] + b[i]

    a = np.array(np.random.random(N), dtype=np.float32)
    b = np.array(np.random.random(N), dtype=np.float32)
    c = np.ones_like(a)

    # Set the global queue to the default device so that the cached_kernel gets
    # created for that device
    dpctl.set_global_queue(filter_str)
    func = dppy.kernel(data_parallel_sum)
    default_queue = dpctl.get_current_queue()
    cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(
        func._get_argtypes(a, b, c), default_queue)
    for i in range(0, 10):
        # Each iteration create a fresh queue that will share the same context
        with dpctl.device_context(filter_str) as gpu_queue:
            _kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(
                func._get_argtypes(a, b, c), gpu_queue)
            assert _kernel == cached_kernel
예제 #6
0
def test_debug_info_locals_vars_on_no_opt():
    """
    Check llvm debug tag DILocalVariable is emitting to IR for all variables if debug parameter is set to True
    and optimization is O0
    """
    @dppy.kernel
    def foo(var_a, var_b, var_c):
        i = dppy.get_global_id(0)
        var_c[i] = var_a[i] + var_b[i]

    ir_tags = [
        '!DILocalVariable(name: "var_a"',
        '!DILocalVariable(name: "var_b"',
        '!DILocalVariable(name: "var_c"',
        '!DILocalVariable(name: "i"',
    ]

    sycl_queue = dpctl.get_current_queue()
    sig = (
        npytypes_array_to_dppy_array(types.float32[:]),
        npytypes_array_to_dppy_array(types.float32[:]),
        npytypes_array_to_dppy_array(types.float32[:]),
    )

    with override_config("OPT", 0):
        kernel_ir = get_kernel_ir(sycl_queue, foo, sig, debug=True)

    for tag in ir_tags:
        assert tag in kernel_ir
예제 #7
0
    def test_create_program_from_source(self):
        oclSrc = "                                                             \
        kernel void add(global int* a, global int* b, global int* c) {         \
            size_t index = get_global_id(0);                                   \
            c[index] = a[index] + b[index];                                    \
        }                                                                      \
        kernel void axpy(global int* a, global int* b, global int* c, int d) { \
            size_t index = get_global_id(0);                                   \
            c[index] = a[index] + d*b[index];                                  \
        }"
        with dpctl.device_context("opencl:gpu:0"):
            q = dpctl.get_current_queue()
            prog = dpctl_prog.create_program_from_source(q, oclSrc)
            self.assertIsNotNone(prog)

            self.assertTrue(prog.has_sycl_kernel("add"))
            self.assertTrue(prog.has_sycl_kernel("axpy"))

            addKernel = prog.get_sycl_kernel("add")
            axpyKernel = prog.get_sycl_kernel("axpy")

            self.assertEqual(addKernel.get_function_name(), "add")
            self.assertEqual(axpyKernel.get_function_name(), "axpy")
            self.assertEqual(addKernel.get_num_args(), 3)
            self.assertEqual(axpyKernel.get_num_args(), 4)
예제 #8
0
def test_env_var_generates_ir_with_debuginfo_for_func(debug_option):
    """
    Check debug info is emitting to IR if NUMBA_DPPY_DEBUGINFO is set to 1
    """
    @dppy.func
    def func_sum(a, b):
        result = a + b
        return result

    @dppy.kernel
    def data_parallel_sum(a, b, c):
        i = dppy.get_global_id(0)
        c[i] = func_sum(a[i], b[i])

    ir_tags = [
        r'\!DISubprogram\(name: ".*func_sum"',
        r'\!DISubprogram\(name: ".*data_parallel_sum"',
    ]

    sycl_queue = dpctl.get_current_queue()
    sig = (
        npytypes_array_to_dppy_array(types.float32[:]),
        npytypes_array_to_dppy_array(types.float32[:]),
        npytypes_array_to_dppy_array(types.float32[:]),
    )

    with override_config("DEBUGINFO_DEFAULT", int(debug_option)):
        kernel_ir = get_kernel_ir(sycl_queue, data_parallel_sum, sig)

    for tag in ir_tags:
        assert debug_option == make_check(kernel_ir, tag)
예제 #9
0
def test_debug_flag_generates_ir_with_debuginfo_for_func(debug_option):
    """
    Check debug info is emitting to IR if debug parameter is set to True
    """
    @dppy.func(debug=debug_option)
    def func_sum(a, b):
        result = a + b
        return result

    @dppy.kernel(debug=debug_option)
    def data_parallel_sum(a, b, c):
        i = dppy.get_global_id(0)
        c[i] = func_sum(a[i], b[i])

    ir_tags = [
        r'\!DISubprogram\(name: ".*func_sum"',
        r'\!DISubprogram\(name: ".*data_parallel_sum"',
    ]

    sycl_queue = dpctl.get_current_queue()
    sig = (
        npytypes_array_to_dppy_array(types.float32[:]),
        npytypes_array_to_dppy_array(types.float32[:]),
        npytypes_array_to_dppy_array(types.float32[:]),
    )

    kernel_ir = get_kernel_ir(sycl_queue,
                              data_parallel_sum,
                              sig,
                              debug=debug_option)

    for tag in ir_tags:
        assert debug_option == make_check(kernel_ir, tag)
예제 #10
0
def test_consuming_array_from_dpnp(offload_device, dtype):
    if not ensure_dpnp():
        pytest.skip("No DPNP")

    import dpnp

    if skip_test(offload_device):
        pytest.skip("No device for " + offload_device)

    if ("opencl" not in dpctl.get_current_queue().sycl_device.filter_string
            and "opencl" in offload_device):
        pytest.skip("Bug in DPNP. See: IntelPython/dpnp#723")

    @dppy.kernel
    def data_parallel_sum(a, b, c):
        """
        Vector addition using the ``kernel`` decorator.
        """
        i = dppy.get_global_id(0)
        c[i] = a[i] + b[i]

    global_size = 1021

    with dppy.offload_to_sycl_device(offload_device):
        a = dppy.asarray(dpnp.arange(global_size, dtype=dtype))
        b = dppy.asarray(dpnp.arange(global_size, dtype=dtype))
        c = dppy.asarray(dpnp.ones_like(a))

        data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
예제 #11
0
 def test_create_program_from_spirv(self):
     CURR_DIR = os.path.dirname(os.path.abspath(__file__))
     spirv_file = os.path.join(CURR_DIR, "input_files/multi_kernel.spv")
     with open(spirv_file, "rb") as fin:
         spirv = fin.read()
         with dpctl.device_context("level_zero:gpu:0"):
             q = dpctl.get_current_queue()
             prog = dpctl_prog.create_program_from_spirv(q, spirv)
예제 #12
0
def test_current_device(check):
    """Test is the device for the current queue is valid."""
    try:
        q = dpctl.get_current_queue()
    except Exception:
        pytest.fail("Encountered an exception inside get_current_queue().")
    device = q.get_sycl_device()
    check(device)
예제 #13
0
def is_gen12(device_type):
    with dpctl.device_context(device_type):
        q = dpctl.get_current_queue()
        device = q.get_sycl_device()
        name = device.name
        if "Gen12" in name:
            return True

        return False
예제 #14
0
    def test_memory_gpu_context(self):
        mobj = self._create_memory()

        # GPU context
        with dpctl.device_context("opencl:gpu:0"):
            usm_type = mobj._usm_type()
            self.assertEqual(usm_type, "shared")
            current_queue = dpctl.get_current_queue()
            usm_type = mobj._usm_type(current_queue)
            self.assertTrue(usm_type in ["unknown", "shared"])
예제 #15
0
 def test_get_max_work_item_sizes(self):
     try:
         q = dpctl.get_current_queue()
     except Exception:
         self.fail("Encountered an exception inside get_current_queue().")
     try:
         max_work_item_sizes = q.get_sycl_device().get_max_work_item_sizes()
     except Exception:
         self.fail(
             "Encountered an exception inside get_max_work_item_sizes().")
     self.assertNotEqual(max_work_item_sizes, (None, None, None))
예제 #16
0
 def test_get_max_num_sub_groups(self):
     try:
         q = dpctl.get_current_queue()
     except Exception:
         self.fail("Encountered an exception inside get_current_queue().")
     try:
         max_num_sub_groups = q.get_sycl_device().get_max_num_sub_groups()
     except Exception:
         self.fail(
             "Encountered an exception inside get_max_num_sub_groups().")
     self.assertTrue(max_num_sub_groups > 0)
예제 #17
0
 def test_get_max_work_group_size(self):
     try:
         q = dpctl.get_current_queue()
     except Exception:
         self.fail("Encountered an exception inside get_current_queue().")
     try:
         max_work_group_size = q.get_sycl_device().get_max_work_group_size()
     except Exception:
         self.fail(
             "Encountered an exception inside get_max_work_group_size().")
     self.assertTrue(max_work_group_size > 0)
예제 #18
0
 def _wrapped(pyfunc):
     current_queue = dpctl.get_current_queue()
     ordered_arg_access_types = get_ordered_arg_access_types(
         pyfunc, access_types
     )
     # We create an instance of JitDPPYKernel to make sure at call time
     # we are going through the caching mechanism.
     dppy_kernel = JitDPPYKernel(pyfunc, debug, ordered_arg_access_types)
     # This will make sure we are compiling eagerly.
     dppy_kernel.specialize(argtypes, current_queue)
     return dppy_kernel
예제 #19
0
 def test_has_int64_base_atomics(self):
     try:
         q = dpctl.get_current_queue()
     except Exception:
         self.fail("Encountered an exception inside get_current_queue().")
     try:
         aspects_base_atomics = q.get_sycl_device().has_int64_base_atomics()
     except Exception:
         self.fail(
             "Encountered an exception inside has_int64_base_atomics().")
     self.assertNotEqual(aspects_base_atomics, False)
예제 #20
0
    def __call__(self, *args, **kwargs):
        assert not kwargs, "Keyword Arguments are not supported"
        try:
            current_queue = dpctl.get_current_queue()
        except:
            _raise_no_device_found_error()

        argtypes = self._get_argtypes(*args)
        kernel = self.specialize(argtypes, current_queue)
        cfg = kernel.configure(self.sycl_queue, self.global_size,
                               self.local_size)
        cfg(*args)
예제 #21
0
def test_current_device(check):
    """
    Test is the device for the current queue is valid.
    """
    try:
        q = dpctl.get_current_queue()
    except Exception:
        pytest.fail("Encountered an exception inside get_current_queue().")
    ctx = q.get_sycl_context()
    devs = ctx.get_devices()
    # add check that device is among devs
    check(devs[0])
예제 #22
0
    def test_memcpy_copy_usm_to_usm(self):
        mobj1 = self._create_memory()
        mobj2 = self._create_memory()
        q = dpctl.get_current_queue()

        mv1 = memoryview(mobj1)
        mv2 = memoryview(mobj2)

        mv1[:3] = b"123"

        q.memcpy(mobj2, mobj1, 3)

        self.assertEqual(mv2[:3], b"123")
예제 #23
0
 def test_create_program_from_source(self):
     oclSrc = "                                                             \
     kernel void add(global int* a, global int* b, global int* c) {         \
         size_t index = get_global_id(0);                                   \
         c[index] = a[index] + b[index];                                    \
     }                                                                      \
     kernel void axpy(global int* a, global int* b, global int* c, int d) { \
         size_t index = get_global_id(0);                                   \
         c[index] = a[index] + d*b[index];                                  \
     }"
     with dpctl.device_context("level0:gpu:0"):
         q = dpctl.get_current_queue()
         prog = dpctl_prog.create_program_from_source(q, oclSrc)
예제 #24
0
def test_dpnp_create_array_in_context(offload_device, dtype):
    if not ensure_dpnp():
        pytest.skip("No DPNP")

    import dpnp

    if skip_test(offload_device):
        pytest.skip("No device for " + offload_device)

    if ("opencl" not in dpctl.get_current_queue().sycl_device.filter_string
            and "opencl" in offload_device):
        pytest.skip("Bug in DPNP. See: IntelPython/dpnp#723")

    with dpctl.device_context(offload_device):
        a = dpnp.arange(1024, dtype=dtype)  # noqa
예제 #25
0
    def test_memcpy_type_error(self):
        mobj = self._create_memory()
        q = dpctl.get_current_queue()

        with self.assertRaises(TypeError) as cm:
            q.memcpy(None, mobj, 3)

        self.assertEqual(type(cm.exception), TypeError)
        self.assertEqual(str(cm.exception), "Parameter dest should be Memory.")

        with self.assertRaises(TypeError) as cm:
            q.memcpy(mobj, None, 3)

        self.assertEqual(type(cm.exception), TypeError)
        self.assertEqual(str(cm.exception), "Parameter src should be Memory.")
예제 #26
0
    def test_memory_cpu_context(self):
        mobj = self._create_memory()

        # CPU context
        with dpctl.device_context("opencl:cpu:0"):
            # type respective to the context in which
            # memory was created
            usm_type = mobj._usm_type()
            self.assertEqual(usm_type, "shared")

            current_queue = dpctl.get_current_queue()
            # type as view from current queue
            usm_type = mobj._usm_type(current_queue)
            # type can be unknown if current queue is
            # not in the same SYCL context
            self.assertTrue(usm_type in ["unknown", "shared"])
예제 #27
0
def atomic_sub_tuple(context, builder, sig, args):
    device_type = dpctl.get_current_queue().sycl_device.device_type
    dtype = sig.args[0].dtype

    if dtype == types.float32 or dtype == types.float64:
        if (device_type == dpctl.device_type.gpu
                and config.NATIVE_FP_ATOMICS == 1):
            return atomic_sub_wrapper(context, builder, sig, args)
        else:
            # Currently, DPCPP only supports native floating point
            # atomics for GPUs.
            return atomic_add(context, builder, sig, args, "sub")
    elif dtype == types.int32 or dtype == types.int64:
        return atomic_sub_wrapper(context, builder, sig, args)
    else:
        raise TypeError("Atomic operation on unsupported type %s" % dtype)
예제 #28
0
    def __call__(self, *args, **kwargs):
        assert not kwargs, "Keyword Arguments are not supported"

        argtypes = self._get_argtypes(*args)
        compute_queue = None

        # Get the array type and whether all array are of same type or not
        array_type, uniform = self._datatype_is_same(argtypes)
        if not uniform:
            _raise_datatype_mixed_error(argtypes)

        if type(array_type) == USMNdArrayType:
            if dpctl.is_in_device_context():
                warnings.warn(cfd_ctx_mgr_wrng_msg)

            queues = []
            for i, argtype in enumerate(argtypes):
                if type(argtype) == USMNdArrayType:
                    memory = dpctl.memory.as_usm_memory(args[i])
                    if dpctl_version < (0, 12):
                        queue = memory._queue
                    else:
                        queue = memory.sycl_queue
                    queues.append(queue)

            # dpctl.utils.get_exeuction_queue() checks if the queues passed are equivalent and returns a
            # SYCL queue if they are equivalent and None if they are not.
            compute_queue = dpctl.utils.get_execution_queue(queues)
            if compute_queue is None:
                raise IndeterminateExecutionQueueError(
                    "Data passed as argument are not equivalent. Please "
                    "create dpctl.tensor.usm_ndarray with equivalent SYCL queue."
                )

        if compute_queue is None:
            try:
                compute_queue = dpctl.get_current_queue()
            except:
                _raise_no_device_found_error()

        kernel = self.specialize(argtypes, compute_queue)
        cfg = kernel.configure(
            kernel.sycl_queue, self.global_size, self.local_size
        )
        cfg(*args)
예제 #29
0
    def __getitem__(self, args):
        """Mimick CUDA python's square-bracket notation for configuration.
        This assumes the argument to be:
            `global size, local size`
        """
        ls = None
        nargs = len(args)
        # Check if the kernel enquing arguments are sane
        if nargs < 1 or nargs > 2:
            _raise_invalid_kernel_enqueue_args

        sycl_queue = dpctl.get_current_queue()

        gs = _ensure_valid_work_item_grid(args[0], sycl_queue)
        # If the optional local size argument is provided
        if nargs == 2 and args[1] != []:
            ls = _ensure_valid_work_group_size(args[1], gs)

        return self.configure(sycl_queue, gs, ls)
예제 #30
0
def test_debug_flag_generates_ir_with_debuginfo(debug_option):
    """
    Check debug info is emitting to IR if debug parameter is set to True
    """
    @dppy.kernel
    def foo(x):
        x = 1  # noqa

    sycl_queue = dpctl.get_current_queue()
    sig = (types.int32, )

    kernel_ir = get_kernel_ir(sycl_queue, foo, sig, debug=debug_option)

    tag = "!dbg"

    if debug_option:
        assert tag in kernel_ir
    else:
        assert tag not in kernel_ir