Exemplo n.º 1
0
def lower_parfor_rollback(lowerer, parfor):
    try:
        _lower_parfor_gufunc(lowerer, parfor)
        if config.DEBUG:

            device_filter_str = (
                dpctl.get_current_queue().get_sycl_device().filter_string)

            msg = "Parfor offloaded to " + device_filter_str
            print(msg, parfor.loc)
    except Exception as e:

        device_filter_str = (
            dpctl.get_current_queue().get_sycl_device().filter_string)
        msg = (
            "Failed to offload parfor to " + device_filter_str + ". Falling "
            "back to default CPU parallelization. Please file a bug report "
            "at https://github.com/IntelPython/numba-dppy. To help us debug "
            "the issue, please add the traceback to the bug report.")
        if not config.DEBUG:
            msg += " Set the environment variable NUMBA_DPPY_DEBUG to 1 to "
            msg += "generate a traceback."

        warnings.warn(NumbaPerformanceWarning(msg, parfor.loc))
        raise e
Exemplo n.º 2
0
def auto_device(obj, stream=0, copy=True, user_explicit=False):
    """
    Create a DeviceRecord or DeviceArray like obj and optionally copy data from
    host to device. If obj already represents device memory, it is returned and
    no copy is made.
    """
    if _driver.is_device_memory(obj):
        return obj, False
    elif hasattr(obj, '__cuda_array_interface__'):
        return numba.cuda.as_cuda_array(obj), False
    else:
        if isinstance(obj, np.void):
            devobj = from_record_like(obj, stream=stream)
        else:
            # This allows you to pass non-array objects like constants and
            # objects implementing the array interface
            # https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html
            # into this function (with no overhead -- copies -- for `obj`s
            # that are already `ndarray`s.
            obj = np.array(obj, copy=False, subok=True)
            sentry_contiguous(obj)
            devobj = from_array_like(obj, stream=stream)
        if copy:
            if config.CUDA_WARN_ON_IMPLICIT_COPY:
                if (not user_explicit and (not isinstance(obj, DeviceNDArray)
                                           and isinstance(obj, np.ndarray))):
                    msg = ("Host array used in CUDA kernel will incur "
                           "copy overhead to/from device.")
                    warn(NumbaPerformanceWarning(msg))
            devobj.copy_to_device(obj, stream=stream)
        return devobj, True
Exemplo n.º 3
0
    def matmul_typer(self, a, b, out=None):
        """
        Typer function for Numpy matrix multiplication.
        """
        if not isinstance(a, types.Array) or not isinstance(b, types.Array):
            return
        if not all(x.ndim in (1, 2) for x in (a, b)):
            raise TypingError("%s only supported on 1-D and 2-D arrays" %
                              (self.func_name, ))
        # Output dimensionality
        ndims = set([a.ndim, b.ndim])
        if ndims == set([2]):
            # M * M
            out_ndim = 2
        elif ndims == set([1, 2]):
            # M* V and V * M
            out_ndim = 1
        elif ndims == set([1]):
            # V * V
            out_ndim = 0

        if out is not None:
            if out_ndim == 0:
                raise TypeError(
                    "explicit output unsupported for vector * vector")
            elif out.ndim != out_ndim:
                raise TypeError("explicit output has incorrect dimensionality")
            if not isinstance(out, types.Array) or out.layout != "C":
                raise TypeError("output must be a C-contiguous array")
            all_args = (a, b, out)
        else:
            all_args = (a, b)

        if not (config.DISABLE_PERFORMANCE_WARNINGS or all(x.layout in "CF"
                                                           for x in (a, b))):
            msg = "%s is faster on contiguous arrays, called on %s" % (
                self.func_name,
                (a, b),
            )
            warnings.warn(NumbaPerformanceWarning(msg))
        if not all(x.dtype == a.dtype for x in all_args):
            raise TypingError("%s arguments must all have the same dtype" %
                              (self.func_name, ))
        if not isinstance(a.dtype, (types.Float, types.Complex)):
            raise TypingError("%s only supported on float and complex arrays" %
                              (self.func_name, ))
        if out:
            return out
        elif out_ndim > 0:
            return types.Array(a.dtype, out_ndim, "C")
        else:
            return a.dtype
Exemplo n.º 4
0
    def __init__(self, dispatcher, griddim, blockdim, stream, sharedmem):
        self.dispatcher = dispatcher
        self.griddim = griddim
        self.blockdim = blockdim
        self.stream = stream
        self.sharedmem = sharedmem

        if config.CUDA_LOW_OCCUPANCY_WARNINGS:
            ctx = get_context()
            smcount = ctx.device.MULTIPROCESSOR_COUNT
            grid_size = griddim[0] * griddim[1] * griddim[2]
            if grid_size < 2 * smcount:
                msg = ("Grid size ({grid}) < 2 * SM count ({sm}) "
                       "will likely result in GPU under utilization due "
                       "to low occupancy.")
                msg = msg.format(grid=grid_size, sm=2 * smcount)
                warn(NumbaPerformanceWarning(msg))
Exemplo n.º 5
0
    def __init__(self, dispatcher, griddim, blockdim, stream, sharedmem):
        self.dispatcher = dispatcher
        self.griddim = griddim
        self.blockdim = blockdim
        self.stream = stream
        self.sharedmem = sharedmem

        if config.CUDA_LOW_OCCUPANCY_WARNINGS:
            # Warn when the grid has fewer than 128 blocks. This number is
            # chosen somewhat heuristically - ideally the minimum is 2 times
            # the number of SMs, but the number of SMs varies between devices -
            # some very small GPUs might only have 4 SMs, but an H100-SXM5 has
            # 132. In general kernels should be launched with large grids
            # (hundreds or thousands of blocks), so warning when fewer than 128
            # blocks are used will likely catch most beginner errors, where the
            # grid tends to be very small (single-digit or low tens of blocks).
            min_grid_size = 128
            grid_size = griddim[0] * griddim[1] * griddim[2]
            if grid_size < min_grid_size:
                msg = (f"Grid size {grid_size} will likely result in GPU "
                       "under-utilization due to low occupancy.")
                warn(NumbaPerformanceWarning(msg))