Пример #1
0
    def __init__(self, context, builder):
        """
        Note: Maybe called multiple times when lowering a function
        """
        from numba.targets import boxing
        self.context = context
        self.builder = builder

        self.module = builder.basic_block.function.module
        # A unique mapping of serialized objects in this module
        try:
            self.module.__serialized
        except AttributeError:
            self.module.__serialized = {}

        # Initialize types
        self.pyobj = self.context.get_argument_type(types.pyobject)
        self.voidptr = Type.pointer(Type.int(8))
        self.long = Type.int(ctypes.sizeof(ctypes.c_long) * 8)
        self.ulonglong = Type.int(ctypes.sizeof(ctypes.c_ulonglong) * 8)
        self.longlong = self.ulonglong
        self.double = Type.double()
        self.py_ssize_t = self.context.get_value_type(types.intp)
        self.cstring = Type.pointer(Type.int(8))
        self.gil_state = Type.int(_helperlib.py_gil_state_size * 8)
        self.py_buffer_t = ir.ArrayType(ir.IntType(8), _helperlib.py_buffer_size)
Пример #2
0
    def __init__(self, context, builder):
        """
        Note: Maybe called multiple times when lowering a function
        """
        from numba.targets import boxing
        self.context = context
        self.builder = builder

        self.module = builder.basic_block.function.module
        # A unique mapping of serialized objects in this module
        try:
            self.module.__serialized
        except AttributeError:
            self.module.__serialized = {}

        # Initialize types
        self.pyobj = self.context.get_argument_type(types.pyobject)
        self.voidptr = Type.pointer(Type.int(8))
        self.long = Type.int(ctypes.sizeof(ctypes.c_long) * 8)
        self.ulong = self.long
        self.longlong = Type.int(ctypes.sizeof(ctypes.c_ulonglong) * 8)
        self.ulonglong = self.longlong
        self.double = Type.double()
        self.py_ssize_t = self.context.get_value_type(types.intp)
        self.cstring = Type.pointer(Type.int(8))
        self.gil_state = Type.int(_helperlib.py_gil_state_size * 8)
        self.py_buffer_t = ir.ArrayType(ir.IntType(8), _helperlib.py_buffer_size)
Пример #3
0
    def __call__(self, *args):
        ctx, symbol, kernargs, kernarg_region = self.bind()

        # Unpack pyobject values into ctypes scalar values
        expanded_values = []
        for ty, val in zip(self.argument_types, args):
            _unpack_argument(ty, val, expanded_values)

        # Insert kernel arguments
        base = 0
        for av in expanded_values:
            # Adjust for alignemnt
            align = ctypes.sizeof(av)
            pad = _calc_padding_for_alignment(align, base)
            base += pad
            # Move to offset
            offseted = ctypes.addressof(kernargs) + base
            asptr = ctypes.cast(offseted, ctypes.POINTER(type(av)))
            # Assign value
            asptr[0] = av
            # Increment offset
            base += align

        assert base <= ctypes.sizeof(
            kernargs), "Kernel argument size is invalid"

        # Actual Kernel launch
        qq = ctx.default_queue

        # Dispatch
        qq.dispatch(symbol, kernargs, workgroup_size=self.local_size,
                    grid_size=self.global_size)

        # Free kernel region
        kernarg_region.free(kernargs)
Пример #4
0
    def __call__(self, *args):
        self._sentry_resource_limit()

        ctx, symbol, kernargs, kernarg_region = self.bind()

        # Unpack pyobject values into ctypes scalar values
        expanded_values = []

        # contains lambdas to execute on return
        retr = []
        for ty, val in zip(self.argument_types, args):
            _unpack_argument(ty, val, expanded_values, retr)

        # Insert kernel arguments
        base = 0
        for av in expanded_values:
            # Adjust for alignemnt
            align = ctypes.sizeof(av)
            pad = _calc_padding_for_alignment(align, base)
            base += pad
            # Move to offset
            offseted = kernargs.value + base
            asptr = ctypes.cast(offseted, ctypes.POINTER(type(av)))
            # Assign value
            asptr[0] = av
            # Increment offset
            base += align

        # Actual Kernel launch
        qq = ctx.default_queue

        if self.stream is None:
            hsa.implicit_sync()

        # Dispatch
        signal = None
        if self.stream is not None:
            signal = hsa.create_signal(1)
            qq.insert_barrier(self.stream._get_last_signal())

        qq.dispatch(symbol,
                    kernargs,
                    workgroup_size=self.local_size,
                    grid_size=self.global_size,
                    signal=signal)

        if self.stream is not None:
            self.stream._add_signal(signal)

        # retrieve auto converted arrays
        for wb in retr:
            wb()

        # Free kernel region
        if kernargs is not None:
            if self.stream is None:
                kernarg_region.free(kernargs)
            else:
                self.stream._add_callback(
                    lambda: kernarg_region.free(kernargs))
Пример #5
0
    def __init__(self, context, builder):
        """
        Note: Maybe called multiple times when lowering a function
        """
        fix_python_api()
        self.context = context
        self.builder = builder

        self.module = builder.basic_block.function.module
        # Initialize types
        self.pyobj = self.context.get_argument_type(types.pyobject)
        self.long = Type.int(ctypes.sizeof(ctypes.c_long) * 8)
        self.ulonglong = Type.int(ctypes.sizeof(ctypes.c_ulonglong) * 8)
        self.longlong = self.ulonglong
        self.double = Type.double()
        self.py_ssize_t = self.context.get_value_type(types.intp)
        self.cstring = Type.pointer(Type.int(8))
Пример #6
0
    def __init__(self, context, builder):
        """
        Note: Maybe called multiple times when lowering a function
        """
        fix_python_api()
        self.context = context
        self.builder = builder

        self.module = builder.basic_block.function.module
        # Initialize types
        self.pyobj = self.context.get_argument_type(types.pyobject)
        self.long = Type.int(ctypes.sizeof(ctypes.c_long) * 8)
        self.ulonglong = Type.int(ctypes.sizeof(ctypes.c_ulonglong) * 8)
        self.longlong = self.ulonglong
        self.double = Type.double()
        self.py_ssize_t = self.context.get_value_type(types.intp)
        self.cstring = Type.pointer(Type.int(8))
Пример #7
0
    def __call__(self, *args):
        self._sentry_resource_limit()

        ctx, symbol, kernargs, kernarg_region = self.bind()

        # Unpack pyobject values into ctypes scalar values
        expanded_values = []

        # contains lambdas to execute on return
        retr = []
        for ty, val in zip(self.argument_types, args):
            _unpack_argument(ty, val, expanded_values, retr)

        # Insert kernel arguments
        base = 0
        for av in expanded_values:
            # Adjust for alignemnt
            align = ctypes.sizeof(av)
            pad = _calc_padding_for_alignment(align, base)
            base += pad
            # Move to offset
            offseted = kernargs.value + base
            asptr = ctypes.cast(offseted, ctypes.POINTER(type(av)))
            # Assign value
            asptr[0] = av
            # Increment offset
            base += align

        # Actual Kernel launch
        qq = ctx.default_queue

        if self.stream is None:
            hsa.implicit_sync()

        # Dispatch
        signal = None
        if self.stream is not None:
            signal = hsa.create_signal(1)
            qq.insert_barrier(self.stream._get_last_signal())

        qq.dispatch(symbol, kernargs, workgroup_size=self.local_size,
                    grid_size=self.global_size, signal=signal)

        if self.stream is not None:
            self.stream._add_signal(signal)

        # retrieve auto converted arrays
        for wb in retr:
            wb()

        # Free kernel region
        if kernargs is not None:
            if self.stream is None:
                kernarg_region.free(kernargs)
            else:
                self.stream._add_callback(lambda: kernarg_region.free(kernargs))
Пример #8
0
    def allocate(self, nd):
        arraytype = make_array_ctype(nd)
        sizeof = ctypes.sizeof(arraytype)
        # Oversized or insufficient space
        if sizeof > self.elemsize or not self.queue:
            return _allocate_head(nd)

        mem = self.queue.popleft()
        self.allocated.add(mem)
        return mem
Пример #9
0
    def _kernel_call(self, args, griddim, blockdim, stream=0, sharedmem=0):
        # Prepare kernel
        cufunc = self._func.get()

        if self.debug:
            excname = cufunc.name + "__errcode__"
            excmem, excsz = cufunc.module.get_global_symbol(excname)
            assert excsz == ctypes.sizeof(ctypes.c_int)
            excval = ctypes.c_int()
            excmem.memset(0, stream=stream)

        # Prepare arguments
        retr = []  # hold functors for writeback

        kernelargs = []
        for t, v in zip(self.argument_types, args):
            self._prepare_args(t, v, stream, retr, kernelargs)

        # Configure kernel
        cu_func = cufunc.configure(griddim,
                                   blockdim,
                                   stream=stream,
                                   sharedmem=sharedmem)
        # Invoke kernel
        cu_func(*kernelargs)

        if self.debug:
            driver.device_to_host(ctypes.addressof(excval), excmem, excsz)
            if excval.value != 0:
                # An error occurred
                def load_symbol(name):
                    mem, sz = cufunc.module.get_global_symbol(
                        "%s__%s__" % (cufunc.name, name))
                    val = ctypes.c_int()
                    driver.device_to_host(ctypes.addressof(val), mem, sz)
                    return val.value

                tid = [load_symbol("tid" + i) for i in 'zyx']
                ctaid = [load_symbol("ctaid" + i) for i in 'zyx']
                code = excval.value
                exccls, exc_args = self.call_helper.get_exception(code)
                # Prefix the exception message with the thread position
                prefix = "tid=%s ctaid=%s" % (tid, ctaid)
                if exc_args:
                    exc_args = ("%s: %s" %
                                (prefix, exc_args[0]), ) + exc_args[1:]
                else:
                    exc_args = prefix,
                raise exccls(*exc_args)

        # retrieve auto converted arrays
        for wb in retr:
            wb()
Пример #10
0
    def _kernel_call(self, args, griddim, blockdim, stream=0, sharedmem=0):
        # Prepare kernel
        cufunc = self._func.get()

        if self.debug:
            excname = cufunc.name + "__errcode__"
            excmem, excsz = cufunc.module.get_global_symbol(excname)
            assert excsz == ctypes.sizeof(ctypes.c_int)
            excval = ctypes.c_int()
            excmem.memset(0, stream=stream)

        # Prepare arguments
        retr = []                       # hold functors for writeback

        kernelargs = []
        for t, v in zip(self.argument_types, args):
            self._prepare_args(t, v, stream, retr, kernelargs)

        # Configure kernel
        cu_func = cufunc.configure(griddim, blockdim,
                                   stream=stream,
                                   sharedmem=sharedmem)
        # Invoke kernel
        cu_func(*kernelargs)

        if self.debug:
            driver.device_to_host(ctypes.addressof(excval), excmem, excsz)
            if excval.value != 0:
                # An error occurred
                def load_symbol(name):
                    mem, sz = cufunc.module.get_global_symbol("%s__%s__" %
                                                              (cufunc.name,
                                                               name))
                    val = ctypes.c_int()
                    driver.device_to_host(ctypes.addressof(val), mem, sz)
                    return val.value

                tid = [load_symbol("tid" + i) for i in 'zyx']
                ctaid = [load_symbol("ctaid" + i) for i in 'zyx']
                code = excval.value
                exccls, exc_args = self.call_helper.get_exception(code)
                # Prefix the exception message with the thread position
                prefix = "tid=%s ctaid=%s" % (tid, ctaid)
                if exc_args:
                    exc_args = ("%s: %s" % (prefix, exc_args[0]),) + exc_args[1:]
                else:
                    exc_args = prefix,
                raise exccls(*exc_args)

        # retrieve auto converted arrays
        for wb in retr:
            wb()
Пример #11
0
    def bind(self):
        """
        Bind kernel to device
        """
        ctx, entry = self._cacheprog.get()
        if entry.symbol.kernarg_segment_size > 0:
            sz = ctypes.sizeof(ctypes.c_byte) *\
                entry.symbol.kernarg_segment_size
            kernargs = entry.kernarg_region.allocate(sz)
        else:
            kernargs = None

        return ctx, entry.symbol, kernargs, entry.kernarg_region
Пример #12
0
    def bind(self):
        """
        Bind kernel to device
        """
        ctx, entry = self._cacheprog.get()
        if entry.symbol.kernarg_segment_size > 0:
            sz = ctypes.sizeof(ctypes.c_byte) *\
                entry.symbol.kernarg_segment_size
            kernargs = entry.kernarg_region.allocate(sz)
        else:
            kernargs = None

        return ctx, entry.symbol, kernargs, entry.kernarg_region
Пример #13
0
def make_array_ctype(ndim):
    """Create a array header type for a given dimension.
    """
    c_intp = ctypes.c_ssize_t

    class c_array(ctypes.Structure):
        _fields_ = [('parent', ctypes.c_void_p),
                    ('data', ctypes.c_void_p),
                    ('shape', c_intp * ndim),
                    ('strides', c_intp * ndim)]

    assert ctypes.sizeof(c_array) == _calc_array_sizeof(ndim), \
        "sizeof(CUDA array struct) != sizeof(CPU array struct)"
    return c_array
Пример #14
0
def ndarray_populate_head(gpu_head, gpu_data, shape, strides, stream=0):
    """
    Populate the array header
    """
    nd = len(shape)
    assert nd > 0, "0 or negative dimension"

    arraytype = make_array_ctype(nd)
    struct = arraytype(data=driver.device_pointer(gpu_data),
                       shape=shape,
                       strides=strides)

    driver.host_to_device(gpu_head, struct, ctypes.sizeof(struct),
                          stream=stream)
    driver.device_memory_depends(gpu_head, gpu_data)
Пример #15
0
def ndarray_populate_head(gpu_mem, gpu_data, shape, strides, stream=0):
    """
    Populate the array header
    """
    nd = len(shape)
    assert nd > 0, "0 or negative dimension"

    arraytype = make_array_ctype(nd)
    struct = arraytype(data=driver.device_pointer(gpu_data),
                       shape=shape,
                       strides=strides)

    gpu_head = gpu_mem.allocate(nd)
    databytes = np.ndarray(shape=ctypes.sizeof(struct), dtype=np.byte,
                           buffer=struct)
    gpu_mem.write(databytes, gpu_head, stream=stream)
    driver.device_memory_depends(gpu_head, gpu_data)
    return gpu_head
Пример #16
0
def ndarray_populate_head(gpu_mem, gpu_data, shape, strides, stream=0):
    """
    Populate the array header
    """
    nd = len(shape)
    assert nd > 0, "0 or negative dimension"

    arraytype = make_array_ctype(nd)
    struct = arraytype(parent=None,
                       data=driver.device_pointer(gpu_data),
                       shape=shape,
                       strides=strides)

    gpu_head = gpu_mem.allocate(nd)
    databytes = np.ndarray(shape=ctypes.sizeof(struct), dtype=np.byte,
                           buffer=struct)
    gpu_mem.write(databytes, gpu_head, stream=stream)
    driver.device_memory_depends(gpu_head, gpu_data)
    return gpu_head
Пример #17
0
def _allocate_head(nd):
    """Allocate the metadata structure
    """
    arraytype = make_array_ctype(nd)
    gpu_head = devices.get_context().memalloc(ctypes.sizeof(arraytype))
    return gpu_head
Пример #18
0
 def _test_array(self, ndim):
     c_array = make_array_ctype(ndim)
     self.assertEqual(ctypes.sizeof(c_array),
                      self._cpu_array_sizeof(ndim))
Пример #19
0
 def _test_array(self, ndim):
     c_array = make_array_ctype(ndim)
     self.assertEqual(ctypes.sizeof(c_array), self._cpu_array_sizeof(ndim))