예제 #1
0
 def test_d2d(self):
     hst = np.arange(100, dtype=np.uint32)
     hst2 = np.empty_like(hst)
     sz = hst.size * hst.dtype.itemsize
     dev1 = self.context.memalloc(sz)
     dev2 = self.context.memalloc(sz)
     driver.host_to_device(dev1, hst, sz)
     driver.device_to_device(dev2, dev1, sz)
     driver.device_to_host(hst2, dev2, sz)
     self.assertTrue(np.all(hst == hst2))
예제 #2
0
 def test_d2d(self):
     hst = np.arange(100, dtype=np.uint32)
     hst2 = np.empty_like(hst)
     sz = hst.size * hst.dtype.itemsize
     dev1 = self.context.memalloc(sz)
     dev2 = self.context.memalloc(sz)
     driver.host_to_device(dev1, hst, sz)
     driver.device_to_device(dev2, dev1, sz)
     driver.device_to_host(hst2, dev2, sz)
     self.assertTrue(np.all(hst == hst2))
예제 #3
0
def gpu_slice(arr, col):
    """
    Missing feature in NumbaPro
    """
    from numba.cuda.cudadrv.driver import device_to_host

    view, size = gpu_slice_view(arr, col)
    host = np.empty(shape=arr.shape[0], dtype=arr.dtype)
    device_to_host(host, view, size)
    return host
예제 #4
0
    def test_memcpy(self):
        hstary = np.arange(100, dtype=np.uint32)
        hstary2 = np.arange(100, dtype=np.uint32)
        sz = hstary.size * hstary.dtype.itemsize
        devary = self.context.memalloc(sz)

        driver.host_to_device(devary, hstary, sz)
        driver.device_to_host(hstary2, devary, sz)

        self.assertTrue(np.all(hstary == hstary2))
예제 #5
0
    def test_memcpy(self):
        hstary = np.arange(100, dtype=np.uint32)
        hstary2 = np.arange(100, dtype=np.uint32)
        sz = hstary.size * hstary.dtype.itemsize
        devary = self.context.memalloc(sz)

        driver.host_to_device(devary, hstary, sz)
        driver.device_to_host(hstary2, devary, sz)

        self.assertTrue(np.all(hstary == hstary2))
예제 #6
0
    def test_memset(self):
        dtype = np.dtype('uint32')
        n = 10
        sz = dtype.itemsize * 10
        devary = self.context.memalloc(sz)
        driver.device_memset(devary, 0xab, sz)

        hstary = np.empty(n, dtype=dtype)
        driver.device_to_host(hstary, devary, sz)

        hstary2 = np.array([0xabababab] * n, dtype=np.dtype('uint32'))
        self.assertTrue(np.all(hstary == hstary2))
예제 #7
0
    def test_memset(self):
        dtype = np.dtype('uint32')
        n = 10
        sz = dtype.itemsize * 10
        devary = self.context.memalloc(sz)
        driver.device_memset(devary, 0xab, sz)

        hstary = np.empty(n, dtype=dtype)
        driver.device_to_host(hstary, devary, sz)

        hstary2 = np.array([0xabababab] * n, dtype=np.dtype('uint32'))
        self.assertTrue(np.all(hstary == hstary2))
예제 #8
0
    def copy_to_host(self, ary=None, stream=0):
        """Copy ``self`` to ``ary`` or create a new Numpy ndarray
        if ``ary`` is ``None``.

        If a CUDA ``stream`` is given, then the transfer will be made
        asynchronously as part as the given stream.  Otherwise, the transfer is
        synchronous: the function returns after the copy is finished.

        Always returns the host array.

        Example::

            import numpy as np
            from numba import cuda

            arr = np.arange(1000)
            d_arr = cuda.to_device(arr)

            my_kernel[100, 100](d_arr)

            result_array = d_arr.copy_to_host()
        """
        if any(s < 0 for s in self.strides):
            msg = "D->H copy not implemented for negative strides: {}"
            raise NotImplementedError(msg.format(self.strides))
        assert self.alloc_size >= 0, "Negative memory size"
        stream = self._default_stream(stream)
        if ary is None:
            hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
        else:
            check_array_compatibility(self, ary)
            hostary = ary

        if self.alloc_size != 0:
            _driver.device_to_host(hostary,
                                   self,
                                   self.alloc_size,
                                   stream=stream)

        if ary is None:
            if self.size == 0:
                hostary = np.ndarray(shape=self.shape,
                                     dtype=self.dtype,
                                     buffer=hostary)
            else:
                hostary = np.ndarray(
                    shape=self.shape,
                    dtype=self.dtype,
                    strides=self.strides,
                    buffer=hostary,
                )
        return hostary
예제 #9
0
    def test_cuda_driver_basic(self):
        module = self.context.create_module_ptx(self.ptx)
        function = module.get_function('_Z10helloworldPi')

        array = (c_int * 100)()

        memory = self.context.memalloc(sizeof(array))

        host_to_device(memory, array, sizeof(array))

        function = function.configure((1, ), (100, ))
        function(memory)

        device_to_host(array, memory, sizeof(array))
        for i, v in enumerate(array):
            self.assertEqual(i, v)

        module.unload()
예제 #10
0
    def test_cuda_driver_basic(self):
        module = self.context.create_module_ptx(self.ptx)
        function = module.get_function('_Z10helloworldPi')

        array = (c_int * 100)()

        memory = self.context.memalloc(sizeof(array))

        host_to_device(memory, array, sizeof(array))

        function = function.configure((1,), (100,))
        function(memory)

        device_to_host(array, memory, sizeof(array))
        for i, v in enumerate(array):
            self.assertEqual(i, v)

        module.unload()
예제 #11
0
    def test_cuda_driver_stream(self):
        module = self.context.create_module_ptx(self.ptx)
        function = module.get_function('_Z10helloworldPi')

        array = (c_int * 100)()

        stream = self.context.create_stream()

        with stream.auto_synchronize():
            memory = self.context.memalloc(sizeof(array))
            host_to_device(memory, array, sizeof(array), stream=stream)

            function = function.configure((1, ), (100, ), stream=stream)
            function(memory)

        device_to_host(array, memory, sizeof(array), stream=stream)

        for i, v in enumerate(array):
            self.assertEqual(i, v)
예제 #12
0
    def test_cuda_driver_stream(self):
        module = self.context.create_module_ptx(self.ptx)
        function = module.get_function('_Z10helloworldPi')

        array = (c_int * 100)()

        stream = self.context.create_stream()

        with stream.auto_synchronize():
            memory = self.context.memalloc(sizeof(array))
            host_to_device(memory, array, sizeof(array), stream=stream)

            function = function.configure((1,), (100,), stream=stream)
            function(memory)

        device_to_host(array, memory, sizeof(array), stream=stream)

        for i, v in enumerate(array):
            self.assertEqual(i, v)
예제 #13
0
    def test_host_alloc_driver(self):
        n = 32
        mem = cuda.current_context().memhostalloc(n, mapped=True)

        dtype = np.dtype(np.uint8)
        ary = np.ndarray(shape=n // dtype.itemsize, dtype=dtype, buffer=mem)

        magic = 0xab
        driver.device_memset(mem, magic, n)

        self.assertTrue(np.all(ary == magic))

        ary.fill(n)

        recv = np.empty_like(ary)

        driver.device_to_host(recv, mem, ary.size)

        self.assertTrue(np.all(ary == recv))
        self.assertTrue(np.all(recv == n))
    def test_host_alloc_driver(self):
        n = 32
        mem = cuda.current_context().memhostalloc(n, mapped=True)

        dtype = np.dtype(np.uint8)
        ary = np.ndarray(shape=n // dtype.itemsize, dtype=dtype, buffer=mem)

        magic = 0xAB
        driver.device_memset(mem, magic, n)

        self.assertTrue(np.all(ary == magic))

        ary.fill(n)

        recv = np.empty_like(ary)

        driver.device_to_host(recv, mem, ary.size)

        self.assertTrue(np.all(ary == recv))
        self.assertTrue(np.all(recv == n))
예제 #15
0
    def test_cuda_driver_basic(self):
        module = self.context.create_module_ptx(self.ptx)
        function = module.get_function('_Z10helloworldPi')

        array = (c_int * 100)()

        memory = self.context.memalloc(sizeof(array))

        host_to_device(memory, array, sizeof(array))

        launch_kernel(function.handle,  # Kernel
                      1,   1, 1,        # gx, gy, gz
                      100, 1, 1,        # bx, by, bz
                      0,                # dynamic shared mem
                      0,                # stream
                      [memory])         # arguments

        device_to_host(array, memory, sizeof(array))
        for i, v in enumerate(array):
            self.assertEqual(i, v)

        module.unload()
예제 #16
0
    def _do_getitem(self, item, stream=0):
        stream = self._default_stream(stream)
        typ, offset = self.dtype.fields[item]
        newdata = self.gpu_data.view(offset)

        if typ.shape == ():
            if typ.names is not None:
                return DeviceRecord(dtype=typ, stream=stream,
                                    gpu_data=newdata)
            else:
                hostary = np.empty(1, dtype=typ)
                _driver.device_to_host(dst=hostary, src=newdata,
                                       size=typ.itemsize,
                                       stream=stream)
            return hostary[0]
        else:
            shape, strides, dtype = \
                prepare_shape_strides_dtype(typ.shape,
                                            None,
                                            typ.subdtype[0], 'C')
            return DeviceNDArray(shape=shape, strides=strides,
                                 dtype=dtype, gpu_data=newdata,
                                 stream=stream)
예제 #17
0
    def test_cuda_driver_stream_operations(self):
        module = self.context.create_module_ptx(self.ptx)
        function = module.get_function('_Z10helloworldPi')

        array = (c_int * 100)()

        stream = self.context.create_stream()

        with stream.auto_synchronize():
            memory = self.context.memalloc(sizeof(array))
            host_to_device(memory, array, sizeof(array), stream=stream)

            launch_kernel(function.handle,  # Kernel
                          1,   1, 1,        # gx, gy, gz
                          100, 1, 1,        # bx, by, bz
                          0,                # dynamic shared mem
                          stream.handle,    # stream
                          [memory])         # arguments

        device_to_host(array, memory, sizeof(array), stream=stream)

        for i, v in enumerate(array):
            self.assertEqual(i, v)
예제 #18
0
    def _do_getitem(self, item, stream=0):
        stream = self._default_stream(stream)

        arr = self._dummy.__getitem__(item)
        extents = list(arr.iter_contiguous_extent())
        cls = type(self)
        if len(extents) == 1:
            newdata = self.gpu_data.view(*extents[0])

            if not arr.is_array:
                # Element indexing
                hostary = np.empty(1, dtype=self.dtype)
                _driver.device_to_host(dst=hostary, src=newdata,
                                       size=self._dummy.itemsize,
                                       stream=stream)
                return hostary[0]
            else:
                return cls(shape=arr.shape, strides=arr.strides,
                           dtype=self.dtype, gpu_data=newdata, stream=stream)
        else:
            newdata = self.gpu_data.view(*arr.extent)
            return cls(shape=arr.shape, strides=arr.strides,
                       dtype=self.dtype, gpu_data=newdata, stream=stream)
예제 #19
0
 def load_symbol(name):
     mem, sz = cufunc.module.get_global_symbol(
         "%s__%s__" % (cufunc.name, name))
     val = ctypes.c_int()
     driver.device_to_host(ctypes.addressof(val), mem, sz)
     return val.value
예제 #20
0
    def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):
        # Prepare kernel
        cufunc = self._codelibrary.get_cufunc()

        if self.debug:
            excname = cufunc.name + "__errcode__"
            excmem, excsz = cufunc.module.get_global_symbol(excname)
            assert excsz == ctypes.sizeof(ctypes.c_int)
            excval = ctypes.c_int()
            excmem.memset(0, stream=stream)

        # Prepare arguments
        retr = []  # hold functors for writeback

        kernelargs = []
        for t, v in zip(self.argument_types, args):
            self._prepare_args(t, v, stream, retr, kernelargs)

        if driver.USE_NV_BINDING:
            zero_stream = driver.binding.CUstream(0)
        else:
            zero_stream = None

        stream_handle = stream and stream.handle or zero_stream

        # Invoke kernel
        driver.launch_kernel(cufunc.handle,
                             *griddim,
                             *blockdim,
                             sharedmem,
                             stream_handle,
                             kernelargs,
                             cooperative=self.cooperative)

        if self.debug:
            driver.device_to_host(ctypes.addressof(excval), excmem, excsz)
            if excval.value != 0:
                # An error occurred
                def load_symbol(name):
                    mem, sz = cufunc.module.get_global_symbol(
                        "%s__%s__" % (cufunc.name, name))
                    val = ctypes.c_int()
                    driver.device_to_host(ctypes.addressof(val), mem, sz)
                    return val.value

                tid = [load_symbol("tid" + i) for i in 'zyx']
                ctaid = [load_symbol("ctaid" + i) for i in 'zyx']
                code = excval.value
                exccls, exc_args, loc = self.call_helper.get_exception(code)
                # Prefix the exception message with the source location
                if loc is None:
                    locinfo = ''
                else:
                    sym, filepath, lineno = loc
                    filepath = os.path.abspath(filepath)
                    locinfo = 'In function %r, file %s, line %s, ' % (
                        sym,
                        filepath,
                        lineno,
                    )
                # Prefix the exception message with the thread position
                prefix = "%stid=%s ctaid=%s" % (locinfo, tid, ctaid)
                if exc_args:
                    exc_args = ("%s: %s" % (prefix, exc_args[0]),) + \
                        exc_args[1:]
                else:
                    exc_args = prefix,
                raise exccls(*exc_args)

        # retrieve auto converted arrays
        for wb in retr:
            wb()