Exemplo n.º 1
0
    def __init__(self,
                 shape,
                 strides,
                 dtype,
                 stream=0,
                 writeback=None,
                 gpu_data=None):
        """
        Args
        ----

        shape
            array shape.
        strides
            array strides.
        dtype
            data type as np.dtype coercible object.
        stream
            cuda stream.
        writeback
            Deprecated.
        gpu_data
            user provided device memory for the ndarray data buffer
        """
        if isinstance(shape, int):
            shape = (shape, )
        if isinstance(strides, int):
            strides = (strides, )
        dtype = np.dtype(dtype)
        self.ndim = len(shape)
        if len(strides) != self.ndim:
            raise ValueError('strides not match ndim')
        self._dummy = dummyarray.Array.from_desc(0, shape, strides,
                                                 dtype.itemsize)
        self.shape = tuple(shape)
        self.strides = tuple(strides)
        self.dtype = dtype
        self.size = int(functools.reduce(operator.mul, self.shape, 1))
        # prepare gpu memory
        if self.size > 0:
            if gpu_data is None:
                self.alloc_size = _driver.memory_size_from_info(
                    self.shape, self.strides, self.dtype.itemsize)
                gpu_data = devices.get_context().memalloc(self.alloc_size)
            else:
                self.alloc_size = _driver.device_memory_size(gpu_data)
        else:
            # Make NULL pointer for empty allocation
            gpu_data = _driver.MemoryPointer(context=devices.get_context(),
                                             pointer=c_void_p(0),
                                             size=0)
            self.alloc_size = 0

        self.gpu_data = gpu_data

        self.__writeback = writeback  # should deprecate the use of this
        self.stream = stream
Exemplo n.º 2
0
 def open(self):
     """
     Returns a new *DeviceNDArray* that shares the allocation from the
     original process.  Must not be used on the original process.
     """
     dptr = self._ipc_handle.open(devices.get_context())
     return DeviceNDArray(gpu_data=dptr, **self._array_desc)
Exemplo n.º 3
0
    def _do_setitem(self, key, value, stream=0):

        stream = self._default_stream(stream)

        # If the record didn't have a default stream, and the user didn't
        # provide a stream, then we will use the default stream for the
        # assignment kernel and synchronize on it.
        synchronous = not stream
        if synchronous:
            ctx = devices.get_context()
            stream = ctx.get_default_stream()

        # (1) prepare LHS

        typ, offset = self.dtype.fields[key]
        newdata = self.gpu_data.view(offset)

        lhs = type(self)(dtype=typ, stream=stream, gpu_data=newdata)

        # (2) prepare RHS

        rhs, _ = auto_device(lhs.dtype.type(value), stream=stream)

        # (3) do the copy

        _driver.device_to_device(lhs, rhs, rhs.dtype.itemsize, stream)

        if synchronous:
            stream.synchronize()
Exemplo n.º 4
0
def ndarray_device_allocate_data(ary):
    """
    Allocate gpu data buffer
    """
    datasize = driver.host_memory_size(ary)
    # allocate
    gpu_data = devices.get_context().memalloc(datasize)
    return gpu_data
Exemplo n.º 5
0
 def setUp(self):
     self.assertTrue(len(devices.gpus) > 0)
     self.context = devices.get_context()
     device = self.context.device
     ccmajor, _ = device.compute_capability
     if ccmajor >= 2:
         self.ptx = ptx2
     else:
         self.ptx = ptx1
Exemplo n.º 6
0
 def setUp(self):
     self.assertTrue(len(devices.gpus) > 0)
     self.context = devices.get_context()
     device = self.context.device
     ccmajor, _ = device.compute_capability
     if ccmajor >= 2:
         self.ptx = ptx2
     else:
         self.ptx = ptx1
Exemplo n.º 7
0
    def get_ipc_handle(self):
        """
        Returns a *IpcArrayHandle* object that is safe to serialize and transfer
        to another process to share the local allocation.

        Note: this feature is only available on Linux.
        """
        ipch = devices.get_context().get_ipc_handle(self.gpu_data)
        desc = dict(shape=self.shape, strides=self.strides, dtype=self.dtype)
        return IpcArrayHandle(ipc_handle=ipch, array_desc=desc)
Exemplo n.º 8
0
    def _do_setitem(self, key, value, stream=0):

        stream = self._default_stream(stream)

        # If the array didn't have a default stream, and the user didn't provide
        # a stream, then we will use the default stream for the assignment
        # kernel and synchronize on it.
        synchronous = not stream
        if synchronous:
            ctx = devices.get_context()
            stream = ctx.get_default_stream()

        # (1) prepare LHS

        arr = self._dummy.__getitem__(key)
        newdata = self.gpu_data.view(*arr.extent)

        if isinstance(arr, dummyarray.Element):
            # convert to a 0d array
            shape = ()
            strides = ()
        else:
            shape = arr.shape
            strides = arr.strides

        lhs = type(self)(
            shape=shape,
            strides=strides,
            dtype=self.dtype,
            gpu_data=newdata,
            stream=stream)

        # (2) prepare RHS

        rhs, _ = auto_device(value, stream=stream, user_explicit=True)
        if rhs.ndim > lhs.ndim:
            raise ValueError("Can't assign %s-D array to %s-D self" % (
                rhs.ndim,
                lhs.ndim))
        rhs_shape = np.ones(lhs.ndim, dtype=np.int64)
        # negative indices would not work if rhs.ndim == 0
        rhs_shape[lhs.ndim - rhs.ndim:] = rhs.shape
        rhs = rhs.reshape(*rhs_shape)
        for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)):
            if r != 1 and l != r:
                raise ValueError("Can't copy sequence with size %d to array "
                                 "axis %d with dimension %d" % ( r, i, l))

        # (3) do the copy

        n_elements = functools.reduce(operator.mul, lhs.shape, 1)
        _assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs)
        if synchronous:
            stream.synchronize()
Exemplo n.º 9
0
    def __init__(self, dispatcher, griddim, blockdim, stream, sharedmem):
        self.dispatcher = dispatcher
        self.griddim = griddim
        self.blockdim = blockdim
        self.stream = stream
        self.sharedmem = sharedmem

        if config.CUDA_LOW_OCCUPANCY_WARNINGS:
            ctx = get_context()
            smcount = ctx.device.MULTIPROCESSOR_COUNT
            grid_size = griddim[0] * griddim[1] * griddim[2]
            if grid_size < 2 * smcount:
                msg = ("Grid size ({grid}) < 2 * SM count ({sm}) "
                       "will likely result in GPU under utilization due "
                       "to low occupancy.")
                msg = msg.format(grid=grid_size, sm=2 * smcount)
                warn(NumbaPerformanceWarning(msg))
Exemplo n.º 10
0
 def _compute_thread_per_block(self, kernel):
     tpb = self.thread_per_block
     # Prefer user-specified config
     if tpb != 0:
         return tpb
     # Else, ask the driver to give a good config
     else:
         ctx = get_context()
         # Kernel is specialized, so there's only one definition - get it so
         # we can get the cufunc from the code library
         defn = next(iter(kernel.overloads.values()))
         kwargs = dict(
             func=defn._codelibrary.get_cufunc(),
             b2d_func=0,  # dynamic-shared memory is constant to blksz
             memsize=self.sharedmem,
             blocksizelimit=1024,
         )
         _, tpb = ctx.get_max_potential_block_size(**kwargs)
         return tpb
Exemplo n.º 11
0
    def test_device_get_uuid(self):
        # A device UUID looks like:
        #
        #     GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643
        #
        # To test, we construct an RE that matches this form and verify that
        # the returned UUID matches.
        #
        # Device UUIDs may not conform to parts of the UUID specification (RFC
        # 4122) pertaining to versions and variants, so we do not extract and
        # validate the values of these bits.

        h = '[0-9a-f]{%d}'
        h4 = h % 4
        h8 = h % 8
        h12 = h % 12
        uuid_format = f'^GPU-{h8}-{h4}-{h4}-{h4}-{h12}$'

        dev = devices.get_context().device
        self.assertRegex(dev.uuid, uuid_format)
Exemplo n.º 12
0
    def max_cooperative_grid_blocks(self, blockdim, dynsmemsize=0):
        '''
        Calculates the maximum number of blocks that can be launched for this
        kernel in a cooperative grid in the current context, for the given block
        and dynamic shared memory sizes.

        :param blockdim: Block dimensions, either as a scalar for a 1D block, or
                         a tuple for 2D or 3D blocks.
        :param dynsmemsize: Dynamic shared memory size in bytes.
        :return: The maximum number of blocks in the grid.
        '''
        ctx = get_context()
        cufunc = self._codelibrary.get_cufunc()

        if isinstance(blockdim, tuple):
            blockdim = functools.reduce(lambda x, y: x * y, blockdim)
        active_per_sm = ctx.get_active_blocks_per_multiprocessor(
            cufunc, blockdim, dynsmemsize)
        sm_count = ctx.device.MULTIPROCESSOR_COUNT
        return active_per_sm * sm_count
Exemplo n.º 13
0
def cc_X_or_above(major, minor):
    if not config.ENABLE_CUDASIM:
        cc = devices.get_context().device.compute_capability
        return cc >= (major, minor)
    else:
        return True
Exemplo n.º 14
0
 def setUp(self):
     super().setUp()
     self.context = devices.get_context()
Exemplo n.º 15
0
 def setUp(self):
     self.context = devices.get_context()
Exemplo n.º 16
0
 def setUp(self):
     self.context = devices.get_context()