def copy_to_device(self, ary, stream=0): """Copy `ary` to `self`. If `ary` is a CUDA memory, perform a device-to-device transfer. Otherwise, perform a a host-to-device transfer. """ if ary.size == 0: # Nothing to do return sentry_contiguous(self) stream = self._default_stream(stream) self_core, ary_core = array_core(self), array_core(ary) if _driver.is_device_memory(ary): sentry_contiguous(ary) check_array_compatibility(self_core, ary_core) _driver.device_to_device(self, ary, self.alloc_size, stream=stream) else: # Ensure same contiguity. Only makes a host-side copy if necessary # (i.e., in order to materialize a writable strided view) ary_core = np.array( ary_core, order='C' if self_core.flags['C_CONTIGUOUS'] else 'F', subok=True, copy=not ary_core.flags['WRITEABLE']) check_array_compatibility(self_core, ary_core) _driver.host_to_device(self, ary_core, self.alloc_size, stream=stream)
def auto_device(obj, stream=0, copy=True): """ Create a DeviceRecord or DeviceArray like obj and optionally copy data from host to device. If obj already represents device memory, it is returned and no copy is made. """ if _driver.is_device_memory(obj): return obj, False elif hasattr(obj, '__cuda_array_interface__'): return numba.cuda.as_cuda_array(obj), False else: if isinstance(obj, np.void): devobj = from_record_like(obj, stream=stream) else: # This allows you to pass non-array objects like constants and # objects implementing the array interface # https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html # into this function (with no overhead -- copies -- for `obj`s # that are already `ndarray`s. obj = np.array(obj, copy=False, subok=True) sentry_contiguous(obj) devobj = from_array_like(obj, stream=stream) if copy: devobj.copy_to_device(obj, stream=stream) return devobj, True
def auto_device(obj, stream=0, copy=True, user_explicit=False): """ Create a DeviceRecord or DeviceArray like obj and optionally copy data from host to device. If obj already represents device memory, it is returned and no copy is made. """ if _driver.is_device_memory(obj): return obj, False elif hasattr(obj, '__cuda_array_interface__'): return numba.cuda.as_cuda_array(obj), False else: if isinstance(obj, np.void): devobj = from_record_like(obj, stream=stream) else: # This allows you to pass non-array objects like constants and # objects implementing the array interface # https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html # into this function (with no overhead -- copies -- for `obj`s # that are already `ndarray`s. obj = np.array(obj, copy=False, subok=True) sentry_contiguous(obj) devobj = from_array_like(obj, stream=stream) if copy: if config.CUDA_WARN_ON_IMPLICIT_COPY: if (not user_explicit and (not isinstance(obj, DeviceNDArray) and isinstance(obj, np.ndarray))): msg = ("Host array used in CUDA kernel will incur " "copy overhead to/from device.") warn(NumbaPerformanceWarning(msg)) devobj.copy_to_device(obj, stream=stream) return devobj, True
def _template(self, obj): self.assertTrue(driver.is_device_memory(obj)) driver.require_device_memory(obj) if driver.USE_NV_BINDING: expected_class = driver.binding.CUdeviceptr else: expected_class = drvapi.cu_device_ptr self.assertTrue(isinstance(obj.device_ctypes_pointer, expected_class))
def _get_params(self): params = KernelParams() params.blockDimX = self.params.get('blockDimX', 1) params.blockDimY = self.params.get('blockDimY', 1) params.blockDimZ = self.params.get('blockDimZ', 1) params.gridDimX = self.params.get('gridDimX', 1) params.gridDimY = self.params.get('gridDimY', 1) params.gridDimZ = self.params.get('gridDimZ', 1) params.sharedMemBytes = self.params.get('sharedMemBytes', 0) if isinstance(self.kernel, AutoJitCUDAKernel): kernel = self.kernel.specialize(*self.args) elif isinstance(self.kernel, CUDAKernel): kernel = self.kernel else: raise Exception('invalid kernel type "%s"' % type(self.kernel).__name__) params.func = kernel._func.get().handle retr, kernel_args = [], [] for t, v in zip(kernel.argument_types, self.args): kernel._prepare_args(t, v, 0, retr, kernel_args) # TODO: take care of retr after graph launched if len(retr): raise Exception('host array as kernel node args not supported yet') param_vals = [] for arg in kernel_args: if is_device_memory(arg): param_vals.append(addressof(device_ctypes_pointer(arg))) else: param_vals.append(addressof(arg)) params.kernelParams = (c_void_p * len(param_vals))( *param_vals) if len(param_vals) else None params.extra = self.params.get('extra', None) return params
def _template(self, obj): self.assertTrue(driver.is_device_memory(obj)) driver.require_device_memory(obj) self.assertTrue(isinstance(obj.device_ctypes_pointer, drvapi.cu_device_ptr))
return args # Create the list of arguments - we compiled for float32[:], int32, float32[:], # float32[:] args = [] args += make_array_args(d_r) args += [ctypes.c_int(13)] args += make_array_args(d_x) args += make_array_args(d_y) # Make a list of pointers to the arguments param_vals = [] for arg in args: if is_device_memory(arg): param_vals.append(addressof(device_ctypes_pointer(arg))) else: param_vals.append(addressof(arg)) params = (c_void_p * len(param_vals))(*param_vals) # (see cudadrv.launch_kernel) # CUresult cuLaunchKernel(CUfunction f, # unsigned int gridDimX, # unsigned int gridDimY, # unsigned int gridDimZ, # unsigned int blockDimX, # unsigned int blockDimY, # unsigned int blockDimZ,