def test_cuda_driver_basic(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function('_Z10helloworldPi') array = (c_int * 100)() memory = self.context.memalloc(sizeof(array)) host_to_device(memory, array, sizeof(array)) launch_kernel( function.handle, # Kernel 1, 1, 1, # gx, gy, gz 100, 1, 1, # bx, by, bz 0, # dynamic shared mem 0, # stream [memory]) # arguments device_to_host(array, memory, sizeof(array)) for i, v in enumerate(array): self.assertEqual(i, v) module.unload()
def test_cuda_driver_basic(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function('_Z10helloworldPi') array = (c_int * 100)() memory = self.context.memalloc(sizeof(array)) host_to_device(memory, array, sizeof(array)) ptr = memory.device_ctypes_pointer stream = 0 if _driver.USE_NV_BINDING: ptr = c_void_p(int(ptr)) stream = _driver.binding.CUstream(stream) launch_kernel( function.handle, # Kernel 1, 1, 1, # gx, gy, gz 100, 1, 1, # bx, by, bz 0, # dynamic shared mem stream, # stream [ptr]) # arguments device_to_host(array, memory, sizeof(array)) for i, v in enumerate(array): self.assertEqual(i, v) module.unload()
def test_cuda_driver_stream_operations(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function('_Z10helloworldPi') array = (c_int * 100)() stream = self.context.create_stream() with stream.auto_synchronize(): memory = self.context.memalloc(sizeof(array)) host_to_device(memory, array, sizeof(array), stream=stream) launch_kernel( function.handle, # Kernel 1, 1, 1, # gx, gy, gz 100, 1, 1, # bx, by, bz 0, # dynamic shared mem stream.handle, # stream [memory]) # arguments device_to_host(array, memory, sizeof(array), stream=stream) for i, v in enumerate(array): self.assertEqual(i, v)
def copy_to_device(self, ary, stream=0): """Copy `ary` to `self`. If `ary` is a CUDA memory, perform a device-to-device transfer. Otherwise, perform a a host-to-device transfer. """ if ary.size == 0: # Nothing to do return sentry_contiguous(self) stream = self._default_stream(stream) self_core, ary_core = array_core(self), array_core(ary) if _driver.is_device_memory(ary): sentry_contiguous(ary) check_array_compatibility(self_core, ary_core) _driver.device_to_device(self, ary, self.alloc_size, stream=stream) else: # Ensure same contiguity. Only makes a host-side copy if necessary # (i.e., in order to materialize a writable strided view) ary_core = np.array( ary_core, order='C' if self_core.flags['C_CONTIGUOUS'] else 'F', subok=True, copy=not ary_core.flags['WRITEABLE']) check_array_compatibility(self_core, ary_core) _driver.host_to_device(self, ary_core, self.alloc_size, stream=stream)
def test_d2d(self): hst = np.arange(100, dtype=np.uint32) hst2 = np.empty_like(hst) sz = hst.size * hst.dtype.itemsize dev1 = self.context.memalloc(sz) dev2 = self.context.memalloc(sz) driver.host_to_device(dev1, hst, sz) driver.device_to_device(dev2, dev1, sz) driver.device_to_host(hst2, dev2, sz) self.assertTrue(np.all(hst == hst2))
def test_memcpy(self): hstary = np.arange(100, dtype=np.uint32) hstary2 = np.arange(100, dtype=np.uint32) sz = hstary.size * hstary.dtype.itemsize devary = self.context.memalloc(sz) driver.host_to_device(devary, hstary, sz) driver.device_to_host(hstary2, devary, sz) self.assertTrue(np.all(hstary == hstary2))
def test_cuda_driver_basic(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function('_Z10helloworldPi') array = (c_int * 100)() memory = self.context.memalloc(sizeof(array)) host_to_device(memory, array, sizeof(array)) function = function.configure((1, ), (100, )) function(memory) device_to_host(array, memory, sizeof(array)) for i, v in enumerate(array): self.assertEqual(i, v) module.unload()
def test_cuda_driver_basic(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function('_Z10helloworldPi') array = (c_int * 100)() memory = self.context.memalloc(sizeof(array)) host_to_device(memory, array, sizeof(array)) function = function.configure((1,), (100,)) function(memory) device_to_host(array, memory, sizeof(array)) for i, v in enumerate(array): self.assertEqual(i, v) module.unload()
def test_cuda_driver_stream(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function('_Z10helloworldPi') array = (c_int * 100)() stream = self.context.create_stream() with stream.auto_synchronize(): memory = self.context.memalloc(sizeof(array)) host_to_device(memory, array, sizeof(array), stream=stream) function = function.configure((1, ), (100, ), stream=stream) function(memory) device_to_host(array, memory, sizeof(array), stream=stream) for i, v in enumerate(array): self.assertEqual(i, v)
def test_cuda_driver_stream(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function('_Z10helloworldPi') array = (c_int * 100)() stream = self.context.create_stream() with stream.auto_synchronize(): memory = self.context.memalloc(sizeof(array)) host_to_device(memory, array, sizeof(array), stream=stream) function = function.configure((1,), (100,), stream=stream) function(memory) device_to_host(array, memory, sizeof(array), stream=stream) for i, v in enumerate(array): self.assertEqual(i, v)