def pairwise_difference(self, in_gpu, N): out = gpuarray.empty((N, N), in_gpu.dtype) block_dim, grid_dim = misc.select_block_grid_sizes(self.device, (N, N)) if in_gpu.dtype.itemsize == 8: dtype = 'double' else: dtype = 'float' pairwised = self.pairwised(N, dtype) pairwised(out.gpudata, in_gpu.gpudata, np.uint32(out.size), block=block_dim, grid=grid_dim) return out
def gamma_exponential_correlation_cuda(self, image, l, y): if isinstance(image, pycuda.gpuarray.GPUArray): N = image.shape[0] image_g = image else: N = image.shape[0] * image.shape[1] image = image.ravel().reshape(N, 1) image_g = gpuarray.to_gpu(image) params = gpuarray.to_gpu(np.array([l, y]).astype(image.dtype)) C = gpuarray.empty((N, N), image_g.dtype) block_dim, grid_dim = misc.select_block_grid_sizes(self.device, (N, N)) if image.dtype.itemsize == 8: dtype = 'double' else: dtype = 'float' kernel_func = self.gamma_exp_kernel(N, dtype) kernel_func(C.gpudata, image_g.gpudata, params.gpudata, np.uint64(C.size), block=block_dim, grid=grid_dim) return C.copy()
unsigned int a = idx/(${B}*${C}*${D}); unsigned int b = (idx%(${B}*${C}*${D}))/(${C}*${D}); unsigned int c = ((idx%(${B}*${C}*${D}))%(${C}*${D}))/${D}; unsigned int d = ((idx%(${B}*${C}*${D}))%(${C}*${D}))%${D}; // Use the subscripts to access the array: if (idx < N) { if (c == 0) x[INDEX(a,b,c,d)] = 100; } } """) max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs( pycuda.autoinit.device) block_dim, grid_dim = misc.select_block_grid_sizes(pycuda.autoinit.device, x.shape) max_blocks_per_grid = max(max_grid_dim) func_mod = \ SourceModule(func_mod_template.substitute(max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid, A=A, B=B, C=C, D=D)) func = func_mod.get_function('func') x_gpu = gpuarray.to_gpu(x) func(x_gpu.gpudata, np.uint32(x_gpu.size), block=block_dim, grid=grid_dim) x_np = x.copy() x_np[:, :, 0, :] = 100 print 'Success status: ', np.allclose(x_np, x_gpu.get())
// Convert the linear index to subscripts: unsigned int a = idx/(${B}*${C}); unsigned int b = (idx%(${B}*${C}))/${C}; unsigned int c = (idx%(${B}*${C}))%${C}; // Use the subscripts to access the array: if (idx < N) { if (b == 0) x[INDEX(a,b,c)] = 100; } } """) max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs(pycuda.autoinit.device) block_dim, grid_dim = misc.select_block_grid_sizes(pycuda.autoinit.device, x.shape) max_blocks_per_grid = max(max_grid_dim) func_mod = \ SourceModule(func_mod_template.substitute(max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid, A=A, B=B, C=C)) func = func_mod.get_function('func') x_gpu = gpuarray.to_gpu(x) func(x_gpu.gpudata, np.uint32(x_gpu.size), block=block_dim, grid=grid_dim) x_np = x.copy() x_np[:, 0, :] = 100 print('Success status: ', np.allclose(x_np, x_gpu.get()))