def to_tex3d(data): """ Source: https://wiki.tiker.net/PyCUDA/Examples/Demo3DSurface """ d, h, w = data.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(data.dtype) descr.num_channels = 1 descr.flags = 0 if isinstance(data, gpuarray.GPUArray): data = data.get() device_array = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(data) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = data.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def create_3d_texture(a, module, variable, point_sampling=False): a = numpy.asfortranarray(a) w, h, d = a.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(a.dtype) descr.num_channels = 1 descr.flags = 0 ary = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(a) copy.set_dst_array(ary) copy.width_in_bytes = copy.src_pitch = a.strides[1] copy.src_height = copy.height = h copy.depth = d copy() out_texref = module.get_texref(variable) out_texref.set_array(ary) if point_sampling: out_texref.set_filter_mode(cuda.filter_mode.POINT) else: out_texref.set_filter_mode(cuda.filter_mode.LINEAR) return out_texref
def copyTexture(self, data, pixelSize=None, extent=None, isocenter=None): # Convert data to float32 array in Contiguous ordering. self.arrIn = np.array(data, dtype=np.float32, order='C') d, h, w = self.arrIn.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(self.arrIn.dtype) descr.num_channels = 1 # descr.flags = 0 self.gpuTexture = cuda.Array(descr) # Copy array data across. This puts a 3D array in linear memory on the GPU. copy = cuda.Memcpy3D() copy.set_src_host(self.arrIn) copy.set_dst_array(self.gpuTexture) copy.src_pitch = self.arrIn.strides[1] copy.width_in_bytes = self.arrIn.strides[1] copy.height = h copy.depth = d copy.src_height = h copy() self.pixelSize = pixelSize self.isocenter = isocenter # Extent[l,r,b,t,f,b] self.extent = extent # Trigger for setting bottom left corner as 0,0,0. self.zeroExtent = False
def numpy3d_to_array(np_array, allow_surface_bind=False, layered=True): d, h, w = np_array.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(np_array.dtype) descr.num_channels = 1 descr.flags = 0 if allow_surface_bind: descr.flags = cuda.array3d_flags.SURFACE_LDST if layered: descr.flags = cuda.array3d_flags.ARRAY3D_LAYERED device_array = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(np_array) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = np_array.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def gpuArray3DtocudaArray(gpuArray, allowSurfaceBind=False, precision='float'): #import pycuda.autoinit d, h, w = gpuArray.shape descr3D = cuda.ArrayDescriptor3D() descr3D.width = w descr3D.height = h descr3D.depth = d if precision == 'float': descr3D.format = cuda.dtype_to_array_format(gpuArray.dtype) descr3D.num_channels = 1 elif precision == 'double': descr3D.format = cuda.array_format.SIGNED_INT32 descr3D.num_channels = 2 else: print("ERROR: CUDA_ARRAY incompatible precision") sys.exit() descr3D.flags = 0 if allowSurfaceBind: descr3D.flags = cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descr3D) copy3D = cuda.Memcpy3D() copy3D.set_src_device(gpuArray.ptr) copy3D.set_dst_array(cudaArray) copy3D.width_in_bytes = copy3D.src_pitch = gpuArray.strides[1] copy3D.src_height = copy3D.height = h copy3D.depth = d copy3D() return cudaArray, copy3D
def _copy_local_to_global(self, local_array, global_array, dtype=np.float64): nz, ny, nx = self.local_dims sw = self.stencil_width typesize = global_array.dtype.itemsize copier = cuda.Memcpy3D() copier.set_src_device(local_array.gpudata) copier.set_dst_device(global_array.gpudata) # offsets copier.src_x_in_bytes = sw * typesize copier.src_y = sw copier.src_z = sw copier.src_pitch = local_array.strides[1] copier.dst_pitch = global_array.strides[1] copier.src_height = ny + 2 * sw copier.dst_height = ny copier.width_in_bytes = nx * typesize copier.height = ny copier.depth = nz copier()
def numpy3d_to_array(np_array, allow_surface_bind=True): import pycuda.autoinit d, h, w = np_array.shape descr = drv.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = drv.dtype_to_array_format(np_array.dtype) descr.num_channels = 1 descr.flags = 0 if allow_surface_bind: descr.flags = drv.array3d_flags.SURFACE_LDST device_array = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_host(np_array) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = np_array.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def array_to_numpy3d(cuda_array): import pycuda.autoinit descriptor = cuda_array.get_descriptor_3d() w = descriptor.width h = descriptor.height d = descriptor.depth shape = d, h, w dtype = array_format_to_dtype(descriptor.format) numpy_array = np.zeros(shape, dtype) copy = drv.Memcpy3D() copy.set_src_array(cuda_array) copy.set_dst_host(numpy_array) itemsize = numpy_array.dtype.itemsize copy.width_in_bytes = copy.dst_pitch = w * itemsize copy.dst_height = copy.height = h copy.depth = d copy() return numpy_array
def numpy3d_to_array(np_array): '''Copy a 3D (d,h,w) numpy array into a 3D pycuda array that can be used to set a texture. (For some reason, gpuarrays can't be used to do that directly). A transpose happens implicitly; the CUDA array has dim (w,h,d).''' import pycuda.autoinit d, h, w = np_array.shape descr = driver.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d if np_array.dtype == np.float64: descr.format = driver.array_format.SIGNED_INT32 descr.num_channels = 2 else: descr.format = driver.dtype_to_array_format(np_array.dtype) descr.num_channels = 1 descr.flags = 0 device_array = driver.Array(descr) copy = driver.Memcpy3D() copy.set_src_host(np_array) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = np_array.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def numpy3d_to_array(np_array, allow_surface_bind=True): """Converts 3D numpy array to 3D device array. """ # numpy3d_to_array # taken from pycuda mailing list (striped for C ordering only) d, h, w = np_array.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(np_array.dtype) descr.num_channels = 1 descr.flags = 0 if allow_surface_bind: descr.flags = cuda.array3d_flags.SURFACE_LDST device_array = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(np_array) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = np_array.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def copy3D_device_to_numpy(dst, src, type_sz, width, height, depth): copy = cuda.Memcpy3D() copy.set_src_device(src) copy.set_dst_host(dst) copy.height = height copy.depth = depth copy.dst_pitch = copy.src_pitch = copy.width_in_bytes = width * type_sz copy()
def test_3d_texture(self): # adapted from code by Nicolas Pinto w = 2 h = 4 d = 8 shape = (w, h, d) a = numpy.asarray(numpy.random.randn(*shape), dtype=numpy.float32, order="F") descr = drv.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = drv.dtype_to_array_format(a.dtype) descr.num_channels = 1 descr.flags = 0 ary = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_host(a) copy.set_dst_array(ary) copy.width_in_bytes = copy.src_pitch = a.strides[1] copy.src_height = copy.height = h copy.depth = d copy() mod = SourceModule(""" texture<float, 3, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int x = threadIdx.x; int y = threadIdx.y; int z = threadIdx.z; int dx = blockDim.x; int dy = blockDim.y; int i = (z*dy + y)*dx + x; dest[i] = tex3D(mtx_tex, x, y, z); //dest[i] = x; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") mtx_tex.set_array(ary) dest = numpy.zeros(shape, dtype=numpy.float32, order="F") copy_texture(drv.Out(dest), block=shape, texrefs=[mtx_tex]) assert la.norm(dest - a) == 0
def malloc_gpu_arrays(nx, ny, nz, cex, cey, cez): print 'rank= %d, (%d, %d, %d)' % (rank, nx, ny, nz), total_bytes = nx * ny * nz * 4 * 9 if total_bytes / (1024**3) == 0: print '%d MB' % (total_bytes / (1024**2)) else: print '%1.2f GB' % (float(total_bytes) / (1024**3)) if nz % Dx != 0: print "Error: nz is not multiple of %d" % (Dx) sys.exit() if (nx * ny) % Dy != 0: print "Error: nx*ny is not multiple of %d" % (Dy) sys.exit() f = np.zeros((nx, ny, nz), 'f') ex_gpu = cuda.to_device(f) ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) descr = cuda.ArrayDescriptor3D() descr.width = nz descr.height = ny descr.depth = nx descr.format = cuda.dtype_to_array_format(f.dtype) descr.num_channels = 1 descr.flags = 0 tcex_gpu = cuda.Array(descr) tcey_gpu = cuda.Array(descr) tcez_gpu = cuda.Array(descr) mcopy = cuda.Memcpy3D() mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1] mcopy.src_height = mcopy.height = ny mcopy.depth = nx mcopy.set_src_host(cex) mcopy.set_dst_array(tcex_gpu) mcopy() mcopy.set_src_host(cey) mcopy.set_dst_array(tcey_gpu) mcopy() mcopy.set_src_host(cez) mcopy.set_dst_array(tcez_gpu) mcopy() eh_fields = [ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu] tex_fields = [tcex_gpu, tcey_gpu, tcez_gpu] return eh_fields, tex_fields
def texCubeToGPU(texCube): descr = cuda.ArrayDescriptor3D() descr.width = texCube.shape[2] descr.height = texCube.shape[1] descr.depth = 6 descr.format = cuda.dtype_to_array_format(texCube.dtype) descr.num_channels = 4 descr.flags = cuda.array3d_flags.CUBEMAP texCubeArray = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(texCube) copy.set_dst_array(texCubeArray) copy.width_in_bytes = copy.src_pitch = texCube.strides[1] #d*h*w*c copy.src_height = copy.height = texCube.shape[1] copy.depth = 6 copy() return texCubeArray
def _copy_halo_to_array(self, halo, array, copy_dims, copy_offsets, dtype=np.float64): # copy from 2-d halo to 3-d array # # Parameters: # halo, array: gpuarrays involved in the copy # copy_dims: number of elements to copy in (z, y, x) directions # copy_offsets: offsets at the destination in (z, y, x) directions nz, ny, nx = self.local_dims sw = self.stencil_width d, h, w = copy_dims z_offs, y_offs, x_offs = copy_offsets typesize = array.dtype.itemsize copier = cuda.Memcpy3D() copier.set_src_device(halo.gpudata) copier.set_dst_device(array.gpudata) # this time, offsets are at the destination: copier.dst_x_in_bytes = x_offs * typesize copier.dst_y = y_offs copier.dst_z = z_offs copier.src_pitch = halo.strides[1] copier.dst_pitch = array.strides[1] copier.src_height = h copier.dst_height = ny + 2 * sw copier.width_in_bytes = w * typesize copier.height = h copier.depth = d # perform the copy: copier()
def np3DtoCudaArray(npArray, allowSurfaceBind=False): #import pycuda.autoinit d, h, w = npArray.shape descr3D = cuda.ArrayDescriptor3D() descr3D.width = w descr3D.height = h descr3D.depth = d descr3D.format = cuda.dtype_to_array_format(npArray.dtype) descr3D.num_channels = 1 descr3D.flags = 0 if allowSurfaceBind: descr3D.flags = cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descr3D) copy3D = cuda.Memcpy3D() copy3D.set_src_host(npArray) copy3D.set_dst_array(cudaArray) copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] copy3D.src_height = copy3D.height = h copy3D.depth = d copy3D() return cudaArray, copy3D
def get_Memcpy3D_d2d(src, dst, src_pitch, dst_pitch, dim_args, itemsize, src_height, dst_height): ''' Wrapper for the pycuda.driver.Memcpy3d() function (same args) Returns a callable object which copies the arrays on invocation of () dim_args: list, [width, height, depth] !not width_in_bytes ''' depth, height, width = dim_args width_in_bytes = width * itemsize src_ptr = getattr(src, 'gpudata', 0) # set to NULL if no valid ptr dst_ptr = getattr(dst, 'gpudata', 0) # set to NULL if no valid ptr cpy = drv.Memcpy3D() cpy.set_src_device(src_ptr) cpy.set_dst_device(dst_ptr) cpy.height = np.int64(height) cpy.width_in_bytes = np.int64(width_in_bytes) cpy.depth = np.int64(depth) cpy.src_pitch = src_pitch cpy.dst_pitch = dst_pitch cpy.src_height = np.int64(src_height) cpy.dst_height = np.int64(dst_height) return cpy
def _prepare_F_texture(self): descr = drv.ArrayDescriptor3D() descr.width = self.side descr.height = self.side descr.depth = self.side descr.format = drv.dtype_to_array_format(self.F_gpu.dtype) descr.num_channels = 1 descr.flags = 0 F_array = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_device(self.F_gpu.gpudata) copy.set_dst_array(F_array) copy.width_in_bytes = copy.src_pitch = self.F_gpu.strides[1] copy.src_height = copy.height = self.side copy.depth = self.side self.F_gpu_to_array_copy = copy self.F_gpu_to_array_copy() self.F_texture.set_array(F_array)
def gpu3D(src): """ """ w, h, d = src.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(src.dtype) descr.num_channels = 1 descr.flags = 0 dst = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(src) copy.set_dst_array(dst) copy.width_in_bytes = copy.src_pitch = src.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return dst
def _copy_array_to_halo(self, array, halo, copy_dims, copy_offsets, dtype=np.float64): # copy from 3-d array to 2-d halo # # Paramters: # array, halo: gpuarrays involved in the copy. # copy_dims: number of elements to copy in (z, y, x) directions # copy_offsets: offsets at the source in (z, y, x) directions nz, ny, nx = self.local_dims d, h, w = copy_dims z_offs, y_offs, x_offs = copy_offsets typesize = array.dtype.itemsize copier = cuda.Memcpy3D() copier.set_src_device(array.gpudata) copier.set_dst_device(halo.gpudata) copier.src_x_in_bytes = x_offs * typesize copier.src_y = y_offs copier.src_z = z_offs copier.src_pitch = array.strides[1] copier.dst_pitch = halo.strides[1] copier.src_height = ny copier.dst_height = h copier.width_in_bytes = w * typesize copier.height = h copier.depth = d # perform the copy: copier()
def alloc_coeff_arrays(s): f = np.zeros((s.nx, s.ny, s.nz), 'f') s.cex = np.ones_like(f) * 0.5 s.cex[:, -1, :] = 0 s.cex[:, :, -1] = 0 s.cey = np.ones_like(f) * 0.5 s.cey[:, :, -1] = 0 s.cey[-1, :, :] = 0 s.cez = np.ones_like(f) * 0.5 s.cez[-1, :, :] = 0 s.cez[:, -1, :] = 0 descr = cuda.ArrayDescriptor3D() descr.width = s.nz descr.height = s.ny descr.depth = s.nx descr.format = cuda.dtype_to_array_format(f.dtype) descr.num_channels = 1 descr.flags = 0 s.tcex_gpu = cuda.Array(descr) s.tcey_gpu = cuda.Array(descr) s.tcez_gpu = cuda.Array(descr) mcpy = cuda.Memcpy3D() mcpy.width_in_bytes = mcpy.src_pitch = f.strides[1] mcpy.src_height = mcpy.height = s.ny mcpy.depth = s.nx mcpy.set_src_host(s.cex) mcpy.set_dst_array(s.tcex_gpu) mcpy() mcpy.set_src_host(s.cey) mcpy.set_dst_array(s.tcey_gpu) mcpy() mcpy.set_src_host(s.cez) mcpy.set_dst_array(s.tcez_gpu) mcpy()
descr = cuda.ArrayDescriptor3D() descr.width = nx descr.height = ny descr.depth = nz descr.format = cuda.dtype_to_array_format(f.dtype) descr.num_channels = 1 descr.flags = 0 cex_gpu = cuda.Array(descr) cey_gpu = cuda.Array(descr) cez_gpu = cuda.Array(descr) chx_gpu = cuda.Array(descr) chy_gpu = cuda.Array(descr) chz_gpu = cuda.Array(descr) mcopy = cuda.Memcpy3D() mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1] mcopy.src_height = mcopy.height = ny mcopy.depth = nz arrcopy(mcopy, set_c(f, (None, -1, -1)), cex_gpu) arrcopy(mcopy, set_c(f, (-1, None, -1)), cey_gpu) arrcopy(mcopy, set_c(f, (-1, -1, None)), cez_gpu) arrcopy(mcopy, set_c(f, (None, 0, 0)), chx_gpu) arrcopy(mcopy, set_c(f, (0, None, 0)), chy_gpu) arrcopy(mcopy, set_c(f, (0, 0, None)), chz_gpu) # prepare kernels from pycuda.compiler import SourceModule mod = SourceModule(kernels) update_e = mod.get_function("update_e")
def __init__(self, stream, nx, ny, nz, x_halo, y_halo, z_halo, cpu_data=None, dtype=np.float32): self.logger = logging.getLogger(__name__) self.nx = nx self.ny = ny self.nz = nz self.x_halo = x_halo self.y_halo = y_halo self.z_halo = z_halo nx_halo = nx + 2 * x_halo ny_halo = ny + 2 * y_halo nz_halo = nz + 2 * z_halo #self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz) #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here self.data = pycuda.gpuarray.zeros((nz_halo, ny_halo, nx_halo), dtype) #For returning to download self.memorypool = PageLockedMemoryPool() #If we don't have any data, just allocate and return if cpu_data is None: return #Make sure data is in proper format assert cpu_data.shape == ( nz_halo, ny_halo, nx_halo) or cpu_data.shape == ( self.nz, self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str( cpu_data.shape), str( (self.nz, self.ny, self.nx)), str((nz_halo, ny_halo, nx_halo))) assert cpu_data.itemsize == 4, "Wrong size of data type" assert not np.isfortran( cpu_data), "Wrong datatype (Fortran, expected C)" #Create copy object from host to device copy = cuda.Memcpy3D() copy.set_src_host(cpu_data) copy.set_dst_device(self.data.gpudata) #Set offsets of destination x_offset = (nx_halo - cpu_data.shape[2]) // 2 y_offset = (ny_halo - cpu_data.shape[1]) // 2 z_offset = (nz_halo - cpu_data.shape[0]) // 2 copy.dst_x_in_bytes = x_offset * self.data.strides[1] copy.dst_y = y_offset copy.dst_z = z_offset #Set pitch of destination copy.dst_pitch = self.data.strides[0] #Set width in bytes to copy for each row and #number of rows to copy width = max(self.nx, cpu_data.shape[2]) height = max(self.ny, cpu_data.shape[1]) depth = max(self.nz, cpu - data.shape[0]) copy.width_in_bytes = width * cpu_data.itemsize copy.height = height copy.depth = depth #Perform the copy copy(stream)
if async: drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream) else: drv.memcpy_dtoh(dst, src.gpudata) else: src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,)) if async: drv.memcpy_htod_async(dst.gpudata, src, stream=stream) else: drv.memcpy_htod(dst.gpudata, src) return if len(shape) == 2: copy = drv.Memcpy2D() elif len(shape) == 3: copy = drv.Memcpy3D() else: raise ValueError("more than 2 discontiguous axes not supported %s" % (tuple(sorted(axes)),)) if isinstance(src, GPUArray): copy.set_src_device(src.gpudata) else: copy.set_src_host(src) if isinstance(dst, GPUArray): copy.set_dst_device(dst.gpudata) else: copy.set_dst_host(dst) copy.width_in_bytes = src.dtype.itemsize*shape[0]
def register_multiple_images_subpix_cuda(stack, template): import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as drv import pycuda.cumath as cumath import skcuda.fft as cu_fft import skcuda.linalg as lin import skcuda.cublas as cub from numpy import pi, newaxis, floor import cmath from pycuda.elementwise import ElementwiseKernel from pycuda.compiler import SourceModule from numpy import conj, abs, arctan2, sqrt, real, imag, shape, zeros, trunc, ceil, floor, fix from numpy.fft import fftshift, ifftshift fft2, ifft2 = fftn, ifftn = fast_ffts.get_ffts(nthreads=1, use_numpy_fft=False) mod = SourceModule(""" #include <pycuda-complex.hpp>" __global__ void load_convert(unsigned short *a, float *b,int f, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; int offset = f * imlen; if (idx <imlen) { b[idx] = (float)a[offset+idx]; } } __global__ void convert_export(float *a, unsigned short *b,int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { b[idx] = (unsigned short)(a[idx]>0 ? a[idx] : 0) ; } } __global__ void multiply_comp_float(pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { z[idx] = x[idx] * y[idx]; } } __global__ void calc_conj(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { y[idx]._M_re = x[idx]._M_re; y[idx]._M_im = -x[idx]._M_im; } } __global__ void convert_multiply(float *x, pycuda::complex<float> *y, float sx, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { y[idx]._M_re = 0; y[idx]._M_im = x[idx] * sx; } } __global__ void transfer_array(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlenl, int imlen, int nlargeh, int nh) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; int offset = imlenl*3/4; if (idx<imlen) { int target_ind = (offset+(idx/nh)*nlargeh + (idx % nh))%imlenl; x[target_ind] = y[idx]; } } __global__ void calc_shiftmatrix(float *x, float *y, pycuda::complex<float> *z, float sx, float sy,float dg, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { z[idx]._M_re = 0; z[idx]._M_im = x[idx] * sx + y[idx] * sy + dg; } } __global__ void sub_float(float *x, float *y, float sv, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { x[idx] = y[idx]-sv; } } """) load_convert_kernel = mod.get_function('load_convert') convert_export_kernel = mod.get_function('convert_export') convert_multiply_kernel = mod.get_function('convert_multiply') multiply_float_kernel = mod.get_function('multiply_comp_float') transfer_array_kernel = mod.get_function('transfer_array') calc_shiftmatrix_kernel = mod.get_function('calc_shiftmatrix') conj_kernel = mod.get_function('calc_conj') sub_float_kernel = mod.get_function('sub_float') Z = stack.shape[0] M = stack.shape[1] N = stack.shape[2] max_memsize = 4200000000 imlen = M * N half_imlen = M * (N // 2 + 1) grid_dim = (64, int(imlen / (512 * 64)) + 1, 1) block_dim = (512, 1, 1) #512 threads per block stack_bin = int(max_memsize / (M * N * stack.itemsize)) stack_ite = int(Z / stack_bin) + 1 usfac = 100 ## needs to be bigger than 10 if not template.shape == stack.shape[1:]: raise ValueError("Images must have same shape.") if np.any(np.isnan(template)): template = template.copy() template[template != template] = 0 if np.any(np.isnan(stack)): stack = stack.copy() stack[stack != stack] = 0 mlarge = M * 2 nlarge = N * 2 t = time.time() plan_forward = cu_fft.Plan((M, N), np.float32, np.complex64) plan_inverse = cu_fft.Plan((M, N), np.complex64, np.float32) plan_inverse_big = cu_fft.Plan((mlarge, nlarge), np.complex64, np.float32) cub_h = cub.cublasCreate() template_gpu = gpuarray.to_gpu(template.astype('float32')) source_gpu = gpuarray.empty((M, N), np.float32) ifft_gpu = gpuarray.empty((M, N), np.float32) result_gpu = gpuarray.empty((M, N), np.uint16) templatef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64) sourcef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64) prod_gpu1 = gpuarray.empty((M, N // 2 + 1), np.complex64) prod_gpu2 = gpuarray.empty((M, N // 2 + 1), np.complex64) shiftmatrix = gpuarray.empty((M, N // 2 + 1), np.complex64) cu_fft.fft(template_gpu, templatef_gpu, plan_forward, scale=True) templatef_gpu = templatef_gpu.conj() move_list = np.zeros((Z, 2)) largearray1_gpu = gpuarray.zeros((mlarge, nlarge // 2 + 1), np.complex64) largearray2_gpu = gpuarray.empty((mlarge, nlarge), np.float32) imlenl = mlarge * (nlarge // 2 + 1) zoom_factor = 1.5 dftshiftG = trunc(ceil(usfac * zoom_factor) / 2) #% Center of output array at dftshift+1 upsample_dim = int(ceil(usfac * zoom_factor)) term1c = (ifftshift(np.arange(N, dtype='float') - floor(N / 2)). T[:, newaxis]) / N # fftfreq # output points term2c = ((np.arange(upsample_dim, dtype='float')) / usfac)[newaxis, :] term1r = (np.arange(upsample_dim, dtype='float').T)[:, newaxis] term2r = (ifftshift(np.arange(M, dtype='float')) - floor(M / 2))[newaxis, :] # fftfreq term1c_gpu = gpuarray.to_gpu(term1c[:int(floor(N / 2) + 1), :].astype('float32')) term2c_gpu = gpuarray.to_gpu(term2c.astype('float32')) term1r_gpu = gpuarray.to_gpu(term1r.astype('float32')) term2r_gpu = gpuarray.to_gpu(term2r.astype('float32')) term2c_gpu_ori = gpuarray.to_gpu(term2c.astype('float32')) term1r_gpu_ori = gpuarray.to_gpu(term1r.astype('float32')) kernc_gpu = gpuarray.zeros((N // 2 + 1, upsample_dim), np.float32) kernr_gpu = gpuarray.zeros((upsample_dim, M), np.float32) kernc_gpuc = gpuarray.zeros((N // 2 + 1, upsample_dim), np.complex64) kernr_gpuc = gpuarray.zeros((upsample_dim, M), np.complex64) Nr = np.fft.ifftshift(np.linspace(-np.fix(M / 2), np.ceil(M / 2) - 1, M)) Nc = np.fft.ifftshift(np.linspace(-np.fix(N / 2), np.ceil(N / 2) - 1, N)) [Nc, Nr] = np.meshgrid(Nc, Nr) Nc_gpu = gpuarray.to_gpu((Nc[:, :N // 2 + 1] / N).astype('float32')) Nr_gpu = gpuarray.to_gpu((Nr[:, :N // 2 + 1] / M).astype('float32')) upsampled1 = gpuarray.empty((upsample_dim, N // 2 + 1), np.complex64) upsampled2 = gpuarray.empty((upsample_dim, upsample_dim), np.complex64) source_stack = gpuarray.empty((stack_bin, M, N), dtype=stack.dtype) copy = drv.Memcpy3D() copy.set_src_host(stack.data) copy.set_dst_device(source_stack.gpudata) copy.width_in_bytes = copy.src_pitch = stack.strides[1] copy.src_height = copy.height = M for zb in range(stack_ite): zrange = np.arange(zb * stack_bin, min((stack_bin * (zb + 1)), Z)) copy.depth = len(zrange) copy.src_z = int(zrange[0]) copy() for i in range(len(zrange)): t = zb * stack_bin + i load_convert_kernel(source_stack, source_gpu.gpudata, np.int32(i), np.int32(imlen), block=block_dim, grid=grid_dim) cu_fft.fft(source_gpu, sourcef_gpu, plan_forward, scale=True) multiply_float_kernel(sourcef_gpu, templatef_gpu, prod_gpu1, np.int32(half_imlen), block=block_dim, grid=grid_dim) transfer_array_kernel(largearray1_gpu, prod_gpu1, np.int32(imlenl), np.int32(half_imlen), np.int32(nlarge // 2 + 1), np.int32(N // 2 + 1), block=block_dim, grid=grid_dim) cu_fft.ifft(largearray1_gpu, largearray2_gpu, plan_inverse_big, scale=True) peakind = cub.cublasIsamax(cub_h, largearray2_gpu.size, largearray2_gpu.gpudata, 1) rloc, cloc = np.unravel_index(peakind, largearray2_gpu.shape) md2 = trunc(mlarge / 2) nd2 = trunc(nlarge / 2) if rloc > md2: row_shift2 = rloc - mlarge else: row_shift2 = rloc if cloc > nd2: col_shift2 = cloc - nlarge else: col_shift2 = cloc row_shiftG = row_shift2 / 2. col_shiftG = col_shift2 / 2. # Initial shift estimate in upsampled grid row_shiftG0 = round(row_shiftG * usfac) / usfac col_shiftG0 = round(col_shiftG * usfac) / usfac # Matrix multiply DFT around the current shift estimate roffG = dftshiftG - row_shiftG0 * usfac coffG = dftshiftG - col_shiftG0 * usfac sub_float_kernel(term2c_gpu, term2c_gpu_ori, np.float32(coffG / usfac), np.int32(term2c_gpu.size), block=block_dim, grid=grid_dim) sub_float_kernel(term1r_gpu, term1r_gpu_ori, np.float32(roffG), np.int32(term1r_gpu.size), block=block_dim, grid=grid_dim) lin.dot(term1c_gpu, term2c_gpu, handle=cub_h, out=kernc_gpu) lin.dot(term1r_gpu, term2r_gpu, handle=cub_h, out=kernr_gpu) convert_multiply_kernel(kernc_gpu, kernc_gpuc, np.float32(-2 * pi), np.int32(kernc_gpu.size), block=block_dim, grid=grid_dim) convert_multiply_kernel(kernr_gpu, kernr_gpuc, np.float32(-2 * pi / (M * usfac)), np.int32(kernr_gpu.size), block=block_dim, grid=grid_dim) cumath.exp(kernc_gpuc, out=kernc_gpuc) cumath.exp(kernr_gpuc, out=kernr_gpuc) conj_kernel(prod_gpu1, prod_gpu2, np.int32(half_imlen), block=block_dim, grid=grid_dim) lin.dot(kernr_gpuc, prod_gpu2, handle=cub_h, out=upsampled1) lin.dot(upsampled1, kernc_gpuc, handle=cub_h, out=upsampled2) CCG = conj(upsampled2.get()) / (md2 * nd2 * usfac**2) rlocG, clocG = np.unravel_index(abs(CCG).argmax(), CCG.shape) CCGmax = CCG[rlocG, clocG] rlocG = rlocG - dftshiftG #+ 1 # +1 # questionable/failed hack + 1; clocG = clocG - dftshiftG #+ 1 # -1 # questionable/failed hack - 1; row_shiftG = row_shiftG0 + rlocG / usfac col_shiftG = col_shiftG0 + clocG / usfac diffphaseG = arctan2(imag(CCGmax), real(CCGmax)) # Compute registered version of source stack calc_shiftmatrix_kernel(Nr_gpu, Nc_gpu, shiftmatrix, np.float32(row_shiftG * 2 * np.pi), np.float32(col_shiftG * 2 * np.pi), np.float32(diffphaseG), np.int32(half_imlen), block=block_dim, grid=grid_dim) cumath.exp(shiftmatrix, out=shiftmatrix) multiply_float_kernel(sourcef_gpu, shiftmatrix, prod_gpu1, np.int32(half_imlen), block=block_dim, grid=grid_dim) cu_fft.ifft(prod_gpu1, ifft_gpu, plan_inverse) convert_export_kernel(ifft_gpu, result_gpu, np.int32(imlen), block=block_dim, grid=grid_dim) move_list[t, :] = (row_shiftG, col_shiftG) stack[t, :, :] = result_gpu.get() cub.cublasDestroy(cub_h) return (stack, move_list)
def copy_non_contiguous(dst, src): """Copy ``src`` array to ``dst`` array. A gpu-array may have a non contiguous block of memory, i.e. it may have substancial pitches/strides. However a cpu-array must have a contiguous block of memory. All four directions are allowed. """ assert src.dtype == dst.dtype,\ "src ({}) and dst ({}) must have the same datatype.".format(str(src.dtype), str(dst.dtype)) assert dst.shape == src.shape,\ "Shapes do not match: " + str(dst.shape) + " <-> " + str(src.shape) itemsize = np.dtype(src.dtype).itemsize copy = cuda.Memcpy2D() src_on_gpu = isinstance(src, pycuda.gpuarray.GPUArray) dst_on_gpu = isinstance(dst, pycuda.gpuarray.GPUArray) if src_on_gpu: copy.set_src_device(src.gpudata) else: copy.set_src_host(src) if dst_on_gpu: copy.set_dst_device(dst.gpudata) else: copy.set_dst_host(dst) if len(src.shape) == 1: copy.src_pitch = src.strides[0] if src_on_gpu else itemsize copy.dst_pitch = dst.strides[0] if dst_on_gpu else itemsize copy.width_in_bytes = itemsize copy.height = src.shape[0] copy(aligned=False) elif len(src.shape) == 2: if (itemsize != src.strides[1] if src_on_gpu else False) or \ (itemsize != dst.strides[1] if dst_on_gpu else False): # arrays have to be copied column by column, because there a two substantial pitches/strides # which is not supported by cuda. copy.src_pitch = src.strides[0] if src_on_gpu else itemsize copy.dst_pitch = dst.strides[0] if dst_on_gpu else itemsize copy.width_in_bytes = itemsize copy.height = src.shape[0] for col in range(src.shape[1]): copy.src_x_in_bytes = col * src.strides[ 1] if src_on_gpu else col * itemsize copy.dst_x_in_bytes = col * dst.strides[ 1] if dst_on_gpu else col * itemsize copy(aligned=False) else: # both arrays have a contiguous block of memory for each row copy.src_pitch = src.strides[ 0] if src_on_gpu else itemsize * src.shape[1] copy.dst_pitch = dst.strides[ 0] if dst_on_gpu else itemsize * src.shape[1] copy.width_in_bytes = itemsize * src.shape[1] copy.height = src.shape[0] copy(aligned=False) elif len(src.shape) == 3: if (src.strides[0] != src.shape[1] * src.strides[1] if src_on_gpu else False) or \ (dst.strides[0] != dst.shape[1] * dst.strides[1] if dst_on_gpu else False): # arrays have to be copied plane by plane, because there a substantial pitche/stride # for the z-axis which is not supported by cuda. for plane in range(src.shape[0]): copy_non_contiguous(dst[plane, :, :], src[plane, :, :]) return copy = cuda.Memcpy3D() if src_on_gpu: copy.set_src_device(src.gpudata) else: copy.set_src_host(src) if dst_on_gpu: copy.set_dst_device(dst.gpudata) else: copy.set_dst_host(dst) copy.src_pitch = src.strides[ 1] if src_on_gpu else itemsize * src.shape[2] copy.dst_pitch = dst.strides[ 1] if dst_on_gpu else itemsize * src.shape[2] copy.width_in_bytes = itemsize * src.shape[2] copy.height = copy.src_height = copy.dst_height = src.shape[1] copy.depth = src.shape[0] copy() else: raise RuntimeError("dimension %d is not supported." % len(src.shape))
def np3DtoCudaArray(npArray, prec, order = "C", allowSurfaceBind=False): ''' Some parameters like stride are explained in PyCUDA: driver.py test_driver.py gpuarray.py''' # For 1D-2D Cuda Arrays the descriptor is the same just puttin LAYERED flags # if order != "C": raise LogicError("Just implemented for C order") dimension = len(npArray.shape) case = order in ["C","F"] if not case: raise LogicError("order must be either F or C") # if dimension == 1: # w = npArray.shape[0] # h, d = 0,0 if dimension == 2: if order == "C": stride = 0 if order == "F": stride = -1 h, w = npArray.shape d = 1 if allowSurfaceBind: descrArr = cuda.ArrayDescriptor3D() descrArr.width = w descrArr.height = h descrArr.depth = d else: descrArr = cuda.ArrayDescriptor() descrArr.width = w descrArr.height = h # descrArr.depth = d elif dimension == 3: if order == "C": stride = 1 if order == "F": stride = 1 d, h, w = npArray.shape descrArr = cuda.ArrayDescriptor3D() descrArr.width = w descrArr.height = h descrArr.depth = d else: raise LogicError("CUDArray dimesnsion 2 and 3 supported at the moment ... ") if prec == 'float': descrArr.format = cuda.dtype_to_array_format(npArray.dtype) descrArr.num_channels = 1 elif prec == 'cfloat': # Hack for complex 64 = (float 32, float 32) == (re,im) descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi=re,lo=im) structure descrArr.num_channels = 2 elif prec == 'double': # Hack for doubles descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi,lo) structure descrArr.num_channels = 2 elif prec == 'cdouble': # Hack for doubles descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure descrArr.num_channels = 4 else: descrArr.format = cuda.dtype_to_array_format(npArray.dtype) descrArr.num_channels = 1 if allowSurfaceBind: if dimension==2: descrArr.flags |= cuda.array3d_flags.ARRAY3D_LAYERED descrArr.flags |= cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descrArr) if allowSurfaceBind or dimension==3 : copy3D = cuda.Memcpy3D() copy3D.set_src_host(npArray) copy3D.set_dst_array(cudaArray) copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride] # if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support # if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support copy3D.src_height = copy3D.height = h copy3D.depth = d copy3D() return cudaArray, copy3D else: # if dimension == 3: # copy3D = cuda.Memcpy3D() # copy3D.set_src_host(npArray) # copy3D.set_dst_array(cudaArray) # copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride] # # if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support # # if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support # copy3D.src_height = copy3D.height = h # copy3D.depth = d # copy3D() # return cudaArray, copy3D # if dimension == 2: cudaArray = cuda.Array(descrArr) copy2D = cuda.Memcpy2D() copy2D.set_src_host(npArray) copy2D.set_dst_array(cudaArray) copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[stride] # copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0] #Jut C order support copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, copy2D