def __init__(self, m=None, n=None, k=None): super(BMM, self).__init__() self.m = m self.n = n self.k = k with open("kernels/bmm_kernel.cu", 'r') as f: ### self.kernel = f.read() self.kernel = (self.kernel.replace("_M_", str(m) if m else "M").replace( "_N_", str(n) if n else "N").replace( "_K_", str(k) if k else "K")) self._fn_tt = cp.RawKernel(code=self.kernel, name="bmm_tt", backend='nvcc', options=('--maxrregcount=128', '--use_fast_math')) self._fn_nn = cp.RawKernel(code=self.kernel, name="bmm_nn", backend='nvcc', options=('--maxrregcount=128', '--use_fast_math')) self._fn_tn = cp.RawKernel(code=self.kernel, name="bmm_tn", backend='nvcc', options=('--maxrregcount=128', '--use_fast_math')) self._fn_nt = cp.RawKernel(code=self.kernel, name="bmm_nt", backend='nvcc', options=('--maxrregcount=128', '--use_fast_math'))
def mexThSpkPC(Params, dataRAW, wPCA, iC): code, constants = get_cuda('mexThSpkPC') Nthreads = constants.Nthreads maxFR = constants.maxFR NT, Nchan, NchanNear, nt0, nt0min, spkTh, NrankPC = Params NT = int(NT) Nchan = int(Nchan) # Input GPU arrays. d_Params = cp.asarray(Params, dtype=np.float64, order='F') d_data = cp.asarray(dataRAW, dtype=np.float32, order='F') d_W = cp.asarray(wPCA, dtype=np.float32, order='F') d_iC = cp.asarray(iC, dtype=np.int32, order='F') # New GPU arrays. d_dout = cp.zeros((Nchan, NT), dtype=np.float32, order='F') d_dmax = cp.zeros((Nchan, NT), dtype=np.float32, order='F') d_st = cp.zeros(maxFR, dtype=np.int32, order='F') d_id = cp.zeros(maxFR, dtype=np.int32, order='F') d_counter = cp.zeros(1, dtype=np.int32, order='F') # filter the data with the temporal templates Conv1D = cp.RawKernel(code, 'Conv1D') Conv1D((Nchan, ), (Nthreads, ), (d_Params, d_data, d_W, d_dout)) # get the max of the data max1D = cp.RawKernel(code, 'max1D') max1D((Nchan, ), (Nthreads, ), (d_Params, d_dout, d_dmax)) # take max across nearby channels maxChannels = cp.RawKernel(code, 'maxChannels') maxChannels((int(NT // Nthreads), ), (Nthreads, ), (d_Params, d_dout, d_dmax, d_iC, d_st, d_id, d_counter)) # move d_x to the CPU minSize = 1 minSize = min(maxFR, int(d_counter[0])) d_featPC = cp.zeros((NrankPC * NchanNear, minSize), dtype=np.float32, order='F') d_id2 = cp.zeros(minSize, dtype=np.int32, order='F') if (minSize > 0): computeProjections = cp.RawKernel(code, 'computeProjections') computeProjections((minSize, ), (NchanNear, NrankPC), (d_Params, d_data, d_iC, d_st, d_id, d_W, d_featPC)) # TODO: check that the copy occurs on the GPU only d_id2[:] = d_id[:minSize] # Free memory. # TODO: unclear - does this do something special for cupy objects? # - all of these go out of scope in the next line anyway del d_st, d_id, d_counter, d_Params, d_dmax, d_dout # free_gpu_memory() return d_featPC, d_id2
def __enter__(self): """Return self at start of a with-block.""" # Call the __enter__ methods for any composed operators. # Allocate special memory objects. self.scatter_kernel = cp.RawKernel(_cu_source, "scatter") self.gather_kernel = cp.RawKernel(_cu_source, "gather") return self
def _bspline_prefilter(volume): code = f''' #include "helper_math.h" #include "bspline.h" ''' incl_path = str((Path(__file__).parent / 'kernels').absolute()) prefilter_x = cp.RawKernel(code=code, name='SamplesToCoefficients3DX', options=('-I', incl_path)) prefilter_y = cp.RawKernel(code=code, name='SamplesToCoefficients3DY', options=('-I', incl_path)) prefilter_z = cp.RawKernel(code=code, name='SamplesToCoefficients3DZ', options=('-I', incl_path)) slice_stride = volume.strides[1] dim_grid, dim_block = utils.compute_prefilter_workgroup_dims(volume.shape) dims = cp.asarray(volume.shape[::-1], dtype=cp.int32) prefilter_x(dim_grid[0], dim_block[0], (volume, slice_stride, dims)) prefilter_y(dim_grid[1], dim_block[1], (volume, slice_stride, dims)) prefilter_z(dim_grid[2], dim_block[2], (volume, slice_stride, dims)) return volume
def mexDistances2(Params, Ws, W, iMatch, iC, Wh, mus, mu): code, _ = get_cuda('mexDistances2') Nspikes = int(Params[0]) Nfilters = int(Params[2]) d_Params = cp.asarray(Params, dtype=np.float64, order='F') d_Ws = cp.asarray(Ws, dtype=np.float32, order='F') d_W = cp.asarray(W, dtype=np.float32, order='F') d_iMatch = cp.asarray(iMatch, dtype=np.bool, order='F') d_iC = cp.asarray(iC, dtype=np.int32, order='F') d_Wh = cp.asarray(Wh, dtype=np.int32, order='F') d_mu = cp.asarray(mu, dtype=np.float32, order='F') d_mus = cp.asarray(mus, dtype=np.float32, order='F') d_cmax = cp.zeros(Nspikes * Nfilters, dtype=np.float32, order='F') d_id = cp.zeros(Nspikes, dtype=np.int32, order='F') d_x = cp.zeros(Nspikes, dtype=np.float32, order='F') # get list of cmaxes for each combination of neuron and filter computeCost = cp.RawKernel(code, 'computeCost') computeCost( (Nfilters, ), (1024, ), (d_Params, d_Ws, d_mus, d_W, d_mu, d_iMatch, d_iC, d_Wh, d_cmax)) # loop through cmax to find best template bestFilter = cp.RawKernel(code, 'bestFilter') bestFilter((40, ), (256, ), (d_Params, d_iMatch, d_Wh, d_cmax, d_mus, d_id, d_x)) del d_Params, d_cmax return d_id, d_x
def Q_inner_product_cupy(Q, A, start_indices, window_size): num_time_points, num_lms = Q.shape num_extrinsic_samples, _ = A.shape assert not cupy.isfortran(Q) assert not cupy.isfortran(A) out = cupy.empty( (num_extrinsic_samples, window_size), dtype=cupy.complex128, order="C", ) global _cuda_code if _cuda_code is None: # it's assumed that cuda_Q_inner_product.cu is placed in the same folder as this code path = os.path.join(os.path.dirname(__file__), 'cuda_Q_inner_product.cu') # alternative to deal with packaging in another directory if not (os.path.isfile(path)): path = os.path.join( os.path.split(os.path.dirname(__file__))[0], 'cuda_Q_inner_product.cu') with open(path, 'r') as f: _cuda_code = f.read() Q_prod_fn = cupy.RawKernel(_cuda_code, "Q_inner") else: Q_prod_fn = cupy.RawKernel(_cuda_code, "Q_inner") float_prec = 16 num_threads_x = 4 num_threads_y = 1024 // 4 block_size = num_threads_x, num_threads_y, 0 grid_size = ( (num_extrinsic_samples + num_threads_x - 1) // num_threads_x, 0, 0, ) args = ( Q, A, start_indices, window_size, num_time_points, num_extrinsic_samples, num_lms, out, ) Q_prod_fn( grid_size, block_size, args, shared_mem=cupy.int32(num_threads_x * num_lms * float_prec), ) return out
def mexSVDsmall2(Params, dWU, W, iC, iW, Ka, Kb): code, constants = get_cuda('mexSVDsmall2') Nthreads = constants.Nthreads Nfilt = int(Params[1]) nt0 = int(Params[4]) Nrank = int(Params[6]) Nchan = int(Params[9]) d_Params = cp.asarray(Params, dtype=np.float64, order='F') d_dWU = cp.asarray(dWU, dtype=np.float64, order='F') d_iC = cp.asarray(iC, dtype=np.int32, order='F') d_iW = cp.asarray(iW, dtype=np.int32, order='F') d_A = cp.asarray(Ka, dtype=np.float64, order='F') d_B = cp.asarray(Kb, dtype=np.float64, order='F') d_U = cp.zeros((Nchan, Nfilt, Nrank), dtype=np.float64, order='F') d_mu = cp.zeros(Nfilt, dtype=np.float64, order='F') d_W = cp.asarray(W, dtype=np.float64, order='F') d_wtw = cp.zeros((nt0, nt0, Nfilt), dtype=np.float64, order='F') d_dWUb = cp.zeros((nt0, Nchan, Nfilt), dtype=np.float64, order='F') tpS = (nt0, int(Nthreads // nt0)) tpK = (Nrank, int(Nthreads // Nrank)) blankdWU = cp.RawKernel(code, 'blankdWU') blankdWU((Nfilt, ), tpS, (d_Params, d_dWU, d_iC, d_iW, d_dWUb)) # compute dWU * dWU' getwtw = cp.RawKernel(code, 'getwtw') getwtw((Nfilt, ), tpS, (d_Params, d_dWUb, d_wtw)) # get W by power svd iterations getW = cp.RawKernel(code, 'getW') getW((Nfilt, ), (nt0, ), (d_Params, d_wtw, d_W)) # compute U by W' * dWU getU = cp.RawKernel(code, 'getU') getU((Nfilt, ), tpK, (d_Params, d_dWUb, d_W, d_U)) # normalize U, get S, get mu, renormalize W reNormalize = cp.RawKernel(code, 'reNormalize') reNormalize((Nfilt, ), (nt0, ), (d_Params, d_A, d_B, d_W, d_U, d_mu)) del d_wtw, d_Params, d_dWUb return d_W, d_U, d_mu
def GPU_Bilateral(src, sigmaS=3.0, sigmaR=0.02, sigma=0): half_kernel_size = round(sigmaS * 2) blksize = (32, 8) fast = False snn = int(sigma > 0) # whether to use SNN sampling strategy if src.format.id != vs.GRAYS: raise vs.Error("Bilateral: Only 32-bit float grayscale is supported!") w, h = src.width, src.height # source code of CUDA kernel with open( os.path.join(os.path.dirname(Path(__file__).resolve()), 'bilateral.cu'), 'r') as f: kernel_source_code = f.read() kernel_source_code = Template(kernel_source_code) kernel_source_code = kernel_source_code.substitute( width=w, height=h, sigma_s=-0.5 / (sigmaS**2), sigma_r=-0.5 / (sigmaR**2), sigma=sigma, snn=snn, half_kernel_size=half_kernel_size) if fast: kernel = cp.RawKernel(kernel_source_code, 'bilateral', options=('--use_fast_math', )) else: kernel = cp.RawKernel(kernel_source_code, 'bilateral') # create NumPy function def bilateral_core(h_img, kernel): # h_img must be a 2-D image d_img = cp.asarray(h_img) d_out = cp.empty_like(d_img) kernel(((w + blksize[0] - 1) // blksize[0], (h + blksize[1] - 1) // blksize[1]), blksize, (d_img, d_out)) h_out = cp.asnumpy(d_out) return h_out # process return mufnp.numpy_process(src, bilateral_core, kernel=kernel)
def __init__(self, m=None, n=None, k=None, write_float8=True, share_mask=False): super(MBMM, self).__init__() assert type(write_float8) == bool assert type(share_mask) == bool self.m = m self.n = n self.k = k self.write_float8 = write_float8 self.share_mask = share_mask with open("kernels/mbmm_kernel.cu", 'r') as f: ### self.kernel = f.read() self.kernel = (self.kernel.replace( "_M_", str(m) if m else "M").replace( "_N_", str(n) if n else "N").replace( "_K_", str(k) if k else "K").replace( "__WRITE_FLOAT8__", "true" if write_float8 else "false").replace( "__MASK_BATCH__", "0" if share_mask else "bid")) self._fn_tt = cp.RawKernel(code=self.kernel, name="mbmm_tt", backend='nvcc', options=('--maxrregcount=128', '--use_fast_math')) self._fn_nn = cp.RawKernel(code=self.kernel, name="mbmm_nn", backend='nvcc', options=('--maxrregcount=128', '--use_fast_math')) self._fn_tn = cp.RawKernel(code=self.kernel, name="mbmm_tn", backend='nvcc', options=('--maxrregcount=128', '--use_fast_math')) self._fn_nt = cp.RawKernel(code=self.kernel, name="mbmm_nt", backend='nvcc', options=('--maxrregcount=128', '--use_fast_math'))
def __init__( self, ta=1, tpb=256, sm_size=48 * 256 * 4, ): super(GetDivOfAddressV2CUDA, self).__init__() self.ta = ta # how many clusters each thread is responsible of self.tpb = tpb self.sm_size = sm_size assert ta * tpb * 8 <= sm_size with open(get_absolute_path("kernels", "GetDivOfAddressV2Kernel.cu"), "r") as f: self.kernel = f.read() kernel = (self.kernel.replace("_TA_", str(ta)).replace("_TPB_", str(tpb))) self.fn = cp.RawKernel( kernel, 'get_div_of_address', backend='nvcc', # options=('--maxrregcount=255',), ) self.fn.max_dynamic_shared_size_bytes = ta * tpb * 8
def nn_gpu(ref, query): import cupy with open(cu_file) as f: kernel = cupy.RawKernel(f.read(), "cuComputeDistanceGlobal") ref_nb, ref_dim = ref.shape query_nb, query_dim = query.shape assert ref_dim == query_dim dim = ref_dim ref = ref.transpose(1, 0) query = query.transpose(1, 0) ref = cupy.ascontiguousarray(ref) query = cupy.ascontiguousarray(query) dist = cupy.empty((ref_nb, query_nb), dtype=cupy.float32) BLOCK_DIM = 16 grid = ( int(math.ceil(query_nb / BLOCK_DIM)), int(math.ceil(ref_nb / BLOCK_DIM)), 1, ) block = (16, 16, 1) args = (ref, ref_nb, query, query_nb, dim, dist) shared_mem = BLOCK_DIM * BLOCK_DIM + BLOCK_DIM * BLOCK_DIM + 5 kernel(grid, block, args=args, shared_mem=shared_mem) indices = cupy.argmin(dist, axis=0) return indices
def Overlap_GPU(image, frames, coord_x, coord_y, k_r): n_angles = coord_x.shape[1] n_rays = coord_x.shape[0] img_x = image.shape[0] img_y = image.shape[1] nthreads = 128 nblocks = ((n_rays * n_angles) / nthreads) + 1 import cupy as cp image = image.astype(cp.complex64) frames = frames.astype(cp.complex64) coord_x = coord_x.astype(cp.int32) coord_y = coord_y.astype(cp.int32) cp.RawKernel(convolve_raw_kernel, "Convolve") \ ((int(nblocks),), (int(nthreads),), \ (image, frames, coord_x, coord_y, k_r, n_angles, n_rays, img_x, img_y)) return image
def __init__( self, de=16, dk=16, sm_size=48 * 256 * 4, ): super(ComputeCentroidsCUDA, self).__init__() self.de = de self.dk = dk assert dk * (de + 1) * 4 <= sm_size self.tpb = 256 self.sm_size = sm_size with open( get_absolute_path("kmeans", "kernels", "ComputeCentroidsKernel.cu"), "r") as f: self.kernel = f.read() kernel = (self.kernel.replace("_DE_", str(de)).replace( "_DK_", str(dk)).replace("_TPB_", str(self.tpb)).replace( "_NITERS_", str(math.ceil(dk / self.tpb)))) self.fn = cp.RawKernel( kernel, 'compute_centroids', # options=('--maxrregcount=255',), # backend='nvcc', ) self.fn.max_dynamic_shared_size_bytes = sm_size
def setUp(self): if hasattr(self, 'clean_up'): if cupy.cuda.runtime.is_hip: # Clearing memo triggers recompiling kernels using name # expressions in other tests, e.g. dot and matmul, which # hits a nvrtc bug. See #5843, #5945 and #6725. self.skipTest('Clearing memo hits a nvrtc bug in other tests') _util.clear_memo() self.dev = cupy.cuda.runtime.getDevice() assert self.dev != 1 if not hasattr(self, 'jitify'): self.jitify = False if cupy.cuda.runtime.is_hip and self.jitify: self.skipTest('Jitify does not support ROCm/HIP') self.temporary_cache_dir_context = use_temporary_cache_dir() self.in_memory_context = compile_in_memory(self.in_memory) self.cache_dir = self.temporary_cache_dir_context.__enter__() self.in_memory_context.__enter__() self.kern = cupy.RawKernel(_test_source1, 'test_sum', backend=self.backend, jitify=self.jitify) self.mod2 = cupy.RawModule(code=_test_source2, backend=self.backend, jitify=self.jitify) self.mod3 = cupy.RawModule(code=_test_source3, options=('-DPRECISION=2', ), backend=self.backend, jitify=self.jitify)
def bool_spike_to_float(spike_b: torch.Tensor, s_dtype: torch.dtype, s_shape: torch.Size, s_padding: int = 0): device_id = spike_b.get_device() spike = torch.zeros(spike_b.numel() * 8, device=spike_b.device, dtype=s_dtype) if s_dtype == torch.float: kernel_codes = DataTypeConvertCUDACode.bool2float kernel_name = 'bool2float' elif s_dtype == torch.half: kernel_codes = DataTypeConvertCUDACode.bool2half kernel_name = 'bool2half' else: raise NotImplementedError with cuda_utils.DeviceEnvironment(device_id): numel = spike_b.numel() blocks = cuda_utils.cal_blocks(numel) numel = cupy.asarray(numel) spike_b, spike, numel = cuda_utils.get_contiguous(spike_b, spike, numel) kernel_args = [spike_b, spike, numel] kernel = cupy.RawKernel( kernel_codes, kernel_name, options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend ) kernel( (blocks,), (configure.cuda_threads,), cuda_utils.wrap_args_to_raw_kernel( device_id, *kernel_args ) ) if s_padding != 0 and s_padding != 8: spike = spike[0: spike.numel() - s_padding] return spike.reshape(s_shape)
def setUp(self): if hasattr(self, 'clean_up'): _util.clear_memo() self.dev = cupy.cuda.runtime.getDevice() assert self.dev != 1 if not hasattr(self, 'jitify'): self.jitify = False if cupy.cuda.runtime.is_hip and self.jitify: self.skipTest('Jitify does not support ROCm/HIP') self.temporary_cache_dir_context = use_temporary_cache_dir() self.in_memory_context = compile_in_memory(self.in_memory) self.cache_dir = self.temporary_cache_dir_context.__enter__() self.in_memory_context.__enter__() self.kern = cupy.RawKernel( _test_source1, 'test_sum', backend=self.backend, jitify=self.jitify) self.mod2 = cupy.RawModule( code=_test_source2, backend=self.backend, jitify=self.jitify) self.mod3 = cupy.RawModule( code=_test_source3, options=('-DPRECISION=2',), backend=self.backend, jitify=self.jitify)
def sgemm(A, B, dim_x=16, dim_y=16, blk_m=64, blk_n=64, blk_k=4, dim_xa=64, dim_ya=4, dim_xb=4, dim_yb=64): assert A.dtype == cp.float32 assert B.dtype == cp.float32 assert(dim_x * dim_y == dim_xa * dim_ya == dim_xb * dim_yb) m, k = A.shape k, n = B.shape # Inputs matrices need to be in Fortran order. A = cp.asfortranarray(A) B = cp.asfortranarray(B) C = cp.empty((m, n), dtype=cp.float32, order='F') config = {'DIM_X': dim_x, 'DIM_Y': dim_y, 'BLK_M': blk_m, 'BLK_N': blk_n, 'BLK_K': blk_k, 'DIM_XA': dim_xa, 'DIM_YA': dim_ya, 'DIM_XB': dim_xb, 'DIM_YB': dim_yb, 'THR_M': blk_m // dim_x, 'THR_N': blk_n // dim_y} code = read_code(sgemm_file, params=config) kern = cp.RawKernel(code, 'sgemm') grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), 1) block = (dim_x, dim_y, 1) args = (m, n, k, A, B, C) shared_mem = blk_k * (blk_m + 1) * 4 + blk_n * (blk_k + 1) * 4 kern(grid, block, args=args, shared_mem=shared_mem) return C
def setup_cupy(): global cupy global cupy_stream global square_diff_kernel global mix_channels_kernel global gray_scale_kernel import cupy as cupy cupy_stream = cupy.cuda.Stream() square_diff_kernel = cupy.ElementwiseKernel('T x, T y', 'T z', 'z = x*x - y*y', 'square_diff') mix_channels_kernel = cupy.ElementwiseKernel('uint8 x, uint8 y', 'uint8 z', 'z = (i % 3) ? x : y', 'mix_channels') gray_scale_kernel = cupy.RawKernel( r''' extern "C" __global__ void gray_scale(float *output, const unsigned char *input, long long height, long long width) { int tidx = blockIdx.x * blockDim.x + threadIdx.x; int tidy = blockIdx.y * blockDim.y + threadIdx.y; if (tidx < width && tidy < height) { float r = input[tidy * width + tidx] / 255.; float g = input[tidy * width + tidx + 1] / 255.; float b = input[tidy * width + tidx + 2] / 255.; output[tidy * width + tidx] = 0.299 * r + 0.59 * g + 0.11 * b; } } ''', 'gray_scale')
def mexWtW2(Params, W1, W2, UtU): code, constants = get_cuda('mexWtW2') nblock = constants.nblock Nfilt = int(Params[1]) nt0 = int(Params[9]) d_Params = cp.asarray(Params, dtype=np.float64, order='F') d_W1 = cp.asarray(W1, dtype=np.float32, order='F') d_W2 = cp.asarray(W2, dtype=np.float32, order='F') d_UtU = cp.asarray(UtU, dtype=np.float32, order='F') d_WtW = cp.zeros((Nfilt, Nfilt, 2 * nt0 - 1), dtype=np.float32, order='F') grid = (1 + int(Nfilt // nblock), 1 + int(Nfilt // nblock)) block = (nblock, nblock) crossFilter = cp.RawKernel(code, 'crossFilter') crossFilter(grid, block, (d_Params, d_W1, d_W2, d_UtU, d_WtW)) del d_Params, d_W1, d_W2, d_UtU return d_WtW
def _generate_kernel(parallactic_angles, feed_type): dtype = parallactic_angles.dtype # Block sizes if dtype == np.float32: block = (1024, 1, 1) elif dtype == np.float64: block = (512, 1, 1) else: raise TypeError("Unhandled type %s" % dtype) # Create template render = jinja_env.get_template(_TEMPLATE_PATH).render name = "feed_rotation" code = render(kernel_name=name, feed_type=feed_type, sincos_fn=cuda_function('sincos', dtype), pa_type=_get_typename(dtype), out_type=_get_typename(dtype)) code = code.encode('utf-8') # Complex output type out_dtype = np.result_type(dtype, np.complex64) return cp.RawKernel(code, name), block, out_dtype
def get_raw_spline1d_kernel(axis, ndim, mode, order, index_type="int", data_type="double", pole_type="double", block_size=128): """Generate a kernel for applying a spline prefilter along a given axis.""" poles = get_poles(order) # determine number of samples for the boundary approximation # (SciPy uses n_boundary = n_samples but this is excessive) largest_pole = max([abs(p) for p in poles]) # tol < 1e-7 fails test cases comparing to SciPy at atol = rtol = 1e-5 tol = 1e-10 if pole_type == "float" else 1e-18 n_boundary = math.ceil(math.log(tol, largest_pole)) # headers and general utility function for extracting rows of data code = _FILTER_GENERAL.format(index_type=index_type, data_type=data_type, pole_type=pole_type) # generate source for a 1d function for a given boundary mode and poles code += _get_spline1d_code(mode, poles, n_boundary) # generate code handling batch operation of the 1d filter code += _batch_spline1d_strided_template.format(ndim=ndim, axis=axis, block_size=block_size) return cupy.RawKernel(code, "cupyx_spline_filter")
def _generate_kernel(lm, uvw, frequency): # Floating point output type out_dtype = np.result_type(lm, uvw, frequency) # Block sizes blockdimx = 32 if frequency.dtype == np.float32 else 16 blockdimy = 32 if uvw.dtype == np.float32 else 16 block = (blockdimx, blockdimy, 1) # Create template render = jinja_env.get_template(_TEMPLATE_PATH).render name = "phase_delay" code = render(kernel_name=name, lm_type=_get_typename(lm.dtype), uvw_type=_get_typename(uvw.dtype), freq_type=_get_typename(frequency.dtype), out_type=_get_typename(out_dtype), sqrt_fn=cuda_function('sqrt', lm.dtype), sincos_fn=cuda_function('sincos', out_dtype), minus_two_pi_over_c=minus_two_pi_over_c, blockdimx=blockdimx, blockdimy=blockdimy).encode('utf-8') # Complex output type out_dtype = np.result_type(out_dtype, np.complex64) return cp.RawKernel(code, name), block, out_dtype
def get_raw_spline1d_kernel(axis, ndim, mode, order, index_type='int', data_type='double', pole_type='double', block_size=128): """Generate a kernel for applying a spline prefilter along a given axis.""" poles = get_poles(order) # determine number of samples for the boundary approximation # (SciPy uses n_boundary = n_samples but this is excessive) largest_pole = max([abs(p) for p in poles]) # tol < 1e-7 fails test cases comparing to SciPy at atol = rtol = 1e-5 tol = 1e-10 if pole_type == 'float' else 1e-18 n_boundary = math.ceil(math.log(tol, largest_pole)) # headers and general utility function for extracting rows of data code = _FILTER_GENERAL.format(index_type=index_type, data_type=data_type, pole_type=pole_type) # generate source for a 1d function for a given boundary mode and poles code += _get_spline1d_code(mode, poles, n_boundary) # generate code handling batch operation of the 1d filter mode_str = mode.replace('-', '_') # cannot have '-' in kernel name kernel_name = (f'cupyx_scipy_ndimage_spline_filter_{ndim}d_ord{order}_' f'axis{axis}_{mode_str}') code += _batch_spline1d_strided_template.format(ndim=ndim, axis=axis, block_size=block_size, kernel_name=kernel_name) return cupy.RawKernel(code, kernel_name)
def _generate_kernel(inputs, input_schema, output_schema): mapping, in_shape, out_shape, out_dtype = stokes_convert_setup( inputs, input_schema, output_schema) # Flatten input and output shapes # Check that number elements are the same in_elems = reduce(mul, in_shape, 1) out_elems = reduce(mul, out_shape, 1) if in_elems != out_elems: raise ValueError("Number of input_schema elements %s " "and output schema elements %s " "must match for CUDA kernel." % (in_shape, out_shape)) # Infer the output data type if out_dtype == "real": if np.iscomplexobj(inputs): out_dtype = inputs.real.dtype else: out_dtype = inputs.dtype elif out_dtype == "complex": if np.iscomplexobj(inputs): out_dtype = inputs.dtype else: out_dtype = np.result_type(inputs.dtype, np.complex64) else: raise ValueError("Invalid setup dtype %s" % out_dtype) cuda_out_dtype = cuda_type(out_dtype) assign_exprs = [] # Render the assignment expression for each element for (c1, c1i), (c2, c2i), outi, template_fn in mapping: # Flattened indices flat_outi = np.ravel_multi_index(outi, out_shape) render = jinja_env.from_string(template_fn).render kwargs = { c1: "in[%d]" % np.ravel_multi_index(c1i, in_shape), c2: "in[%d]" % np.ravel_multi_index(c2i, in_shape), "out_type": cuda_out_dtype } expr_str = render(**kwargs) assign_exprs.append("out[%d] = %s;" % (flat_outi, expr_str)) # Now render the main template render = jinja_env.get_template(_TEMPLATE_PATH).render name = "stokes_convert" code = render(kernel_name=name, input_type=cuda_type(inputs.dtype), output_type=cuda_type(out_dtype), assign_exprs=assign_exprs, elements=in_elems).encode("utf-8") # cuda block, flatten non-schema dims into a single source dim blockdimx = 512 block = (blockdimx, 1, 1) return (cp.RawKernel(code, name), block, in_shape, out_shape, out_dtype)
def mexClustering2(Params, uproj, W, mu, call, iMatch, iC): code, _ = get_cuda('mexClustering2') Nspikes = int(Params[0]) NrankPC = int(Params[1]) Nfilters = int(Params[2]) NchanNear = int(Params[6]) Nchan = int(Params[7]) d_Params = cp.asarray(Params, dtype=np.float64, order='F') d_uproj = cp.asarray(uproj, dtype=np.float32, order='F') d_W = cp.asarray(W, dtype=np.float32, order='F') d_mu = cp.asarray(mu, dtype=np.float32, order='F') d_call = cp.asarray(call, dtype=np.int32, order='F') d_iC = cp.asarray(iC, dtype=np.int32, order='F') d_iMatch = cp.asarray(iMatch, dtype=np.bool, order='F') d_dWU = cp.zeros((NrankPC * Nchan, Nfilters), dtype=np.float32, order='F') d_cmax = cp.zeros((Nspikes, Nfilters), dtype=np.float32, order='F') d_id = cp.zeros(Nspikes, dtype=np.int32, order='F') d_x = cp.zeros(Nspikes, dtype=np.float32, order='F') d_nsp = cp.zeros(Nfilters, dtype=np.int32, order='F') d_V = cp.zeros(Nfilters, dtype=np.float32, order='F') # get list of cmaxes for each combination of neuron and filter computeCost = cp.RawKernel(code, 'computeCost') computeCost((Nfilters, ), (1024, ), (d_Params, d_uproj, d_mu, d_W, d_iMatch, d_iC, d_call, d_cmax)) # loop through cmax to find best template bestFilter = cp.RawKernel(code, 'bestFilter') bestFilter((40, ), (256, ), (d_Params, d_iMatch, d_iC, d_call, d_cmax, d_id, d_x)) # average all spikes for same template -- ORIGINAL average_snips_v2 = cp.RawKernel(code, 'average_snips_v2') average_snips_v2((Nfilters, ), (NrankPC, NchanNear), (d_Params, d_iC, d_call, d_id, d_uproj, d_cmax, d_dWU)) count_spikes = cp.RawKernel(code, 'count_spikes') count_spikes((7, ), (256, ), (d_Params, d_id, d_nsp, d_x, d_V)) del d_Params, d_V return d_dWU, d_id, d_x, d_nsp, d_cmax
def __getitem__(self, key): try: return super().__getitem__(key) except KeyError: # lazy load kernel definition, will NOT compile until called kernel = cp.RawKernel(self._source, key) super().__setitem__(key, kernel) return kernel
def load_cupy_func(fname, name, **kwargs): try: fname = str((Path(__file__).parent / fname).resolve()) except: pass with open(fname) as f: code = f.read() macros = ['#define {!s} {!s}'.format(k, v) for k,v in kwargs.items()] code = '\n'.join(macros + [code]) return add_checks(cp.RawKernel(code, name))
def test_dynamical_parallelism(self): ker = cupy.RawKernel(_test_source4, 'test_kernel', options=('-dc',), backend=self.backend, jitify=self.jitify) N = 169 inner_chunk = 13 x = cupy.zeros((N,), dtype=cupy.float32) ker((1,), (N//inner_chunk,), (x, N, inner_chunk)) assert (x == 1.0).all()
def conv_forward_pass(image, filt, bias): # print("CUDA cODE") (n_f, n_c_f, f, _) = filt.shape # filter dimensions n_c, in_dim, _ = image.shape # image dimensions convfilter_kernel = cp.RawKernel(r''' extern "C" __global__ void my_conv(const float* img, const float * filt, const float * bias, float * out, const int depth, const int img_dim, const int f, const int out_dim ) { const unsigned int j = blockDim.x*blockIdx.x + threadIdx.x; const unsigned int i = blockDim.y*blockIdx.y + threadIdx.y; const unsigned int pp = blockIdx.z; const unsigned int dp = depth; //printf("%d,%d ::: ",i,j); if(i<img_dim && j<img_dim){ unsigned int oPixelPos = i*out_dim + j+ pp*out_dim*out_dim; out[oPixelPos] = 0; for(int h=0;h<dp;h++){ for(int k=0;k<f;k++){ for(int l=0;l<f;l++){ unsigned int iPixelPos = ((i+k)*img_dim + (j+l)) + img_dim*img_dim*h; unsigned int filtPos = (k*f+l) + f*f*h + f*f*dp*pp; out[oPixelPos] += img[iPixelPos] * filt[filtPos] + bias[pp]; } } } } } ''', 'my_conv') out_dim = int(in_dim - f)+1 # calculate output dimensions img_gpu = cp.asarray(image, dtype=cp.float32) #28X28 img_dim_gpu = cp.int32(in_dim) #28 filt_size_gpu = cp.int32(f) # 5 out_dim_gpu = cp.int32(out_dim) #24 depth_gpu = cp.int32(n_c) threads_per_block = f num_blocks = (in_dim//f) +1 filt_gpu = cp.asarray(filt, dtype=cp.float32) #5X5 # print(filt) # filt_gpu = cp.zeros((n_f, n_c_f, f, f), dtype=cp.float32) #5X5 filt_gpu = cp.asarray(filt_gpu.flatten(), dtype=cp.float32) # print(filt_gpu) # out_gpu = cp.zeros((out_dim,out_dim,n_f), dtype=cp.float32) # 24 X 24 out_gpu = cp.zeros((n_f,out_dim,out_dim), dtype=cp.float32) # 24 X 24 out_gpu = out_gpu.flatten() bias_gpu = cp.asarray(bias, dtype=cp.float32) #5X5 bias_gpu = bias_gpu.flatten() convfilter_kernel((num_blocks,num_blocks,8), (threads_per_block,threads_per_block), (img_gpu, filt_gpu, bias_gpu, out_gpu, depth_gpu, img_dim_gpu, filt_size_gpu, out_dim_gpu)) # # convfilter_kernel((576,), (24,5), (img_gpu, filt_gpu, out_gpu, depth_gpu, img_dim_gpu, filt_size_gpu, out_dim_gpu)) out_gpu = cp.reshape(out_gpu,(n_f,out_dim,out_dim)) output = cp.asnumpy(out_gpu) return output
def _get_transform_kernel(interpolation: str = 'linear'): if interpolation not in AVAILABLE_INTERPOLATIONS: raise ValueError(f'Interpolation must be one of {interpolation}') code = f''' #include "helper_math.h" #include "bspline.h" #include "helper_interpolation.h" extern "C" {{ inline __device__ int get_z_idx(const int i, const uint4* const dims) {{ return i / (dims[0].y * dims[0].z); }} inline __device__ int get_y_idx(const int i, const uint4* const dims) {{ return (i % (dims[0].y * dims[0].z)) / dims[0].z; }} inline __device__ int get_x_idx(const int i, const uint4* const dims) {{ return (i % (dims[0].y * dims[0].z)) % dims[0].z; }} __global__ void transform(float* const volume, cudaTextureObject_t texture, const float4* const xform, const uint4* const dims) {{ unsigned tid = threadIdx.x; unsigned total_threads = gridDim.x*blockDim.x; unsigned cta_start = blockDim.x*blockIdx.x; unsigned i; unsigned n = dims[0].x * dims[0].y * dims[0].z; for (i = cta_start + tid; i < n; i += total_threads) {{ int z = get_x_idx(i, dims); int y = get_y_idx(i, dims); int x = get_z_idx(i, dims); float4 voxf = make_float4((float)x, (float)y, (float)z, 1.0f); float3 ndx; ndx.z = dot(voxf, xform[0]) + .5f; ndx.y = dot(voxf, xform[1]) + .5f; ndx.x = dot(voxf, xform[2]) + .5f; if (ndx.x < 0 || ndx.y < 0 || ndx.z < 0 || ndx.x >= dims[0].z || ndx.y >= dims[0].y || ndx.z >= dims[0].x) {{ continue; }} volume[i] = {_INTERPOLATIONS[interpolation]}(texture, ndx); }} }} }} ''' incl_path = str((Path(__file__).parent / 'kernels').absolute()) kernel = cp.RawKernel(code=code, name='transform', options=('-I', incl_path)) return kernel