def setup_cupy(): global cupy global cupy_stream global square_diff_kernel global mix_channels_kernel global gray_scale_kernel import cupy as cupy cupy_stream = cupy.cuda.Stream() square_diff_kernel = cupy.ElementwiseKernel('T x, T y', 'T z', 'z = x*x - y*y', 'square_diff') mix_channels_kernel = cupy.ElementwiseKernel('uint8 x, uint8 y', 'uint8 z', 'z = (i % 3) ? x : y', 'mix_channels') gray_scale_kernel = cupy.RawKernel( r''' extern "C" __global__ void gray_scale(float *output, const unsigned char *input, long long height, long long width) { int tidx = blockIdx.x * blockDim.x + threadIdx.x; int tidy = blockIdx.y * blockDim.y + threadIdx.y; if (tidx < width && tidy < height) { float r = input[tidy * width + tidx] / 255.; float g = input[tidy * width + tidx + 1] / 255.; float b = input[tidy * width + tidx + 2] / 255.; output[tidy * width + tidx] = 0.299 * r + 0.59 * g + 0.11 * b; } } ''', 'gray_scale')
def sum_duplicates(self): """Eliminate duplicate matrix entries by adding them together. .. seealso:: :func:`scipy.sparse.coo_matrix.sum_duplicates` """ if self._has_canonical_format: return if self.data.size == 0: self._has_canonical_format = True return keys = cupy.stack([self.row, self.col]) order = cupy.lexsort(keys) src_data = self.data[order] src_row = self.row[order] src_col = self.col[order] diff = cupy.ElementwiseKernel( 'raw int32 row, raw int32 col', 'int32 diff', ''' int index; if (i == 0 || row[i - 1] == row[i] && col[i - 1] == col[i]) { diff = 0; } else { diff = 1; } ''', 'sum_duplicates_diff' )(src_row, src_col, size=self.row.size) if diff[1:].all(): # All elements have different indices. data = src_data row = src_row col = src_col else: index = cupy.cumsum(diff, dtype='i') size = int(index[-1]) + 1 data = cupy.zeros(size, dtype=self.data.dtype) row = cupy.empty(size, dtype='i') col = cupy.empty(size, dtype='i') cupy.ElementwiseKernel( 'T src_data, int32 src_row, int32 src_col, int32 index', 'raw T data, raw int32 row, raw int32 col', ''' atomicAdd(&data[index], src_data); row[index] = src_row; col[index] = src_col; ''', 'sum_duplicates_assign', preamble=util._preamble_atomic_add )(src_data, src_row, src_col, index, data, row, col) self.data = data self.row = row self.col = col self._has_canonical_format = True
def fit(X, n_clusters, max_iter, use_custom_kernel): assert X.ndim == 2 xp = cupy.get_array_module(X) pred = xp.zeros(len(X), dtype=np.int32) initial_indexes = np.random.choice(len(X), n_clusters, replace=False).astype(np.int32) centers = X[initial_indexes] data_num = X.shape[0] data_dim = X.shape[1] for _ in six.moves.range(max_iter): # calculate distances and label if not use_custom_kernel or xp == np: distances = xp.linalg.norm(X[:, None, :] - centers[None, :, :], axis=2) else: distances = xp.zeros((data_num, n_clusters), dtype=np.float32) cupy.ElementwiseKernel( 'S data, raw S centers, int32 n_clusters, int32 dim', 'raw S dist', ''' for (int j = 0; j < n_clusters; j++){ int cent_ind[] = {j, i % dim}; int dist_ind[] = {i / dim, j}; double diff = centers[cent_ind] - data; atomicAdd(&dist[dist_ind], diff * diff); } ''', 'calc_distances')(X, centers, n_clusters, data_dim, distances) new_pred = xp.argmin(distances, axis=1).astype(np.int32) if xp.all(new_pred == pred): break pred = new_pred # calculate centers if not use_custom_kernel or xp == np: centers = xp.stack([ X[pred == i].mean(axis=0) for i in six.moves.range(n_clusters) ]) else: centers = xp.zeros((n_clusters, data_dim), dtype=np.float32) group = xp.zeros(n_clusters, dtype=np.float32) label = pred[:, None] cupy.ElementwiseKernel( 'S data, T label, int32 dim', 'raw S centers, raw S group', ''' int cent_ind[] = {label, i % dim}; atomicAdd(¢ers[cent_ind], data); atomicAdd(&group[label], 1); ''', 'calc_center')(X, label, data_dim, centers, group) group /= data_dim centers /= group[:, None] return centers, pred
def deformation(self, prm): """ Apply 2D Gaussian and Planar deformation. Computation is parallelized on GPU using cupy. """ import cupy as cp xy_cp = cp.asarray(prm.xy) a_cp = cp.asarray(self.a) b_cp = cp.asarray(self.b) c_cp = cp.asarray(self.c) d_cp = cp.asarray(self.d) sigma_cp = cp.asarray(self.sigma) e_cp = cp.asarray(self.e) f_cp = cp.asarray(self.f) g_cp = cp.asarray(self.g) z_cp = cp.asarray(prm.z) func_planar = cp.ElementwiseKernel( in_params='T x, T y, T e, T f, T g', out_params='T z', operation= \ ''' z = e + f*x + g*y; ''', name='func_planar' ) func_gauss2d = cp.ElementwiseKernel( in_params='T x, T y, T b, T c, T d, T sigma', out_params='T z', operation= \ ''' z = b*expf(-(powf(x-c,2) + powf(y-d,2))/(2*powf(sigma,2))); ''', name='func_gauss2d' ) gauss_2d_cp = cp.zeros_like(xy_cp[:, 0]) for i in range(len(self.b)): gauss_2d_cp += func_gauss2d(xy_cp[:, 0], xy_cp[:, 1], b_cp[i], c_cp[i], d_cp[i], sigma_cp[i]) s1_cp = a_cp + (1.5 / z_cp) * cp.outer(cp.transpose(gauss_2d_cp), z_cp) s2_cp = func_planar(xy_cp[:, 0], xy_cp[:, 1], e_cp, f_cp, g_cp) refl_cp = cp.asarray(self.refl) for i in range(prm.nxy_tr): s = s1_cp[i, :] + s2_cp[i] + z_cp mat = cp.tile(z_cp, (len(s), 1)) - cp.tile(cp.expand_dims(s, 1), (1, len(z_cp))) refl_cp[i, :] = cp.dot(refl_cp[i, :], cp.sinc(mat)) return np.reshape(cp.asnumpy(refl_cp), [prm.nxy_tr, prm.nz_tr])
def bincount(x, weights=None, minlength=None): """Count number of occurrences of each value in array of non-negative ints. Args: x (cupy.ndarray): Input array. weights (cupy.ndarray): Weights array which has the same shape as ``x``. minlength (int): A minimum number of bins for the output array. Returns: cupy.ndarray: The result of binning the input array. The length of output is equal to ``max(cupy.max(x) + 1, minlength)``. .. seealso:: :func:`numpy.bincount` """ if x.ndim > 1: raise ValueError('object too deep for desired array') if x.ndim < 1: raise ValueError('object of too small depth for desired array') if x.dtype.kind == 'f': raise TypeError('x must be int array') if (x < 0).any(): raise ValueError('The first argument of bincount must be non-negative') if weights is not None and x.shape != weights.shape: raise ValueError('The weights and list don\'t have the same length.') if minlength is not None: minlength = int(minlength) if minlength < 0: raise ValueError('minlength must be non-negative') size = int(cupy.max(x)) + 1 if minlength is not None: size = max(size, minlength) if weights is None: # atomicAdd for int64 is not provided b = cupy.zeros((size, ), dtype=cupy.int32) cupy.ElementwiseKernel('S x', 'raw U bin', 'atomicAdd(&bin[x], 1)', 'bincount_kernel')(x, b) b = b.astype(numpy.intp) else: # atomicAdd for float64 is not provided b = cupy.zeros((size, ), dtype=cupy.float32) cupy.ElementwiseKernel('S x, T w', 'raw U bin', 'atomicAdd(&bin[x], w)', 'bincount_with_weight_kernel')(x, weights, b) b = b.astype(cupy.float64) return b
def bincount(X, B, weights=None): if weights is None: b = cp.zeros((B, ), dtype=cp.int32) startin = time.time() cp.ElementwiseKernel('S x', 'raw U bin', 'atomicAdd(&bin[x], 1)', 'bincount_kernel')(X, b) b = b.astype(np.intp) else: b = cp.zeros((B, ), dtype=cp.float32) cp.ElementwiseKernel('S x, T w', 'raw U bin', 'atomicAdd(&bin[x], w)', 'bincount_with_weight_kernel')(X, weights, b) b = b.astype(cp.float64) return b
def col2im_gpu(col, sy, sx, ph, pw, h, w, dy=1, dx=1): n, c, kh, kw, out_h, out_w = col.shape img = cp.empty((n, c, h, w), dtype=col.dtype) cp.ElementwiseKernel( 'raw T col, int32 h, int32 w, int32 out_h, int32 out_w,' 'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw,' 'int32 dx, int32 dy', 'T img', ''' int c0 = i / (h * w); int y = i / w % h; int x = i % w; T val = 0; for (int ky = 0; ky < kh; ++ky) { int out_y = (y + ph - ky * dy); if (0 > out_y || out_y >= out_h * sy) continue; if (out_y % sy != 0) continue; out_y /= sy; for (int kx = 0; kx < kw; ++kx) { int out_x = (x + pw - kx * dx); if (0 > out_x || out_x >= out_w * sx) continue; if (out_x % sx != 0) continue; out_x /= sx; int k = out_y + out_h * (kx + kw * (ky + kh * c0)); val = val + col[out_x + out_w * k]; } } img = val; ''', 'col2im')(col.reduced_view(), h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, dx, dy, img) return img
def forward(ctx, data, masks, default_value): # PyTorch to CuPy device = data.device data_in = cp.asarray(data) masks = cp.asarray(masks) data_out = data_in.copy() dim = data_in.size / masks.size # distribute masks = cp.ascontiguousarray(masks) data_out = cp.ascontiguousarray(data_out) kernel = cp.ElementwiseKernel( 'raw S data_out, int64 mask', '', string.Template(''' if (mask == 0) { ${dtype}* p = (${dtype}*)&data_out[i * ${dim}]; for (int j = 0; j < ${dim}; j++) *p++ = ${default_value}; } ''').substitute( dim=dim, dtype=utils.get_dtype_in_cuda(data_out.dtype), default_value=default_value, ), 'function', ) kernel(data_out, masks) # CuPy to PyTorch data_out = torch.as_tensor(data_out, device=device) return data_out
def euclid_filter_cupy(image, distance, grad=True): cp.cuda.set_allocator(cp.cuda.MemoryPool().malloc) img = cp.asarray(image).astype(cp.float32) new_img = cp.zeros_like(img) height, width = img.shape diameter = distance * 2 + 1 eucfilter = cp.zeros((diameter, diameter), dtype=cp.float32) for i in range(diameter): for j in range(diameter): euclid = euclidean((i, j), (distance, distance)) if euclid <= distance: if grad: if euclid == 0: eucfilter[i, j] = 1 else: eucfilter[i, j] = 1 / euclid else: eucfilter[i, j] = 1 get_smooth_image = cp.ElementwiseKernel( in_params='raw float32 img, raw float32 eucfilter, uint16 height, uint16 width, uint16 distance, uint16 diameter', out_params='float32 output', preamble=\ ''' __device__ int get_x_idx(int i, int width) { return i % width; } __device__ int get_y_idx(int i, int height) { return i / height; } ''', operation=\ ''' int x = get_x_idx(i, width); int y = get_y_idx(i, height); float minimum = 0.00001; float sum = 0; float length = 0; if ( ((x >= distance) && (x < width - distance)) && ((y >= distance) && (y < height - distance)) && (img[i] > minimum) ) { for (int k=0; k<diameter; k++) { for (int l=0; l<diameter; l++) { float pixel_img = img[i + (k-distance)*height + l - distance]; float pixel_filter = eucfilter[k*diameter + l]; if (pixel_img > minimum) { sum += pixel_img * pixel_filter; length += pixel_filter; } } } output = sum / length; } else { output = 0; } ''', name='get_smooth_image' ) get_smooth_image(img, eucfilter, height, width, distance, diameter, new_img) return cp.asnumpy(new_img)
def compile_kernel(self, lower, upper): return cp.ElementwiseKernel( "uint8 r, uint8 g, uint8 b", "uint8 out", f"if(r >= {lower[0]} && r <= {upper[0]} && g >= {lower[1]} && g <= {upper[1]} && b >= {lower[2]} && b <= {upper[2]}) {{ out = 255; }} else {{ out = 0; }}", "threshold", )
def binKernel( num_bins ): binK = cp.ElementwiseKernel( 'float64 x, raw float64 bins', 'float64 z', ''' min_index = -9; for( j=0; j < num_bins - 1; ++j ) { if( bins[j] < x ) { min_index = j; } } if( bins[num_bins -1] < x) { min_index = -9; } z = min_index; ''', 'distance_arg', loop_prep=''' int j = 0; int min_index = -9; int num_bins = {}; '''.format(num_bins) ) return binK
def _get_zoom_kernel(ndim, large_int, yshape, mode, cval=0.0, order=1, integer_output=False, grid_mode=False, nprepad=0): in_params = 'raw X x, raw W zoom' out_params = 'Y y' operation, name = _generate_interp_custom( coord_func=_get_coord_zoom_grid if grid_mode else _get_coord_zoom, ndim=ndim, large_int=large_int, yshape=yshape, mode=mode, cval=cval, order=order, name="zoom_grid" if grid_mode else "zoom", integer_output=integer_output, nprepad=nprepad, ) return cupy.ElementwiseKernel(in_params, out_params, operation, name, preamble=math_constants_preamble)
def _kernel_init(): return cupy.ElementwiseKernel( "X x", "Y y", "if (x == 0) { y = -1; } else { y = i; }", "cucim_nd_label_init", )
def _get_correlete_kernel(ndim, mode, cval, xshape, wshape, origin): # weights is always casted to float64 in order to get an output compatible # with SciPy, thought float32 might be sufficient when input dtype is low # precision. in_params, out_params, operation, name = _generate_correlete_kernel( ndim, mode, cval, xshape, wshape, origin) return cupy.ElementwiseKernel(in_params, out_params, operation, name)
def test_manual_indexing(self, n=100): in1 = cupy.random.uniform(-1, 1, n).astype(cupy.float32) in2 = cupy.random.uniform(-1, 1, n).astype(cupy.float32) uesr_kernel_1 = cupy.ElementwiseKernel( 'T x, T y', 'T z', ''' z = x + y; ''', 'uesr_kernel_1') out1 = uesr_kernel_1(in1, in2) uesr_kernel_2 = cupy.ElementwiseKernel( 'raw T x, raw T y', 'raw T z', ''' z[i] = x[i] + y[i]; ''', 'uesr_kernel_2') out2 = uesr_kernel_2(in1, in2, size=n) testing.assert_array_equal(out1, out2)
def tri(N, M=None, k=0, dtype=float): """Creates an array with ones at and below the given diagonal. Args: N (int): Number of rows. M (int): Number of columns. M == N by default. k (int): The sub-diagonal at and below which the array is filled. Zero is the main diagonal, a positive value is above it, and a negative value is below. dtype: Data type specifier. Returns: cupy.ndarray: An array with ones at and below the given diagonal. .. seealso:: :func:`numpy.tri` """ if M is None: M = N out = cupy.empty((N, M), dtype=dtype) return cupy.ElementwiseKernel( 'int32 m, int32 k', 'T out', ''' int row = i % m; int col = i / m; out = (row <= col + k); ''', 'tri', )(M, k, out)
def test_strides(self): x = cupy.arange(6).reshape((2, 3)).astype('i') y = cupy.ElementwiseKernel( 'raw int32 x', 'int32 y', 'y = x.strides()[i]', 'test_carray_strides', )(x, size=2) testing.assert_array_equal(y, (12, 4))
def test_scalar(self, xp, dtype): x = testing.shaped_arange((2, 3, 4), xp, dtype) if xp is numpy: return x + numpy.dtype(dtype).type(self.value) else: kernel = cupy.ElementwiseKernel('T x, T y', 'T z', 'z = x + y') return kernel(x, self.value)
def _get_affine_kernel( ndim, large_int, yshape, mode, cval=0.0, order=1, integer_output=False, nprepad=0, ): in_params = "raw X x, raw W mat" out_params = "Y y" operation, name = _generate_interp_custom( in_params=in_params, coord_func=_get_coord_affine, ndim=ndim, large_int=large_int, yshape=yshape, mode=mode, cval=cval, order=order, name="affine", integer_output=integer_output, nprepad=nprepad, ) return cupy.ElementwiseKernel(in_params, out_params, operation, name, preamble=math_constants_preamble)
def _get_map_kernel( ndim, large_int, yshape, mode, cval=0.0, order=1, integer_output=False, nprepad=0, ): in_params = "raw X x, raw W coords" out_params = "Y y" operation, name = _generate_interp_custom( in_params=in_params, coord_func=_get_coord_map, ndim=ndim, large_int=large_int, yshape=yshape, mode=mode, cval=cval, order=order, name="map_coordinates", integer_output=integer_output, nprepad=nprepad, ) return cupy.ElementwiseKernel(in_params, out_params, operation, name)
def _kernel_labels(): return cupy.ElementwiseKernel( '', 'raw Y y, raw int32 count, raw int32 labels', ''' if (y[i] != i) continue; int j = atomicAdd(&count[1], 1); labels[j] = i; ''', 'cupyx_nd_label_labels')
def cupy_multiply_by_dense(): return cupy.ElementwiseKernel(''' raw S SP_DATA, raw I SP_INDPTR, raw I SP_INDICES, int32 SP_M, int32 SP_N, raw D DN_DATA, int32 DN_M, int32 DN_N, raw I OUT_INDPTR, int32 OUT_M, int32 OUT_N ''', 'O OUT_DATA, I OUT_INDICES', ''' int i_out = i; int m_out = get_row_id(i_out, 0, OUT_M - 1, &(OUT_INDPTR[0])); int i_sp = i_out; if (OUT_M > SP_M && SP_M == 1) { i_sp -= OUT_INDPTR[m_out]; } if (OUT_N > SP_N && SP_N == 1) { i_sp /= OUT_N; } int n_out = SP_INDICES[i_sp]; if (OUT_N > SP_N && SP_N == 1) { n_out = i_out - OUT_INDPTR[m_out]; } int m_dn = m_out; if (OUT_M > DN_M && DN_M == 1) { m_dn = 0; } int n_dn = n_out; if (OUT_N > DN_N && DN_N == 1) { n_dn = 0; } OUT_DATA = (O)(SP_DATA[i_sp] * DN_DATA[n_dn + (DN_N * m_dn)]); OUT_INDICES = n_out; ''', 'cupyx_scipy_sparse_csr_multiply_by_dense', preamble=_GET_ROW_ID_)
def _get_affine_kernel(ndim, large_int, yshape, mode, cval=0.0, order=1, integer_output=False): in_params = 'raw X x, raw W mat' out_params = 'Y y' operation, name = _generate_interp_custom( coord_func=_get_coord_affine, ndim=ndim, large_int=large_int, yshape=yshape, mode=mode, cval=cval, order=order, name='affine', integer_output=integer_output, ) return cupy.ElementwiseKernel(in_params, out_params, operation, name, preamble=math_constants_preamble)
def _get_map_kernel(ndim, large_int, yshape, mode, cval=0.0, order=1, integer_output=False, nprepad=0): in_params = 'raw X x, raw W coords' out_params = 'Y y' operation, name = _generate_interp_custom( coord_func=_get_coord_map, ndim=ndim, large_int=large_int, yshape=yshape, mode=mode, cval=cval, order=order, name='shift', integer_output=integer_output, nprepad=nprepad, omit_in_coord=True, # input image coordinates are not needed ) return cupy.ElementwiseKernel(in_params, out_params, operation, name, preamble=math_constants_preamble)
def cupy_binopt_csr_step2(op_name): name = 'cupyx_scipy_sparse_csr_binopt' + op_name + 'step2' return cupy.ElementwiseKernel( ''' raw I A_INFO, raw B A_VALID, raw I A_TMP_INDICES, raw O A_TMP_DATA, int32 A_NNZ, raw I B_INFO, raw B B_VALID, raw I B_TMP_INDICES, raw O B_TMP_DATA, int32 B_NNZ ''', 'raw I C_INDICES, raw O C_DATA', ''' if (i < A_NNZ) { int j = i; if (A_VALID[j]) { C_INDICES[A_INFO[j]] = A_TMP_INDICES[j]; C_DATA[A_INFO[j]] = A_TMP_DATA[j]; } } else if (i < A_NNZ + B_NNZ) { int j = i - A_NNZ; if (B_VALID[j]) { C_INDICES[B_INFO[j]] = B_TMP_INDICES[j]; C_DATA[B_INFO[j]] = B_TMP_DATA[j]; } } ''', name, )
def test_invalid_shape(self): with six.assertRaisesRegex(self, ValueError, 'Out shape is mismatched'): f = cupy.ElementwiseKernel('T x', 'T y', 'y += x') x = cupy.arange(12).reshape(3, 4) y = cupy.arange(4) f(x, y)
def gumbel(loc=0.0, scale=1.0, size=None, dtype=float): """Returns an array of samples drawn from a Gumbel distribution. The samples are drawn from a Gumbel distribution with location ``loc`` and scale ``scale``. Its probability density function is defined as .. math:: f(x) = \\frac{1}{\\eta} \ \\exp\\left\\{ - \\frac{x - \\mu}{\\eta} \\right\\} \ \\exp\\left[-\\exp\\left\\{-\\frac{x - \\mu}{\\eta} \ \\right\\}\\right], where :math:`\\mu` is ``loc`` and :math:`\\eta` is ``scale``. Args: loc (float): The location of the mode :math:`\\mu`. scale (float): The scale parameter :math:`\\eta`. size (int or tuple of ints): The shape of the array. If ``None``, a zero-dimensional array is generated. dtype: Data type specifier. Only :class:`numpy.float32` and :class:`numpy.float64` types are allowed. Returns: cupy.ndarray: Samples drawn from the Gumbel destribution. .. seealso:: :func:`numpy.random.gumbel` """ rs = uniform(size=size, dtype=dtype) # We use `1 - x` as input of `log` method to prevent overflow. # It obeys numpy implementation. return cupy.ElementwiseKernel('T x, T loc, T scale', 'T y', 'y = loc - log(-log(1 - x)) * scale', 'gumbel_kernel')(rs, loc, scale, rs) return rs
def get_label_lengths(self, labels): if self.xp == numpy: label_lengths = self.xp.zeros(len(labels)) for i in range(len(labels)): for j in range(len(labels[i])): if labels.data[i][j] == self.blank_symbol: label_lengths[i] = j break else: import cupy label_length_kernel = cupy.ElementwiseKernel( 'raw T labels, int32 blank_symbol, int32 num_labels', 'T length', ''' for (int j = 0; j < num_labels; ++j) { T label_value = labels[i * num_labels + j]; if (label_value == blank_symbol) { length = j; break; } } ''', 'get_label_lengths') label_lengths = label_length_kernel(labels.data, self.blank_symbol, labels.shape[1], size=len(labels)) return label_lengths
def interpolate_bilinear_gpu(x, v, u, vw, uw): B, H, W = x.shape out_H, out_W = v.shape y = cp.empty((B, out_H, out_W), dtype=x.dtype) cp.ElementwiseKernel( 'raw T x, S v, S u, T vw, T uw, S H, S W, S outsize', 'T y', ''' // indices S v0 = v; S v1 = min(v + 1, (S)(H - 1)); S u0 = u; S u1 = min(u + 1, (S)(W - 1)); // weights T w0 = (1 - vw) * (1 - uw); T w1 = (1 - vw) * uw; T w2 = vw * (1 - uw); T w3 = vw * uw; // fetch S offset = i / outsize * H * W; T px0 = x[offset + v0 * W + u0]; T px1 = x[offset + v0 * W + u1]; T px2 = x[offset + v1 * W + u0]; T px3 = x[offset + v1 * W + u1]; // interpolate y = (w0 * px0 + w1 * px1) + (w2 * px2 + w3 * px3); ''', 'resize_images_interpolate_bilinear')(x, v, u, vw, uw, H, W, out_H * out_W, y) return y
def test_getitem_int(self): x = cupy.arange(24).reshape((2, 3, 4)).astype('i') y = cupy.empty_like(x) y = cupy.ElementwiseKernel( 'raw T x', 'int32 y', 'y = x[i]', 'test_carray_getitem_int', )(x, y) testing.assert_array_equal(y, x)