def copy(a): """Creates a copy of a given array on the current device. This function allocates the new array on the current device. If the given array is allocated on the different device, then this function tries to copy the contents over the devices. Args: a (cupy.ndarray): The source array. Returns: cupy.ndarray: The copy of ``a`` on the current device. See: :func:`numpy.copy`, :meth:`cupy.ndarray.copy` """ # If the current device is different from the device of ``a``, then this # function allocates a new array on the current device, and copies the # contents over the devices. # TODO(beam2d): Support ordering option if a.size == 0: return cupy.empty_like(a) if not a.flags.c_contiguous: a = ascontiguousarray(a) if a.data.device == cuda.Device(): return a newarray = cupy.empty_like(a) newarray.data.copy_from(a.data, a.nbytes) return newarray
def _batched_inv(a): assert a.ndim >= 3 _util._assert_cupy_array(a) _util._assert_stacked_square(a) dtype, out_dtype = _util.linalg_common_type(a) if dtype == cupy.float32: getrf = cupy.cuda.cublas.sgetrfBatched getri = cupy.cuda.cublas.sgetriBatched elif dtype == cupy.float64: getrf = cupy.cuda.cublas.dgetrfBatched getri = cupy.cuda.cublas.dgetriBatched elif dtype == cupy.complex64: getrf = cupy.cuda.cublas.cgetrfBatched getri = cupy.cuda.cublas.cgetriBatched elif dtype == cupy.complex128: getrf = cupy.cuda.cublas.zgetrfBatched getri = cupy.cuda.cublas.zgetriBatched else: msg = ('dtype must be float32, float64, complex64 or complex128' ' (actual: {})'.format(a.dtype)) raise ValueError(msg) if 0 in a.shape: return cupy.empty_like(a, dtype=out_dtype) a_shape = a.shape # copy is necessary to present `a` to be overwritten. a = a.astype(dtype, order='C').reshape(-1, a_shape[-2], a_shape[-1]) handle = device.get_cublas_handle() batch_size = a.shape[0] n = a.shape[1] lda = n step = n * lda * a.itemsize start = a.data.ptr stop = start + step * batch_size a_array = cupy.arange(start, stop, step, dtype=cupy.uintp) pivot_array = cupy.empty((batch_size, n), dtype=cupy.int32) info_array = cupy.empty((batch_size, ), dtype=cupy.int32) getrf(handle, n, a_array.data.ptr, lda, pivot_array.data.ptr, info_array.data.ptr, batch_size) cupy.linalg._util._check_cublas_info_array_if_synchronization_allowed( getrf, info_array) c = cupy.empty_like(a) ldc = lda step = n * ldc * c.itemsize start = c.data.ptr stop = start + step * batch_size c_array = cupy.arange(start, stop, step, dtype=cupy.uintp) getri(handle, n, a_array.data.ptr, lda, pivot_array.data.ptr, c_array.data.ptr, ldc, info_array.data.ptr, batch_size) cupy.linalg._util._check_cublas_info_array_if_synchronization_allowed( getri, info_array) return c.reshape(a_shape).astype(out_dtype, copy=False)
def _init_params(self, n): # CuPy setup self.x = cp.linspace(0, 2, n, dtype=cp.double) self.y = cp.linspace(0, 2, n, dtype=cp.double) self.X, self.Y = cp.meshgrid(self.x, self.y) self.u = cp.zeros((n, n), dtype=cp.double) self.v = cp.zeros((n, n), dtype=cp.double) self.p = cp.zeros((n, n), dtype=cp.double) self.un = cp.empty_like(self.u, dtype=cp.double) self.vn = cp.empty_like(self.v, dtype=cp.double) self.pn = cp.empty_like(self.p, dtype=cp.double) self.b = cp.zeros((n, n), dtype=cp.double)
def _ndimage_mean_kernel_2(input, labels, index, batch_size=4, return_count=False): sum_val = cupy.empty_like(index, dtype=cupy.float64) count = cupy.empty_like(index, dtype=cupy.uint64) for i in range(0, index.size, batch_size): matched = labels == index[i:i + batch_size].reshape( (-1,) + (1,) * input.ndim) mean_axes = tuple(range(1, 1 + input.ndim)) count[i:i + batch_size] = matched.sum(axis=mean_axes) sum_val[i:i + batch_size] = cupy.where(matched, input, 0).sum( axis=mean_axes) if return_count: return sum_val / count, count return sum_val / count
def precompute_cc_factors(ad, bd, radius, mode="constant"): # factors = cp.zeros((5,) + ad.shape, dtype=ad.dtype) factors = [None] * 5 sum_h = cp.ones((2 * radius + 1, ), dtype=ad.dtype) h_tuple = (sum_h, ) * ad.ndim kwargs = dict(mode=mode) sum_a = convolve_separable(ad, h_tuple, **kwargs) sum_b = convolve_separable(bd, h_tuple, **kwargs) sum_ab = convolve_separable(ad * bd, h_tuple, **kwargs) sum_aa = convolve_separable(ad * ad, h_tuple, **kwargs) sum_bb = convolve_separable(bd * bd, h_tuple, **kwargs) if mode != "constant": cnt = (2 * radius + 1)**ad.ndim else: cnt = convolve_separable(cp.ones_like(ad), (sum_h, ) * ad.ndim, **kwargs).astype(cp.int32) if True: factors[0] = cp.empty_like(ad) factors[1] = cp.empty_like(ad) factors[2] = cp.empty_like(ad) factors[3] = cp.empty_like(ad) factors[4] = cp.empty_like(ad) _cc_precompute( ad, bd, sum_a, sum_b, sum_ab, sum_aa, sum_bb, cnt, factors[0], factors[1], factors[2], factors[3], factors[4], ) else: a_mean = sum_a / cnt b_mean = sum_b / cnt factors[0] = ad - a_mean factors[1] = bd - b_mean factors[2] = sum_ab - b_mean * sum_a - a_mean * sum_b + sum_a * b_mean factors[3] = sum_aa - (a_mean + a_mean) * sum_a + sum_a * a_mean factors[4] = sum_bb - (b_mean + b_mean) * sum_b + sum_b * b_mean return factors
def test_getitem_int(self): x = cupy.arange(24).reshape((2, 3, 4)).astype('i') y = cupy.empty_like(x) y = cupy.ElementwiseKernel( 'raw T x', 'int32 y', 'y = x[i]', 'test_carray_getitem_int', )(x, y) testing.assert_array_equal(y, x)
def clip(self, indices, in_place=False, remove=False): ''' Removes the items of list, header, and data that do not have index in the given indices. Parameters ---------- indices : array-like indices of items to leave in_place : bool, default False If True, perform in-place array manipulations for ndarray. This saves memory, but the values of the original ndarray are not preserved. ''' if remove: indices = sorted(set(range(len(self))) - set(indices)) else: indices = sorted(indices) list = self.list head = self.header data = self.data self.list = (list[i] for i in indices) self.header = (head[i] for i in indices) tmpd = data[:len(indices)] if not in_place: tmpd = cp.empty_like(tmpd) for dst, j in zip(tmpd, indices): cp.copyto(dst, data[j]) self.data = tmpd
def apply_diffusion(in_field, out_field, alpha, num_halo, num_iter=1): """ Integrate 4th-order diffusion equation by a certain number of iterations. Parameters ---------- in_field : array-like Input field (nz x ny x nx with halo in x- and y-direction). lap_field : array-like Result (must be same size as ``in_field``). alpha : float Diffusion coefficient (dimensionless). num_iter : `int`, optional Number of iterations to execute. """ tmp_field = xp.empty_like(in_field) for n in range(num_iter): halo_update(in_field, num_halo) laplacian(in_field, tmp_field, num_halo=num_halo, extend=1) laplacian(tmp_field, out_field, num_halo=num_halo, extend=0) out_field[:, num_halo:-num_halo, num_halo:-num_halo] = ( in_field[:, num_halo:-num_halo, num_halo:-num_halo] - alpha * out_field[:, num_halo:-num_halo, num_halo:-num_halo]) if n < num_iter - 1: in_field, out_field = out_field, in_field else: halo_update(out_field, num_halo)
def _pack(binary): data_size = binary.dtype.itemsize * binary.shape[0] out_size = data_size out = cp.empty_like(binary, dtype=cp.ubyte, shape=out_size) threadsperblock, blockspergrid = _get_tpb_bpg() k_type = "pack" _populate_kernel_cache(out.dtype, k_type) kernel = _get_backend_kernel( out.dtype, blockspergrid, threadsperblock, k_type, ) kernel(out_size, binary, out) _print_atts(kernel) # Remove binary data del binary return out
def _reshuffle_dct3(y, n, axis, dst): """Reorder entries to allow computation of DCT/DST-II via FFT.""" x = cupy.empty_like(y) n_half = (n + 1) // 2 # Store first half of y in the even entries of the output sl_even = [slice(None)] * y.ndim sl_even[axis] = slice(0, None, 2) sl_even = tuple(sl_even) sl_half = [slice(None)] * y.ndim sl_half[axis] = slice(0, n_half) x[sl_even] = y[tuple(sl_half)] # Store the second half of y in the odd entries of the output sl_odd = [slice(None)] * y.ndim sl_odd[axis] = slice(1, None, 2) sl_odd = tuple(sl_odd) sl_half[axis] = slice(-1, n_half - 1, -1) if dst: x[sl_odd] = -y[tuple(sl_half)] else: x[sl_odd] = y[tuple(sl_half)] return x
def test_static_array(self): code = ''' struct double5 { double value[5]; __device__ const double& operator[](size_t i) const { return value[i]; } }; extern "C" __global__ void test_kernel(const double* a, double5 b, double* x) { int i = threadIdx.x; x[i] = a[i] + b[0] + b[1] + b[2] + b[3] + b[4]; } ''' a_cpu = numpy.arange(24, dtype=numpy.float64) a = cupy.array(a_cpu) x = cupy.empty_like(a) func = _compile_func('test_kernel', code) # We cannot pass np.ndarray kernel arguments of size > 1 b = numpy.arange(5).astype(numpy.float64) with pytest.raises(TypeError): func.linear_launch(a.size, (a, b, x)) double5 = numpy.dtype({ 'names': ['dummy'], 'formats': [(numpy.float64, (5, ))] }) func.linear_launch(a.size, (a, b.view(double5), x)) expected = a_cpu + b.sum() testing.assert_array_equal(x, expected)
def copy(array, out=None, out_device=None, stream=None): """Copies a cupy.ndarray object using the default stream. This function can copy the device array to the destination array on another device. Args: array (cupy.ndarray): Array to be copied. out (cupy.ndarray): Destination array. If it is not ``None``, then ``out_device`` argument is ignored. out_device: Destination device specifier. Actual device object is obtained by passing this value to :func:`get_device`. stream (cupy.cuda.Stream): CUDA stream. Returns: cupy.ndarray: Copied array. If ``out`` is not specified, then the array is allocated on the device specified by ``out_device`` argument. """ check_cuda_available() assert stream is None # TODO(beam2d): FIX IT if out is None: if out_device is None: out_device = array with get_device(out_device): out = cupy.empty_like(array) with get_device(array): cupy.copyto(out, array) return out
def test_uniform_filter1d(self): d = cp.random.randn(5000) os = cp.empty((4, d.size)) ot = cp.empty_like(os) self.check_func_serial(4, sndi.uniform_filter1d, (d, 5), os) self.check_func_thread(4, sndi.uniform_filter1d, (d, 5), ot) assert_array_equal(os, ot)
def activation_backward(x, y, gy, mode): x = cupy.ascontiguousarray(x) gy = cupy.ascontiguousarray(gy) gx = cupy.empty_like(x) dtype = "d" if x.dtype == "d" else "f" one = numpy.array(1, dtype=dtype).ctypes zero = numpy.array(0, dtype=dtype).ctypes handle = get_handle() y_mat = _as4darray(y) desc = create_tensor_descriptor(y_mat) cudnn.activationBackward_v3( handle, mode, one.data, desc.value, y.data.ptr, desc.value, gy.data.ptr, desc.value, x.data.ptr, zero.data, desc.value, gx.data.ptr, ) return gx
def to_gpu(array, device=None, stream=None): """Copies the given CPU array to the specified device. Args: array: Array to be sent to GPU. device: Device specifier. stream (~cupy.cuda.Stream): *(deprecated since v3.0.0)* CUDA stream. If not ``None``, the copy runs asynchronously. Returns: cupy.ndarray: Array on GPU. If ``array`` is already on the GPU device specified by ``device``, this function just returns ``array`` without performing any copy. """ if stream is not None: warnings.warn( 'The stream option is deprecated in chainer.cuda.to_gpu. ' 'Please remove it.', DeprecationWarning) check_cuda_available() if not isinstance(array, (cupy.ndarray, numpy.ndarray)): raise TypeError( 'The array sent to gpu must be numpy.ndarray or cupy.ndarray.' '\nActual type: {0}.'.format(type(array))) with _get_device(device): array_dev = get_device_from_array(array) if array_dev.id == cupy.cuda.device.get_device_id(): return array if stream is not None and stream.ptr != 0: ret = cupy.empty_like(array) if array_dev.id == -1: # cpu to gpu mem = cupy.cuda.alloc_pinned_memory(array.nbytes) src = numpy.frombuffer(mem, array.dtype, array.size).reshape(array.shape) src[...] = array ret.set(src, stream) cupy.cuda.pinned_memory._add_to_watch_list( stream.record(), mem) else: # gpu to gpu with array_dev: src = array.copy() event = Stream.null.record() stream.wait_event(event) ret.data.copy_from_device_async(src.data, src.nbytes, stream) # to hold a reference until the end of the asynchronous # memcpy stream.add_callback(lambda *x: None, (src, ret)) return ret if array_dev.id == -1: return cupy.asarray(array) # Need to make a copy when an array is copied to another device return cupy.array(array, copy=True)
def test_backward_fft(self, dtype): t = dtype idtype = odtype = edtype = cupy.dtype(t) shape = self.shape length = cupy.core.internal.prod(shape[1:]) a = testing.shaped_random(shape, cupy, dtype) out = cupy.empty_like(a) plan = cufft.XtPlanNd(shape[1:], shape[1:], 1, length, idtype, shape[1:], 1, length, odtype, shape[0], edtype, order='C', last_axis=-1, last_size=None) plan.fft(a, out, cufft.CUFFT_INVERSE) if len(shape) <= 2: out_cp = cupy.fft.ifft(a) else: out_cp = cupy.fft.ifftn(a, axes=(-1, -2)) testing.assert_allclose(out / length, out_cp)
def _terrain_gpu(height_map, seed, x_range=(0, 1), y_range=(0, 1)): NOISE_LAYERS = ((1 / 2**i, (2**i, 2**i)) for i in range(16)) noise = cupy.empty_like(height_map, dtype=np.float32) griddim, blockdim = cuda_args(height_map.shape) for i, (m, (xfreq, yfreq)) in enumerate(NOISE_LAYERS): # cupy.random.seed(seed+i) # p = cupy.random.permutation(2**20) # use numpy.random then transfer data to GPU to ensure the same result # when running numpy backed and cupy backed data array. np.random.seed(seed + i) p = cupy.asarray(np.random.permutation(2**20)) p = cupy.append(p, p) _perlin_gpu[griddim, blockdim](p, x_range[0] * xfreq, x_range[1] * xfreq, y_range[0] * yfreq, y_range[1] * yfreq, m, noise) height_map += noise height_map /= (1.00 + 0.50 + 0.25 + 0.13 + 0.06 + 0.03) height_map = height_map**3 return height_map
def test_cross_correlate_masked_over_axes(): """Masked normalized cross-correlation over axes should be equivalent to a loop over non-transform axes.""" # See random number generator for reproducible results np.random.seed(23) arr1 = np.random.random((8, 8, 5)) arr2 = np.random.random((8, 8, 5)) m1 = np.random.choice([True, False], arr1.shape) m2 = np.random.choice([True, False], arr2.shape) arr1 = cp.asarray(arr1) arr2 = cp.asarray(arr2) m1 = cp.asarray(m1) m2 = cp.asarray(m2) # Loop over last axis with_loop = cp.empty_like(arr1, dtype=np.complex128) for index in range(arr1.shape[-1]): with_loop[:, :, index] = cross_correlate_masked(arr1[:, :, index], arr2[:, :, index], m1[:, :, index], m2[:, :, index], axes=(0, 1), mode='same') over_axes = cross_correlate_masked( arr1, arr2, m1, m2, axes=(0, 1), mode='same') cp.testing.assert_array_almost_equal(with_loop, over_axes)
def _unpack(binary, dtype, endianness): data_size = cp.dtype(dtype).itemsize // binary.dtype.itemsize out_size = binary.shape[0] // data_size out = cp.empty_like(binary, dtype=dtype, shape=out_size) if endianness == "B": little = False else: little = True threadsperblock, blockspergrid = _get_tpb_bpg() k_type = "unpack" _populate_kernel_cache(out.dtype, k_type) kernel = _get_backend_kernel( out.dtype, blockspergrid, threadsperblock, k_type, ) kernel(out_size, little, binary, out) _print_atts(kernel) # Remove binary data del binary return out
def activation_backward(x, y, gy, mode): x = cupy.ascontiguousarray(x) gy = cupy.ascontiguousarray(gy) gx = cupy.empty_like(x) dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=dtype).ctypes zero = numpy.array(0, dtype=dtype).ctypes handle = get_handle() y_mat = _as4darray(y) desc = create_tensor_descriptor(y_mat) if _cudnn_version >= 4000: act_desc = Descriptor(cudnn.createActivationDescriptor(), cudnn.destroyActivationDescriptor) cudnn.setActivationDescriptor( act_desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, 0.0) cudnn.activationBackward_v4( handle, act_desc.value, one.data, desc.value, y.data.ptr, desc.value, gy.data.ptr, desc.value, x.data.ptr, zero.data, desc.value, gx.data.ptr) else: cudnn.activationBackward_v3( handle, mode, one.data, desc.value, y.data.ptr, desc.value, gy.data.ptr, desc.value, x.data.ptr, zero.data, desc.value, gx.data.ptr) return gx
def main(): N = 8 module = cupy.RawModule(code=code, options=('-std=c++11', ), name_expressions=('kernel<float>', 'kernel<double>')) # The kernel computes out = A*B+C where A, B and C are 4x4 matrices. # A and B are arrays of N such matrices and C is a matrix kernel parameter. for (ctype, dtype) in zip(('float', 'double'), (numpy.float32, numpy.float64)): A = cupy.random.rand(16 * N, dtype=dtype).reshape(N, 4, 4) B = cupy.random.rand(16 * N, dtype=dtype).reshape(N, 4, 4) C = numpy.random.rand(16).astype(dtype).reshape(4, 4) out = cupy.empty_like(A) Matrix = numpy.dtype({ 'names': ['value'], 'formats': [(dtype, (4, 4))] }) kernel = module.get_function('kernel<{}>'.format(ctype)) args = (A, B, C.ravel().view(Matrix), out) kernel((1, ), (N, ), args) expected = cupy.matmul(A, B) + cupy.asarray(C[None, :, :]) cupy.testing.assert_array_almost_equal(expected, out) print("Kernel output matches expected value for " "type '{}'.".format(ctype))
def ascontiguousarray(a, dtype=None): """Returns a C-contiguous array. Args: a (cupy.ndarray): Source array. dtype: Data type specifier. Returns: cupy.ndarray: If no copy is required, it returns ``a``. Otherwise, it returns a copy of ``a``. .. seealso:: :func:`numpy.ascontiguousarray` """ if dtype is None: dtype = a.dtype else: dtype = numpy.dtype(dtype) if dtype == a.dtype and a.flags.c_contiguous: return a else: newarray = cupy.empty_like(a, dtype) elementwise.copy(a, newarray) return newarray
def copy(array, out=None, out_device=None, stream=None): """Copies a :class:`cupy.ndarray` object using the default stream. This function can copy the device array to the destination array on another device. Args: array (cupy.ndarray): Array to be copied. out (cupy.ndarray): Destination array. If it is not ``None``, then ``out_device`` argument is ignored. out_device: Destination device specifier. Actual device object is obtained by passing this value to :func:`get_device`. stream (cupy.cuda.Stream): CUDA stream. Returns: cupy.ndarray: Copied array. If ``out`` is not specified, then the array is allocated on the device specified by ``out_device`` argument. """ check_cuda_available() assert stream is None # TODO(beam2d): FIX IT if out is None: if out_device is None: out_device = array with get_device(out_device): out = cupy.empty_like(array) with get_device(array): cupy.copyto(out, array) return out
def inv_gpu(b): # We do a batched LU decomposition on the GPU to compute the inverse # Change the shape of the array to be size=1 minibatch if necessary # Also copy the matrix as the elments will be modified in-place a = _as_batch_mat(b).copy() n = a.shape[1] n_matrices = len(a) # Pivot array p = cupy.empty((n, n_matrices), dtype=np.int32) # Output array c = cupy.empty_like(a) # These arrays hold information on the execution success # or if the matrix was singular info = cupy.empty(n_matrices, dtype=np.int32) ap = _mat_ptrs(a) cp = _mat_ptrs(c) _, lda = _get_ld(a) _, ldc = _get_ld(c) handle = cuda.Device().cublas_handle cuda.cublas.sgetrfBatched( handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices) cuda.cublas.sgetriBatched( handle, n, ap.data.ptr, lda, p.data.ptr, cp.data.ptr, ldc, info.data.ptr, n_matrices) return c
def pressure_poisson(u,v,p,b, nit, dx, dy): pn = cp.empty_like(p) pn = p.copy() # pn has all boundary related elemnts after 1st time loop # Below loop helps us to achieve pressure terms from boundary to whole surface via differential schemes in time. # Indeed we used boundary conditions and get the whole surface discretely for q in range(nit): # (n-1)th time presssure part is used to calculate nth time pressure. pn = p.copy() # optimized python code for our PP equation p[1:-1, 1:-1] = (((pn[1:-1, 2:] + pn[1:-1, 0:-2]) * dy**2 + (pn[2:, 1:-1] + pn[0:-2, 1:-1]) * dx**2) / (2 * (dx**2 + dy**2)) - dx**2 * dy**2 / (2 * (dx**2 + dy**2)) * b[1:-1,1:-1]) # boundary conditions # utilized backward difference scheme and equated it to 0 p[:, -1] = p[:, -2] # dp/dx = 0 at x = 2 p[0, :] = p[1, :] # dp/dy = 0 at y = 0 p[:, 0] = p[:, 1] # dp/dx = 0 at x = 0 p[-1, :] = 0 # p = 0 at y = 2 return p
def test_correlate1d(self): d = cp.random.randn(5000) os = cp.empty((4, d.size)) ot = cp.empty_like(os) self.check_func_serial(4, sndi.correlate1d, (d, cp.arange(5)), os) self.check_func_thread(4, sndi.correlate1d, (d, cp.arange(5)), ot) assert_array_equal(os, ot)
def cavity_flow(nt, u, v, p, b, vis, rho, nx, ny, nit, dt, dx, dy): un = cp.empty_like(u) vn = cp.empty_like(v) b = cp.zeros((ny, nx), dtype = np.float64) for n in range(nt): un = u.copy() vn = v.copy() #b = build_up_b(b, rho, dt, u, v, dx, dy) #p = pressure_poisson(p, dx, dy, b) b = build_up_b(b, u, v, rho, dt, dx, dy) p = pressure_poisson(un,vn,p,b, nit, dx, dy) u[1:-1, 1:-1] = (un[1:-1, 1:-1]- un[1:-1, 1:-1] * dt / dx * (un[1:-1, 1:-1] - un[1:-1, 0:-2]) - vn[1:-1, 1:-1] * dt / dy * (un[1:-1, 1:-1] - un[0:-2, 1:-1]) - dt / (2 * rho * dx) * (p[1:-1, 2:] - p[1:-1, 0:-2]) + vis * (dt / dx**2 * (un[1:-1, 2:] - 2 * un[1:-1, 1:-1] + un[1:-1, 0:-2]) + dt / dy**2 * (un[2:, 1:-1] - 2 * un[1:-1, 1:-1] + un[0:-2, 1:-1]))) v[1:-1,1:-1] = (vn[1:-1, 1:-1] - un[1:-1, 1:-1] * dt / dx * (vn[1:-1, 1:-1] - vn[1:-1, 0:-2]) - vn[1:-1, 1:-1] * dt / dy * (vn[1:-1, 1:-1] - vn[0:-2, 1:-1]) - dt / (2 * rho * dy) * (p[2:, 1:-1] - p[0:-2, 1:-1]) + vis * (dt / dx**2 * (vn[1:-1, 2:] - 2 * vn[1:-1, 1:-1] + vn[1:-1, 0:-2]) + dt / dy**2 * (vn[2:, 1:-1] - 2 * vn[1:-1, 1:-1] + vn[0:-2, 1:-1]))) u[0, :] = 0 u[:, 0] = 0 u[:, -1] = 0 u[-1, :] = 1 # set velocity on cavity lid equal to 1 v[0, :] = 0 v[-1, :] = 0 v[:, 0] = 0 v[:, -1] = 0 return u,v,p
def solve(a, b): """Solves a linear matrix equation. It computes the exact solution of ``x`` in ``ax = b``, where ``a`` is a square and full rank matrix. Args: a (cupy.ndarray): The matrix with dimension ``(..., M, M)``. b (cupy.ndarray): The matrix with dimension ``(...,M)`` or ``(..., M, K)``. Returns: cupy.ndarray: The matrix with dimension ``(..., M)`` or ``(..., M, K)``. .. warning:: This function calls one or more cuSOLVER routine(s) which may yield invalid results if input conditions are not met. To detect these invalid results, you can set the `linalg` configuration to a value that is not `ignore` in :func:`cupyx.errstate` or :func:`cupyx.seterr`. .. seealso:: :func:`numpy.linalg.solve` """ if a.ndim > 2 and a.shape[-1] <= get_batched_gesv_limit(): # Note: There is a low performance issue in batched_gesv when matrix is # large, so it is not used in such cases. return batched_gesv(a, b) # TODO(kataoka): Move the checks to the beginning _util._assert_cupy_array(a, b) _util._assert_stacked_2d(a) _util._assert_stacked_square(a) if not ((a.ndim == b.ndim or a.ndim == b.ndim + 1) and a.shape[:-1] == b.shape[:a.ndim - 1]): raise ValueError( 'a must have (..., M, M) shape and b must have (..., M) ' 'or (..., M, K)') dtype, out_dtype = _util.linalg_common_type(a, b) if a.ndim == 2: # prevent 'a' and 'b' to be overwritten a = a.astype(dtype, copy=True, order='F') b = b.astype(dtype, copy=True, order='F') cupyx.lapack.gesv(a, b) return b.astype(out_dtype, copy=False) # prevent 'a' to be overwritten a = a.astype(dtype, copy=True, order='C') x = cupy.empty_like(b, dtype=out_dtype) shape = a.shape[:-2] for i in range(numpy.prod(shape)): index = numpy.unravel_index(i, shape) # prevent 'b' to be overwritten bi = b[index].astype(dtype, copy=True, order='F') cupyx.lapack.gesv(a[index], bi) x[index] = bi return x
def matvec(x): y = cupy.empty_like(x) desc_x = cusparse.DnVecDescriptor.create(x) desc_y = cusparse.DnVecDescriptor.create(y) _cusparse.spMV(handle, op_a, alpha.ctypes.data, desc_A.desc, desc_x.desc, beta.ctypes.data, desc_y.desc, cuda_dtype, alg, buff.data.ptr) return y
def _compute_cc_step(grad_static, factors, radius, forward=True, zero_borders=True, coord_axis=-1): out = cp.empty_like(grad_static) ndim = out.ndim - 1 Ii = factors[0] Ji = factors[1] sfm = factors[2] sff = factors[3] smm = factors[4] result = cp.empty_like(sfm) _cc_local_correlation(sfm, sff, smm, 1e-5, result) energy = result.sum() if forward: cc_kernel = _cc_compute_forward else: cc_kernel = _cc_compute_backward # can reuse result for the output array cc_kernel(sfm, sff, smm, Ji, Ii, result) if coord_axis == -1: result = result[..., cp.newaxis] else: result = result[cp.newaxis, :] result = result * grad_static if zero_borders and radius > 0: if coord_axis not in [0, -1]: raise ValueError("coord_axis must be 0 or -1.") slices = [slice(None)] * (ndim + 1) if coord_axis == -1: axes = range(ndim) else: axes = range(1, ndim + 1) for ax in axes: slices[ax] = slice(0, radius) result[tuple(slices)] = 0 slices[ax] = slice(-radius, None) result[tuple(slices)] = 0 slices[ax] = slice(None) return result, energy
def test_correlate(self): d = cp.random.randn(500, 500) k = cp.random.randn(10, 10) os = cp.empty([4] + list(d.shape)) ot = cp.empty_like(os) self.check_func_serial(4, sndi.correlate, (d, k), os) self.check_func_thread(4, sndi.correlate, (d, k), ot) assert_array_equal(os, ot)
def test_empty_like_reshape_cupy_only(self, dtype, order): a = testing.shaped_arange((2, 3, 4), cupy, dtype) b = cupy.empty_like(a, shape=self.shape) b.fill(0) c = cupy.empty(self.shape, order=order, dtype=dtype) c.fill(0) testing.assert_array_equal(b, c)
def to_gpu(array, device=None, stream=None): """Copies the given CPU array to specified device. Args: array: Array to be sent to GPU. device: Device specifier. stream (cupy.cuda.Stream): CUDA stream. If not ``None``, the copy runs asynchronously. Returns: cupy.ndarray: Array on GPU. If ``array`` is already on GPU, then this function just returns ``array`` without performing any copy. Note that this function does not copy :class:`cupy.ndarray` into specified device. """ check_cuda_available() with _get_device(device): array_dev = get_device_from_array(array) if array_dev.id == cupy.cuda.device.get_device_id(): return array if stream is not None: warnings.warn( 'The stream option is deprecated in chainer.cuda.to_gpu. ' 'Please remove it.', DeprecationWarning) if stream.ptr != 0: ret = cupy.empty_like(array) if array_dev.id == -1: # cpu to gpu mem = cupy.cuda.alloc_pinned_memory(array.nbytes) src = numpy.frombuffer( mem, array.dtype, array.size).reshape(array.shape) src[...] = array ret.set(src, stream) cupy.cuda.pinned_memory._add_to_watch_list( stream.record(), mem) else: # gpu to gpu with array_dev: src = array.copy() event = Stream.null.record() stream.wait_event(event) ret.data.copy_from_device_async( src.data, src.nbytes, stream) # to hold a reference until the end of the asynchronous # memcpy stream.add_callback(lambda *x: None, (src, ret)) return ret if array_dev.id == -1: return cupy.asarray(array) # Need to make a copy when an array is copied to another device return cupy.array(array, copy=True)
def test_getitem_idx(self): x = cupy.arange(24).reshape((2, 3, 4)).astype('i') y = cupy.empty_like(x) y = cupy.ElementwiseKernel( 'raw T x', 'int32 y', 'int idx[] = {i / 12, i / 4 % 3, i % 4}; y = x[idx]', 'test_carray_getitem_idx', )(x, y) testing.assert_array_equal(y, x)
def activation_forward(x, mode): x = cupy.ascontiguousarray(x) y = cupy.empty_like(x) dtype = "d" if x.dtype == "d" else "f" one = numpy.array(1, dtype=dtype).ctypes zero = numpy.array(0, dtype=dtype).ctypes handle = get_handle() x_mat = _as4darray(x) desc = create_tensor_descriptor(x_mat) cudnn.activationForward_v3(handle, mode, one.data, desc.value, x_mat.data.ptr, zero.data, desc.value, y.data.ptr) return y
def _array_to_gpu(array, device, stream): assert device is DummyDevice or isinstance(device, Device) if array is None: return None if isinstance(array, (numpy.number, numpy.bool_)): array = numpy.asarray(array) elif isinstance(array, intel64.mdarray): array = numpy.asarray(array) if not isinstance(array, (cupy.ndarray, numpy.ndarray)): raise TypeError( 'The array sent to gpu must be an array or a NumPy scalar.' '\nActual type: {0}.'.format(type(array))) array_dev = get_device_from_array(array) if array_dev.id == cupy.cuda.device.get_device_id(): return array if stream is not None and stream.ptr != 0: ret = cupy.empty_like(array) if array_dev.id == -1: # cpu to gpu mem = cupy.cuda.alloc_pinned_memory(array.nbytes) src = numpy.frombuffer( mem, array.dtype, array.size).reshape(array.shape) src[...] = array ret.set(src, stream) cupy.cuda.pinned_memory._add_to_watch_list( stream.record(), mem) else: # gpu to gpu with array_dev: src = array.copy() event = Stream.null.record() stream.wait_event(event) ret.data.copy_from_device_async( src.data, src.nbytes, stream) # to hold a reference until the end of the asynchronous # memcpy stream.add_callback(lambda *x: None, (src, ret)) return ret if array_dev.id == -1: return cupy.asarray(array) # Need to make a copy when an array is copied to another device return cupy.array(array, copy=True)
def roll(a, shift, axis=None): """Roll array elements along a given axis. Args: a (~cupy.ndarray): Array to be rolled. shift (int): The number of places by which elements are shifted. axis (int or None): The axis along which elements are shifted. If ``axis`` is ``None``, the array is flattened before shifting, and after that it is reshaped to the original shape. Returns: ~cupy.ndarray: Output array. .. seealso:: :func:`numpy.roll` """ if axis is None: if a.size == 0: return a size = a.size ra = a.ravel() shift %= size res = cupy.empty((size,), a.dtype) res[:shift] = ra[size - shift:] res[shift:] = ra[:size - shift] return res.reshape(a.shape) else: axis = int(axis) if axis < 0: axis += a.ndim if not 0 <= axis < a.ndim: raise ValueError('axis must be >= %d and < %d' % (-a.ndim, a.ndim)) size = a.shape[axis] if size == 0: return a shift %= size prev = (slice(None),) * axis rest = (slice(None),) * (a.ndim - axis - 1) # Roll only the dimensiont at the given axis # ind1 is [:, ..., size-shift:, ..., :] # ind2 is [:, ..., :size-shift, ..., :] ind1 = prev + (slice(size - shift, None, None),) + rest ind2 = prev + (slice(None, size - shift, None),) + rest r_ind1 = prev + (slice(None, shift, None),) + rest r_ind2 = prev + (slice(shift, None, None),) + rest res = cupy.empty_like(a) res[r_ind1] = a[ind1] res[r_ind2] = a[ind2] return res
def empty_like(array): """Creates an uninitialized GPU array like the given one. Args: array (cupy.ndarray or numpy.ndarray): Base array. Returns: cupy.ndarray: GPU array of the same shape and dtype as `array`. """ warnings.warn("chainer.cuda.empty_like is deprecated. Use cupy.empty_like instead.", DeprecationWarning) check_cuda_available() if isinstance(array, cupy.ndarray): return cupy.empty_like(array) return cupy.empty(array.shape, dtype=array.dtype)
def to_gpu(array, device=None, stream=None): """Copies the given CPU array to specified device. Args: array: Array to be sent to GPU. device: Device specifier. stream (cupy.cuda.Stream): CUDA stream. If not ``None``, the copy runs asynchronously. Returns: cupy.ndarray: Array on GPU. If ``array`` is already on GPU, then this function just returns ``array`` without performing any copy. Note that this function does not copy :class:`cupy.ndarray` into specified device. """ check_cuda_available() with get_device(device): array_dev = get_device(array) if array_dev.id == cupy.cuda.device.get_device_id(): return array if stream is not None: ret = cupy.empty_like(array) if array_dev.id == -1: # cpu to gpu src = array.copy(order='C') ret.set(src, stream) else: # gpu to gpu with array_dev: src = array.copy() ret.data.copy_from_device_async(src.data, src.nbytes, stream) # to hold a reference until the end of the asynchronous memcpy stream.add_callback(lambda *x: None, (src, ret)) return ret if array_dev.id == -1: return cupy.asarray(array) # Need to make a copy when an array is copied to another device return cupy.array(array, copy=True)
def packbits(myarray): """Packs the elements of a binary-valued array into bits in a uint8 array. This function currently does not support ``axis`` option. Args: myarray (cupy.ndarray): Input array. Returns: cupy.ndarray: The packed array. .. note:: When the input array is empty, this function returns a copy of it, i.e., the type of the output array is not necessarily always uint8. This exactly follows the NumPy's behaviour (as of version 1.11), alghough this is inconsistent to the documentation. .. seealso:: :func:`numpy.packbits` """ if myarray.dtype.kind not in 'biu': raise TypeError( 'Expected an input array of integer or boolean data type') if myarray.size == 0: return cupy.empty_like(myarray) myarray = myarray.ravel() packed_size = (myarray.size + 7) // 8 packed = cupy.zeros((packed_size,), dtype=cupy.uint8) cupy.ElementwiseKernel( 'raw T myarray, raw int32 myarray_size', 'uint8 packed', '''for (int j = 0; j < 8; ++j) { int k = i * 8 + j; int bit = k < myarray_size && myarray[k] != 0; packed |= bit << (7 - j); }''', 'packbits_kernel' )(myarray, myarray.size, packed) return packed
def _array_to_gpu(array, device, stream): if array is None: return None if isinstance(array, chainerx.ndarray): # TODO(niboshi): Update this logic once both CuPy and ChainerX support # the array interface. if array.device.backend.name == 'cuda': # Convert to cupy.ndarray on the same device as source array array = cupy.ndarray( array.shape, array.dtype, cupy.cuda.MemoryPointer( cupy.cuda.UnownedMemory( array.data_ptr + array.offset, array.data_size, array, array.device.index), 0), strides=array.strides) else: array = chainerx.to_numpy(array) elif isinstance(array, (numpy.number, numpy.bool_)): array = numpy.asarray(array) elif isinstance(array, intel64.mdarray): array = numpy.asarray(array) if isinstance(array, ndarray): if array.device == device: return array is_numpy = False elif isinstance(array, numpy.ndarray): is_numpy = True else: raise TypeError( 'The array sent to gpu must be an array or a NumPy scalar.' '\nActual type: {0}.'.format(type(array))) if stream is not None and stream.ptr != 0: ret = cupy.empty_like(array) if is_numpy: # cpu to gpu mem = cupy.cuda.alloc_pinned_memory(array.nbytes) src = numpy.frombuffer( mem, array.dtype, array.size).reshape(array.shape) src[...] = array ret.set(src, stream) cupy.cuda.pinned_memory._add_to_watch_list( stream.record(), mem) else: # gpu to gpu with array.device: src = array.copy() event = Stream.null.record() stream.wait_event(event) ret.data.copy_from_device_async( src.data, src.nbytes, stream) # to hold a reference until the end of the asynchronous # memcpy stream.add_callback(lambda *x: None, (src, ret)) return ret with device: if is_numpy: return cupy.asarray(array) # Need to make a copy when an array is copied to another device return cupy.array(array, copy=True)