def cholesky(a): '''Cholesky decomposition. Decompose a given two-dimensional square matrix into ``L * L.T``, where ``L`` is a lower-triangular matrix and ``.T`` is a conjugate transpose operator. Note that in the current implementation ``a`` must be a real matrix, and only float32 and float64 are supported. Args: a (cupy.ndarray): The input matrix with dimension ``(N, N)`` .. seealso:: :func:`numpy.linalg.cholesky` ''' if not cuda.cusolver_enabled: raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0') # TODO(Saito): Current implementation only accepts two-dimensional arrays _assert_cupy_array(a) _assert_rank2(a) _assert_nd_squareness(a) # Cast to float32 or float64 if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype.char else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char x = a.astype(dtype, copy=True) n = len(a) handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if dtype == 'f': buffersize = cusolver.spotrf_bufferSize( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=numpy.float32) cusolver.spotrf( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n, workspace.data.ptr, buffersize, dev_info.data.ptr) else: # dtype == 'd' buffersize = cusolver.dpotrf_bufferSize( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=numpy.float64) cusolver.dpotrf( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n, workspace.data.ptr, buffersize, dev_info.data.ptr) status = int(dev_info[0]) if status > 0: raise linalg.LinAlgError( 'The leading minor of order {} ' 'is not positive definite'.format(status)) elif status < 0: raise linalg.LinAlgError( 'Parameter error (maybe caused by a bug in cupy.linalg?)') _tril(x, k=0) return x
def tile(A, reps): """Construct an array by repeating A the number of times given by reps. Args: A (cupy.ndarray): Array to transform. reps (int or tuple): The number of repeats. Returns: cupy.ndarray: Transformed array with repeats. .. seealso:: :func:`numpy.tile` """ try: tup = tuple(reps) except TypeError: tup = (reps,) d = len(tup) if tup.count(1) == len(tup) and isinstance(A, cupy.ndarray): # Fixes the problem that the function does not make a copy if A is a # array and the repetitions are 1 in all dimensions return cupy.array(A, copy=True, ndmin=d) else: # Note that no copy of zero-sized arrays is made. However since they # have no data there is no risk of an inadvertent overwrite. c = cupy.array(A, copy=False, ndmin=d) if (d < c.ndim): tup = (1,) * (c.ndim - d) + tup shape_out = tuple(s * t for s, t in zip(c.shape, tup)) if c.size == 0: return cupy.empty(shape_out, dtype=c.dtype) c_shape = [] ret_shape = [] for dim_in, nrep in zip(c.shape, tup): if nrep == 1: c_shape.append(dim_in) ret_shape.append(dim_in) elif dim_in == 1: c_shape.append(dim_in) ret_shape.append(nrep) else: c_shape.append(1) c_shape.append(dim_in) ret_shape.append(nrep) ret_shape.append(dim_in) ret = cupy.empty(ret_shape, dtype=c.dtype) if ret.size: ret[...] = c.reshape(c_shape) return ret.reshape(shape_out)
def _generate_normal(self, func, size, dtype, *args): # curand functions below don't support odd size. # * curand.generateNormal # * curand.generateNormalDouble # * curand.generateLogNormal # * curand.generateLogNormalDouble size = core.get_size(size) element_size = six.moves.reduce(operator.mul, size, 1) if element_size % 2 == 0: out = cupy.empty(size, dtype=dtype) func(self._generator, out.data.ptr, out.size, *args) return out else: out = cupy.empty((element_size + 1,), dtype=dtype) func(self._generator, out.data.ptr, out.size, *args) return out[:element_size].reshape(size)
def _get_crossentropyloss_gpu(probs, t): kernel = _crossentropyloss_kernel() N, M = probs.shape loss = cp.empty((1,), dtype=np.float32) kernel(grid=(N, 1, 1), block=(32, 1, 1), args=(probs, t, loss, np.int32(N), np.int32(M))) return loss
def check_copy(self, dtype, src_id, dst_id): with cuda.Device(src_id): src = testing.shaped_arange((2, 3, 4), dtype=dtype) with cuda.Device(dst_id): dst = cupy.empty((2, 3, 4), dtype=dtype) core.elementwise_copy(src, dst) testing.assert_allclose(src, dst)
def test_copy_orders(self, order): a = cupy.empty((2, 3, 4)) b = cupy.copy(a, order) a_cpu = numpy.empty((2, 3, 4)) b_cpu = numpy.copy(a_cpu, order) self.assertEqual(b.strides, b_cpu.strides)
def _pyfftw_rfftn_empty_aligned(shape, axes, dtype, order='C', n=None): """Patched version of :func:`sporco.linalg.pyfftw_rfftn_empty_aligned`. """ ashp = list(shape) raxis = axes[-1] ashp[raxis] = ashp[raxis] // 2 + 1 cdtype = _complex_dtype(dtype) return cp.empty(ashp, cdtype, order)
def take(a, indices, axis=None, out=None): """Takes elements of an array at specified indices along an axis. This is an implementation of "fancy indexing" at single axis. This function does not support ``mode`` option. Args: a (cupy.ndarray): Array to extract elements. indices (int or array-like): Indices of elements that this function takes. axis (int): The axis along which to select indices. The flattened input is used by default. out (cupy.ndarray): Output array. If provided, it should be of appropriate shape and dtype. Returns: cupy.ndarray: The result of fancy indexing. .. seealso:: :func:`numpy.take` """ if axis is None: a = a.ravel() lshape = () rshape = () else: if axis >= a.ndim: raise ValueError('Axis overrun') lshape = a.shape[:axis] rshape = a.shape[axis + 1:] if numpy.isscalar(indices): a = cupy.rollaxis(a, axis) if out is None: return a[indices].copy() else: out[:] = a[indices] return out elif not isinstance(indices, cupy.ndarray): indices = cupy.array(indices, dtype=int) out_shape = lshape + indices.shape + rshape if out is None: out = cupy.empty(out_shape, dtype=a.dtype) else: if out.dtype != a.dtype: raise TypeError('Output dtype mismatch') if out.shape != out_shape: raise ValueError('Output shape mismatch') cdim = indices.size rdim = internal.prod(rshape) indices = cupy.reshape( indices, (1,) * len(lshape) + indices.shape + (1,) * len(rshape)) return _take_kernel(a, indices, cdim, rdim, out)
def _get_out_args(out_args, out_types, out_shape): if not out_args: return [cupy.empty(out_shape, t) for t in out_types] for a in out_args: if not isinstance(a, cupy.ndarray): raise TypeError( 'Output arguments type must be cupy.ndarray') if a.shape != out_shape: raise ValueError('Out shape is mismatched') return out_args
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None): """Returns an array with evenly-spaced values within a given interval. Instead of specifying the step width like :func:`cupy.arange`, this function requires the total number of elements specified. Args: start: Start of the interval. stop: End of the interval. num: Number of elements. endpoint (bool): If True, the stop value is included as the last element. Otherwise, the stop value is omitted. retstep (bool): If True, this function returns (array, step). Otherwise, it returns only the array. dtype: Data type specifier. It is inferred from the start and stop arguments by default. Returns: cupy.ndarray: The 1-D array of ranged values. """ if num < 0: raise ValueError('linspace with num<0 is not supported') if dtype is None: # In actual implementation, only float is used dtype = float ret = cupy.empty((num,), dtype=dtype) if num == 0: step = float('nan') elif num == 1: ret.fill(start) step = float('nan') else: div = (num - 1) if endpoint else num step = float(stop - start) / div stop = float(stop) if step == 0.0: # for underflow _linspace_ufunc_underflow(start, stop - start, div, ret, casting='unsafe') else: _linspace_ufunc(start, step, ret, casting='unsafe') if endpoint: ret[-1] = stop if retstep: return ret, step else: return ret
def concatenate(tup, axis=0): """Joins arrays along an axis. Args: tup (sequence of arrays): Arrays to be joined. All of these should have same dimensionalities except the specified axis. axis (int): The axis to join arrays along. Returns: cupy.ndarray: Joined array. .. seealso:: :func:`numpy.concatenate` """ ndim = None shape = None for a in tup: if not isinstance(a, cupy.ndarray): raise TypeError('Only cupy arrays can be concatenated') if a.ndim == 0: raise TypeError('zero-dimensional arrays cannot be concatenated') if ndim is None: ndim = a.ndim shape = list(a.shape) axis = _get_positive_axis(a.ndim, axis) continue if a.ndim != ndim: raise ValueError( 'All arrays to concatenate must have the same ndim') if any(i != axis and shape[i] != a.shape[i] for i in six.moves.range(ndim)): raise ValueError( 'All arrays must have same shape except the axis to ' 'concatenate') shape[axis] += a.shape[axis] if ndim is None: raise ValueError('Cannot concatenate from empty tuple') dtype = numpy.find_common_type([a.dtype for a in tup], []) ret = cupy.empty(shape, dtype=dtype) skip = (slice(None),) * axis i = 0 for a in tup: aw = a.shape[axis] ret[skip + (slice(i, i + aw),)] = a i += aw return ret
def arange(start, stop=None, step=1, dtype=None): """Rerurns an array with evenly spaced values within a given interval. Values are generated within the half-open interval [start, stop). The first three arguments are mapped like the ``range`` built-in function, i.e. start and step are optional. Args: start: Start of the interval. stop: End of the interval. step: Step width between each pair of consecutive values. dtype: Data type specifier. It is inferred from other arguments by default. Returns: cupy.ndarray: The 1-D array of range values. .. seealso:: :func:`numpy.arange` """ if dtype is None: if any(numpy.dtype(type(val)).kind == 'f' for val in (start, stop, step)): dtype = float else: dtype = int if stop is None: stop = start start = 0 size = int(numpy.ceil((stop - start) / step)) if size <= 0: return cupy.empty((0,), dtype=dtype) ret = cupy.empty((size,), dtype=dtype) typ = numpy.dtype(dtype).type _arange_ufunc(typ(start), typ(step), ret, dtype=dtype) return ret
def _get_out_args_with_params(out_args, out_types, out_shape, out_params): if not out_args: for p in out_params: if p.raw: raise ValueError('Output array size is Undecided') return [cupy.empty(out_shape, t) for t in out_types] for a, p in six_zip(out_args, out_params): if not isinstance(a, cupy.ndarray): raise TypeError( 'Output arguments type must be cupy.ndarray') if a.shape != out_shape and not p.raw: raise ValueError('Out shape is mismatched') return out_args
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None): """Returns an array with evenly-spaced values within a given interval. Instead of specifying the step width like :func:`cupy.arange`, this function requires the total number of elements specified. Args: start: Start of the interval. stop: End of the interval. num: Number of elements. endpoint (bool): If True, the stop value is included as the last element. Otherwise, the stop value is omitted. retstep (bool): If True, this function returns (array, step). Otherwise, it returns only the array. dtype: Data type specifier. It is inferred from the start and stop arguments by default. Returns: cupy.ndarray: The 1-D array of ranged values. """ if num <= 0: # TODO(beam2d): Return zero-sized array raise ValueError('linspace with num<=0 is not supported') if dtype is None: if any(numpy.dtype(type(val)).kind == 'f' for val in (start, stop)): dtype = float else: dtype = int ret = cupy.empty((num,), dtype=dtype) if num == 0: return ret elif num == 1: ret.fill(start) return ret if endpoint: step = (stop - start) / (num - 1) else: step = (stop - start) / num stop = start + step * (num - 1) typ = numpy.dtype(dtype).type _linspace_ufunc(typ(start), stop - start, num - 1, ret) if retstep: return ret, step else: return ret
def empty(shape, dtype=numpy.float32): """Creates an uninitialized cupy.ndarray object. Args: shape (tuple of ints): The shape of array. dtype (numpy.dtype): Element type. Returns: cupy.ndarray: Uninitialized GPU array allocated by the memory pool. """ warnings.warn("chainer.cuda.empty is deprecated. Use cupy.empty instead.", DeprecationWarning) check_cuda_available() return cupy.empty(shape, dtype)
def roll(a, shift, axis=None): """Roll array elements along a given axis. Args: a (~cupy.ndarray): Array to be rolled. shift (int): The number of places by which elements are shifted. axis (int or None): The axis along which elements are shifted. If ``axis`` is ``None``, the array is flattened before shifting, and after that it is reshaped to the original shape. Returns: ~cupy.ndarray: Output array. .. seealso:: :func:`numpy.roll` """ if axis is None: if a.size == 0: return a size = a.size ra = a.ravel() shift %= size res = cupy.empty((size,), a.dtype) res[:shift] = ra[size - shift:] res[shift:] = ra[:size - shift] return res.reshape(a.shape) else: axis = int(axis) if axis < 0: axis += a.ndim if not 0 <= axis < a.ndim: raise ValueError('axis must be >= %d and < %d' % (-a.ndim, a.ndim)) size = a.shape[axis] if size == 0: return a shift %= size prev = (slice(None),) * axis rest = (slice(None),) * (a.ndim - axis - 1) # Roll only the dimensiont at the given axis # ind1 is [:, ..., size-shift:, ..., :] # ind2 is [:, ..., :size-shift, ..., :] ind1 = prev + (slice(size - shift, None, None),) + rest ind2 = prev + (slice(None, size - shift, None),) + rest r_ind1 = prev + (slice(None, shift, None),) + rest r_ind2 = prev + (slice(shift, None, None),) + rest res = cupy.empty_like(a) res[r_ind1] = a[ind1] res[r_ind2] = a[ind2] return res
def diagonal(a, offset=0, axis1=0, axis2=1): """Returns specified diagonals. This function extracts the diagonals along two specified axes. The other axes are not changed. This function returns a writable view of this array as NumPy 1.10 will do. Args: a (cupy.ndarray): Array from which the diagonals are taken. offset (int): Index of the diagonals. Zero indicates the main diagonals, a positive value upper diagonals, and a negative value lower diagonals. axis1 (int): The first axis to take diagonals from. axis2 (int): The second axis to take diagonals from. Returns: cupy.ndarray: A view of the diagonals of ``a``. .. seealso:: :func:`numpy.diagonal` """ if axis1 < axis2: min_axis, max_axis = axis1, axis2 else: min_axis, max_axis = axis2, axis1 tr = list(six.moves.range(a.ndim)) del tr[max_axis] del tr[min_axis] if offset >= 0: a = cupy.transpose(a, tr + [axis1, axis2]) else: a = cupy.transpose(a, tr + [axis2, axis1]) offset = -offset diag_size = max(0, min(a.shape[-2], a.shape[-1] - offset)) ret_shape = a.shape[:-2] + (diag_size,) if diag_size == 0: return cupy.empty(ret_shape, dtype=a.dtype) a = a[..., :diag_size, offset:offset + diag_size] ret = a.view() ret._shape = a.shape[:-2] + (diag_size,) ret._strides = a.strides[:-2] + (a.strides[-1] + a.strides[-2],) ret._size = internal.prod(ret._shape) ret._c_contiguous = -1 ret._f_contiguous = -1 return ret
def empty_like(array): """Creates an uninitialized GPU array like the given one. Args: array (cupy.ndarray or numpy.ndarray): Base array. Returns: cupy.ndarray: GPU array of the same shape and dtype as `array`. """ warnings.warn("chainer.cuda.empty_like is deprecated. Use cupy.empty_like instead.", DeprecationWarning) check_cuda_available() if isinstance(array, cupy.ndarray): return cupy.empty_like(array) return cupy.empty(array.shape, dtype=array.dtype)
def _get_out_args(in_args, out_args, out_types, out_shape, out_params=None): if len(out_args) == 0: if out_params is not None and any(p.raw for p in out_params): raise ValueError('Output array size is Undecided') out_args = [cupy.empty(shape=out_shape, dtype=t) for t in out_types] else: assert len(out_args) == len(out_types) for i, a in enumerate(out_args): if not isinstance(a, cupy.ndarray): raise TypeError( 'Output arguments type must be cupy.ndarray') if a.shape != out_shape: if out_params is None or not out_params[i].raw: raise ValueError('Out shape is mismatched') return out_args
def lognormal(self, mean=0.0, sigma=1.0, size=None, dtype=float): """Returns an array of samples drawn from a log normal distribution. .. seealso:: :func:`cupy.random.lognormal` for full documentation, :meth:`numpy.random.RandomState.lognormal` """ dtype = _check_and_get_dtype(dtype) out = cupy.empty(size, dtype=dtype) if dtype.char == 'f': func = curand.generateLogNormal else: func = curand.generateLogNormalDouble func(self._generator, out.data.ptr, out.size, mean, sigma) return out
def normal(self, loc=0.0, scale=1.0, size=None, dtype=float): """Returns an array of normally distributed samples. .. seealso:: :func:`cupy.random.normal` for full documentation, :meth:`numpy.random.RandomState.normal` """ dtype = _check_and_get_dtype(dtype) out = cupy.empty(size, dtype=dtype) if dtype.char == 'f': func = curand.generateNormal else: func = curand.generateNormalDouble func(self._generator, out.data.ptr, out.size, loc, scale) return out
def asfortranarray(a, dtype=None): """Return an array laid out in Fortran order in memory. Args: a (~cupy.ndarray): The input array. dtype (str or dtype object, optional): By default, the data-type is inferred from the input data. Returns: ~cupy.ndarray: The input `a` in Fortran, or column-major, order. .. seealso:: :func:`numpy.asfortranarray` """ ret = cupy.empty(a.shape[::-1], a.dtype if dtype is None else dtype).T ret[...] = a return ret
def random_sample(self, size=None, dtype=float): """Returns an array of random values over the interval ``[0, 1)``. .. seealso:: :func:`cupy.random.random_sample` for full documentation, :meth:`numpy.random.RandomState.random_sample` """ dtype = _check_and_get_dtype(dtype) out = cupy.empty(size, dtype=dtype) if dtype.char == 'f': func = curand.generateUniform else: func = curand.generateUniformDouble func(self._generator, out.data.ptr, out.size) RandomState._1m_kernel(out) return out
def forward_gpu(self, inputs): x = inputs[0] W = inputs[1] # Prepare BLAS call handle = cuda.Device().cublas_handle k, m = W.shape n, l = x.shape[0] * x.shape[1], x.shape[2] lda = max(1, x.shape[-1]) ldb = max(1, W.strides[0] // W.dtype.itemsize) ldc = max(1, m) Wx = cupy.empty((x.shape[0], x.shape[1], W.shape[1]), dtype=numpy.float32) sgemm(handle, False, False, m, n, k, 1, W.data.ptr, ldb, x.data.ptr, lda, 0, Wx.data.ptr, ldc) if len(inputs) > 2: b = inputs[2] Wx += b return Wx,
def _forward_gpu(x): T = x.shape[0] N = x.shape[1] M = x.shape[2] y = cp.empty((N, T, M), dtype=np.float32) if N == 1: bdim, gdim = gpu.utils.Get_bdim_and_gdimRowVec(M) elif M >= (N*4): bdim, gdim = gpu.utils.Get_bdim_and_gdimSmallNBigM(N,M) else: bdim, gdim = gpu.utils.Get_bdim_and_gdim2D(N,M) forward_kernel = _GetForward_kernel() forward_kernel(grid=gdim, block=bdim, args=(x, y, T, N, M ) ) return y
def hotdot(a, indices, out=None, dont_add=False): """ In: a: a pycuda gpuarray indices: hot indices a K-hot encoded matrix out: out: x.dot(a.T), where x is a K-hot encoded matrix """ HotDot1, HotDot2 = _get_HotDot_kernels() H, D = a.shape N, K = indices.shape if N == 1: bdim, gdim = Get_bdim_and_gdimRowVec(H) elif H >= (N*4): bdim, gdim = Get_bdim_and_gdimSmallNBigM(N,H) else: bdim, gdim = Get_bdim_and_gdim2D(N,H) if dont_add: B = np.int32(1) else: B = np.int32(0) if out is None: out = cp.empty((N,H), dtype=np.float32) B = np.int32(1) if K > 1: HotDot1(grid=gdim, block=bdim, args=(a, out, indices, np.int32(K), np.int32(N), np.int32(H), np.int32(D), np.int32(B)) ) else: HotDot2(grid=gdim, block=bdim, args=(a, out, indices, np.int32(N), np.int32(H), np.int32(D), np.int32(B)) ) return out
def asfortranarray(a, dtype=None): """Return an array laid out in Fortran order in memory. Args: a (~cupy.ndarray): The input array. dtype (str or dtype object, optional): By default, the data-type is inferred from the input data. Returns: ~cupy.ndarray: The input `a` in Fortran, or column-major, order. .. seealso:: :func:`numpy.asfortranarray` """ ret = cupy.empty(a.shape[::-1], a.dtype if dtype is None else dtype).T if (a.flags.c_contiguous and (a.dtype == numpy.float32 or a.dtype == numpy.float64) and a.ndim == 2 and dtype is None): m, n = a.shape if a.dtype == numpy.float32: cupy.cuda.cublas.sgeam( cupy.cuda.Device().cublas_handle, 1, # transpose a 1, # transpose ret m, n, 1., a.data.ptr, n, 0., a.data.ptr, n, ret.data.ptr, m) elif a.dtype == numpy.float64: cupy.cuda.cublas.dgeam( cupy.cuda.Device().cublas_handle, 1, # transpose a 1, # transpose ret m, n, 1., a.data.ptr, n, 0., a.data.ptr, n, ret.data.ptr, m) return ret else: ret[...] = a return ret
def _backward_gpu(gy): N = gy.shape[0] T = gy.shape[1] M = gy.shape[2] gx = cp.empty((T, N, M), dtype=np.float32) if N == 1: bdim, gdim = gpu.utils.Get_bdim_and_gdimRowVec(M) elif M >= (N*4): bdim, gdim = gpu.utils.Get_bdim_and_gdimSmallNBigM(N,M) else: bdim, gdim = gpu.utils.Get_bdim_and_gdim2D(N,M) Backward_kernel = _GetBackward_kernel() Backward_kernel(grid=gdim, block=bdim, args=(gy, gx, T, N, M) ) return gx
def _empty_aligned(shape, dtype, order='C', n=None): """Patched version of :func:`sporco.fft.empty_aligned`.""" return cp.empty(shape, dtype, order)
def add_buffers_gpu(species, float_recv_left, float_recv_right, uint_recv_left, uint_recv_right): """ Add the particles stored in recv_left and recv_right to the existing particle in species. Parameters ---------- species: a Particles object Contain the particles that stayed on the present processors float_recv_left, float_recv_right, uint_recv_left, uint_recv_right: arrays of shape (n_float,Nptcl) and (n_int,Nptcl) where Nptcl is the number of particles that are received to the left proc and right proc respectively, and where n_float and n_int are the number of float and integer quantities respectively These arrays are always on the CPU (since they were used for MPI) """ # Get the new number of particles old_Ntot = species.Ntot n_left = float_recv_left.shape[1] n_right = float_recv_right.shape[1] new_Ntot = old_Ntot + n_left + n_right # Get the threads per block and the blocks per grid n_left_grid, n_left_block = cuda_tpb_bpg_1d(n_left) n_right_grid, n_right_block = cuda_tpb_bpg_1d(n_right) n_old_grid, n_old_block = cuda_tpb_bpg_1d(old_Ntot) # Iterate over particle attributes # Build list of float attributes to copy attr_list = [ (species,'x'), (species,'y'), (species,'z'), \ (species,'ux'), (species,'uy'), (species,'uz'), \ (species,'inv_gamma'), (species,'w') ] if species.ionizer is not None: attr_list += [(species.ionizer, 'w_times_level')] # Loop through the float quantities for i_attr in range(len(attr_list)): # Copy the proper buffers to the GPU left_buffer = cupy.asarray(float_recv_left[i_attr]) right_buffer = cupy.asarray(float_recv_right[i_attr]) # Initialize the new particle array particle_array = cupy.empty((new_Ntot, ), dtype=np.float64) # Merge the arrays on the GPU stay_buffer = getattr(attr_list[i_attr][0], attr_list[i_attr][1]) if n_left != 0: copy_particles[n_left_grid, n_left_block](n_left, left_buffer, 0, particle_array, 0) if old_Ntot != 0: copy_particles[n_old_grid, n_old_block](old_Ntot, stay_buffer, 0, particle_array, n_left) if n_right != 0: copy_particles[n_right_grid, n_right_block](n_right, right_buffer, 0, particle_array, n_left + old_Ntot) # Assign the stay_buffer to the initial particle data array # and fill the sending buffers (if needed for MPI) setattr(attr_list[i_attr][0], attr_list[i_attr][1], particle_array) # Build list of integer quantities to copy attr_list = [] if species.tracker is not None: attr_list.append((species.tracker, 'id')) if species.ionizer is not None: attr_list.append((species.ionizer, 'ionization_level')) # Loop through the integer quantities for i_attr in range(len(attr_list)): # Copy the proper buffers to the GPU left_buffer = cupy.asarray(uint_recv_left[i_attr]) right_buffer = cupy.asarray(uint_recv_right[i_attr]) # Initialize the new particle array particle_array = cupy.empty((new_Ntot, ), dtype=np.uint64) # Merge the arrays on the GPU stay_buffer = getattr(attr_list[i_attr][0], attr_list[i_attr][1]) if n_left != 0: copy_particles[n_left_grid, n_left_block](n_left, left_buffer, 0, particle_array, 0) if old_Ntot != 0: copy_particles[n_old_grid, n_old_block](old_Ntot, stay_buffer, 0, particle_array, n_left) if n_right != 0: copy_particles[n_right_grid, n_right_block](n_right, right_buffer, 0, particle_array, n_left + old_Ntot) # Assign the stay_buffer to the initial particle data array # and fill the sending buffers (if needed for MPI) setattr(attr_list[i_attr][0], attr_list[i_attr][1], particle_array) # Adapt the total number of particles species.Ntot = new_Ntot
def eigsh(a, k=6, *, which='LM', ncv=None, maxiter=None, tol=0, return_eigenvectors=True): """Finds ``k`` eigenvalues and eigenvectors of the real symmetric matrix. Solves ``Ax = wx``, the standard eigenvalue problem for ``w`` eigenvalues with corresponding eigenvectors ``x``. Args: a (ndarray, spmatrix or LinearOperator): A symmetric square matrix with dimension ``(n, n)``. ``a`` must :class:`cupy.ndarray`, :class:`cupyx.scipy.sparse.spmatrix` or :class:`cupyx.scipy.sparse.linalg.LinearOperator`. k (int): The number of eigenvalues and eigenvectors to compute. Must be ``1 <= k < n``. which (str): 'LM' or 'LA'. 'LM': finds ``k`` largest (in magnitude) eigenvalues. 'LA': finds ``k`` largest (algebraic) eigenvalues. ncv (int): The number of Lanczos vectors generated. Must be ``k + 1 < ncv < n``. If ``None``, default value is used. maxiter (int): Maximum number of Lanczos update iterations. If ``None``, default value is used. tol (float): Tolerance for residuals ``||Ax - wx||``. If ``0``, machine precision is used. return_eigenvectors (bool): If ``True``, returns eigenvectors in addition to eigenvalues. Returns: tuple: If ``return_eigenvectors is True``, it returns ``w`` and ``x`` where ``w`` is eigenvalues and ``x`` is eigenvectors. Otherwise, it returns only ``w``. .. seealso:: :func:`scipy.sparse.linalg.eigsh` .. note:: This function uses the thick-restart Lanczos methods (https://sdm.lbl.gov/~kewu/ps/trlan.html). """ n = a.shape[0] if a.ndim != 2 or a.shape[0] != a.shape[1]: raise ValueError('expected square matrix (shape: {})'.format(a.shape)) if a.dtype.char not in 'fdFD': raise TypeError('unsupprted dtype (actual: {})'.format(a.dtype)) if k <= 0: raise ValueError('k must be greater than 0 (actual: {})'.format(k)) if k >= n: raise ValueError('k must be smaller than n (actual: {})'.format(k)) if which not in ('LM', 'LA'): raise ValueError('which must be \'LM\' or \'LA\' (actual: {})' ''.format(which)) if ncv is None: ncv = min(max(2 * k, k + 32), n - 1) else: ncv = min(max(ncv, k + 2), n - 1) if maxiter is None: maxiter = 10 * n if tol == 0: tol = numpy.finfo(a.dtype).eps alpha = cupy.zeros((ncv, ), dtype=a.dtype) beta = cupy.zeros((ncv, ), dtype=a.dtype.char.lower()) V = cupy.empty((ncv, n), dtype=a.dtype) # Set initial vector u = cupy.random.random((n, )).astype(a.dtype) V[0] = u / cublas.nrm2(u) # Choose Lanczos implementation, unconditionally use 'fast' for now upadte_impl = 'fast' if upadte_impl == 'fast': lanczos = _lanczos_fast(a, n, ncv) else: lanczos = _lanczos_asis # Lanczos iteration lanczos(a, V, u, alpha, beta, 0, ncv) iter = ncv w, s = _eigsh_solve_ritz(alpha, beta, None, k, which) x = V.T @ s # Compute residual beta_k = beta[-1] * s[-1, :] res = cublas.nrm2(beta_k) while res > tol and iter < maxiter: # Setup for thick-restart beta[:k] = 0 alpha[:k] = w V[:k] = x.T u -= u.T @ V[:k].conj().T @ V[:k] V[k] = u / cublas.nrm2(u) u[...] = a @ V[k] cublas.dotc(V[k], u, out=alpha[k]) u -= alpha[k] * V[k] u -= V[:k].T @ beta_k cublas.nrm2(u, out=beta[k]) V[k + 1] = u / beta[k] # Lanczos iteration lanczos(a, V, u, alpha, beta, k + 1, ncv) iter += ncv - k w, s = _eigsh_solve_ritz(alpha, beta, beta_k, k, which) x = V.T @ s # Compute residual beta_k = beta[-1] * s[-1, :] res = cublas.nrm2(beta_k) if return_eigenvectors: idx = cupy.argsort(w) return w[idx], x[:, idx] else: return cupy.sort(w)
def aux(A, V, u, alpha, beta, i_start, i_end): assert A is outer_A # Get ready for spmv if enabled if cusparse_handle is not None: # Note: I would like to reuse descriptors and working buffer # on the next update, but I gave it up because it sometimes # caused illegal memory access error. spmv_desc_A = cusparse.SpMatDescriptor.create(A) spmv_desc_v = cusparse.DnVecDescriptor.create(v) spmv_desc_u = cusparse.DnVecDescriptor.create(u) buff_size = _cusparse.spMV_bufferSize( cusparse_handle, spmv_op_a, spmv_alpha.ctypes.data, spmv_desc_A.desc, spmv_desc_v.desc, spmv_beta.ctypes.data, spmv_desc_u.desc, spmv_cuda_dtype, spmv_alg) spmv_buff = cupy.empty(buff_size, cupy.int8) v[...] = V[i_start] for i in range(i_start, i_end): # Matrix-vector multiplication if cusparse_handle is None: u[...] = A @ v else: _cusparse.spMV(cusparse_handle, spmv_op_a, spmv_alpha.ctypes.data, spmv_desc_A.desc, spmv_desc_v.desc, spmv_beta.ctypes.data, spmv_desc_u.desc, spmv_cuda_dtype, spmv_alg, spmv_buff.data.ptr) # Call dotc _cublas.setPointerMode(cublas_handle, _cublas.CUBLAS_POINTER_MODE_DEVICE) try: dotc(cublas_handle, n, v.data.ptr, 1, u.data.ptr, 1, alpha.data.ptr + i * alpha.itemsize) finally: _cublas.setPointerMode(cublas_handle, cublas_pointer_mode) # Orthogonalize gemm(cublas_handle, _cublas.CUBLAS_OP_C, _cublas.CUBLAS_OP_N, 1, i + 1, n, one.ctypes.data, u.data.ptr, n, V.data.ptr, n, zero.ctypes.data, uu.data.ptr, 1) gemm(cublas_handle, _cublas.CUBLAS_OP_N, _cublas.CUBLAS_OP_C, n, 1, i + 1, mone.ctypes.data, V.data.ptr, n, uu.data.ptr, 1, one.ctypes.data, u.data.ptr, n) # Call nrm2 _cublas.setPointerMode(cublas_handle, _cublas.CUBLAS_POINTER_MODE_DEVICE) try: nrm2(cublas_handle, n, u.data.ptr, 1, beta.data.ptr + i * beta.itemsize) finally: _cublas.setPointerMode(cublas_handle, cublas_pointer_mode) # Break here as the normalization below touches V[i+1] if i >= i_end - 1: break # Normalize _kernel_normalize(u, beta, i, n, v, V)
def _lanczos_fast(A, n, ncv): cublas_handle = device.get_cublas_handle() cublas_pointer_mode = _cublas.getPointerMode(cublas_handle) if A.dtype.char == 'f': dotc = _cublas.sdot nrm2 = _cublas.snrm2 gemm = _cublas.sgemm elif A.dtype.char == 'd': dotc = _cublas.ddot nrm2 = _cublas.dnrm2 gemm = _cublas.dgemm elif A.dtype.char == 'F': dotc = _cublas.cdotc nrm2 = _cublas.scnrm2 gemm = _cublas.cgemm elif A.dtype.char == 'D': dotc = _cublas.zdotc nrm2 = _cublas.dznrm2 gemm = _cublas.zgemm else: raise TypeError('invalid dtype ({})'.format(A.dtype)) cusparse_handle = None if csr.isspmatrix_csr(A) and cusparse.check_availability('spmv'): cusparse_handle = device.get_cusparse_handle() spmv_op_a = _cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE spmv_alpha = numpy.array(1.0, A.dtype) spmv_beta = numpy.array(0.0, A.dtype) spmv_cuda_dtype = cusparse._dtype_to_DataType(A.dtype) spmv_alg = _cusparse.CUSPARSE_MV_ALG_DEFAULT v = cupy.empty((n, ), dtype=A.dtype) uu = cupy.empty((ncv, ), dtype=A.dtype) one = numpy.array(1.0, dtype=A.dtype) zero = numpy.array(0.0, dtype=A.dtype) mone = numpy.array(-1.0, dtype=A.dtype) outer_A = A def aux(A, V, u, alpha, beta, i_start, i_end): assert A is outer_A # Get ready for spmv if enabled if cusparse_handle is not None: # Note: I would like to reuse descriptors and working buffer # on the next update, but I gave it up because it sometimes # caused illegal memory access error. spmv_desc_A = cusparse.SpMatDescriptor.create(A) spmv_desc_v = cusparse.DnVecDescriptor.create(v) spmv_desc_u = cusparse.DnVecDescriptor.create(u) buff_size = _cusparse.spMV_bufferSize( cusparse_handle, spmv_op_a, spmv_alpha.ctypes.data, spmv_desc_A.desc, spmv_desc_v.desc, spmv_beta.ctypes.data, spmv_desc_u.desc, spmv_cuda_dtype, spmv_alg) spmv_buff = cupy.empty(buff_size, cupy.int8) v[...] = V[i_start] for i in range(i_start, i_end): # Matrix-vector multiplication if cusparse_handle is None: u[...] = A @ v else: _cusparse.spMV(cusparse_handle, spmv_op_a, spmv_alpha.ctypes.data, spmv_desc_A.desc, spmv_desc_v.desc, spmv_beta.ctypes.data, spmv_desc_u.desc, spmv_cuda_dtype, spmv_alg, spmv_buff.data.ptr) # Call dotc _cublas.setPointerMode(cublas_handle, _cublas.CUBLAS_POINTER_MODE_DEVICE) try: dotc(cublas_handle, n, v.data.ptr, 1, u.data.ptr, 1, alpha.data.ptr + i * alpha.itemsize) finally: _cublas.setPointerMode(cublas_handle, cublas_pointer_mode) # Orthogonalize gemm(cublas_handle, _cublas.CUBLAS_OP_C, _cublas.CUBLAS_OP_N, 1, i + 1, n, one.ctypes.data, u.data.ptr, n, V.data.ptr, n, zero.ctypes.data, uu.data.ptr, 1) gemm(cublas_handle, _cublas.CUBLAS_OP_N, _cublas.CUBLAS_OP_C, n, 1, i + 1, mone.ctypes.data, V.data.ptr, n, uu.data.ptr, 1, one.ctypes.data, u.data.ptr, n) # Call nrm2 _cublas.setPointerMode(cublas_handle, _cublas.CUBLAS_POINTER_MODE_DEVICE) try: nrm2(cublas_handle, n, u.data.ptr, 1, beta.data.ptr + i * beta.itemsize) finally: _cublas.setPointerMode(cublas_handle, cublas_pointer_mode) # Break here as the normalization below touches V[i+1] if i >= i_end - 1: break # Normalize _kernel_normalize(u, beta, i, n, v, V) return aux
def remove_particles_gpu(species, fld, n_guard, left_proc, right_proc): """ Remove the particles that are outside of the physical domain (i.e. in the guard cells). Store them in sending buffers, which are returned. Parameters ---------- species: a Particles object Contains the data of this species fld: a Fields object Contains information about the dimension of the grid, and the prefix sum (when using the GPU) n_guard: int Number of guard cells left_proc, right_proc: int or None Indicate whether there is a left or right processor or if the boundary is open (None). Returns ------- float_send_left, float_send_right, uint_send_left, uint_send_right: arrays of shape (n_float,Nptcl) and (n_int,Nptcl) where Nptcl is the number of particles that are sent to the left proc and right proc respectively, and where n_float and n_int are the number of float and integer quantities respectively """ # Check if particles are sorted # (The particles are usually expected to be sorted from the previous # iteration at this point - except at the first iteration of `step`.) if species.sorted == False: species.sort_particles(fld=fld) species.sorted = True # Get the particle indices between which to remove the particles # (Take into account the fact that the moving window may have # shifted the grid since the particles were last sorted: prefix_sum_shift) prefix_sum = species.prefix_sum Nz = fld.Nz Nr = fld.Nr # Find the z index of the first cell for which particles are kept iz_min = max(n_guard + species.prefix_sum_shift, 0) # Find the z index of the first cell for which particles are removed again iz_max = min(Nz - n_guard + species.prefix_sum_shift + 1, Nz) # Find the corresponding indices in the particle array # Reminder: prefix_sum[i] is the cumulative sum of the number of particles # in cells 0 to i (where cell i is included) if iz_min * (Nr + 1) - 1 >= 0: i_min = int(prefix_sum[iz_min * (Nr + 1) - 1]) else: i_min = 0 i_max = int(prefix_sum[iz_max * (Nr + 1) - 1]) # Total number of particles in each particle group N_send_l = i_min new_Ntot = i_max - i_min N_send_r = species.Ntot - i_max # Allocate the sending buffers on the CPU n_float = species.n_float_quantities n_int = species.n_integer_quantities if left_proc is not None: float_send_left = np.empty((n_float, N_send_l), dtype=np.float64) uint_send_left = np.empty((n_int, N_send_l), dtype=np.uint64) else: float_send_left = np.empty((n_float, 0), dtype=np.float64) uint_send_left = np.empty((n_int, 0), dtype=np.uint64) if right_proc is not None: float_send_right = np.empty((n_float, N_send_r), dtype=np.float64) uint_send_right = np.empty((n_int, N_send_r), dtype=np.uint64) else: float_send_right = np.empty((n_float, 0), dtype=np.float64) uint_send_right = np.empty((n_int, 0), dtype=np.uint64) # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d(species.Ntot) # Float quantities: # Build list of float attributes to copy attr_list = [(species, 'x'), (species, 'y'), (species, 'z'), (species, 'ux'), (species, 'uy'), (species, 'uz'), (species, 'inv_gamma'), (species, 'w')] if species.ionizer is not None: attr_list.append((species.ionizer, 'w_times_level')) # Loop through the float attributes for i_attr in range(n_float): # Initialize 3 buffer arrays on the GPU (need to be initialized # inside the loop, as `copy_to_host` invalidates these arrays) left_buffer = cupy.empty((N_send_l, ), dtype=np.float64) right_buffer = cupy.empty((N_send_r, ), dtype=np.float64) stay_buffer = cupy.empty((new_Ntot, ), dtype=np.float64) # Check that the buffers are still on GPU # (safeguard against automatic memory management) assert type(left_buffer) != np.ndarray assert type(right_buffer) != np.ndarray assert type(left_buffer) != np.ndarray # Split the particle array into the 3 buffers on the GPU particle_array = getattr(attr_list[i_attr][0], attr_list[i_attr][1]) split_particles_to_buffers[dim_grid_1d, dim_block_1d](particle_array, left_buffer, stay_buffer, right_buffer, i_min, i_max) # Assign the stay_buffer to the initial particle data array # and fill the sending buffers (if needed for MPI) setattr(attr_list[i_attr][0], attr_list[i_attr][1], stay_buffer) if left_proc is not None: left_buffer.get(out=float_send_left[i_attr]) if right_proc is not None: right_buffer.get(out=float_send_right[i_attr]) # Integer quantities: if n_int > 0: attr_list = [] if species.tracker is not None: attr_list.append((species.tracker, 'id')) if species.ionizer is not None: attr_list.append((species.ionizer, 'ionization_level')) for i_attr in range(n_int): # Initialize 3 buffer arrays on the GPU (need to be initialized # inside the loop, as `copy_to_host` invalidates these arrays) left_buffer = cupy.empty((N_send_l, ), dtype=np.uint64) right_buffer = cupy.empty((N_send_r, ), dtype=np.uint64) stay_buffer = cupy.empty((new_Ntot, ), dtype=np.uint64) # Split the particle array into the 3 buffers on the GPU particle_array = getattr(attr_list[i_attr][0], attr_list[i_attr][1]) split_particles_to_buffers[dim_grid_1d, dim_block_1d](particle_array, left_buffer, stay_buffer, right_buffer, i_min, i_max) # Assign the stay_buffer to the initial particle data array # and fill the sending buffers (if needed for MPI) setattr(attr_list[i_attr][0], attr_list[i_attr][1], stay_buffer) if left_proc is not None: left_buffer.get(out=uint_send_left[i_attr]) if right_proc is not None: right_buffer.get(out=uint_send_right[i_attr]) # Change the new total number of particles species.Ntot = new_Ntot # Return the sending buffers return (float_send_left, float_send_right, uint_send_left, uint_send_right)
def geam(transa, transb, alpha, a, beta, b, out=None): """Computes alpha * op(a) + beta * op(b) op(a) = a if transa is 'N', op(a) = a.T if transa is 'T', op(a) = a.T.conj() if transa is 'H'. op(b) = b if transb is 'N', op(b) = b.T if transb is 'T', op(b) = b.T.conj() if transb is 'H'. """ assert a.ndim == b.ndim == 2 assert a.dtype == b.dtype dtype = a.dtype.char if dtype == 'f': func = cublas.sgeam elif dtype == 'd': func = cublas.dgeam elif dtype == 'F': func = cublas.cgeam elif dtype == 'D': func = cublas.zgeam else: raise TypeError('invalid dtype') transa = _trans_to_cublas_op(transa) transb = _trans_to_cublas_op(transb) if transa == cublas.CUBLAS_OP_N: m, n = a.shape else: n, m = a.shape if transb == cublas.CUBLAS_OP_N: assert b.shape == (m, n) else: assert b.shape == (n, m) if out is None: out = cupy.empty((m, n), dtype=dtype, order='F') else: assert out.ndim == 2 assert out.shape == (m, n) assert out.dtype == dtype alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype) beta, beta_ptr = _get_scalar_ptr(beta, a.dtype) handle = device.get_cublas_handle() orig_mode = cublas.getPointerMode(handle) if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray): if not isinstance(alpha, cupy.ndarray): alpha = cupy.array(alpha) alpha_ptr = alpha.data.ptr if not isinstance(beta, cupy.ndarray): beta = cupy.array(beta) beta_ptr = beta.data.ptr cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE) else: cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST) lda, transa = _decide_ld_and_trans(a, transa) ldb, transb = _decide_ld_and_trans(b, transb) if not (lda is None or ldb is None): if out._f_contiguous: try: func(handle, transa, transb, m, n, alpha_ptr, a.data.ptr, lda, beta_ptr, b.data.ptr, ldb, out.data.ptr, m) finally: cublas.setPointerMode(handle, orig_mode) return out elif out._c_contiguous: # Computes alpha * a.T + beta * b.T try: func(handle, 1 - transa, 1 - transb, n, m, alpha_ptr, a.data.ptr, lda, beta_ptr, b.data.ptr, ldb, out.data.ptr, n) finally: cublas.setPointerMode(handle, orig_mode) return out a, lda = _change_order_if_necessary(a, lda) b, ldb = _change_order_if_necessary(b, ldb) c = out if not out._f_contiguous: c = out.copy(order='F') try: func(handle, transa, transb, m, n, alpha_ptr, a.data.ptr, lda, beta_ptr, b.data.ptr, ldb, c.data.ptr, m) finally: cublas.setPointerMode(handle, orig_mode) if not out._f_contiguous: out[...] = c return out
def create_dropout_states(handle): state_size = cudnn.dropoutGetStatesSize(handle) return cupy.empty((state_size, ), dtype='b')
def test_empty_zero_sized_array_strides(self, order): a = numpy.empty((1, 0, 2), dtype='d', order=order) b = cupy.empty((1, 0, 2), dtype='d', order=order) self.assertEqual(b.strides, a.strides)
def solve(a, b): '''Solves a linear matrix equation. It computes the exact solution of ``x`` in ``ax = b``, where ``a`` is a square and full rank matrix. Args: a (cupy.ndarray): The matrix with dimension ``(M, M)`` b (cupy.ndarray): The vector with ``M`` elements, or the matrix with dimension ``(M, K)`` Returns: cupy.ndarray: The vector with ``M`` elements, or the matrix with dimension ``(M, K)``. .. seealso:: :func:`numpy.linalg.solve` ''' # NOTE: Since cusolver in CUDA 8.0 does not support gesv, # we manually solve a linear system with QR decomposition. # For details, please see the following: # http://docs.nvidia.com/cuda/cusolver/index.html#qr_examples if not cuda.cusolver_enabled: raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0') # TODO(Saito): Current implementation only accepts two-dimensional arrays util._assert_cupy_array(a, b) util._assert_rank2(a) util._assert_nd_squareness(a) if 2 < b.ndim: raise linalg.LinAlgError('{}-dimensional array given. Array must be ' 'one or two-dimensional'.format(b.ndim)) if len(a) != len(b): raise linalg.LinAlgError('The number of rows of array a must be ' 'the same as that of array b') # Cast to float32 or float64 if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype.char else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char m, k = (b.size, 1) if b.ndim == 1 else b.shape a = a.transpose().astype(dtype, order='C', copy=True) b = b.transpose().astype(dtype, order='C', copy=True) cusolver_handle = device.get_cusolver_handle() cublas_handle = device.get_cublas_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if dtype == 'f': geqrf = cusolver.sgeqrf geqrf_bufferSize = cusolver.sgeqrf_bufferSize ormqr = cusolver.sormqr trsm = cublas.strsm else: # dtype == 'd' geqrf = cusolver.dgeqrf geqrf_bufferSize = cusolver.dgeqrf_bufferSize ormqr = cusolver.dormqr trsm = cublas.dtrsm # 1. QR decomposition (A = Q * R) buffersize = geqrf_bufferSize(cusolver_handle, m, m, a.data.ptr, m) workspace = cupy.empty(buffersize, dtype=dtype) tau = cupy.empty(m, dtype=dtype) geqrf(cusolver_handle, m, m, a.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) _check_status(dev_info) # 2. ormqr (Q^T * B) ormqr(cusolver_handle, cublas.CUBLAS_SIDE_LEFT, cublas.CUBLAS_OP_T, m, k, m, a.data.ptr, m, tau.data.ptr, b.data.ptr, m, workspace.data.ptr, buffersize, dev_info.data.ptr) _check_status(dev_info) # 3. trsm (X = R^{-1} * (Q^T * B)) trsm(cublas_handle, cublas.CUBLAS_SIDE_LEFT, cublas.CUBLAS_FILL_MODE_UPPER, cublas.CUBLAS_OP_N, cublas.CUBLAS_DIAG_NON_UNIT, m, k, 1, a.data.ptr, m, b.data.ptr, m) return b.transpose()
def warp_coords(coord_map, shape, dtype=np.float64): """Build the source coordinates for the output of a 2-D image warp. Parameters ---------- coord_map : callable like GeometricTransform.inverse Return input coordinates for given output coordinates. Coordinates are in the shape (P, 2), where P is the number of coordinates and each element is a ``(row, col)`` pair. shape : tuple Shape of output image ``(rows, cols[, bands])``. dtype : np.dtype or string dtype for return value (sane choices: float32 or float64). Returns ------- coords : (ndim, rows, cols[, bands]) array of dtype `dtype` Coordinates for `scipy.ndimage.map_coordinates`, that will yield an image of shape (orows, ocols, bands) by drawing from source points according to the `coord_transform_fn`. Notes ----- This is a lower-level routine that produces the source coordinates for 2-D images used by `warp()`. It is provided separately from `warp` to give additional flexibility to users who would like, for example, to re-use a particular coordinate mapping, to use specific dtypes at various points along the the image-warping process, or to implement different post-processing logic than `warp` performs after the call to `ndi.map_coordinates`. Examples -------- Produce a coordinate map that shifts an image up and to the right: >>> import cupy as cp >>> from skimage import data >>> from scipy.ndimage import map_coordinates >>> >>> def shift_up10_left20(xy): ... return xy - cp.array([-20, 10])[None, :] >>> >>> image = data.astronaut().astype(np.float32) >>> coords = warp_coords(shift_up10_left20, image.shape) >>> warped_image = map_coordinates(image, coords) """ shape = safe_as_int(shape) rows, cols = shape[0], shape[1] coords_shape = [len(shape), rows, cols] if len(shape) == 3: coords_shape.append(shape[2]) coords = cp.empty(coords_shape, dtype=dtype) # Reshape grid coordinates into a (P, 2) array of (row, col) pairs tf_coords = cp.indices((cols, rows), dtype=dtype).reshape(2, -1).T # Map each (row, col) pair to the source image according to # the user-provided mapping tf_coords = coord_map(tf_coords) # Reshape back to a (2, M, N) coordinate grid tf_coords = tf_coords.T.reshape((-1, cols, rows)).swapaxes(1, 2) # Place the y-coordinate mapping _stackcopy(coords[1, ...], tf_coords[0, ...]) # Place the x-coordinate mapping _stackcopy(coords[0, ...], tf_coords[1, ...]) if len(shape) == 3: coords[2, ...] = cp.arange(shape[2], dtype=coords.dtype) return coords
def binopt_csr(a, b, op_name): check_shape_for_pointwise_op(a.shape, b.shape) a_m, a_n = a.shape b_m, b_n = b.shape m, n = max(a_m, b_m), max(a_n, b_n) a_nnz = a.nnz * (m // a_m) * (n // a_n) b_nnz = b.nnz * (m // b_m) * (n // b_n) a_info = cupy.zeros(a_nnz + 1, dtype=a.indices.dtype) b_info = cupy.zeros(b_nnz + 1, dtype=b.indices.dtype) a_valid = cupy.zeros(a_nnz, dtype=numpy.int8) b_valid = cupy.zeros(b_nnz, dtype=numpy.int8) c_indptr = cupy.zeros(m + 1, dtype=a.indptr.dtype) in_dtype = numpy.promote_types(a.dtype, b.dtype) a_data = a.data.astype(in_dtype, copy=False) b_data = b.data.astype(in_dtype, copy=False) funcs = _GET_ROW_ID_ if op_name == '_maximum_': funcs += _BINOPT_MAX_ out_dtype = in_dtype elif op_name == '_minimum_': funcs += _BINOPT_MIN_ out_dtype = in_dtype elif op_name == '_eq_': funcs += _BINOPT_EQ_ out_dtype = numpy.bool elif op_name == '_ne_': funcs += _BINOPT_NE_ out_dtype = numpy.bool elif op_name == '_lt_': funcs += _BINOPT_LT_ out_dtype = numpy.bool elif op_name == '_gt_': funcs += _BINOPT_GT_ out_dtype = numpy.bool elif op_name == '_le_': funcs += _BINOPT_LE_ out_dtype = numpy.bool elif op_name == '_ge_': funcs += _BINOPT_GE_ out_dtype = numpy.bool else: raise ValueError('invalid op_name: {}'.format(op_name)) a_tmp_data = cupy.empty(a_nnz, dtype=out_dtype) b_tmp_data = cupy.empty(b_nnz, dtype=out_dtype) a_tmp_indices = cupy.empty(a_nnz, dtype=a.indices.dtype) b_tmp_indices = cupy.empty(b_nnz, dtype=b.indices.dtype) _size = a_nnz + b_nnz cupy_binopt_csr_step1(op_name, preamble=funcs)(m, n, a.indptr, a.indices, a_data, a_m, a_n, a.nnz, a_nnz, b.indptr, b.indices, b_data, b_m, b_n, b.nnz, b_nnz, a_info, a_valid, a_tmp_indices, a_tmp_data, b_info, b_valid, b_tmp_indices, b_tmp_data, c_indptr, size=_size) a_info = cupy.cumsum(a_info, dtype=a_info.dtype) b_info = cupy.cumsum(b_info, dtype=b_info.dtype) c_indptr = cupy.cumsum(c_indptr, dtype=c_indptr.dtype) c_nnz = int(c_indptr[-1]) c_indices = cupy.empty(c_nnz, dtype=a.indices.dtype) c_data = cupy.empty(c_nnz, dtype=out_dtype) cupy_binopt_csr_step2(op_name)(a_info, a_valid, a_tmp_indices, a_tmp_data, a_nnz, b_info, b_valid, b_tmp_indices, b_tmp_data, b_nnz, c_indices, c_data, size=_size) return csr_matrix((c_data, c_indices, c_indptr), shape=(m, n))
def cross(a, b, axisa=-1, axisb=-1, axisc=-1, axis=None): """Returns the cross product of two vectors. The cross product of ``a`` and ``b`` in :math:`R^3` is a vector perpendicular to both ``a`` and ``b``. If ``a`` and ``b`` are arrays of vectors, the vectors are defined by the last axis of ``a`` and ``b`` by default, and these axes can have dimensions 2 or 3. Where the dimension of either ``a`` or ``b`` is 2, the third component of the input vector is assumed to be zero and the cross product calculated accordingly. In cases where both input vectors have dimension 2, the z-component of the cross product is returned. Args: a (cupy.ndarray): Components of the first vector(s). b (cupy.ndarray): Components of the second vector(s). axisa (int, optional): Axis of ``a`` that defines the vector(s). By default, the last axis. axisb (int, optional): Axis of ``b`` that defines the vector(s). By default, the last axis. axisc (int, optional): Axis of ``c`` containing the cross product vector(s). Ignored if both input vectors have dimension 2, as the return is scalar. By default, the last axis. axis (int, optional): If defined, the axis of ``a``, ``b`` and ``c`` that defines the vector(s) and cross product(s). Overrides ``axisa``, ``axisb`` and ``axisc``. Returns: cupy.ndarray : Vector cross product(s). .. seealso:: :func:`numpy.cross` """ if axis is not None: axisa, axisb, axisc = (axis, ) * 3 a = cupy.asarray(a) b = cupy.asarray(b) # Check axisa and axisb are within bounds axisa = internal._normalize_axis_index(axisa, a.ndim) axisb = internal._normalize_axis_index(axisb, b.ndim) # Move working axis to the end of the shape a = cupy.moveaxis(a, axisa, -1) b = cupy.moveaxis(b, axisb, -1) if a.shape[-1] not in (2, 3) or b.shape[-1] not in (2, 3): msg = ('incompatible dimensions for cross product\n' '(dimension must be 2 or 3)') raise ValueError(msg) # Create the output array shape = cupy.broadcast(a[..., 0], b[..., 0]).shape if a.shape[-1] == 3 or b.shape[-1] == 3: shape += (3, ) # Check axisc is within bounds axisc = internal._normalize_axis_index(axisc, len(shape)) dtype = cupy.promote_types(a.dtype, b.dtype) cp = cupy.empty(shape, dtype) # create local aliases for readability a0 = a[..., 0] a1 = a[..., 1] if a.shape[-1] == 3: a2 = a[..., 2] b0 = b[..., 0] b1 = b[..., 1] if b.shape[-1] == 3: b2 = b[..., 2] if cp.ndim != 0 and cp.shape[-1] == 3: cp0 = cp[..., 0] cp1 = cp[..., 1] cp2 = cp[..., 2] if a.shape[-1] == 2: if b.shape[-1] == 2: # a0 * b1 - a1 * b0 cupy.multiply(a0, b1, out=cp) cp -= a1 * b0 return cp else: assert b.shape[-1] == 3 # cp0 = a1 * b2 - 0 (a2 = 0) # cp1 = 0 - a0 * b2 (a2 = 0) # cp2 = a0 * b1 - a1 * b0 cupy.multiply(a1, b2, out=cp0) cupy.multiply(a0, b2, out=cp1) cupy.negative(cp1, out=cp1) cupy.multiply(a0, b1, out=cp2) cp2 -= a1 * b0 else: assert a.shape[-1] == 3 if b.shape[-1] == 3: # cp0 = a1 * b2 - a2 * b1 # cp1 = a2 * b0 - a0 * b2 # cp2 = a0 * b1 - a1 * b0 cupy.multiply(a1, b2, out=cp0) tmp = a2 * b1 cp0 -= tmp cupy.multiply(a2, b0, out=cp1) cupy.multiply(a0, b2, out=tmp) cp1 -= tmp cupy.multiply(a0, b1, out=cp2) cupy.multiply(a1, b0, out=tmp) cp2 -= tmp else: assert b.shape[-1] == 2 # cp0 = 0 - a2 * b1 (b2 = 0) # cp1 = a2 * b0 - 0 (b2 = 0) # cp2 = a0 * b1 - a1 * b0 cupy.multiply(a2, b1, out=cp0) cupy.negative(cp0, out=cp0) cupy.multiply(a2, b0, out=cp1) cupy.multiply(a0, b1, out=cp2) cp2 -= a1 * b0 return cupy.moveaxis(cp, -1, axisc)
def sum_duplicates(self): """Eliminate duplicate matrix entries by adding them together. .. seealso:: :meth:`scipy.sparse.coo_matrix.sum_duplicates` """ if self._has_canonical_format: return if self.data.size == 0: self._has_canonical_format = True return keys = cupy.stack([self.row, self.col]) order = cupy.lexsort(keys) src_data = self.data[order] src_row = self.row[order] src_col = self.col[order] diff = cupy.ElementwiseKernel( 'raw int32 row, raw int32 col', 'int32 diff', ''' int index; if (i == 0 || row[i - 1] == row[i] && col[i - 1] == col[i]) { diff = 0; } else { diff = 1; } ''', 'sum_duplicates_diff')(src_row, src_col, size=self.row.size) if diff[1:].all(): # All elements have different indices. data = src_data row = src_row col = src_col else: index = cupy.cumsum(diff, dtype='i') size = int(index[-1]) + 1 data = cupy.zeros(size, dtype=self.data.dtype) row = cupy.empty(size, dtype='i') col = cupy.empty(size, dtype='i') if self.data.dtype.kind == 'f': cupy.ElementwiseKernel( 'T src_data, int32 src_row, int32 src_col, int32 index', 'raw T data, raw int32 row, raw int32 col', ''' atomicAdd(&data[index], src_data); row[index] = src_row; col[index] = src_col; ''', 'sum_duplicates_assign')(src_data, src_row, src_col, index, data, row, col) elif self.data.dtype.kind == 'c': cupy.ElementwiseKernel( 'T src_real, T src_imag, int32 src_row, int32 src_col, ' 'int32 index', 'raw T real, raw T imag, raw int32 row, raw int32 col', ''' atomicAdd(&real[index], src_real); atomicAdd(&imag[index], src_imag); row[index] = src_row; col[index] = src_col; ''', 'sum_duplicates_assign_complex')(src_data.real, src_data.imag, src_row, src_col, index, data.real, data.imag, row, col) self.data = data self.row = row self.col = col self._has_canonical_format = True
def inv(a): """Computes the inverse of a matrix. This function computes matrix ``a_inv`` from n-dimensional regular matrix ``a`` such that ``dot(a, a_inv) == eye(n)``. Args: a (cupy.ndarray): The regular matrix Returns: cupy.ndarray: The inverse of a matrix. .. seealso:: :func:`numpy.linalg.inv` """ if a.ndim >= 3: return _batched_inv(a) if not cuda.cusolver_enabled: raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0') # to prevent `a` to be overwritten a = a.copy() util._assert_cupy_array(a) util._assert_rank2(a) util._assert_nd_squareness(a) # support float32, float64, complex64, and complex128 if a.dtype.char in 'fdFD': dtype = a.dtype.char else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char cusolver_handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) ipiv = cupy.empty((a.shape[0], 1), dtype=numpy.intc) if dtype == 'f': getrf = cusolver.sgetrf getrf_bufferSize = cusolver.sgetrf_bufferSize getrs = cusolver.sgetrs elif dtype == 'd': getrf = cusolver.dgetrf getrf_bufferSize = cusolver.dgetrf_bufferSize getrs = cusolver.dgetrs elif dtype == 'F': getrf = cusolver.cgetrf getrf_bufferSize = cusolver.cgetrf_bufferSize getrs = cusolver.cgetrs elif dtype == 'D': getrf = cusolver.zgetrf getrf_bufferSize = cusolver.zgetrf_bufferSize getrs = cusolver.zgetrs else: raise ValueError('unsupported dtype') m = a.shape[0] buffersize = getrf_bufferSize(cusolver_handle, m, m, a.data.ptr, m) workspace = cupy.empty(buffersize, dtype=dtype) # LU factorization getrf(cusolver_handle, m, m, a.data.ptr, m, workspace.data.ptr, ipiv.data.ptr, dev_info.data.ptr) b = cupy.eye(m, dtype=dtype) # solve for the inverse getrs(cusolver_handle, 0, m, m, a.data.ptr, m, ipiv.data.ptr, b.data.ptr, m, dev_info.data.ptr) return b
def sinkhorn_knopp(a, b, M, reg, numItermax=1000, stopThr=1e-9, verbose=False, log=False, to_numpy=True, **kwargs): """ Solve the entropic regularization optimal transport on GPU If the input matrix are in numpy format, they will be uploaded to the GPU first which can incur significant time overhead. The function solves the following optimization problem: .. math:: \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) s.t. \gamma 1 = a \gamma^T 1= b \gamma\geq 0 where : - M is the (ns,nt) metric cost matrix - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - a and b are source and target weights (sum to 1) The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in [2]_ Parameters ---------- a : np.ndarray (ns,) samples weights in the source domain b : np.ndarray (nt,) or np.ndarray (nt,nbb) samples in the target domain, compute sinkhorn with multiple targets and fixed M if b is a matrix (return OT loss + dual variables in log) M : np.ndarray (ns,nt) loss matrix reg : float Regularization term >0 numItermax : int, optional Max number of iterations stopThr : float, optional Stop threshol on error (>0) verbose : bool, optional Print information along iterations log : bool, optional record log if True to_numpy : boolean, optional (default True) If true convert back the GPU array result to numpy format. Returns ------- gamma : (ns x nt) ndarray Optimal transportation matrix for the given parameters log : dict log dictionary return only if log==True in parameters Examples -------- >>> import ot >>> a=[.5,.5] >>> b=[.5,.5] >>> M=[[0.,1.],[1.,0.]] >>> ot.sinkhorn(a,b,M,1) array([[ 0.36552929, 0.13447071], [ 0.13447071, 0.36552929]]) References ---------- .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013 See Also -------- ot.lp.emd : Unregularized OT ot.optim.cg : General regularized OT """ a = cp.asarray(a) b = cp.asarray(b) M = cp.asarray(M) if len(a) == 0: a = np.ones((M.shape[0],)) / M.shape[0] if len(b) == 0: b = np.ones((M.shape[1],)) / M.shape[1] # init data Nini = len(a) Nfin = len(b) if len(b.shape) > 1: nbb = b.shape[1] else: nbb = 0 if log: log = {'err': []} # we assume that no distances are null except those of the diagonal of # distances if nbb: u = np.ones((Nini, nbb)) / Nini v = np.ones((Nfin, nbb)) / Nfin else: u = np.ones(Nini) / Nini v = np.ones(Nfin) / Nfin # print(reg) # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute K = np.empty(M.shape, dtype=M.dtype) np.divide(M, -reg, out=K) np.exp(K, out=K) # print(np.min(K)) tmp2 = np.empty(b.shape, dtype=M.dtype) Kp = (1 / a).reshape(-1, 1) * K cpt = 0 err = 1 while (err > stopThr and cpt < numItermax): uprev = u vprev = v KtransposeU = np.dot(K.T, u) v = np.divide(b, KtransposeU) u = 1. / np.dot(Kp, v) if (np.any(KtransposeU == 0) or np.any(np.isnan(u)) or np.any(np.isnan(v)) or np.any(np.isinf(u)) or np.any(np.isinf(v))): # we have reached the machine precision # come back to previous solution and quit loop print('Warning: numerical errors at iteration', cpt) u = uprev v = vprev break if cpt % 10 == 0: # we can speed up the process by checking for the error only all # the 10th iterations if nbb: err = np.sum((u - uprev)**2) / np.sum((u)**2) + \ np.sum((v - vprev)**2) / np.sum((v)**2) else: # compute right marginal tmp2= (diag(u)Kdiag(v))^T1 tmp2 = np.sum(u[:, None] * K * v[None, :], 0) #tmp2=np.einsum('i,ij,j->j', u, K, v) err = np.linalg.norm(tmp2 - b)**2 # violation of marginal if log: log['err'].append(err) if verbose: if cpt % 200 == 0: print( '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19) print('{:5d}|{:8e}|'.format(cpt, err)) cpt = cpt + 1 if log: log['u'] = u log['v'] = v if nbb: # return only loss #res = np.einsum('ik,ij,jk,ij->k', u, K, v, M) (explodes cupy memory) res = np.empty(nbb) for i in range(nbb): res[i] = np.sum(u[:, None, i] * (K * M) * v[None, :, i]) if to_numpy: res = utils.to_np(res) if log: return res, log else: return res else: # return OT matrix res = u.reshape((-1, 1)) * K * v.reshape((1, -1)) if to_numpy: res = utils.to_np(res) if log: return res, log else: return res
def f3(): [cupy.empty((s,), dtype='b') for s in sizes]
def test_sum_out_wrong_shape(self): a = testing.shaped_arange((2, 3, 4)) b = cupy.empty((2, 3)) with self.assertRaises(ValueError): a.sum(axis=1, out=b)
def make_data(self, parse=False): if self.mask_sum == 0.: if parse: self.parse_mask() else: self.make_mask() if self.bg_count is not None: if parse: self.parse_mask(bg=True) else: self.make_mask(bg=True) with h5py.File(self.out_file, 'a') as fptr: if 'ones' in fptr: del fptr['ones'] if 'multi' in fptr: del fptr['multi'] if 'place_ones' in fptr: del fptr['place_ones'] if 'place_multi' in fptr: del fptr['place_multi'] if 'count_multi' in fptr: del fptr['count_multi'] if 'num_pix' in fptr: del fptr['num_pix'] if 'true_angles' in fptr: del fptr['true_angles'] if 'bg' in fptr: del fptr['bg'] if self.bgmask_sum > 0: fptr['bg'] = self.bgmask.get() fptr['num_pix'] = np.array([self.size**2]) dtype = h5py.special_dtype(vlen=np.dtype('i4')) place_ones = fptr.create_dataset('place_ones', (self.num_data, ), dtype=dtype) place_multi = fptr.create_dataset('place_multi', (self.num_data, ), dtype=dtype) count_multi = fptr.create_dataset('count_multi', (self.num_data, ), dtype=dtype) ones = fptr.create_dataset('ones', (self.num_data, ), dtype='i4') multi = fptr.create_dataset('multi', (self.num_data, ), dtype='i4') ang = np.random.rand(self.num_data).astype('f8') * 2. * cp.pi fptr['true_angles'] = ang if self.fluence == 'gamma': if 'scale' in fptr: del fptr['scale'] scale = np.random.gamma(2., 0.5, self.num_data) else: scale = np.ones(self.num_data, dtype='f8') rot_mask = cp.empty(self.size**2, dtype='f8') bsize_model = int(np.ceil(self.size / 32.)) stime = time.time() for i in range(self.num_data): kernels.slice_gen((bsize_model, ) * 2, (32, ) * 2, (self.mask, ang[i], scale[i], self.size, self.bgmask, 0, rot_mask)) frame = cp.random.poisson(rot_mask, dtype='i4').ravel() place_ones[i] = cp.where(frame == 1)[0].get() place_multi[i] = cp.where(frame > 1)[0].get() count_multi[i] = frame[frame > 1].get() ones[i] = place_ones[i].shape[0] multi[i] = place_multi[i].shape[0] sys.stderr.write('\rWritten %d/%d frames (%d) ' % (i + 1, self.num_data, int(frame.sum()))) etime = time.time() sys.stderr.write('\nTime taken (make_data): %f s\n' % (etime - stime))
def interp(x, xp, fp, left=None, right=None, period=None): """ One-dimensional linear interpolation. Args: x (cupy.ndarray): a 1D array of points on which the interpolation is performed. xp (cupy.ndarray): a 1D array of points on which the function values (``fp``) are known. fp (cupy.ndarray): a 1D array containing the function values at the the points ``xp``. left (float or complex): value to return if ``x < xp[0]``. Default is ``fp[0]``. right (float or complex): value to return if ``x > xp[-1]``. Default is ``fp[-1]``. period (None or float): a period for the x-coordinates. Parameters ``left`` and ``right`` are ignored if ``period`` is specified. Default is ``None``. Returns: cupy.ndarray: The interpolated values, same shape as ``x``. .. note:: This function may synchronize if ``left`` or ``right`` is not already on the device. .. seealso:: :func:`numpy.interp` """ if xp.ndim != 1 or fp.ndim != 1: raise ValueError('xp and fp must be 1D arrays') if xp.size != fp.size: raise ValueError('fp and xp are not of the same length') if xp.size == 0: raise ValueError('array of sample points is empty') if not x.flags.c_contiguous: raise NotImplementedError('Non-C-contiguous x is currently not ' 'supported') x_dtype = cupy.common_type(x, xp) if not cupy.can_cast(x_dtype, cupy.float64): raise TypeError('Cannot cast array data from' ' {} to {} according to the rule \'safe\''.format( x_dtype, cupy.float64)) if period is not None: # The handling of "period" below is modified from NumPy's if period == 0: raise ValueError("period must be a non-zero value") period = abs(period) left = None right = None x = x.astype(cupy.float64) xp = xp.astype(cupy.float64) # normalizing periodic boundaries x %= period xp %= period asort_xp = cupy.argsort(xp) xp = xp[asort_xp] fp = fp[asort_xp] xp = cupy.concatenate((xp[-1:] - period, xp, xp[0:1] + period)) fp = cupy.concatenate((fp[-1:], fp, fp[0:1])) assert xp.flags.c_contiguous assert fp.flags.c_contiguous # NumPy always returns float64 or complex128, so we upcast all values # on the fly in the kernel out_dtype = 'D' if fp.dtype.kind == 'c' else 'd' output = cupy.empty(x.shape, dtype=out_dtype) idx = cupy.searchsorted(xp, x, side='right') left = fp[0] if left is None else cupy.array(left, fp.dtype) right = fp[-1] if right is None else cupy.array(right, fp.dtype) kern = _get_interp_kernel(out_dtype == 'D') kern(x, idx, xp, fp, xp.size, left, right, output) return output
def add_buffers_to_particles(species, float_recv_left, float_recv_right, uint_recv_left, uint_recv_right): """ Add the particles stored in recv_left and recv_right to the existing particle in species. Resize the auxiliary arrays of the particles Ex, Ey, Ez, Bx, By, Bz, as well as cell_idx, sorted_idx and sorting_buffer Parameters ---------- species: a Particles object Contain the particles that stayed on the present processors float_recv_left, float_recv_right, uint_recv_left, uint_recv_right: arrays of shape (n_float,Nptcl) and (n_int,Nptcl) where Nptcl is the number of particles that are received to the left proc and right proc respectively, and where n_float and n_int are the number of float and integer quantities respectively These arrays are always on the CPU (since they were used for MPI) """ # Copy the buffers to an enlarged array if species.use_cuda: add_buffers_gpu(species, float_recv_left, float_recv_right, uint_recv_left, uint_recv_right) else: add_buffers_cpu(species, float_recv_left, float_recv_right, uint_recv_left, uint_recv_right) # Reallocate the particles auxiliary arrays. This needs to be done, # as the total number of particles in this domain has changed. if species.use_cuda: shape = (species.Ntot, ) # Reallocate empty field-on-particle arrays on the GPU species.Ex = cupy.empty(shape, dtype=np.float64) species.Ex = cupy.empty(shape, dtype=np.float64) species.Ey = cupy.empty(shape, dtype=np.float64) species.Ez = cupy.empty(shape, dtype=np.float64) species.Bx = cupy.empty(shape, dtype=np.float64) species.By = cupy.empty(shape, dtype=np.float64) species.Bz = cupy.empty(shape, dtype=np.float64) # Reallocate empty auxiliary sorting arrays on the GPU species.cell_idx = cupy.empty(shape, dtype=np.int32) species.sorted_idx = cupy.empty(shape, dtype=np.intp) species.sorting_buffer = cupy.empty(shape, dtype=np.float64) if species.n_integer_quantities > 0: species.int_sorting_buffer = \ cupy.empty( shape, dtype=np.uint64 ) else: # Reallocate empty field-on-particle arrays on the CPU species.Ex = np.empty(species.Ntot, dtype=np.float64) species.Ey = np.empty(species.Ntot, dtype=np.float64) species.Ez = np.empty(species.Ntot, dtype=np.float64) species.Bx = np.empty(species.Ntot, dtype=np.float64) species.By = np.empty(species.Ntot, dtype=np.float64) species.Bz = np.empty(species.Ntot, dtype=np.float64) # The particles are unsorted after adding new particles. species.sorted = False
def sum_duplicates(self): """Eliminate duplicate matrix entries by adding them together. .. warning:: When sorting the indices, CuPy follows the convention of cuSPARSE, which is different from that of SciPy. Therefore, the order of the output indices may differ: .. code-block:: python >>> # 1 0 0 >>> # A = 1 1 0 >>> # 1 1 1 >>> data = cupy.array([1, 1, 1, 1, 1, 1], 'f') >>> row = cupy.array([0, 1, 1, 2, 2, 2], 'i') >>> col = cupy.array([0, 0, 1, 0, 1, 2], 'i') >>> A = cupyx.scipy.sparse.coo_matrix((data, (row, col)), ... shape=(3, 3)) >>> a = A.get() >>> A.sum_duplicates() >>> a.sum_duplicates() # a is scipy.sparse.coo_matrix >>> A.row array([0, 1, 1, 2, 2, 2], dtype=int32) >>> a.row array([0, 1, 2, 1, 2, 2], dtype=int32) >>> A.col array([0, 0, 1, 0, 1, 2], dtype=int32) >>> a.col array([0, 0, 0, 1, 1, 2], dtype=int32) .. warning:: Calling this function might synchronize the device. .. seealso:: :meth:`scipy.sparse.coo_matrix.sum_duplicates` """ if self.has_canonical_format: return # Note: The sorting order below follows the cuSPARSE convention (first # row then col, so-called row-major) and differs from that of SciPy, as # the cuSPARSE functions such as cusparseSpMV() assume this sorting # order. # See https://docs.nvidia.com/cuda/cusparse/index.html#coo-format keys = cupy.stack([self.col, self.row]) order = cupy.lexsort(keys) src_data = self.data[order] src_row = self.row[order] src_col = self.col[order] diff = self._sum_duplicates_diff(src_row, src_col, size=self.row.size) if diff[1:].all(): # All elements have different indices. data = src_data row = src_row col = src_col else: # TODO(leofang): move the kernels outside this method index = cupy.cumsum(diff, dtype='i') size = int(index[-1]) + 1 data = cupy.zeros(size, dtype=self.data.dtype) row = cupy.empty(size, dtype='i') col = cupy.empty(size, dtype='i') if self.data.dtype.kind == 'f': cupy.ElementwiseKernel( 'T src_data, int32 src_row, int32 src_col, int32 index', 'raw T data, raw int32 row, raw int32 col', ''' atomicAdd(&data[index], src_data); row[index] = src_row; col[index] = src_col; ''', 'sum_duplicates_assign' )(src_data, src_row, src_col, index, data, row, col) elif self.data.dtype.kind == 'c': cupy.ElementwiseKernel( 'T src_real, T src_imag, int32 src_row, int32 src_col, ' 'int32 index', 'raw T real, raw T imag, raw int32 row, raw int32 col', ''' atomicAdd(&real[index], src_real); atomicAdd(&imag[index], src_imag); row[index] = src_row; col[index] = src_col; ''', 'sum_duplicates_assign_complex' )(src_data.real, src_data.imag, src_row, src_col, index, data.real, data.imag, row, col) self.data = data self.row = row self.col = col self.has_canonical_format = True
def test_concatenate_wrong_shape(self): a = cupy.empty((2, 3, 4)) b = cupy.empty((3, 3, 4)) c = cupy.empty((4, 4, 4)) with self.assertRaises(ValueError): cupy.concatenate((a, b, c))
def batched_gesv(a, b): """Solves multiple linear matrix equations using cublas<t>getr[fs]Batched(). Computes the solution to system of linear equation ``ax = b``. Args: a (cupy.ndarray): The matrix with dimension ``(..., M, M)``. b (cupy.ndarray): The matrix with dimension ``(..., M)`` or ``(..., M, K)``. Returns: cupy.ndarray: The matrix with dimension ``(..., M)`` or ``(..., M, K)``. """ _util._assert_cupy_array(a, b) _util._assert_nd_squareness(a) if not ((a.ndim == b.ndim or a.ndim == b.ndim + 1) and a.shape[:-1] == b.shape[:a.ndim - 1]): raise ValueError( 'a must have (..., M, M) shape and b must have (..., M) ' 'or (..., M, K)') dtype, out_dtype = _util.linalg_common_type(a, b) if dtype == 'f': t = 's' elif dtype == 'd': t = 'd' elif dtype == 'F': t = 'c' elif dtype == 'D': t = 'z' else: raise TypeError('invalid dtype') getrf = getattr(cublas, t + 'getrfBatched') getrs = getattr(cublas, t + 'getrsBatched') bs = numpy.prod(a.shape[:-2]) if a.ndim > 2 else 1 n = a.shape[-1] nrhs = b.shape[-1] if a.ndim == b.ndim else 1 b_shape = b.shape a_data_ptr = a.data.ptr b_data_ptr = b.data.ptr a = cupy.ascontiguousarray(a.reshape(bs, n, n).transpose(0, 2, 1), dtype=dtype) b = cupy.ascontiguousarray(b.reshape(bs, n, nrhs).transpose(0, 2, 1), dtype=dtype) if a.data.ptr == a_data_ptr: a = a.copy() if b.data.ptr == b_data_ptr: b = b.copy() if n > get_batched_gesv_limit(): warnings.warn('The matrix size ({}) exceeds the set limit ({})'.format( n, get_batched_gesv_limit())) handle = device.get_cublas_handle() lda = n a_step = lda * n * a.itemsize a_array = cupy.arange(a.data.ptr, a.data.ptr + a_step * bs, a_step, dtype=cupy.uintp) ldb = n b_step = ldb * nrhs * b.itemsize b_array = cupy.arange(b.data.ptr, b.data.ptr + b_step * bs, b_step, dtype=cupy.uintp) pivot = cupy.empty((bs, n), dtype=numpy.int32) dinfo = cupy.empty((bs, ), dtype=numpy.int32) info = numpy.empty((1, ), dtype=numpy.int32) # LU factorization (A = L * U) getrf(handle, n, a_array.data.ptr, lda, pivot.data.ptr, dinfo.data.ptr, bs) _util._check_cublas_info_array_if_synchronization_allowed(getrf, dinfo) # Solves Ax = b getrs(handle, cublas.CUBLAS_OP_N, n, nrhs, a_array.data.ptr, lda, pivot.data.ptr, b_array.data.ptr, ldb, info.ctypes.data, bs) if info[0] != 0: msg = 'Error reported by {} in cuBLAS. '.format(getrs.__name__) if info[0] < 0: msg += 'The {}-th parameter had an illegal value.'.format(-info[0]) raise linalg.LinAlgError(msg) return b.transpose(0, 2, 1).reshape(b_shape).astype(out_dtype, copy=False)
def interval(self, mx, size): """Generate multiple integers independently sampled uniformly from ``[0, mx]``. Args: mx (int): Upper bound of the interval size (None or int or tuple): Shape of the array or the scalar returned. Returns: int or cupy.ndarray: If ``None``, an :class:`cupy.ndarray` with shape ``()`` is returned. If ``int``, 1-D array of length size is returned. If ``tuple``, multi-dimensional array with shape ``size`` is returned. Currently, only 32 bit integers can be sampled. If 0 :math:`\\leq` ``mx`` :math:`\\leq` 0x7fffffff, a ``numpy.int32`` array is returned. If 0x80000000 :math:`\\leq` ``mx`` :math:`\\leq` 0xffffffff, a ``numpy.uint32`` array is returned. """ if size is None: return self.interval(mx, 1).reshape(()) elif isinstance(size, int): size = (size, ) if mx == 0: return cupy.zeros(size, dtype=numpy.int32) if mx < 0: raise ValueError('mx must be non-negative (actual: {})'.format(mx)) elif mx <= 0x7fffffff: dtype = numpy.int32 elif mx <= 0xffffffff: dtype = numpy.uint32 else: raise ValueError( 'mx must be within uint32 range (actual: {})'.format(mx)) mask = (1 << mx.bit_length()) - 1 mask = cupy.array(mask, dtype=dtype) n = functools.reduce(operator.mul, size, 1) sample = cupy.empty((n, ), dtype=dtype) n_rem = n # The number of remaining elements to sample ret = None while n_rem > 0: curand.generate(self._generator, sample.data.ptr, sample.size) # Drop the samples that exceed the upper limit sample &= mask success = sample <= mx if ret is None: # If the sampling has finished in the first iteration, # just return the sample. if success.all(): n_rem = 0 ret = sample break # Allocate the return array. ret = cupy.empty((n, ), dtype=dtype) n_succ = min(n_rem, int(success.sum())) ret[n - n_rem:n - n_rem + n_succ] = sample[success][:n_succ] n_rem -= n_succ assert n_rem == 0 return ret.reshape(size)
def test_vstack_wrong_ndim(self): a = cupy.empty((3, )) b = cupy.empty((3, 1)) with self.assertRaises(ValueError): cupy.vstack((a, b))
def create_dropout_states(handle): state_size = cudnn.dropoutGetStatesSize(handle) return cupy.empty((state_size,), dtype="b")
def f4(): buf = [] for i, s in enumerate(sizes): buf.append(cupy.empty((s,), dtype='b')) if i % 10 == 0: buf[i // 10] = None
def _tensordot_core(a, b, out, n, m, k, ret_shape): ret_dtype = a.dtype.char if ret_dtype != b.dtype.char: ret_dtype = numpy.find_common_type((ret_dtype, b.dtype), ()).char # Cast to float32 or float64 if ret_dtype == 'f' or ret_dtype == 'd': dtype = ret_dtype else: dtype = numpy.find_common_type((ret_dtype, 'f'), ()).char a = a.astype(dtype, copy=False) b = b.astype(dtype, copy=False) if not a.size or not b.size: if a.size or b.size: raise ValueError('cannot dot zero-sized and non-zero-sized arrays') if out is None: return cupy.zeros(ret_shape, dtype=ret_dtype) else: out.fill(0) return out if out is None: out = cupy.empty(ret_shape, dtype) if dtype == ret_dtype: ret = out else: ret = cupy.empty(ret_shape, ret_dtype) else: ret = out if out.dtype != dtype: out = cupy.empty(ret_shape, dtype) # It copies the operands if needed if a.shape != (k, n): a = cupy.reshape(a, (k, n)) if b.shape != (k, m): b = cupy.reshape(b, (k, m)) c = out if c.shape != (n, m): c = c.view() c.shape = (n, m) # Be careful that cuBLAS uses the FORTRAN-order matrix representation. if k == 1: if n == 1: # Scalar-vector product cupy.multiply(a, b, c) elif m == 1: # Scalar-vector product cupy.multiply(a.T, b, c) else: # Outer product A^T * B # c is C-contiguous while cuBLAS requires F-contiguous arrays, so # we compute C^T = B^T * A here. handle = cuda.Device().cublas_handle c.fill(0) a, inca = _to_cublas_vector(a, 1) b, incb = _to_cublas_vector(b, 1) if dtype == 'f': ger = cublas.sger elif dtype == 'd': ger = cublas.dger ger(handle, m, n, 1, b.data.ptr, incb, a.data.ptr, inca, c.data.ptr, m) if dtype != ret_dtype: elementwise.copy(out, ret) return ret handle = cuda.Device().cublas_handle if n == 1: if m == 1: # Inner product a, inca = _to_cublas_vector(a, 0) b, incb = _to_cublas_vector(b, 0) mode = cublas.getPointerMode(handle) cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE) if dtype == 'f': dot = cublas.sdot elif dtype == 'd': dot = cublas.ddot try: dot(handle, k, a.data.ptr, inca, b.data.ptr, incb, c.data.ptr) finally: cublas.setPointerMode(handle, mode) else: # Matrix-vector product B^T * A a, inca = _to_cublas_vector(a, 0) b, transb, ldb = _mat_to_cublas_contiguous(b, 1) if transb: # gemv requires (m, k) as the original matrix dimensions # rather than the transposed dimensions. m, k = k, m if dtype == 'f': gemv = cublas.sgemv elif dtype == 'd': gemv = cublas.dgemv gemv(handle, transb, m, k, 1, b.data.ptr, ldb, a.data.ptr, inca, 0, c.data.ptr, 1) elif m == 1: # Matrix-vector product A^T * B a, transa, lda = _mat_to_cublas_contiguous(a, 1) b, incb = _to_cublas_vector(b, 0) if transa: # gemv requires (n, k) as the original matrix dimensions rather # than the transposed dimensions. n, k = k, n if dtype == 'f': gemv = cublas.sgemv elif dtype == 'd': gemv = cublas.dgemv gemv(handle, transa, n, k, 1, a.data.ptr, lda, b.data.ptr, incb, 0, c.data.ptr, 1) else: # Matrix-Matrix product A^T * B # c is C-contiguous while cuBLAS assumes F-contiguous inputs, so we # compute C^T = B^T * A here. a, transa, lda = _mat_to_cublas_contiguous(a, 0) b, transb, ldb = _mat_to_cublas_contiguous(b, 1) if dtype == 'f': gemm = cublas.sgemm elif dtype == 'd': gemm = cublas.dgemm gemm(handle, transb, transa, m, n, k, 1, b.data.ptr, ldb, a.data.ptr, lda, 0, c.data.ptr, m) if dtype != ret_dtype: elementwise.copy(out, ret) return ret
def test_concatenate_wrong_ndim(self): a = cupy.empty((2, 3)) b = cupy.empty((2, )) with self.assertRaises(ValueError): cupy.concatenate((a, b))
def _batched_inv(a): assert (a.ndim >= 3) util._assert_cupy_array(a) util._assert_nd_squareness(a) if a.dtype == cupy.float32: getrf = cupy.cuda.cublas.sgetrfBatched getri = cupy.cuda.cublas.sgetriBatched elif a.dtype == cupy.float64: getrf = cupy.cuda.cublas.dgetrfBatched getri = cupy.cuda.cublas.dgetriBatched elif a.dtype == cupy.complex64: getrf = cupy.cuda.cublas.cgetrfBatched getri = cupy.cuda.cublas.cgetriBatched elif a.dtype == cupy.complex128: getrf = cupy.cuda.cublas.zgetrfBatched getri = cupy.cuda.cublas.zgetriBatched else: msg = ('dtype must be float32, float64, complex64 or float128' ' (actual: {})'.format(a.dtype)) raise ValueError(msg) if 0 in a.shape: return cupy.empty_like(a) a_shape = a.shape # copy is necessary to present `a` to be overwritten. a = a.copy().reshape(-1, a_shape[-2], a_shape[-1]) handle = device.get_cublas_handle() batch_size = a.shape[0] n = a.shape[1] lda = n step = n * lda * a.itemsize start = a.data.ptr stop = start + step * batch_size a_array = cupy.arange(start, stop, step, dtype=cupy.uintp) pivot_array = cupy.empty((batch_size, n), dtype=cupy.int32) info_array = cupy.empty((batch_size, ), dtype=cupy.int32) getrf(handle, n, a_array.data.ptr, lda, pivot_array.data.ptr, info_array.data.ptr, batch_size) err = False err_detail = '' for i in range(batch_size): info = info_array[i] if info < 0: err = True err_detail += ('\tmatrix[{}]: illegal value at {}-the parameter.' '\n'.format(i, -info)) if info > 0: err = True err_detail += '\tmatrix[{}]: matrix is singular.\n'.format(i) if err: raise RuntimeError('matrix inversion failed at getrf.\n' + err_detail) c = cupy.empty_like(a) ldc = lda step = n * ldc * c.itemsize start = c.data.ptr stop = start + step * batch_size c_array = cupy.arange(start, stop, step, dtype=cupy.uintp) getri(handle, n, a_array.data.ptr, lda, pivot_array.data.ptr, c_array.data.ptr, ldc, info_array.data.ptr, batch_size) for i in range(batch_size): info = info_array[i] if info > 0: err = True err_detail += '\tmatrix[{}]: matrix is singular.\n'.format(i) if err: raise RuntimeError('matrix inversion failed at getri.\n' + err_detail) return c.reshape(a_shape)
def unique(ar, return_index=False, return_inverse=False, return_counts=False, axis=None): """Find the unique elements of an array. Returns the sorted unique elements of an array. There are three optional outputs in addition to the unique elements: * the indices of the input array that give the unique values * the indices of the unique array that reconstruct the input array * the number of times each unique value comes up in the input array Args: ar(array_like): Input array. This will be flattened if it is not already 1-D. return_index(bool, optional): If True, also return the indices of `ar` (along the specified axis, if provided, or in the flattened array) that result in the unique array. return_inverse(bool, optional): If True, also return the indices of the unique array (for the specified axis, if provided) that can be used to reconstruct `ar`. return_counts(bool, optional): If True, also return the number of times each unique item appears in `ar`. axis(int or None, optional): Not supported yet. Returns: cupy.ndarray or tuple: If there are no optional outputs, it returns the :class:`cupy.ndarray` of the sorted unique values. Otherwise, it returns the tuple which contains the sorted unique values and followings. * The indices of the first occurrences of the unique values in the original array. Only provided if `return_index` is True. * The indices to reconstruct the original array from the unique array. Only provided if `return_inverse` is True. * The number of times each of the unique values comes up in the original array. Only provided if `return_counts` is True. .. warning:: This function may synchronize the device. .. seealso:: :func:`numpy.unique` """ if axis is not None: raise NotImplementedError('axis option is not supported yet.') ar = cupy.asarray(ar).flatten() if return_index or return_inverse: perm = ar.argsort() aux = ar[perm] else: ar.sort() aux = ar mask = cupy.empty(aux.shape, dtype=cupy.bool_) mask[0] = True mask[1:] = aux[1:] != aux[:-1] ret = aux[mask] if not return_index and not return_inverse and not return_counts: return ret ret = ret, if return_index: ret += perm[mask], if return_inverse: imask = cupy.cumsum(mask) - 1 inv_idx = cupy.empty(mask.shape, dtype=cupy.intp) inv_idx[perm] = imask ret += inv_idx, if return_counts: nonzero = cupy.nonzero(mask)[0] # may synchronize idx = cupy.empty((nonzero.size + 1,), nonzero.dtype) idx[:-1] = nonzero idx[-1] = mask.size ret += idx[1:] - idx[:-1], return ret