def dotc(x, y, out=None):
    """Computes the dot product of x.conj() and y."""
    dtype = x.dtype.char
    if dtype in 'fd':
        return dot(x, y, out=out)
    elif dtype == 'F':
        func = cublas.cdotc
    elif dtype == 'D':
        func = cublas.zdotc
        raise TypeError('invalid dtype')
    _check_two_vectors(x, y)

    handle = device.get_cublas_handle()
    result_dtype = dtype
    result_ptr, result, orig_mode = _setup_result_ptr(
        handle, out, result_dtype)
        func(handle, x.size, x.data.ptr, 1, y.data.ptr, 1, result_ptr)
        cublas.setPointerMode(handle, orig_mode)

    if out is None:
        out = result
    elif out.dtype != result_dtype:
        _core.elementwise_copy(result, out)
    return out
def _iamaxmin(x, out, name):
    if x.ndim != 1:
        raise ValueError('x must be a 1D array (actual: {})'.format(x.ndim))

    dtype = x.dtype.char
    if dtype == 'f':
        t = 's'
    elif dtype == 'd':
        t = 'd'
    elif dtype == 'F':
        t = 'c'
    elif dtype == 'D':
        t = 'z'
        raise TypeError('invalid dtype')
    func = getattr(cublas, 'i' + t + name)

    handle = device.get_cublas_handle()
    result_dtype = 'i'
    result_ptr, result, orig_mode = _setup_result_ptr(
        handle, out, result_dtype)
        func(handle, x.size, x.data.ptr, 1, result_ptr)
        cublas.setPointerMode(handle, orig_mode)

    if out is None:
        out = result
    elif out.dtype != result_dtype:
        _core.elementwise_copy(result, out)
    return out
def _run_1d_filters(filters, input, args, output, mode, cval, origin=0):
    Runs a series of 1D filters forming an nd filter. The filters must be a
    list of callables that take input, arg, axis, output, mode, cval, origin.
    The args is a list of values that are passed for the arg value to the
    filter. Individual filters can be None causing that axis to be skipped.
    output_orig = output
    output = _util._get_output(output, input)
    modes = _util._fix_sequence_arg(mode, input.ndim, 'mode',
    # for filters, "wrap" is a synonym for "grid-wrap".
    modes = ['grid-wrap' if m == 'wrap' else m for m in modes]
    origins = _util._fix_sequence_arg(origin, input.ndim, 'origin', int)
    n_filters = sum(filter is not None for filter in filters)
    if n_filters == 0:
        _core.elementwise_copy(input, output)
        return output
    # We can't operate in-place efficiently, so use a 2-buffer system
    temp = _util._get_output(output.dtype, input) if n_filters > 1 else None
    first = True
    iterator = zip(filters, args, modes, origins)
    for axis, (fltr, arg, mode, origin) in enumerate(iterator):
        if fltr is None:
        fltr(input, arg, axis, output, mode, cval, origin)
        input, output = output, temp if first else input
        first = False
    if isinstance(output_orig, cupy.ndarray) and input is not output_orig:
        _core.elementwise_copy(input, output_orig)
        input = output_orig
    return input
def nrm2(x, out=None):
    """Computes the Euclidean norm of vector x."""
    if x.ndim != 1:
        raise ValueError('x must be a 1D array (actual: {})'.format(x.ndim))

    dtype = x.dtype.char
    if dtype == 'f':
        func = cublas.snrm2
    elif dtype == 'd':
        func = cublas.dnrm2
    elif dtype == 'F':
        func = cublas.scnrm2
    elif dtype == 'D':
        func = cublas.dznrm2
        raise TypeError('invalid dtype')

    handle = device.get_cublas_handle()
    result_dtype = dtype.lower()
    result_ptr, result, orig_mode = _setup_result_ptr(
        handle, out, result_dtype)
        func(handle, x.size, x.data.ptr, 1, result_ptr)
        cublas.setPointerMode(handle, orig_mode)

    if out is None:
        out = result
    elif out.dtype != result_dtype:
        _core.elementwise_copy(result, out)
    return out
def gerc(alpha, x, y, a):
    """Computes a += alpha * x @ y.T.conj()

    Note: ''a'' will be updated.
    dtype = a.dtype.char
    if dtype in 'fd':
        return ger(alpha, x, y, a)
    elif dtype == 'F':
        func = cublas.cgerc
    elif dtype == 'D':
        func = cublas.zgerc
        raise TypeError('invalid dtype')
    assert a.ndim == 2
    assert x.ndim == y.ndim == 1
    assert a.dtype == x.dtype == y.dtype
    m, n = a.shape
    assert x.shape[0] == m
    assert y.shape[0] == n

    handle = device.get_cublas_handle()
    alpha, alpha_ptr, orig_mode = _setup_scalar_ptr(handle, alpha, dtype)
    x_ptr, y_ptr = x.data.ptr, y.data.ptr
        if a._f_contiguous:
            func(handle, m, n, alpha_ptr, x_ptr, 1, y_ptr, 1, a.data.ptr, m)
            aa = a.copy(order='F')
            func(handle, m, n, alpha_ptr, x_ptr, 1, y_ptr, 1, aa.data.ptr, m)
            _core.elementwise_copy(aa, a)
        cublas.setPointerMode(handle, orig_mode)
 def check_copy(self, dtype, src_id, dst_id):
     with cuda.Device(src_id):
         src = testing.shaped_arange((2, 3, 4), dtype=dtype)
     with cuda.Device(dst_id):
         dst = cupy.empty((2, 3, 4), dtype=dtype)
     _core.elementwise_copy(src, dst)
     testing.assert_allclose(src, dst)
def copyto(dst, src, casting='same_kind', where=None):
    """Copies values from one array to another with broadcasting.

    This function can be called for arrays on different devices. In this case,
    casting, ``where``, and broadcasting is not supported, and an exception is
    raised if these are used.

        dst (cupy.ndarray): Target array.
        src (cupy.ndarray): Source array.
        casting (str): Casting rule. See :func:`numpy.can_cast` for detail.
        where (cupy.ndarray of bool): If specified, this array acts as a mask,
            and an element is copied only if the corresponding element of
            ``where`` is True.

    .. seealso:: :func:`numpy.copyto`


    src_type = type(src)
    src_is_python_scalar = src_type in (int, bool, float, complex,
    if src_is_python_scalar:
        src_dtype = numpy.dtype(type(src))
        can_cast = numpy.can_cast(src, dst.dtype, casting)
        src_dtype = src.dtype
        can_cast = numpy.can_cast(src_dtype, dst.dtype, casting)

    if not can_cast:
        raise TypeError('Cannot cast %s to %s in %s casting mode' %
                        (src_dtype, dst.dtype, casting))
    if fusion._is_fusing():
        if where is None:
            _core.elementwise_copy(src, dst)
            fusion._call_ufunc(search._where_ufunc, where, src, dst, dst)

    if dst.size == 0:

    if src_is_python_scalar and where is None:

    if where is None:
        if _can_memcpy(dst, src):
            dst.data.copy_from_async(src.data, src.nbytes)
            device = dst.device
            with device:
                if src.device != device:
                    src = src.copy()
                _core.elementwise_copy(src, dst)
        _core.elementwise_copy_where(src, where, dst)
def dgmm(side, a, x, out=None, incx=1):
    """Computes diag(x) @ a or a @ diag(x)

    Computes diag(x) @ a if side is 'L', a @ diag(x) if side is 'R'.
    assert a.ndim == 2
    assert 0 <= x.ndim <= 2
    assert a.dtype == x.dtype
    dtype = a.dtype.char
    if dtype == 'f':
        func = cublas.sdgmm
    elif dtype == 'd':
        func = cublas.ddgmm
    elif dtype == 'F':
        func = cublas.cdgmm
    elif dtype == 'D':
        func = cublas.zdgmm
        raise TypeError('invalid dtype')
    if side == 'L' or side == cublas.CUBLAS_SIDE_LEFT:
        side = cublas.CUBLAS_SIDE_LEFT
    elif side == 'R' or side == cublas.CUBLAS_SIDE_RIGHT:
        side = cublas.CUBLAS_SIDE_RIGHT
        raise ValueError('invalid side (actual: {})'.format(side))
    m, n = a.shape
    if side == cublas.CUBLAS_SIDE_LEFT:
        assert x.size >= (m - 1) * abs(incx) + 1
        assert x.size >= (n - 1) * abs(incx) + 1
    if out is None:
        if a._c_contiguous:
            order = 'C'
            order = 'F'
        out = cupy.empty((m, n), dtype=dtype, order=order)
        assert out.ndim == 2
        assert out.shape == a.shape
        assert out.dtype == a.dtype

    handle = device.get_cublas_handle()
    if out._c_contiguous:
        if not a._c_contiguous:
            a = a.copy(order='C')
        func(handle, 1 - side, n, m, a.data.ptr, n, x.data.ptr, incx,
             out.data.ptr, n)
        if not a._f_contiguous:
            a = a.copy(order='F')
        c = out
        if not out._f_contiguous:
            c = out.copy(order='F')
        func(handle, side, m, n, a.data.ptr, m, x.data.ptr, incx,
             c.data.ptr, m)
        if not out._f_contiguous:
            _core.elementwise_copy(c, out)
    return out
def fourier_gaussian(input, sigma, n=-1, axis=-1, output=None):
    """Multidimensional Gaussian shift filter.

    The array is multiplied with the Fourier transform of a (separable)
    Gaussian kernel.

        input (cupy.ndarray): The input array.
        sigma (float or sequence of float):  The sigma of the Gaussian kernel.
            If a float, `sigma` is the same for all axes. If a sequence,
            `sigma` has to contain one value for each axis.
        n (int, optional):  If `n` is negative (default), then the input is
            assumed to be the result of a complex fft. If `n` is larger than or
            equal to zero, the input is assumed to be the result of a real fft,
            and `n` gives the length of the array before transformation along
            the real transform direction.
        axis (int, optional): The axis of the real transform (only used when
            ``n > -1``).
        output (cupy.ndarray, optional):
            If given, the result of shifting the input is placed in this array.

        output (cupy.ndarray): The filtered output.
    ndim = input.ndim
    output = _get_output_fourier(output, input)
    axis = internal._normalize_axis_index(axis, ndim)
    sigmas = _util._fix_sequence_arg(sigma, ndim, 'sigma')

    _core.elementwise_copy(input, output)
    for ax, (sigmak, ax_size) in enumerate(zip(sigmas, output.shape)):

        # compute the frequency grid in Hz
        if ax == axis and n > 0:
            arr = cupy.arange(ax_size, dtype=output.real.dtype)
            arr /= n
            arr = cupy.fft.fftfreq(ax_size)
        arr = arr.astype(output.real.dtype, copy=False)

        # compute the Gaussian weights
        arr *= arr
        scale = sigmak * sigmak / -2
        arr *= (4 * numpy.pi * numpy.pi) * scale
        cupy.exp(arr, out=arr)

        # reshape for broadcasting
        arr = _reshape_nd(arr, ndim=ndim, axis=ax)
        output *= arr

    return output
def tile(A, reps):
    """Construct an array by repeating A the number of times given by reps.

        A (cupy.ndarray): Array to transform.
        reps (int or tuple): The number of repeats.

        cupy.ndarray: Transformed array with repeats.

    .. seealso:: :func:`numpy.tile`

        tup = tuple(reps)
    except TypeError:
        tup = (reps,)
    d = len(tup)
    if tup.count(1) == len(tup) and isinstance(A, cupy.ndarray):
        # Fixes the problem that the function does not make a copy if A is a
        # array and the repetitions are 1 in all dimensions
        return cupy.array(A, copy=True, ndmin=d)
        # Note that no copy of zero-sized arrays is made. However since they
        # have no data there is no risk of an inadvertent overwrite.
        c = cupy.array(A, copy=False, ndmin=d)
    if d < c.ndim:
        tup = (1,) * (c.ndim - d) + tup
    shape_out = tuple(s * t for s, t in zip(c.shape, tup))
    if c.size == 0:
        return cupy.empty(shape_out, dtype=c.dtype)
    c_shape = []
    ret_shape = []
    for dim_in, nrep in zip(c.shape, tup):
        if nrep == 1:
        elif dim_in == 1:
    ret = cupy.empty(ret_shape, dtype=c.dtype)
    if ret.size:
        _core.elementwise_copy(c.reshape(c_shape), ret)
    return ret.reshape(shape_out)
def fourier_shift(input, shift, n=-1, axis=-1, output=None):
    """Multidimensional Fourier shift filter.

    The array is multiplied with the Fourier transform of a shift operation.

        input (cupy.ndarray): The input array. This should be in the Fourier
        shift (float or sequence of float):  The size of shift. If a float,
            `shift` is the same for all axes. If a sequence, `shift` has to
            contain one value for each axis.
        n (int, optional):  If `n` is negative (default), then the input is
            assumed to be the result of a complex fft. If `n` is larger than or
            equal to zero, the input is assumed to be the result of a real fft,
            and `n` gives the length of the array before transformation along
            the real transform direction.
        axis (int, optional): The axis of the real transform (only used when
            ``n > -1``).
        output (cupy.ndarray, optional):
            If given, the result of shifting the input is placed in this array.

        output (cupy.ndarray): The shifted output (in the Fourier domain).
    ndim = input.ndim
    output = _get_output_fourier(output, input, complex_only=True)
    axis = internal._normalize_axis_index(axis, ndim)
    shifts = _util._fix_sequence_arg(shift, ndim, 'shift')

    _core.elementwise_copy(input, output)
    for ax, (shiftk, ax_size) in enumerate(zip(shifts, output.shape)):
        if shiftk == 0:
        if ax == axis and n > 0:
            # cp.fft.rfftfreq(ax_size) * (-2j * numpy.pi * shiftk *  ax_size/n)
            arr = cupy.arange(ax_size, dtype=output.dtype)
            arr *= -2j * numpy.pi * shiftk / n
            arr = cupy.fft.fftfreq(ax_size)
            arr = arr * (-2j * numpy.pi * shiftk)
        cupy.exp(arr, out=arr)

        # reshape for broadcasting
        arr = _reshape_nd(arr, ndim=ndim, axis=ax)
        output *= arr

    return output
    def sum(self, axis=None, dtype=None, out=None):
        """Sums the matrix elements over a given axis.

            axis (int or ``None``): Axis along which the sum is comuted.
                If it is ``None``, it computes the sum of all the elements.
                Select from ``{None, 0, 1, -2, -1}``.
            dtype: The type of returned matrix. If it is not specified, type
                of the array is used.
            out (cupy.ndarray): Output matrix.

            cupy.ndarray: Summed array.

        .. seealso::


        # This implementation uses multiplication, though it is not efficient
        # for some matrix types. These should override this function.

        m, n = self.shape

        if axis is None:
            return self.dot(cupy.ones(n, dtype=self.dtype)).sum(dtype=dtype,

        if axis < 0:
            axis += 2

        if axis == 0:
            ret = self.T.dot(cupy.ones(m, dtype=self.dtype)).reshape(1, n)
        else:  # axis == 1
            ret = self.dot(cupy.ones(n, dtype=self.dtype)).reshape(m, 1)

        if out is not None:
            if out.shape != ret.shape:
                raise ValueError('dimensions do not match')
            _core.elementwise_copy(ret, out)
            return out
        elif dtype is not None:
            return ret.astype(dtype, copy=False)
            return ret
def _call_kernel(kernel,
    Calls a constructed ElementwiseKernel. The kernel must take an input image,
    an optional array of weights, an optional array for the structure, and an
    output array.

    weights and structure can be given as None (structure defaults to None) in
    which case they are not passed to the kernel at all. If the output is given
    as None then it will be allocated in this function.

    This function deals with making sure that the weights and structure are
    contiguous and float64 (or bool for weights that are footprints)*, that the
    output is allocated and appriopately shaped. This also deals with the
    situation that the input and output arrays overlap in memory.

    * weights is always cast to float64 or bool in order to get an output
    compatible with SciPy, though float32 might be sufficient when input dtype
    is low precision. If weights_dtype is passed as weights.dtype then no
    dtype conversion will occur. The input and output are never converted.
    args = [input]
    complex_output = input.dtype.kind == 'c'
    if weights is not None:
        weights = cupy.ascontiguousarray(weights, weights_dtype)
        complex_output = complex_output or weights.dtype.kind == 'c'
    if structure is not None:
        structure = cupy.ascontiguousarray(structure, structure_dtype)
    output = _util._get_output(output, input, None, complex_output)
    needs_temp = cupy.shares_memory(output, input, 'MAY_SHARE_BOUNDS')
    if needs_temp:
        output, temp = _util._get_output(output.dtype, input), output
    if needs_temp:
        _core.elementwise_copy(temp, output)
        output = temp
    return output
def spline_filter(input, order=3, output=cupy.float64, mode='mirror'):
    """Multidimensional spline filter.

        input (cupy.ndarray): The input array.
        order (int): The order of the spline interpolation, default is 3. Must
            be in the range 0-5.
        output (cupy.ndarray or dtype, optional): The array in which to place
            the output, or the dtype of the returned array. Default is
        mode (str): Points outside the boundaries of the input are filled
            according to the given mode (``'constant'``, ``'nearest'``,
            ``'mirror'``, ``'reflect'``, ``'wrap'``, ``'grid-mirror'``,
            ``'grid-wrap'``, ``'grid-constant'`` or ``'opencv'``).

        cupy.ndarray: The result of prefiltering the input.

    .. seealso:: :func:`scipy.spline_filter1d`
    if order < 2 or order > 5:
        raise RuntimeError('spline order not supported')

    x = input
    temp, data_dtype, output_dtype = _get_spline_output(x, output)
    if order not in [0, 1] and input.ndim > 0:
        for axis in range(x.ndim):
            spline_filter1d(x, order, axis, output=temp, mode=mode)
            x = temp
    if isinstance(output, cupy.ndarray):
        _core.elementwise_copy(temp, output)
        output = temp
    if output.dtype != output_dtype:
        output = output.astype(output_dtype)
    return output
def _get_spline_output(input, output):
    """Create workspace array, temp, and the final dtype for the output.

    Differs from SciPy by not always forcing the internal floating point dtype
    to be double precision.
    complex_data = input.dtype.kind == 'c'
    if complex_data:
        min_float_dtype = cupy.complex64
        min_float_dtype = cupy.float32
    if isinstance(output, cupy.ndarray):
        if complex_data and output.dtype.kind != 'c':
            raise ValueError(
                'output must have complex dtype for complex inputs')
        float_dtype = cupy.promote_types(output.dtype, min_float_dtype)
        output_dtype = output.dtype
        if output is None:
            output = output_dtype = input.dtype
            output_dtype = cupy.dtype(output)
        float_dtype = cupy.promote_types(output, min_float_dtype)

    if (isinstance(output, cupy.ndarray)
            and output.dtype == float_dtype == output_dtype
            and output.flags.c_contiguous):
        if output is not input:
            _core.elementwise_copy(input, output)
        temp = output
        temp = input.astype(float_dtype, copy=False)
        temp = cupy.ascontiguousarray(temp)
        if cupy.shares_memory(temp, input, 'MAY_SHARE_BOUNDS'):
            temp = temp.copy()
    return temp, float_dtype, output_dtype
def copyto(dst, src, casting='same_kind', where=None):
    """Copies values from one array to another with broadcasting.

    This function can be called for arrays on different devices. In this case,
    casting, ``where``, and broadcasting is not supported, and an exception is
    raised if these are used.

        dst (cupy.ndarray): Target array.
        src (cupy.ndarray): Source array.
        casting (str): Casting rule. See :func:`numpy.can_cast` for detail.
        where (cupy.ndarray of bool): If specified, this array acts as a mask,
            and an element is copied only if the corresponding element of
            ``where`` is True.

    .. seealso:: :func:`numpy.copyto`

    src_is_numpy_scalar = False

    src_type = type(src)
    src_is_python_scalar = src_type in (
        int, bool, float, complex,
        fusion._FusionVarScalar, _fusion_interface._ScalarProxy)
    if src_is_python_scalar:
        src_dtype = numpy.dtype(type(src))
        can_cast = numpy.can_cast(src, dst.dtype, casting)
    elif isinstance(src, numpy.ndarray) or numpy.isscalar(src):
        if src.size != 1:
            raise ValueError(
                'non-scalar numpy.ndarray cannot be used for copyto')
        src_dtype = src.dtype
        can_cast = numpy.can_cast(src, dst.dtype, casting)
        src = src.item()
        src_is_numpy_scalar = True
        src_dtype = src.dtype
        can_cast = numpy.can_cast(src_dtype, dst.dtype, casting)

    if not can_cast:
        raise TypeError('Cannot cast %s to %s in %s casting mode' %
                        (src_dtype, dst.dtype, casting))

    if fusion._is_fusing():
        # TODO(kataoka): NumPy allows stripping leading unit dimensions.
        # But fusion array proxy does not currently support
        # `shape` and `squeeze`.

        if where is None:
            _core.elementwise_copy(src, dst)
            fusion._call_ufunc(search._where_ufunc, where, src, dst, dst)

    if not src_is_python_scalar and not src_is_numpy_scalar:
        # Check broadcast condition
        # - for fast-paths and
        # - for a better error message (than ufunc's).
        # NumPy allows stripping leading unit dimensions.
        if not all([
            s in (d, 1)
            for s, d in itertools.zip_longest(
                reversed(src.shape), reversed(dst.shape), fillvalue=1)
            raise ValueError(
                "could not broadcast input array "
                f"from shape {src.shape} into shape {dst.shape}")
        squeeze_ndim = src.ndim - dst.ndim
        if squeeze_ndim > 0:
            # always succeeds because broadcast conition is checked.
            src = src.squeeze(tuple(range(squeeze_ndim)))

    if where is not None:
        _core.elementwise_copy(src, dst, _where=where)

    if dst.size == 0:

    if src_is_python_scalar or src_is_numpy_scalar:
        _core.elementwise_copy(src, dst)

    if _can_memcpy(dst, src):
        dst.data.copy_from_async(src.data, src.nbytes)

    device = dst.device
    prev_device = runtime.getDevice()
        if src.device != device:
            src = src.copy()
        _core.elementwise_copy(src, dst)
def label(input, structure=None, output=None):
    """Labels features in an array.

        input (cupy.ndarray): The input array.
        structure (array_like or None): A structuring element that defines
            feature connections. ```structure``` must be centersymmetric. If
            None, structure is automatically generated with a squared
            connectivity equal to one.
        output (cupy.ndarray, dtype or None): The array in which to place the
        label (cupy.ndarray): An integer array where each unique feature in
        ```input``` has a unique label in the array.

        num_features (int): Number of features found.

    .. warning::

        This function may synchronize the device.

    .. seealso:: :func:`scipy.ndimage.label`
    if not isinstance(input, cupy.ndarray):
        raise TypeError('input must be cupy.ndarray')
    if input.dtype.char in 'FD':
        raise TypeError('Complex type not supported')
    if structure is None:
        structure = _generate_binary_structure(input.ndim, 1)
    elif isinstance(structure, cupy.ndarray):
        structure = cupy.asnumpy(structure)
    structure = numpy.array(structure, dtype=bool)
    if structure.ndim != input.ndim:
        raise RuntimeError('structure and input must have equal rank')
    for i in structure.shape:
        if i != 3:
            raise ValueError('structure dimensions must be equal to 3')

    if isinstance(output, cupy.ndarray):
        if output.shape != input.shape:
            raise ValueError("output shape not correct")
        caller_provided_output = True
        caller_provided_output = False
        if output is None:
            output = cupy.empty(input.shape, numpy.int32)
            output = cupy.empty(input.shape, output)

    if input.size == 0:
        # empty
        maxlabel = 0
    elif input.ndim == 0:
        # 0-dim array
        maxlabel = 0 if input.item() == 0 else 1
        if output.dtype != numpy.int32:
            y = cupy.empty(input.shape, numpy.int32)
            y = output
        maxlabel = _label(input, structure, y)
        if output.dtype != numpy.int32:
            _core.elementwise_copy(y, output)

    if caller_provided_output:
        return maxlabel
        return output, maxlabel
def spline_filter1d(input,
    Calculate a 1-D spline filter along the given axis.

    The lines of the array along the given axis are filtered by a
    spline filter. The order of the spline must be >= 2 and <= 5.

        input (cupy.ndarray): The input array.
        order (int): The order of the spline interpolation, default is 3. Must
            be in the range 0-5.
        axis (int): The axis along which the spline filter is applied. Default
            is the last axis.
        output (cupy.ndarray or dtype, optional): The array in which to place
            the output, or the dtype of the returned array. Default is
        mode (str): Points outside the boundaries of the input are filled
            according to the given mode (``'constant'``, ``'nearest'``,
            ``'mirror'``, ``'reflect'``, ``'wrap'``, ``'grid-mirror'``,
            ``'grid-wrap'``, ``'grid-constant'`` or ``'opencv'``).

        cupy.ndarray: The result of prefiltering the input.

    .. seealso:: :func:`scipy.spline_filter1d`
    if order < 0 or order > 5:
        raise RuntimeError('spline order not supported')
    x = input
    ndim = x.ndim
    axis = internal._normalize_axis_index(axis, ndim)

    # order 0, 1 don't require reshaping as no CUDA kernel will be called
    # scalar or size 1 arrays also don't need to be filtered
    run_kernel = not (order < 2 or x.ndim == 0 or x.shape[axis] == 1)
    if not run_kernel:
        output = _util._get_output(output, input)
        _core.elementwise_copy(x, output)
        return output

    temp, data_dtype, output_dtype = _get_spline_output(x, output)
    data_type = cupy._core._scalar.get_typename(temp.dtype)
    pole_type = cupy._core._scalar.get_typename(temp.real.dtype)

    index_type = _util._get_inttype(input)
    index_dtype = cupy.int32 if index_type == 'int' else cupy.int64

    n_samples = x.shape[axis]
    n_signals = x.size // n_samples
    info = cupy.array((n_signals, n_samples) + x.shape, dtype=index_dtype)

    # empirical choice of block size that seemed to work well
    block_size = max(2**math.ceil(numpy.log2(n_samples / 32)), 8)
    kern = _spline_prefilter_core.get_raw_spline1d_kernel(

    # Due to recursive nature, a given line of data must be processed by a
    # single thread. n_signals lines will be processed in total.
    block = (block_size, )
    grid = ((n_signals + block[0] - 1) // block[0], )

    # apply prefilter gain
    poles = _spline_prefilter_core.get_poles(order=order)
    temp *= _spline_prefilter_core.get_gain(poles)

    # apply caual + anti-causal IIR spline filters
    kern(grid, block, (temp, info))

    if isinstance(output, cupy.ndarray) and temp is not output:
        # copy kernel output into the user-provided output array
        _core.elementwise_copy(temp, output)
        return output
    return temp.astype(output_dtype, copy=False)
def geam(transa, transb, alpha, a, beta, b, out=None):
    """Computes alpha * op(a) + beta * op(b)

    op(a) = a if transa is 'N', op(a) = a.T if transa is 'T',
    op(a) = a.T.conj() if transa is 'H'.
    op(b) = b if transb is 'N', op(b) = b.T if transb is 'T',
    op(b) = b.T.conj() if transb is 'H'.
    assert a.ndim == b.ndim == 2
    assert a.dtype == b.dtype
    dtype = a.dtype.char
    if dtype == 'f':
        func = cublas.sgeam
    elif dtype == 'd':
        func = cublas.dgeam
    elif dtype == 'F':
        func = cublas.cgeam
    elif dtype == 'D':
        func = cublas.zgeam
        raise TypeError('invalid dtype')

    transa = _trans_to_cublas_op(transa)
    transb = _trans_to_cublas_op(transb)
    if transa == cublas.CUBLAS_OP_N:
        m, n = a.shape
        n, m = a.shape
    if transb == cublas.CUBLAS_OP_N:
        assert b.shape == (m, n)
        assert b.shape == (n, m)
    if out is None:
        out = cupy.empty((m, n), dtype=dtype, order='F')
        assert out.ndim == 2
        assert out.shape == (m, n)
        assert out.dtype == dtype

    alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype)
    beta, beta_ptr = _get_scalar_ptr(beta, a.dtype)
    handle = device.get_cublas_handle()
    orig_mode = cublas.getPointerMode(handle)
    if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray):
        if not isinstance(alpha, cupy.ndarray):
            alpha = cupy.array(alpha)
            alpha_ptr = alpha.data.ptr
        if not isinstance(beta, cupy.ndarray):
            beta = cupy.array(beta)
            beta_ptr = beta.data.ptr
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE)
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST)

    lda, transa = _decide_ld_and_trans(a, transa)
    ldb, transb = _decide_ld_and_trans(b, transb)
    if not (lda is None or ldb is None):
        if out._f_contiguous:
                func(handle, transa, transb, m, n, alpha_ptr, a.data.ptr,
                     lda, beta_ptr, b.data.ptr, ldb, out.data.ptr, m)
                cublas.setPointerMode(handle, orig_mode)
            return out
        elif out._c_contiguous:
            # Computes alpha * a.T + beta * b.T
                func(handle, 1-transa, 1-transb, n, m, alpha_ptr, a.data.ptr,
                     lda, beta_ptr, b.data.ptr, ldb, out.data.ptr, n)
                cublas.setPointerMode(handle, orig_mode)
            return out

    a, lda = _change_order_if_necessary(a, lda)
    b, ldb = _change_order_if_necessary(b, ldb)
    c = out
    if not out._f_contiguous:
        c = out.copy(order='F')
        func(handle, transa, transb, m, n, alpha_ptr, a.data.ptr, lda,
             beta_ptr, b.data.ptr, ldb, c.data.ptr, m)
        cublas.setPointerMode(handle, orig_mode)
    if not out._f_contiguous:
        _core.elementwise_copy(c, out)
    return out
def _binary_erosion(input, structure, iterations, mask, output, border_value,
                    origin, invert, brute_force=True):
        iterations = operator.index(iterations)
    except TypeError:
        raise TypeError('iterations parameter should be an integer')

    if input.dtype.kind == 'c':
        raise TypeError('Complex type not supported')
    if structure is None:
        structure = generate_binary_structure(input.ndim, 1)
        all_weights_nonzero = input.ndim == 1
        center_is_true = True
        default_structure = True
        structure = structure.astype(dtype=bool, copy=False)
        # transfer to CPU for use in determining if it is fully dense
        # structure_cpu = cupy.asnumpy(structure)
        default_structure = False
    if structure.ndim != input.ndim:
        raise RuntimeError('structure and input must have same dimensionality')
    if not structure.flags.c_contiguous:
        structure = cupy.ascontiguousarray(structure)
    if structure.size < 1:
        raise RuntimeError('structure must not be empty')

    if mask is not None:
        if mask.shape != input.shape:
            raise RuntimeError('mask and input must have equal sizes')
        if not mask.flags.c_contiguous:
            mask = cupy.ascontiguousarray(mask)
        masked = True
        masked = False
    origin = _util._fix_sequence_arg(origin, input.ndim, 'origin', int)

    if isinstance(output, cupy.ndarray):
        if output.dtype.kind == 'c':
            raise TypeError('Complex output type not supported')
        output = bool
    output = _util._get_output(output, input)
    temp_needed = cupy.shares_memory(output, input, 'MAY_SHARE_BOUNDS')
    if temp_needed:
        # input and output arrays cannot share memory
        temp = output
        output = _util._get_output(output.dtype, input)
    if structure.ndim == 0:
        # kernel doesn't handle ndim=0, so special case it here
        if float(structure):
            output[...] = cupy.asarray(input, dtype=bool)
            output[...] = ~cupy.asarray(input, dtype=bool)
        return output
    origin = tuple(origin)
    int_type = _util._get_inttype(input)
    offsets = _filters_core._origins_to_offsets(origin, structure.shape)
    if not default_structure:
        # synchronize required to determine if all weights are non-zero
        nnz = int(cupy.count_nonzero(structure))
        all_weights_nonzero = nnz == structure.size
        if all_weights_nonzero:
            center_is_true = True
            center_is_true = _center_is_true(structure, origin)

    erode_kernel = _get_binary_erosion_kernel(
        structure.shape, int_type, offsets, center_is_true, border_value,
        invert, masked, all_weights_nonzero,

    if iterations == 1:
        if masked:
            output = erode_kernel(input, structure, mask, output)
            output = erode_kernel(input, structure, output)
    elif center_is_true and not brute_force:
        raise NotImplementedError(
            'only brute_force iteration has been implemented'
        if cupy.shares_memory(output, input, 'MAY_SHARE_BOUNDS'):
            raise ValueError('output and input may not overlap in memory')
        tmp_in = cupy.empty_like(input, dtype=output.dtype)
        tmp_out = output
        if iterations >= 1 and not iterations & 1:
            tmp_in, tmp_out = tmp_out, tmp_in
        if masked:
            tmp_out = erode_kernel(input, structure, mask, tmp_out)
            tmp_out = erode_kernel(input, structure, tmp_out)
        # TODO: kernel doesn't return the changed status, so determine it here
        changed = not (input == tmp_out).all()  # synchronize!
        ii = 1
        while ii < iterations or ((iterations < 1) and changed):
            tmp_in, tmp_out = tmp_out, tmp_in
            if masked:
                tmp_out = erode_kernel(tmp_in, structure, mask, tmp_out)
                tmp_out = erode_kernel(tmp_in, structure, tmp_out)
            changed = not (tmp_in == tmp_out).all()
            ii += 1
            if not changed and (not ii & 1):  # synchronize!
                # can exit early if nothing changed
                # (only do this after even number of tmp_in/out swaps)
        output = tmp_out
    if temp_needed:
        _core.elementwise_copy(output, temp)
        output = temp
    return output
def fourier_ellipsoid(input, size, n=-1, axis=-1, output=None):
    """Multidimensional ellipsoid Fourier filter.

    The array is multiplied with the fourier transform of a ellipsoid of
    given sizes.

        input (cupy.ndarray): The input array.
        size (float or sequence of float):  The size of the box used for
            filtering. If a float, `size` is the same for all axes. If a
            sequence, `size` has to contain one value for each axis.
        n (int, optional):  If `n` is negative (default), then the input is
            assumed to be the result of a complex fft. If `n` is larger than or
            equal to zero, the input is assumed to be the result of a real fft,
            and `n` gives the length of the array before transformation along
            the real transform direction.
        axis (int, optional): The axis of the real transform (only used when
            ``n > -1``).
        output (cupy.ndarray, optional):
            If given, the result of shifting the input is placed in this array.

        output (cupy.ndarray): The filtered output.
    ndim = input.ndim
    if ndim == 1:
        return fourier_uniform(input, size, n, axis, output)

    if ndim > 3:
        # Note: SciPy currently does not do any filtering on >=4d inputs, but
        #       does not warn about this!
        raise NotImplementedError('Only 1d, 2d and 3d inputs are supported')
    output = _get_output_fourier(output, input)
    axis = internal._normalize_axis_index(axis, ndim)
    sizes = _util._fix_sequence_arg(size, ndim, 'size')

    _core.elementwise_copy(input, output)

    # compute the distance from the origin for all samples in Fourier space
    distance = 0
    for ax, (size, ax_size) in enumerate(zip(sizes, output.shape)):
        # compute the frequency grid in Hz
        if ax == axis and n > 0:
            arr = cupy.arange(ax_size, dtype=output.real.dtype)
            arr *= numpy.pi * size / n
            arr = cupy.fft.fftfreq(ax_size)
            arr *= numpy.pi * size
        arr = arr.astype(output.real.dtype, copy=False)
        arr *= arr
        arr = _reshape_nd(arr, ndim=ndim, axis=ax)
        distance = distance + arr
    cupy.sqrt(distance, out=distance)

    if ndim == 2:
        special.j1(distance, out=output)
        output *= 2
        output /= distance
    elif ndim == 3:
        cupy.sin(distance, out=output)
        output -= distance * cupy.cos(distance)
        output *= 3
        output /= distance**3
    output[(0, ) * ndim] = 1.0  # avoid NaN in corner at frequency=0 location
    output *= input

    return output