Python nrm2示例，cupy.cublas.nrm2 Python示例

示例#1

0

显示文件

def _lanczos_asis(a, V, u, alpha, beta, i_start, i_end):
    for i in range(i_start, i_end):
        u[...] = a @ V[i]
        cublas.dotc(V[i], u, out=alpha[i])
        u -= u.T @ V[:i + 1].conj().T @ V[:i + 1]
        cublas.nrm2(u, out=beta[i])
        if i >= i_end - 1:
            break
        V[i + 1] = u / beta[i]

示例#2

0

显示文件

文件： _eigen.py 项目： neveroldmilk/cupy

 def _update_asis(self, i_start, i_end):
     for i in range(i_start, i_end):
         u = self.A @ self.V[i]
         cublas.dotc(self.V[i], u, out=self.alpha[i])
         u -= u.T @ self.V[:i + 1].conj().T @ self.V[:i + 1]
         cublas.nrm2(u, out=self.beta[i])
         if i >= i_end - 1:
             break
         self.V[i + 1] = u / self.beta[i]
     return u

示例#3

0

显示文件

文件： test_cublas.py 项目： toslunar/cupy

 def test_nrm2(self):
     x = self._make_random_vector()
     ref = cupy.linalg.norm(x)
     out = self._make_out(self.dtype.char.lower())
     res = cublas.nrm2(x, out=out)
     self._check_pointer(res, out)
     cupy.testing.assert_allclose(res, ref, rtol=self.tol, atol=self.tol)

示例#4

0

显示文件

def _lanczos_asis(a, V, u, alpha, beta, i_start, i_end):
    beta_eps = inversion_eps(a.dtype)
    for i in range(i_start, i_end):
        u[...] = a @ V[i]
        cublas.dotc(V[i], u, out=alpha[i])
        u -= u.T @ V[:i + 1].conj().T @ V[:i + 1]
        cublas.nrm2(u, out=beta[i])
        if i >= i_end - 1:
            break

        if beta[i] < beta_eps:
            V[i + 1:i_end, :] = 0
            u[...] = 0
            break
        if i == i_start:
            beta_eps *= beta[i]  # scale eps to largest beta

        V[i + 1] = u / beta[i]

示例#5

0

显示文件

def eigsh(a,
          k=6,
          *,
          which='LM',
          ncv=None,
          maxiter=None,
          tol=0,
          return_eigenvectors=True):
    """Finds ``k`` eigenvalues and eigenvectors of the real symmetric matrix.

    Solves ``Ax = wx``, the standard eigenvalue problem for ``w`` eigenvalues
    with corresponding eigenvectors ``x``.

    Args:
        a (ndarray, spmatrix or LinearOperator): A symmetric square matrix with
            dimension ``(n, n)``. ``a`` must :class:`cupy.ndarray`,
            :class:`cupyx.scipy.sparse.spmatrix` or
            :class:`cupyx.scipy.sparse.linalg.LinearOperator`.
        k (int): The number of eigenvalues and eigenvectors to compute. Must be
            ``1 <= k < n``.
        which (str): 'LM' or 'LA'. 'LM': finds ``k`` largest (in magnitude)
            eigenvalues. 'LA': finds ``k`` largest (algebraic) eigenvalues.
        ncv (int): The number of Lanczos vectors generated. Must be
            ``k + 1 < ncv < n``. If ``None``, default value is used.
        maxiter (int): Maximum number of Lanczos update iterations.
            If ``None``, default value is used.
        tol (float): Tolerance for residuals ``||Ax - wx||``. If ``0``, machine
            precision is used.
        return_eigenvectors (bool): If ``True``, returns eigenvectors in
            addition to eigenvalues.

    Returns:
        tuple:
            If ``return_eigenvectors is True``, it returns ``w`` and ``x``
            where ``w`` is eigenvalues and ``x`` is eigenvectors. Otherwise,
            it returns only ``w``.

    .. seealso:: :func:`scipy.sparse.linalg.eigsh`

    .. note::
        This function uses the thick-restart Lanczos methods
        (https://sdm.lbl.gov/~kewu/ps/trlan.html).

    """
    n = a.shape[0]
    if a.ndim != 2 or a.shape[0] != a.shape[1]:
        raise ValueError('expected square matrix (shape: {})'.format(a.shape))
    if a.dtype.char not in 'fdFD':
        raise TypeError('unsupprted dtype (actual: {})'.format(a.dtype))
    if k <= 0:
        raise ValueError('k must be greater than 0 (actual: {})'.format(k))
    if k >= n:
        raise ValueError('k must be smaller than n (actual: {})'.format(k))
    if which not in ('LM', 'LA'):
        raise ValueError('which must be \'LM\' or \'LA\' (actual: {})'
                         ''.format(which))
    if ncv is None:
        ncv = min(max(2 * k, k + 32), n - 1)
    else:
        ncv = min(max(ncv, k + 2), n - 1)
    if maxiter is None:
        maxiter = 10 * n
    if tol == 0:
        tol = numpy.finfo(a.dtype).eps

    alpha = cupy.zeros((ncv, ), dtype=a.dtype)
    beta = cupy.zeros((ncv, ), dtype=a.dtype.char.lower())
    V = cupy.empty((ncv, n), dtype=a.dtype)

    # Set initial vector
    u = cupy.random.random((n, )).astype(a.dtype)
    V[0] = u / cublas.nrm2(u)

    # Choose Lanczos implementation, unconditionally use 'fast' for now
    upadte_impl = 'fast'
    if upadte_impl == 'fast':
        lanczos = _lanczos_fast(a, n, ncv)
    else:
        lanczos = _lanczos_asis

    # Lanczos iteration
    lanczos(a, V, u, alpha, beta, 0, ncv)

    iter = ncv
    w, s = _eigsh_solve_ritz(alpha, beta, None, k, which)
    x = V.T @ s

    # Compute residual
    beta_k = beta[-1] * s[-1, :]
    res = cublas.nrm2(beta_k)

    while res > tol and iter < maxiter:
        # Setup for thick-restart
        beta[:k] = 0
        alpha[:k] = w
        V[:k] = x.T

        u -= u.T @ V[:k].conj().T @ V[:k]
        V[k] = u / cublas.nrm2(u)

        u[...] = a @ V[k]
        cublas.dotc(V[k], u, out=alpha[k])
        u -= alpha[k] * V[k]
        u -= V[:k].T @ beta_k
        cublas.nrm2(u, out=beta[k])
        V[k + 1] = u / beta[k]

        # Lanczos iteration
        lanczos(a, V, u, alpha, beta, k + 1, ncv)

        iter += ncv - k
        w, s = _eigsh_solve_ritz(alpha, beta, beta_k, k, which)
        x = V.T @ s

        # Compute residual
        beta_k = beta[-1] * s[-1, :]
        res = cublas.nrm2(beta_k)

    if return_eigenvectors:
        idx = cupy.argsort(w)
        return w[idx], x[:, idx]
    else:
        return cupy.sort(w)

示例#6

0

显示文件

def lsmr(A, b, x0=None, damp=0.0, atol=1e-6, btol=1e-6, conlim=1e8,
         maxiter=None):
    """Iterative solver for least-squares problems.

    lsmr solves the system of linear equations ``Ax = b``. If the system
    is inconsistent, it solves the least-squares problem ``min ||b - Ax||_2``.
    A is a rectangular matrix of dimension m-by-n, where all cases are
    allowed: m = n, m > n, or m < n. B is a vector of length m.
    The matrix A may be dense or sparse (usually sparse).

    Args:
        A (ndarray, spmatrix or LinearOperator): The real or complex
            matrix of the linear system. ``A`` must be
            :class:`cupy.ndarray`, :class:`cupyx.scipy.sparse.spmatrix` or
            :class:`cupyx.scipy.sparse.linalg.LinearOperator`.
        b (cupy.ndarray): Right hand side of the linear system with shape
            ``(m,)`` or ``(m, 1)``.
        x0 (cupy.ndarray): Starting guess for the solution. If None zeros are
            used.
        damp (float): Damping factor for regularized least-squares.
            `lsmr` solves the regularized least-squares problem
            ::

                min ||(b) - (  A   )x||
                    ||(0)   (damp*I) ||_2

            where damp is a scalar. If damp is None or 0, the system
            is solved without regularization.
        atol, btol (float):
            Stopping tolerances. `lsmr` continues iterations until a
            certain backward error estimate is smaller than some quantity
            depending on atol and btol.
        conlim (float): `lsmr` terminates if an estimate of ``cond(A)`` i.e.
            condition number of matrix exceeds `conlim`. If `conlim` is None,
            the default value is 1e+8.
        maxiter (int): Maximum number of iterations.

    Returns:
        tuple:
            - `x` (ndarray): Least-square solution returned.
            - `istop` (int): istop gives the reason for stopping::

                    0 means x=0 is a solution.

                    1 means x is an approximate solution to A*x = B,
                    according to atol and btol.

                    2 means x approximately solves the least-squares problem
                    according to atol.

                    3 means COND(A) seems to be greater than CONLIM.

                    4 is the same as 1 with atol = btol = eps (machine
                    precision)

                    5 is the same as 2 with atol = eps.

                    6 is the same as 3 with CONLIM = 1/eps.

                    7 means ITN reached maxiter before the other stopping
                    conditions were satisfied.

            - `itn` (int): Number of iterations used.
            - `normr` (float): ``norm(b-Ax)``
            - `normar` (float): ``norm(A^T (b - Ax))``
            - `norma` (float): ``norm(A)``
            - `conda` (float): Condition number of A.
            - `normx` (float): ``norm(x)``

    .. seealso:: :func:`scipy.sparse.linalg.lsmr`

    References:
        D. C.-L. Fong and M. A. Saunders, "LSMR: An iterative algorithm for
        sparse least-squares problems", SIAM J. Sci. Comput.,
        vol. 33, pp. 2950-2971, 2011.
    """
    A = _interface.aslinearoperator(A)
    b = b.squeeze()
    matvec = A.matvec
    rmatvec = A.rmatvec
    m, n = A.shape
    minDim = min([m, n])

    if maxiter is None:
        maxiter = minDim * 5

    u = b.copy()
    normb = cublas.nrm2(b)
    beta = normb.copy()
    normb = normb.get().item()
    if x0 is None:
        x = cupy.zeros((n,), dtype=A.dtype)
    else:
        if not (x0.shape == (n,) or x0.shape == (n, 1)):
            raise ValueError('x0 has incompatible dimensions')
        x = x0.astype(A.dtype).ravel()
        u -= matvec(x)
        beta = cublas.nrm2(u)

    beta_cpu = beta.get().item()

    v = cupy.zeros(n)
    alpha = cupy.zeros((), dtype=beta.dtype)
    alpha_cpu = 0

    if beta_cpu > 0:
        u /= beta
        v = rmatvec(u)
        alpha = cublas.nrm2(v)
        alpha_cpu = alpha.get().item()

    if alpha_cpu > 0:
        v /= alpha

    # Initialize variables for 1st iteration.

    itn = 0
    zetabar = alpha_cpu * beta_cpu
    alphabar = alpha_cpu
    rho = 1
    rhobar = 1
    cbar = 1
    sbar = 0

    h = v.copy()
    hbar = cupy.zeros(n)
    # x = cupy.zeros(n)

    # Initialize variables for estimation of ||r||.

    betadd = beta_cpu
    betad = 0
    rhodold = 1
    tautildeold = 0
    thetatilde = 0
    zeta = 0
    d = 0

    # Initialize variables for estimation of ||A|| and cond(A)

    normA2 = alpha_cpu * alpha_cpu
    maxrbar = 0
    minrbar = 1e+100
    normA = alpha_cpu
    condA = 1
    normx = 0

    # Items for use in stopping rules.
    istop = 0
    ctol = 0
    if conlim > 0:
        ctol = 1 / conlim
    normr = beta_cpu

    # Golub-Kahan process terminates when either alpha or beta is zero.
    # Reverse the order here from the original matlab code because
    # there was an error on return when arnorm==0
    normar = alpha_cpu * beta_cpu
    if normar == 0:
        return x, istop, itn, normr, normar, normA, condA, normx

    # Main iteration loop.
    while itn < maxiter:
        itn = itn + 1

        # Perform the next step of the bidiagonalization to obtain the
        # next  beta, u, alpha, v.  These satisfy the relations
        #         beta*u  =  a*v   -  alpha*u,
        #        alpha*v  =  A'*u  -  beta*v.

        u *= -alpha
        u += matvec(v)
        beta = cublas.nrm2(u)  # norm(u)
        beta_cpu = beta.get().item()

        if beta_cpu > 0:
            u /= beta
            v *= -beta
            v += rmatvec(u)
            alpha = cublas.nrm2(v)  # norm(v)
            alpha_cpu = alpha.get().item()
            if alpha_cpu > 0:
                v /= alpha

        # At this point, beta = beta_{k+1}, alpha = alpha_{k+1}.

        # Construct rotation Qhat_{k,2k+1}.

        chat, shat, alphahat = _symOrtho(alphabar, damp)

        # Use a plane rotation (Q_i) to turn B_i to R_i

        rhoold = rho
        c, s, rho = _symOrtho(alphahat, beta_cpu)
        thetanew = s * alpha_cpu
        alphabar = c * alpha_cpu

        # Use a plane rotation (Qbar_i) to turn R_i^T to R_i^bar

        rhobarold = rhobar
        zetaold = zeta
        thetabar = sbar * rho
        rhotemp = cbar * rho
        cbar, sbar, rhobar = _symOrtho(cbar * rho, thetanew)
        zeta = cbar * zetabar
        zetabar = - sbar * zetabar

        # Update h, h_hat, x.

        # hbar = h - (thetabar * rho / (rhoold * rhobarold)) * hbar
        hbar *= -(thetabar * rho / (rhoold * rhobarold))
        hbar += h
        x += (zeta / (rho * rhobar)) * hbar
        # h = v - (thetanew / rho) * h
        h *= -(thetanew / rho)
        h += v

        # Estimate of ||r||.

        # Apply rotation Qhat_{k,2k+1}.
        betaacute = chat * betadd
        betacheck = -shat * betadd

        # Apply rotation Q_{k,k+1}.
        betahat = c * betaacute
        betadd = -s * betaacute

        # Apply rotation Qtilde_{k-1}.
        # betad = betad_{k-1} here.

        thetatildeold = thetatilde
        ctildeold, stildeold, rhotildeold = _symOrtho(rhodold, thetabar)
        thetatilde = stildeold * rhobar
        rhodold = ctildeold * rhobar
        betad = - stildeold * betad + ctildeold * betahat

        # betad   = betad_k here.
        # rhodold = rhod_k  here.

        tautildeold = (zetaold - thetatildeold * tautildeold) / rhotildeold
        taud = (zeta - thetatilde * tautildeold) / rhodold
        d = d + betacheck * betacheck
        normr = numpy.sqrt(d + (betad - taud)**2 + betadd * betadd)

        # Estimate ||A||.
        normA2 = normA2 + beta_cpu * beta_cpu
        normA = numpy.sqrt(normA2)
        normA2 = normA2 + alpha_cpu * alpha_cpu

        # Estimate cond(A).
        maxrbar = max(maxrbar, rhobarold)
        if itn > 1:
            minrbar = min(minrbar, rhobarold)
        condA = max(maxrbar, rhotemp) / min(minrbar, rhotemp)

        # Test for convergence.

        # Compute norms for convergence testing.
        normar = abs(zetabar)
        normx = cublas.nrm2(x)
        normx = normx.get().item()

        # Now use these norms to estimate certain other quantities,
        # some of which will be small near a solution.

        test1 = normr / normb
        if (normA * normr) != 0:
            test2 = normar / (normA * normr)
        else:
            test2 = numpy.infty
        test3 = 1 / condA
        t1 = test1 / (1 + normA*normx/normb)
        rtol = btol + atol*normA*normx/normb

        # The following tests guard against extremely small values of
        # atol, btol or ctol.  (The user may have set any or all of
        # the parameters atol, btol, conlim  to 0.)
        # The effect is equivalent to the normAl tests using
        # atol = eps,  btol = eps,  conlim = 1/eps.

        if itn >= maxiter:
            istop = 7
        if 1 + test3 <= 1:
            istop = 6
        if 1 + test2 <= 1:
            istop = 5
        if 1 + t1 <= 1:
            istop = 4

        # Allow for tolerances set by the user.

        if test3 <= ctol:
            istop = 3
        if test2 <= atol:
            istop = 2
        if test1 <= rtol:
            istop = 1

        if istop > 0:
            break

    # The return type of SciPy is always float64. Therefore, x must be casted.
    x = x.astype(numpy.float64)

    return x, istop, itn, normr, normar, normA, condA, normx

示例#7

0

显示文件

文件： _iterative.py 项目： viantirreau/cupy

def gmres(A,
          b,
          x0=None,
          tol=1e-5,
          restart=None,
          maxiter=None,
          M=None,
          callback=None,
          atol=None,
          callback_type=None):
    """Uses Generalized Minimal RESidual iteration to solve ``Ax = b``.

    Args:
        A (ndarray, spmatrix or LinearOperator): The real or complex
            matrix of the linear system with shape ``(n, n)``. ``A`` must be
            :class:`cupy.ndarray`, :class:`cupyx.scipy.sparse.spmatrix` or
            :class:`cupyx.scipy.sparse.linalg.LinearOperator`.
        b (cupy.ndarray): Right hand side of the linear system with shape
            ``(n,)`` or ``(n, 1)``.
        x0 (cupy.ndarray): Starting guess for the solution.
        tol (float): Tolerance for convergence.
        restart (int): Number of iterations between restarts. Larger values
            increase iteration cost, but may be necessary for convergence.
        maxiter (int): Maximum number of iterations.
        M (ndarray, spmatrix or LinearOperator): Preconditioner for ``A``.
            The preconditioner should approximate the inverse of ``A``.
            ``M`` must be :class:`cupy.ndarray`,
            :class:`cupyx.scipy.sparse.spmatrix` or
            :class:`cupyx.scipy.sparse.linalg.LinearOperator`.
        callback (function): User-specified function to call on every restart.
            It is called as ``callback(arg)``, where ``arg`` is selected by
            ``callback_type``.
        callback_type (str): 'x' or 'pr_norm'. If 'x', the current solution
            vector is used as an argument of callback function. if 'pr_norm',
            relative (preconditioned) residual norm is used as an arugment.
        atol (float): Tolerance for convergence.

    Returns:
        tuple:
            It returns ``x`` (cupy.ndarray) and ``info`` (int) where ``x`` is
            the converged solution and ``info`` provides convergence
            information.

    Reference:
        M. Wang, H. Klie, M. Parashar and H. Sudan, "Solving Sparse Linear
        Systems on NVIDIA Tesla GPUs", ICCS 2009 (2009).

    .. seealso:: :func:`scipy.sparse.linalg.gmres`
    """
    A, M, x, b = _make_system(A, M, x0, b)
    matvec = A.matvec
    psolve = M.matvec

    n = A.shape[0]
    if n == 0:
        return cupy.empty_like(b), 0
    b_norm = cupy.linalg.norm(b)
    if b_norm == 0:
        return b, 0
    if atol is None:
        atol = tol * float(b_norm)
    else:
        atol = max(float(atol), tol * float(b_norm))
    if maxiter is None:
        maxiter = n * 10
    if restart is None:
        restart = 20
    restart = min(restart, n)
    if callback_type is None:
        callback_type = 'pr_norm'
    if callback_type not in ('x', 'pr_norm'):
        raise ValueError('Unknown callback_type: {}'.format(callback_type))
    if callback is None:
        callback_type = None

    V = cupy.empty((n, restart), dtype=A.dtype, order='F')
    H = cupy.zeros((restart + 1, restart), dtype=A.dtype, order='F')
    e = numpy.zeros((restart + 1, ), dtype=A.dtype)

    compute_hu = _make_compute_hu(V)

    iters = 0
    while True:
        mx = psolve(x)
        r = b - matvec(mx)
        r_norm = cublas.nrm2(r)
        if callback_type == 'x':
            callback(mx)
        elif callback_type == 'pr_norm' and iters > 0:
            callback(r_norm / b_norm)
        if r_norm <= atol or iters >= maxiter:
            break
        v = r / r_norm
        V[:, 0] = v
        e[0] = r_norm

        # Arnoldi iteration
        for j in range(restart):
            z = psolve(v)
            u = matvec(z)
            H[:j + 1, j], u = compute_hu(u, j)
            cublas.nrm2(u, out=H[j + 1, j])
            if j + 1 < restart:
                v = u / H[j + 1, j]
                V[:, j + 1] = v

        # Note: The least-square solution to equation Hy = e is computed on CPU
        # because it is faster if tha matrix size is small.
        ret = numpy.linalg.lstsq(cupy.asnumpy(H), e)
        y = cupy.array(ret[0])
        x += V @ y
        iters += restart

    info = 0
    if iters == maxiter and not (r_norm <= atol):
        info = iters
    return mx, info

示例#8

0

显示文件

文件： _iterative.py 项目： viantirreau/cupy

def cg(A,
       b,
       x0=None,
       tol=1e-5,
       maxiter=None,
       M=None,
       callback=None,
       atol=None):
    """Uses Conjugate Gradient iteration to solve ``Ax = b``.

    Args:
        A (ndarray, spmatrix or LinearOperator): The real or complex matrix of
            the linear system with shape ``(n, n)``. ``A`` must be a hermitian,
            positive definitive matrix with type of :class:`cupy.ndarray`,
            :class:`cupyx.scipy.sparse.spmatrix` or
            :class:`cupyx.scipy.sparse.linalg.LinearOperator`.
        b (cupy.ndarray): Right hand side of the linear system with shape
            ``(n,)`` or ``(n, 1)``.
        x0 (cupy.ndarray): Starting guess for the solution.
        tol (float): Tolerance for convergence.
        maxiter (int): Maximum number of iterations.
        M (ndarray, spmatrix or LinearOperator): Preconditioner for ``A``.
            The preconditioner should approximate the inverse of ``A``.
            ``M`` must be :class:`cupy.ndarray`,
            :class:`cupyx.scipy.sparse.spmatrix` or
            :class:`cupyx.scipy.sparse.linalg.LinearOperator`.
        callback (function): User-specified function to call after each
            iteration. It is called as ``callback(xk)``, where ``xk`` is the
            current solution vector.
        atol (float): Tolerance for convergence.

    Returns:
        tuple:
            It returns ``x`` (cupy.ndarray) and ``info`` (int) where ``x`` is
            the converged solution and ``info`` provides convergence
            information.

    .. seealso:: :func:`scipy.sparse.linalg.cg`
    """
    A, M, x, b = _make_system(A, M, x0, b)
    matvec = A.matvec
    psolve = M.matvec

    n = A.shape[0]
    if maxiter is None:
        maxiter = n * 10
    if n == 0:
        return cupy.empty_like(b), 0
    b_norm = cupy.linalg.norm(b)
    if b_norm == 0:
        return b, 0
    if atol is None:
        atol = tol * float(b_norm)
    else:
        atol = max(float(atol), tol * float(b_norm))

    r = b - matvec(x)
    iters = 0
    rho = 0
    while iters < maxiter:
        z = psolve(r)
        rho1 = rho
        rho = cublas.dotc(r, z)
        if iters == 0:
            p = z
        else:
            beta = rho / rho1
            p = z + beta * p
        q = matvec(p)
        alpha = rho / cublas.dotc(p, q)
        x = x + alpha * p
        r = r - alpha * q
        iters += 1
        if callback is not None:
            callback(x)
        resid = cublas.nrm2(r)
        if resid <= atol:
            break

    info = 0
    if iters == maxiter and not (resid <= atol):
        info = iters

    return x, info

示例#9

0

显示文件

文件： _iterative.py 项目： venkywonka/cupy

def cg(A,
       b,
       x0=None,
       tol=1e-5,
       maxiter=None,
       M=None,
       callback=None,
       atol=None):
    """Uses Conjugate Gradient iteration to solve ``Ax = b``.

    Args:
        A (cupy.ndarray or cupyx.scipy.sparse.spmatrix): The real or complex
            matrix of the linear system with shape ``(n, n)``. ``A`` must
            be a hermitian, positive definitive matrix.
        b (cupy.ndarray): Right hand side of the linear system with shape
            ``(n,)`` or ``(n, 1)``.
        x0 (cupy.ndarray): Starting guess for the solution.
        tol (float): Tolerance for convergence.
        maxiter (int): Maximum number of iterations.
        M (cupy.ndarray or cupyx.scipy.sparse.spmatrix): Preconditioner for
            ``A``. The preconditioner should approximate the inverse of ``A``.
        callback (function): User-specified function to call after each
            iteration. It is called as ``callback(xk)``, where ``xk`` is the
            current solution vector.
        atol (float): Tolerance for convergence.

    Returns:
        tuple:
            It returns ``x`` (cupy.ndarray) and ``info`` (int) where ``x`` is
            the converged solution and ``info`` provides convergence
            information.

    .. seealso:: :func:`scipy.sparse.linalg.cg`
    """
    if A.ndim != 2 or A.shape[0] != A.shape[1]:
        raise ValueError('expected square matrix (shape: {})'.format(A.shape))
    if A.dtype.char not in 'fdFD':
        raise TypeError('unsupprted dtype (actual: {})'.format(A.dtype))
    n = A.shape[0]
    if not (b.shape == (n, ) or b.shape == (n, 1)):
        raise ValueError('b has incompatible dimensins')
    b = b.astype(A.dtype).ravel()
    if n == 0:
        return cupy.empty_like(b), 0
    b_norm = cupy.linalg.norm(b)
    if b_norm == 0:
        return b, 0
    if atol is None:
        atol = tol * float(b_norm)
    else:
        atol = max(float(atol), tol * float(b_norm))
    if x0 is None:
        x = cupy.zeros((n, ), dtype=A.dtype)
    else:
        if not (x0.shape == (n, ) or x0.shape == (n, 1)):
            raise ValueError('x0 has incompatible dimensins')
        x = x0.astype(A.dtype).ravel()
    if maxiter is None:
        maxiter = n * 10
    matvec, psolve = _make_funcs(A, M)

    r = b - matvec(x)
    iters = 0
    rho = 0
    while iters < maxiter:
        z = psolve(r)
        rho1 = rho
        rho = cublas.dotc(r, z)
        if iters == 0:
            p = z
        else:
            beta = rho / rho1
            p = z + beta * p
        q = matvec(p)
        alpha = rho / cublas.dotc(p, q)
        x = x + alpha * p
        r = r - alpha * q
        iters += 1
        if callback is not None:
            callback(x)
        resid = cublas.nrm2(r)
        if resid <= atol:
            break

    info = 0
    if iters == maxiter and not (resid <= atol):
        info = iters

    return x, info