 def test_22(self):
     N = 32
     M = 4
     Nd = 8
     D = cp.random.randn(Nd, Nd, M)
     D /= cp.sqrt(cp.sum(D**2, axis=(0, 1)))
     X0 = cp.zeros((N, N, M))
     xr = cp.random.randn(N, N, M)
     xp = cp.abs(xr) > 3
     X0[xp] = cp.random.randn(X0[xp].size)
     S = cp.sum(sl.fftconv(D, X0), axis=2)
     lmbda = 1e-3
     opt = cbpdn.ConvBPDN.Options(
         {'Verbose': False, 'MaxMainIter': 500, 'RelStopTol': 1e-5,
          'rho': 5e-1, 'AutoRho': {'Enabled': False}})
     bp = cbpdn.ConvBPDN(D, S, lmbda, opt)
     Xp = bp.solve()
     epsilon = cp.linalg.norm(bp.reconstruct(Xp).squeeze() - S)
     opt = cbpdn.ConvMinL1InL2Ball.Options(
         {'Verbose': False, 'MaxMainIter': 500, 'RelStopTol': 1e-5,
          'rho': 2e2, 'RelaxParam': 1.0, 'AutoRho': {'Enabled': False}})
     bc = cbpdn.ConvMinL1InL2Ball(D, S, epsilon=epsilon, opt=opt)
     Xc = bc.solve()
     assert cp.linalg.norm(Xp - Xc) / cp.linalg.norm(Xp) < 1e-3
     assert(cp.abs(cp.linalg.norm(Xp.ravel(), 1) -
                   cp.linalg.norm(Xc.ravel(), 1)) < 1e-3)
    def get_deconv(self, variable, indices):
        # 1. 最も活性した場所以外を0にする
        #maxbounds = self.get_max_patch_bounds(loss, rank, indices)
        isfc = Vutil.has_fc_layer(variable)
        # 全結合層の可視化の場合
        if isfc:
            values = Vutil.get_fc_info(variable, indices)
            for i, (j, v) in enumerate(zip(indices, values)):
                variable.data[i, j] = v
        # 畳み込み層やプーリング層などの可視化の場合
            maxinfo = Vutil.get_max_info(variable, indices)
            for i, (c, info) in enumerate(zip(indices, maxinfo)):
                variable.data[i, c, info[1], info[0]] = info[2]

        # 2. 入力層まで逆操作を繰り返す
        data_layer = Vutil.get_data_layer(variable)
        xp = cuda.get_array_module(data_layer.data)

        fixed_RMS = 300
        if xp == cupy:
            rms = cupy.sqrt(cupy.sum(data_layer.data ** 2, axis=(1,2,3)) / np.product(data_layer.data.shape[1:]))
            #rms = cupy.sqrt(cupy.sum(convW ** 2, axis=(2, 3)) / np.product(convW.shape[2:]))
            rms = np.linalg.norm(data_layer.data, axis=(1,2,3)) ** 2 / np.product(data_layer.data.shape[1:])
            #rms = np.linalg.norm(convW, axis=(2, 3)) ** 2 / np.product(convW.shape[2:])
        scale = fixed_RMS / rms
        scale = scale.reshape(-1,1,1,1)
        #print(rms, scale)
        #data_layer.data *= scale


        return data_layer.data
args = parser.parse_args()


from load import *
whole = load_whole(scale=1.0/128.0, shift=-1.0, path=args.dataset)
data = cuda.to_gpu(whole.data)

num_data = [10]

print (num_data)

dist_accum = 0
dist_list = [[] for i in range(len(num_data))]

for i in range(len(data)):
	if i % 1000 == 0:
		print (i)
	dist = cp.sqrt(cp.sum((data - data[i])**2, axis = 1))
	dist[i] = 1000
	sorted_dist = np.sort(cuda.to_cpu(dist))
	for j in range(len(num_data)):

for i in range(len(num_data)):
	np.savetxt(args.dataset + '/' + str(num_data[i]) + 'th_neighbor.txt', np.array(dist_list[i]))

 def forward(self, y, t):
     self.mask = ((y - t) < 0.0)
     self.error = np.sqrt(np.abs(y - t))
     return 2.0 * np.mean(self.error)
 def forward(self, y, t):
     self.y = y
     self.t = t
     error = (y - t) / t
     self.mask = (error < 0.0)
     return 2.0 * np.mean(np.sqrt(np.abs(error)))
def eegstats(signals, samples, statistic):

    import cupy as cp
    from scipy.stats import skew, kurtosis

    if statistic == 'mean':
        means = cp.zeros(samples)
        for i in range(len(signals)):
            means[i] = cp.mean(signals[i])
        return means

    elif statistic == 'std':
        std = cp.zeros(samples)
        for i in range(len(signals)):
            std[i] = cp.std(signals[i])
        return std

    elif statistic == 'skewness':
        skewness = cp.zeros(samples)
        for i in range(len(signals)):
            skewness[i] = skew(signals[i])
        return skewness

    elif statistic == 'kurtosis':
        kurt = cp.zeros(samples)
        for i in range(len(signals)):
            kurt[i] = kurtosis(signals[i])
        return kurt

    elif statistic == 'maximum':
        maxim = cp.zeros(samples)
        for i in range(len(signals)):
            maxim[i] = cp.amax(signals[i])
        return maxim

    elif statistic == 'minimum':
        minim = cp.zeros(samples)
        for i in range(len(signals)):
            minim[i] = cp.amin(signals[i])
        return minim
    elif statistic == 'n5':
        n5 = cp.zeros(samples)
        for i in range(len(signals)):
            n5[i] = cp.percentile(cp.asarray(signals[i]), 5)
        return n5

    elif statistic == 'n25':
        n25 = cp.zeros(samples)
        for i in range(len(signals)):
            n25[i] = cp.percentile(cp.asarray(signals[i]), 25)
        return n25

    elif statistic == 'n75':
        n75 = cp.zeros(samples)
        for i in range(len(signals)):
            n75[i] = cp.percentile(cp.asarray(signals[i]), 75)
        return n75

    elif statistic == 'n95':
        n95 = cp.zeros(samples)
        for i in range(len(signals)):
            n95[i] = cp.percentile(cp.asarray(signals[i]), 95)
        return n95

    elif statistic == 'median':
        median = cp.zeros(samples)
        for i in range(len(signals)):
            median[i] = cp.percentile(cp.asarray(signals[i]), 50)
        return median

    elif statistic == 'variance':
        variance = cp.zeros(samples)
        for i in range(len(signals)):
            variance[i] = cp.var(cp.asarray(signals[i]))
        return variance

    elif statistic == 'rms':
        rms = cp.zeros(samples)
        for i in range(len(signals)):
            rms[i] = cp.mean(cp.sqrt(cp.asarray(signals[i])**2))
        return rms
# Draw values of theta from Gaussian distribution:
theta_k_1 = cp.asarray(np.random.uniform(low=0, high=1, size=(Nx, Ny)) * 2 * np.pi)
theta_k_2 = cp.asarray(np.random.uniform(low=0, high=1, size=(Nx, Ny)) * 2 * np.pi)

# Generate density array so that only certain modes are occupied:
n_k = cp.zeros((Nx, Ny))
n_k[Nx // 2, Ny // 2] = atom_num * 0.5 / (dx * dy)  # k = (0, 0)
n_k[Nx // 2 + 1, Ny // 2] = atom_num * 0.125 / (dx * dy)  # k = (1, 0)
n_k[Nx // 2 + 1, Ny // 2 + 1] = atom_num * 0.125 / (dx * dy)  # k = (1, 1)
n_k[Nx // 2 - 1, Ny // 2] = atom_num * 0.125 / (dx * dy)  # k = (-1, 0)
n_k[Nx // 2 - 1, Ny // 2 - 1] = atom_num * 0.125 / (dx * dy)  # k = (-1, -1)

psi_1_k = cp.fft.fftshift(Nx * Ny * cp.sqrt(n_k) * cp.exp(1j * theta_k_1)) / cp.sqrt(2.)
psi_2_k = cp.fft.fftshift(Nx * Ny * cp.sqrt(n_k) * cp.exp(1j * theta_k_2)) / cp.sqrt(2.)

# ------------------------------------------------------------------------------------------------------------------
# Creating save file and saving initial data
# ------------------------------------------------------------------------------------------------------------------
filename = 'HQV_nonEq'    # Name of file to save data to
data_path = 'data/{}.hdf5'.format(filename)

with h5py.File(data_path, 'w') as data:
    # Saving spatial data:
    data.create_dataset('grid/x', x.shape, data=cp.asnumpy(x))
    data.create_dataset('grid/y', y.shape, data=cp.asnumpy(y))

    # Saving time variables:
    data.create_dataset('time/Nt', data=Nt)
def estimate_sigma(arr, disable_background_masking=False, N=0):
    """Standard deviation estimation from local patches

    arr : 3D or 4D ndarray
        The array to be estimated

    disable_background_masking : bool, default False
        If True, uses all voxels for the estimation, otherwise, only non-zeros
        voxels are used. Useful if the background is masked by the scanner.

    N : int, default 0
        Number of coils of the receiver array. Use N = 1 in case of a SENSE
        reconstruction (Philips scanners) or the number of coils for a GRAPPA
        reconstruction (Siemens and GE). Use 0 to disable the correction factor,
        as for example if the noise is Gaussian distributed. See [1] for more

    sigma : ndarray
        standard deviation of the noise, one estimation per volume.

    This function is the same as manually taking the standard deviation of the
    background and gives one value for the whole 3D array.
    It also includes the coil-dependent correction factor of Koay 2006
    (see [1]_, equation 18) with theta = 0.
    Since this function was introduced in [2]_ for T1 imaging,
    it is expected to perform ok on diffusion MRI data, but might oversmooth
    some regions and leave others un-denoised for spatially varying noise
    profiles. Consider using :func:`piesno` to estimate sigma instead if visual
    inaccuracies are apparent in the denoised result.

    .. [1] Koay, C. G., & Basser, P. J. (2006). Analytically exact correction
    scheme for signal extraction from noisy magnitude MR signals.
    Journal of Magnetic Resonance), 179(2), 317-22.

    .. [2] Coupe, P., Yger, P., Prima, S., Hellier, P., Kervrann, C., Barillot,
    C., 2008. An optimized blockwise nonlocal means denoising filter for 3-D
    magnetic resonance images, IEEE Trans. Med. Imaging 27, 425-41.

    k = np.zeros((3, 3, 3), dtype=np.int8)

    k[0, 1, 1] = 1
    k[2, 1, 1] = 1
    k[1, 0, 1] = 1
    k[1, 2, 1] = 1
    k[1, 1, 0] = 1
    k[1, 1, 2] = 1
    k = cp.asarray(k)

    # Precomputed factor from Koay 2006, this corrects the bias of magnitude
    # image
    correction_factor = {
        0: 1,  # No correction
        1: 0.42920367320510366,
        4: 0.4834941393603609,
        6: 0.4891759468548269,
        8: 0.49195420135894175,
        12: 0.4946862482541263,
        16: 0.4960339908122364,
        20: 0.4968365823718557,
        24: 0.49736907650825657,
        32: 0.49803177052530145,
        64: 0.49901964176235936,

    if N in correction_factor:
        factor = correction_factor[N]
        raise ValueError("N = {0} is not supported! Please choose amongst \
{1}".format(N, sorted(list(correction_factor.keys()))))

    if arr.ndim == 3:
        arr = arr[..., None]
    elif arr.ndim != 4:
        raise ValueError("Array shape is not supported!", arr.shape)

    if disable_background_masking:
        mask = None
        mask = arr[..., 0].astype(np.bool)
        # TODO: make upstream PR at dipy with this binary erosion bug fix
        # erode mask by the convolution kernel shape
        mask = ndi.binary_erosion(mask,

    # TODO: make upstream PR at dipy that avoids an explicit loop over slices
    conv_out = cp.empty(arr.shape, dtype=np.float64)

    ndi.convolve(arr, k[..., np.newaxis], output=conv_out)
    mean_block = arr - conv_out / 6
    if mask is None:
        tmp = mean_block.reshape((-1, mean_block.shape[-1]))
        tmp = mean_block[mask]
    tmp *= math.sqrt(6 / 7)
    tmp *= tmp
    sigma = cp.sqrt(cp.mean(tmp, axis=0) / factor)
    return sigma
def _denoise_tv_chambolle_nd(image, weight=0.1, eps=2.0e-4, n_iter_max=200):
    """Perform total-variation denoising on n-dimensional images.

    image : ndarray
        n-D input data to be denoised.
    weight : float, optional
        Denoising weight. The greater `weight`, the more denoising (at
        the expense of fidelity to `input`).
    eps : float, optional
        Relative difference of the value of the cost function that determines
        the stop criterion. The algorithm stops when:

            (E_(n-1) - E_n) < eps * E_0

    n_iter_max : int, optional
        Maximal number of iterations used for the optimization.

    out : ndarray
        Denoised array of floats.

    Rudin, Osher and Fatemi algorithm.


    ndim = image.ndim
    p = cp.zeros((image.ndim, ) + image.shape, dtype=image.dtype)
    g = cp.zeros_like(p)
    d = cp.zeros_like(image)
    i = 0
    slices_g = [slice(None)] * (ndim + 1)
    slices_d = [slice(None)] * ndim
    slices_p = [slice(None)] * (ndim + 1)
    while i < n_iter_max:
        if i > 0:
            # d will be the (negative) divergence of p
            d = -p.sum(0)
            for ax in range(ndim):
                slices_d[ax] = slice(1, None)
                slices_p[ax + 1] = slice(0, -1)
                slices_p[0] = ax
                d[tuple(slices_d)] += p[tuple(slices_p)]
                slices_d[ax] = slice(None)
                slices_p[ax + 1] = slice(None)
            out = image + d
            E = (d * d).sum()
            out = image
            E = 0.0

        # g stores the gradients of out along each axis
        # e.g. g[0] is the first order finite difference along axis 0
        for ax in range(ndim):
            slices_g[ax + 1] = slice(0, -1)
            slices_g[0] = ax
            g[tuple(slices_g)] = cp.diff(out, axis=ax)
            slices_g[ax + 1] = slice(None)

        norm = (g * g).sum(axis=0, keepdims=True)
        cp.sqrt(norm, out=norm)
        E += weight * norm.sum()
        tau = 1.0 / (2.0 * ndim)
        norm *= tau / weight
        norm += 1.0
        p -= tau * g
        p /= norm
        E /= float(image.size)
        if i == 0:
            E_init = E
            E_previous = E
            if abs(E_previous - E) < eps * E_init:
                E_previous = E
        i += 1
    return out
def norm(x, ord=None, axis=None, keepdims=False):
    """Returns one of matrix norms specified by ``ord`` parameter.

    See numpy.linalg.norm for more detail.

        x (cupy.ndarray): Array to take norm. If ``axis`` is None,
            ``x`` must be 1-D or 2-D.
        ord (non-zero int, inf, -inf, 'fro'): Norm type.
        axis (int, 2-tuple of ints, None): 1-D or 2-D norm is cumputed over
        keepdims (bool): If this is set ``True``, the axes which are normed
            over are left.


    if not issubclass(x.dtype.type, numpy.inexact):
        x = x.astype(float)

    # Immediately handle some default, simple, fast, and common cases.
    if axis is None:
        ndim = x.ndim
        if (ord is None or (ndim == 1 and ord == 2) or
                (ndim == 2 and ord in ('f', 'fro'))):
            if x.dtype.kind == 'c':
                s = abs(x.ravel())
                s *= s
                ret = cupy.sqrt(s.sum())
                ret = cupy.sqrt((x * x).sum())
            if keepdims:
                ret = ret.reshape((1,) * ndim)
            return ret

    # Normalize the `axis` argument to a tuple.
    nd = x.ndim
    if axis is None:
        axis = tuple(range(nd))
    elif not isinstance(axis, tuple):
            axis = int(axis)
        except Exception:
            raise TypeError(
                '\'axis\' must be None, an integer or a tuple of integers')
        axis = (axis,)

    if len(axis) == 1:
        if ord == numpy.Inf:
            return abs(x).max(axis=axis, keepdims=keepdims)
        elif ord == -numpy.Inf:
            return abs(x).min(axis=axis, keepdims=keepdims)
        elif ord == 0:
            # Zero norm
            # Convert to Python float in accordance with NumPy
            return (x != 0).astype(x.real.dtype).sum(
                axis=axis, keepdims=keepdims)
        elif ord == 1:
            # special case for speedup
            return abs(x).sum(axis=axis, keepdims=keepdims)
        elif ord is None or ord == 2:
            # special case for speedup
            if x.dtype.kind == 'c':
                s = abs(x)
                s *= s
                s = x * x
            return cupy.sqrt(s.sum(axis=axis, keepdims=keepdims))
            except TypeError:
                raise ValueError('Invalid norm order for vectors.')

            absx = abs(x)
            absx **= ord
            ret = absx.sum(axis=axis, keepdims=keepdims)
            ret **= cupy.reciprocal(ord, dtype=ret.dtype)
            return ret
    elif len(axis) == 2:
        row_axis, col_axis = axis
        if row_axis < 0:
            row_axis += nd
        if col_axis < 0:
            col_axis += nd
        if not (0 <= row_axis < nd and 0 <= col_axis < nd):
            raise ValueError('Invalid axis %r for an array with shape %r' %
                             (axis, x.shape))
        if row_axis == col_axis:
            raise ValueError('Duplicate axes given.')
        if ord == 2:
            op_max = functools.partial(cupy.take, indices=0)
            ret = _multi_svd_norm(x, row_axis, col_axis, op_max)
        elif ord == -2:
            op_min = functools.partial(cupy.take, indices=-1)
            ret = _multi_svd_norm(x, row_axis, col_axis, op_min)
        elif ord == 1:
            if col_axis > row_axis:
                col_axis -= 1
            ret = abs(x).sum(axis=row_axis).max(axis=col_axis)
        elif ord == numpy.Inf:
            if row_axis > col_axis:
                row_axis -= 1
            ret = abs(x).sum(axis=col_axis).max(axis=row_axis)
        elif ord == -1:
            if col_axis > row_axis:
                col_axis -= 1
            ret = abs(x).sum(axis=row_axis).min(axis=col_axis)
        elif ord == -numpy.Inf:
            if row_axis > col_axis:
                row_axis -= 1
            ret = abs(x).sum(axis=col_axis).min(axis=row_axis)
        elif ord in [None, 'fro', 'f']:
            if x.dtype.kind == 'c':
                s = abs(x)
                s *= s
                ret = cupy.sqrt(s.sum(axis=axis))
                ret = cupy.sqrt((x * x).sum(axis=axis))
        elif ord == 'nuc':
            ret = _multi_svd_norm(x, row_axis, col_axis, cupy.sum)
            raise ValueError('Invalid norm order for matrices.')
        if keepdims:
            ret_shape = list(x.shape)
            ret_shape[axis[0]] = 1
            ret_shape[axis[1]] = 1
            ret = ret.reshape(ret_shape)
        return ret
        raise ValueError('Improper number of dimensions to norm.')
def voronoi(func,
    val_shape = np.array(icp.shape)
    val_shape = val_shape[0:-1]
    value = None

    icp_f = frequency * icp

    icp_int = icp_f
    icp_int = icp_f.astype(int)
    icp_int[icp_f <= 0.0] = (icp_int - 1)[icp_f <= 0.0]
    temp = icp_int.copy()

    xInt = icp_int[..., 0]
    yInt = icp_int[..., 1]
    zInt = icp_int[..., 2]

    xc = cp.zeros(xInt.shape)
    yc = cp.zeros(yInt.shape)
    zc = cp.zeros(zInt.shape)

    minDist = cp.ones(val_shape) * 2147483647.0

    for xi in range(-2, 3):
        for yi in range(-2, 3):
            for zi in range(-2, 3):
                xcur = xInt + xi
                ycur = yInt + yi
                zcur = zInt + zi

                temp[..., 0] = xcur
                temp[..., 1] = ycur
                temp[..., 2] = zcur

                xp = xcur + func(temp, seed=seed)
                yp = ycur + func(temp, seed=seed + 1)
                zp = zcur + func(temp, seed=seed + 2)

                xd = xp - icp_f[..., 0]
                yd = yp - icp_f[..., 1]
                zd = zp - icp_f[..., 2]

                dist = xd * xd + yd * yd + zd * zd

                xc[dist < minDist] = xp[dist < minDist]
                yc[dist < minDist] = yp[dist < minDist]
                zc[dist < minDist] = zp[dist < minDist]
                minDist[dist < minDist] = dist[dist < minDist]

    if distance_enabled:
        xd = xc - icp_f[..., 0]
        yd = yc - icp_f[..., 1]
        zd = zc - icp_f[..., 2]
        value = cp.sqrt(xd * xd + yd * yd +
                        zd * zd) * 1.7320508075688772935 - 1.0
        value = 0.0

    temp[..., 0] = cp.floor(xc)
    temp[..., 1] = cp.floor(yc)
    temp[..., 2] = cp.floor(zc)

    return value + displacement * func(temp, seed=0)
    def WISHrun(self, y0: np.ndarray, SLM: np.ndarray, delta3: float, delta4: float, N_os: int, N_iter: int,\
                N_batch: int, plot: bool=True):
        Runs the WISH algorithm using a Gerchberg Saxton loop for phase retrieval.
        :param y0: Target modulated amplitudes in the sensor plane
        :param SLM: SLM modulation patterns
        :param delta3: Apparent sampling size of the SLM as seen from the sensor plane
        :param delta4: Sampling size of the sensor plane
        :param N_os: Number of observations per image
        :param N_iter: Maximal number of Gerchberg Saxton iterations
        :param N_batch: Number of batches (modulations)
        :param plot: If True, plots the advance of the retrieval every 10 iterations
        :return u4_est, idx_converge: Estimated field of size (N,N) and the convergence indices to check convergence
        wvl = self.wavelength
        z3 = self.z
        ## parameters
        N = y0.shape[0]
        k = 2 * np.pi / wvl
        #u3_batch = np.zeros((N, N, N_os), dtype=complex) # store all U3 gpu
        #u4 = np.zeros((N, N, N_os), dtype=complex) # gpu
        #y = np.zeros((N, N, N_os), dtype=complex) # store all U3 gpu
        u3_batch = cp.zeros((N, N, N_os),
                            dtype=cp.complex64)  # store all U3 gpu
        u4 = cp.zeros((N, N, N_os), dtype=cp.complex64)  # gpu
        y = cp.zeros((N, N, N_os), dtype=cp.complex64)  # store all U3 gpu

        ## initilize a3
        k = 2 * np.pi / wvl
        xx = cp.linspace(0, N - 1, N,
                         dtype=cp.float) - (N / 2) * cp.ones(N, dtype=cp.float)
        yy = cp.linspace(0, N - 1, N,
                         dtype=cp.float) - (N / 2) * cp.ones(N, dtype=cp.float)
        X, Y = float(delta4) * cp.meshgrid(
            xx, yy)[0], float(delta4) * cp.meshgrid(xx, yy)[1]
        R = cp.sqrt(X**2 + Y**2)
        Q = cp.exp(1j * (k / (2 * z3)) * R**2)
        for ii in range(N_os):
            #SLM_batch = SLM[:,:, ii]
            SLM_batch = cp.asarray(SLM[:, :, ii])
            y0_batch = y0[:, :, ii]
            #u3_batch[:,:, ii] = self.frt(y0_batch, delta4, -z3) * np.conj(SLM_batch) #y0_batch gpu
            #u3_batch[:,:, ii] = self.frt_gpu(cp.asarray(y0_batch), delta4, -z3) * cp.conj(SLM_batch) #y0_batch gpu
            u3_batch[:, :, ii] = self.frt_gpu_s(
                cp.asarray(y0_batch) / Q, delta4, -z3) * cp.conj(
                    SLM_batch)  #y0_batch gpu
        #u3 = np.mean(u3_batch, 2) # average it
        u3 = cp.mean(u3_batch, 2)

        ## Recon run : GS loop
        idx_converge = np.empty(N_iter)
        for jj in range(N_iter):
            sys.stdout.write(f"\rGS iteration {jj+1}")
            #u3_collect = np.zeros(u3.shape, dtype=complex)
            u3_collect = cp.zeros(u3.shape, dtype=cp.complex64)
            idx_converge0 = np.empty(N_batch)
            for idx_batch in range(N_batch):
                # put the correct batch into the GPU (no GPU for now)
                #SLM_batch = SLM[:,:, int(N_os * idx_batch): int(N_os * (idx_batch+1))]
                #y0_batch = y0[:,:, int(N_os * idx_batch): int(N_os * (idx_batch+1))]
                SLM_batch = cp.asarray(
                    SLM[:, :,
                        int(N_os * idx_batch):int(N_os * (idx_batch + 1))])
                y0_batch = cp.asarray(
                    y0[:, :,
                       int(N_os * idx_batch):int(N_os * (idx_batch + 1))])
                for _ in range(N_os):
                    #u4[:,:,_] = self.frt(u3 * SLM_batch[:,:,_], delta3, z3) # U4 is the field on the sensor
                    u4[:, :,
                       _] = self.frt_gpu_s(u3 * SLM_batch[:, :, _], delta3,
                                           z3)  # U4 is the field on the sensor
                    y[:, :,
                      _] = y0_batch[:, :, _] * cp.exp(1j * cp.angle(
                          u4[:, :, _]))  # force the amplitude of y to be y0
                    #u3_batch[:,:,_] = self.frt(y[:,:,_], delta4, -z3) * np.conj(SLM_batch[:,:,_])
                    u3_batch[:, :, _] = self.frt_gpu_s(
                        y[:, :, _], delta4, -z3) * cp.conj(SLM_batch[:, :, _])
                #u3_collect = u3_collect + np.mean(u3_batch, 2) # collect(add) U3 from each batch
                u3_collect = u3_collect + cp.mean(
                    u3_batch, 2)  # collect(add) U3 from each batch
                #idx_converge0[idx_batch] = np.mean(np.mean(np.mean(y0_batch,1),0)/np.sum(np.sum(np.abs(np.abs(u4)-y0_batch),1),0))
                #idx_converge0[idx_batch] = cp.asnumpy(cp.mean(cp.mean(cp.mean(y0_batch,1),0)/cp.sum(cp.sum(cp.abs(cp.abs(u4)-y0_batch),1),0)))
                # convergence index matrix for each batch
                idx_converge0[idx_batch] = cp.linalg.norm(
                    cp.abs(u4) - y0_batch) / cp.linalg.norm(y0_batch)

            u3 = (u3_collect / N_batch)  # average over batches
            idx_converge[jj] = np.mean(idx_converge0)  # sum over batches
            sys.stdout.write(f"  (convergence index : {idx_converge[jj]})")
            #u4_est = self.frt(u3, delta3, z3)
            u4_est = cp.asnumpy(self.frt_gpu_s(u3, delta3, z3) * Q)

            if jj % 10 == 0 and plot:
                fig = plt.figure(0)
                fig.suptitle(f'Iteration {jj}')
                ax1 = fig.add_subplot(121)
                ax2 = fig.add_subplot(122)
                im = ax1.imshow(np.abs(u4_est), cmap='viridis')
                ax2.imshow(np.angle(u4_est), cmap='viridis')

                fig1 = plt.figure(1)
                ax = fig1.gca()
                ax.plot(np.arange(0, jj, 1), idx_converge[0:jj], marker='o')
                ax.set_ylabel('Convergence estimator')
                ax.set_title('Convergence curve')

            # exit if the matrix doesn 't change much
            if jj > 1:
                if cp.abs(idx_converge[jj] -
                          idx_converge[jj - 1]) / idx_converge[jj] < 1e-4:
                    print('\nConverged. Exit the GS loop ...')
                    #idx_converge = idx_converge[0:jj]
                    idx_converge = cp.asnumpy(idx_converge[0:jj])
        return u4_est, idx_converge
def normalize(data, mask=None, poly_fit=0):
    """ Apply normalization on GPU
    Applies normalisation (data - mean) / stdev
        data (np/cp.array): Data to preprocess
        mask (np.cp.array): 1D Channel mask for RFI flagging
        return_space ('cpu' or 'gpu'): Returns array in CPU or GPU space
        poly_fit (int): Fit polynomial of degree N, 0 = no fit.
    Returns: d_gpu (cp.array): Normalized data
    # Normalise
    t0 = time.time()

    d_flag = cp.copy(data)

    n_int, n_ifs, n_chan = data.shape

    # Setup 1D channel mask -- used for polynomial fitting
    if mask is None:
        mask = cp.zeros(n_chan, dtype='bool')

    # Do polynomial fit and compute stats (with masking)
    d_mean_ifs, d_std_ifs = cp.zeros(n_ifs), cp.zeros(n_ifs)

    N_masked = mask.sum()
    N_flagged = N_masked * n_ifs * n_int
    N_tot = np.product(data.shape)
    N_unflagged = (N_tot - N_flagged)

    t0p = time.time()

    for ii in range(n_ifs):
        x = cp.arange(n_chan, dtype='float64')
        xc = cp.compress(~mask, x)
        dfit = cp.compress(~mask, data[:, ii].mean(axis=0))

        if poly_fit > 0:
            # WAR: int64 dtype causes issues in cupy 10 (19.04.2022)
            p = cp.poly1d(cp.polyfit(xc, dfit, poly_fit))
            fit = p(x)
            dfit -= p(xc)
            data[:, ii] = data[:, ii] - fit

        # compute mean and stdev
        dmean = dfit.mean()
        dvar = ((data[:, ii] - dmean)**2).mean(axis=0)
        dvar = cp.compress(~mask, dvar).mean()
        dstd = cp.sqrt(dvar)
        d_mean_ifs[ii] = dmean
        d_std_ifs[ii] = dstd

    t1p = time.time()
    ### logger.info(f"Poly fit time: {(t1p-t0p)*1e3:2.2f}ms")

    flag_fraction = N_flagged / N_tot
    flag_correction = N_tot / (N_tot - N_flagged)
    ### logger.info(f"Flagged fraction: {flag_fraction:2.4f}")
    ### if flag_fraction > 0.2:
    ###    logger.warning(f"High flagged fraction: {flag_fraction:2.3f}")

    #  Apply to original data
    for ii in range(n_ifs):
        data[:, ii] = ((data[:, ii] - d_mean_ifs[ii]) / d_std_ifs[ii])
    t1 = time.time()
    ### logger.info(f"Normalisation time: {(t1-t0)*1e3:2.2f}ms")

    return data
def gsw_dHdT(sa, ct, p):
    d/dT of dynamic enthalpy, analytical derivative

    sa     : Absolute Salinity                               [g/kg]
    ct     : Conservative Temperature                        [deg C]
    p      : sea pressure                                    [dbar]
    t1 = v45 * ct
    t2 = 0.2e1 * t1
    t3 = v46 * sa
    t4 = 0.5 * v12
    t5 = v14 * ct
    t7 = ct * (v13 + t5)
    t8 = 0.5 * t7
    t11 = sa * (v15 + v16 * ct)
    t12 = 0.5 * t11
    t13 = t4 + t8 + t12
    t15 = v19 * ct
    t19 = v17 + ct * (v18 + t15) + v20 * sa
    t20 = 1.0 / t19
    t24 = v47 + v48 * ct
    t25 = 0.5 * v13
    t26 = 1.0 * t5
    t27 = sa * v16
    t28 = 0.5 * t27
    t29 = t25 + t26 + t28
    t33 = t24 * t13
    t34 = t19**2
    t35 = 1.0 / t34
    t37 = v18 + 2.0 * t15
    t38 = t35 * t37
    t48 = ct * (v44 + t1 + t3)
    t57 = v40 * ct
    t59 = ct * (v39 + t57)
    t64 = t13**2
    t68 = t20 * t29
    t71 = t24 * t64
    t74 = v04 * ct
    t76 = ct * (v03 + t74)
    t79 = v07 * ct
    t82 = cp.sqrt(sa)
    t83 = v11 * ct
    t85 = ct * (v10 + t83)
    t92 = (v01 + ct * (v02 + t76) + sa * (v05 + ct * (v06 + t79) + t82 *
                                          (v08 + ct * (v09 + t85))))
    t93 = v48 * t92
    t105 = (v02 + t76 + ct * (v03 + 2.0 * t74) + sa *
            (v06 + 2.0 * t79 + t82 * (v09 + t85 + ct * (v10 + 2.0 * t83))))
    t106 = t24 * t105
    t107 = v44 + t2 + t3
    t110 = v43 + t48
    t117 = t24 * t92
    t120 = 4.0 * t71 * t20 - t117 - 2.0 * t110 * t13
    t123 = (v38 + t59 + ct * (v39 + 2.0 * t57) + sa * v42 +
            (4.0 * v48 * t64 * t20 + 8.0 * t33 * t68 - 4.0 * t71 * t38 - t93 -
             t106 - 2.0 * t107 * t13 - 2.0 * t110 * t29) * t20 -
            t120 * t35 * t37)
    t128 = t19 * p
    t130 = p * (1.0 * v12 + 1.0 * t7 + 1.0 * t11 + t128)
    t131 = 1.0 / t92
    t133 = 1.0 + t130 * t131
    t134 = cp.log(t133)
    t143 = v37 + ct * (v38 + t59) + sa * (v41 + v42 * ct) + t120 * t20
    t152 = t37 * p
    t156 = t92**2
    t165 = v25 * ct
    t167 = ct * (v24 + t165)
    t169 = ct * (v23 + t167)
    t175 = v30 * ct
    t177 = ct * (v29 + t175)
    t179 = ct * (v28 + t177)
    t185 = v35 * ct
    t187 = ct * (v34 + t185)
    t189 = ct * (v33 + t187)
    t199 = t13 * t20
    t217 = 2.0 * t117 * t199 - t110 * t92
    t234 = (v21 + ct * (v22 + t169) + sa *
            (v26 + ct * (v27 + t179) + v36 * sa + t82 *
             (v31 + ct * (v32 + t189))) + t217 * t20)
    t241 = t64 - t92 * t19
    t242 = cp.sqrt(t241)
    t243 = 1.0 / t242
    t244 = t4 + t8 + t12 - t242
    t245 = 1.0 / t244
    t247 = t4 + t8 + t12 + t242 + t128
    t248 = 1.0 / t247
    t249 = t242 * t245 * t248
    t252 = 1.0 + 2.0 * t128 * t249
    t253 = cp.log(t252)
    t254 = t243 * t253
    t259 = t234 * t19 - t143 * t13
    t264 = t259 * t20
    t272 = 2.0 * t13 * t29 - t105 * t19 - t92 * t37
    t282 = t128 * t242
    t283 = t244**2
    t287 = t243 * t272 / 2.0
    t292 = t247**2
    t305 = (0.1e5 * p *
            (v44 + t2 + t3 - 2.0 * v48 * t13 * t20 - 2.0 * t24 * t29 * t20 +
             2.0 * t33 * t38 + 0.5 * v48 * p) * t20 - 0.1e5 * p *
            (v43 + t48 - 2.0 * t33 * t20 + 0.5 * t24 * p) * t38 +
            0.5e4 * t123 * t20 * t134 - 0.5e4 * t143 * t35 * t134 * t37 +
            0.5e4 * t143 * t20 *
            (p * (1.0 * v13 + 2.0 * t5 + 1.0 * t27 + t152) * t131 -
             t130 / t156 * t105) / t133 + 0.5e4 *
            ((v22 + t169 + ct * (v23 + t167 + ct * (v24 + 2.0 * t165)) + sa *
              (v27 + t179 + ct * (v28 + t177 + ct * (v29 + 2.0 * t175)) + t82 *
               (v32 + t189 + ct * (v33 + t187 + ct * (v34 + 2.0 * t185)))) +
              (2.0 * t93 * t199 + 2.0 * t106 * t199 + 2.0 * t117 * t68 -
               2.0 * t117 * t13 * t35 * t37 - t107 * t92 - t110 * t105) * t20 -
              t217 * t35 * t37) * t19 + t234 * t37 - t123 * t13 - t143 * t29) *
            t20 * t254 - 0.5e4 * t259 * t35 * t254 * t37 -
            0.25e4 * t264 / t242 / t241 * t253 * t272 + 0.5e4 * t264 * t243 *
            (2.0 * t152 * t249 + t128 * t243 * t245 * t248 * t272 -
             2.0 * t282 / t283 * t248 *
             (t25 + t26 + t28 - t287) - 2.0 * t282 * t245 / t292 *
             (t25 + t26 + t28 + t287 + t152)) / t252)

    return t305
def ccg_slow(st1, st2, nbins, tbin):
    # this function efficiently computes the crosscorrelogram between two sets
    # of spikes (st1, st2), with tbin length each, timelags =  plus/minus nbins
    # and then estimates how refractory the cross-correlogram is, which can be used
    # during merge decisions.

    st1 = cp.sort(
        st1)  # makes sure spike trains are sorted in increasing order
    st2 = cp.sort(st2)

    dt = nbins * tbin

    N1 = max(1, len(st1))
    N2 = max(1, len(st2))
    T = cp.concatenate((st1, st2)).max() - cp.concatenate((st1, st2)).min()

    # we traverse both spike trains together, keeping track of the spikes in the first
    # spike train that are within dt of spikes in the second spike train

    ilow = 0  # lower bound index
    ihigh = 0  # higher bound index
    j = 0  # index of the considered spike

    K = cp.zeros(2 * nbins + 1)

    # (DEV_NOTES) the while loop below is far too slow as is

    while j <= N2 - 1:  # traverse all spikes in the second spike train

        while (ihigh <= N1 - 1) and (st1[ihigh] < st2[j] + dt):
            ihigh += 1  # keep increasing higher bound until it's OUTSIDE of dt range

        while (ilow <= N1 - 1) and (st1[ilow] <= st2[j] - dt):
            ilow += 1  # keep increasing lower bound until it's INSIDE of dt range

        if ilow > N1 - 1:
            break  # break if we exhausted the spikes from the first spike train

        if st1[ilow] > st2[j] + dt:
            # if the lower bound is actually outside of dt range, means we overshot (there were no
            # spikes in range)
            # simply move on to next spike from second spike train
            j += 1

        for k in range(ilow, ihigh):
            # for all spikes within plus/minus dt range
            ibin = cp.rint(
                (st2[j] - st1[k]) / tbin).astype(int)  # convert ISI to integer

            K[ibin + nbins] += 1

        j += 1

    irange1 = cp.concatenate(
        (cp.arange(1, nbins // 2), cp.arange(3 * nbins // 2, 2 * nbins)))
    irange2 = cp.arange(nbins - 50, nbins - 10)
    irange3 = cp.arange(nbins + 11, nbins + 50)

    # normalize the shoulders by what's expected from the mean firing rates
    # a non-refractive poisson process should yield 1

    Q00 = cp.sum(K[irange1]) / (len(irange1) * tbin * N1 * N2 / T)
    # do the same for irange 2
    Q01 = cp.sum(K[irange2]) / (len(irange2) * tbin * N1 * N2 / T)
    # compare to the other shoulder
    Q01 = max(Q01, cp.sum(K[irange3]) / (len(irange3) * tbin * N1 * N2 / T))

    R00 = max(mean(K[irange2]), mean(K[irange3]))  # take the biggest shoulder
    R00 = max(R00, mean(K[irange1]))  # compare this to the asymptotic shoulder

    # test the probability that a central area in the autocorrelogram might be refractory
    # test increasingly larger areas of the central CCG

    a = K[nbins]
    K[nbins] = 0

    Qi = cp.zeros(10)
    Ri = cp.zeros(10)

    for i in range(1, 11):
        irange = cp.arange(nbins - i,
                           nbins + i + 1)  # for this central range of the CCG
        # compute the normalised ratio as above. this should be 1 if there is no refractoriness
        Qi0 = cp.sum(K[irange]) / (2 * i * tbin * N1 * N2 / T)
        Qi[i - 1] = Qi0  # save the normalised probability

        n = cp.sum(K[irange]) / 2
        lam = R00 * i

        # log(p) = log(lam) * n - lam - gammaln(n+1)

        # this is tricky: we approximate the Poisson likelihood with a gaussian of equal mean and
        # variance that allows us to integrate the probability that we would see <N spikes in the
        # center of the cross-correlogram from a distribution with mean R00*i spikes

        p = 1 / 2 * (1 + erf((n - lam) / cp.sqrt(2 * lam)))

        Ri[i - 1] = p  # keep track of p for each bin size i

    K[nbins] = a  # restore the center value of the cross-correlogram

    return K, Qi, Q00, Q01, Ri
def svd_wrapper(matrix, mode, ncomp, verbose, full_output=False,
                random_state=None, to_numpy=True):
    """ Wrapper for different SVD libraries (CPU and GPU). 
    matrix : numpy ndarray, 2d
        2d input matrix.
    mode : {'lapack', 'arpack', 'eigen', 'randsvd', 'cupy', 'eigencupy',
        'randcupy', 'pytorch', 'eigenpytorch', 'randpytorch'}, str optional
        Switch for the SVD method/library to be used.

        ``lapack``: uses the LAPACK linear algebra library through Numpy
        and it is the most conventional way of computing the SVD
        (deterministic result computed on CPU).

        ``arpack``: uses the ARPACK Fortran libraries accessible through
        Scipy (computation on CPU).

        ``eigen``: computes the singular vectors through the
        eigendecomposition of the covariance M.M' (computation on CPU).

        ``randsvd``: uses the randomized_svd algorithm implemented in
        Sklearn (computation on CPU).

        ``cupy``: uses the Cupy library for GPU computation of the SVD as in
        the LAPACK version. `

        `eigencupy``: offers the same method as with the ``eigen`` option
        but on GPU (through Cupy).

        ``randcupy``: is an adaptation of the randomized_svd algorithm,
        where all the computations are done on a GPU (through Cupy). `

        `pytorch``: uses the Pytorch library for GPU computation of the SVD.

        ``eigenpytorch``: offers the same method as with the ``eigen``
        option but on GPU (through Pytorch).

        ``randpytorch``: is an adaptation of the randomized_svd algorithm,
        where all the linear algebra computations are done on a GPU
        (through Pytorch).

    ncomp : int
        Number of singular vectors to be obtained. In the cases when the full
        SVD is computed (LAPACK, ARPACK, EIGEN, CUPY), the matrix of singular 
        vectors is truncated.
    verbose: bool
        If True intermediate information is printed out.
    full_output : bool optional
        If True the 3 terms of the SVD factorization are returned. If ``mode``
        is eigen then only S and V are returned.
    random_state : int, RandomState instance or None, optional
        If int, random_state is the seed used by the random number generator.
        If RandomState instance, random_state is the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random. Used for ``randsvd`` mode.
    to_numpy : bool, optional
        If True (by default) the arrays computed in GPU are transferred from
        VRAM and converted to numpy ndarrays.

    V : numpy ndarray
        The right singular vectors of the input matrix. If ``full_output`` is
        True it returns the left and right singular vectors and the singular
        values of the input matrix. If ``mode`` is set to eigen then only S and
        V are returned.
    * For ``lapack`` SVD mode see:
    * For ``eigen`` mode see:
    * For ``arpack`` SVD mode see:
    * For ``randsvd`` SVD mode see:
        Finding structure with randomness: Stochastic algorithms for constructing
        approximate matrix decompositions
        Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061
    * For ``cupy`` SVD mode see:
    * For ``eigencupy`` mode see:
    * For ``pytorch`` SVD mode see:
    * For ``eigenpytorch`` mode see:

    if matrix.ndim != 2:
        raise TypeError('Input matrix is not a 2d array')

    if ncomp > min(matrix.shape[0], matrix.shape[1]):
        msg = '{} PCs cannot be obtained from a matrix with size [{},{}].'
        msg += ' Increase the size of the patches or request less PCs'
        raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1]))

    if mode == 'eigen':
        # building C as np.dot(matrix.T,matrix) is slower and takes more memory
        C = np.dot(matrix, matrix.T)    # covariance matrix
        e, EV = linalg.eigh(C)          # EVals and EVs
        pc = np.dot(EV.T, matrix)       # PCs using a compact trick when cov is MM'
        V = pc[::-1]                    # reverse since we need the last EVs
        S = np.sqrt(np.abs(e))          # SVals = sqrt(EVals)
        S = S[::-1]                     # reverse since EVals go in increasing order
        for i in range(V.shape[1]):
            V[:, i] /= S    # scaling EVs by the square root of EVals
        V = V[:ncomp]
        if verbose:
            print('Done PCA with numpy linalg eigh functions')

    elif mode == 'lapack':
        # n_frames is usually smaller than n_pixels. In this setting taking
        # the SVD of M' and keeping the left (transposed) SVs is faster than
        # taking the SVD of M (right SVs)
        U, S, V = linalg.svd(matrix.T, full_matrices=False)
        V = V[:ncomp]       # we cut projection matrix according to the # of PCs
        U = U[:, :ncomp]
        S = S[:ncomp]
        if verbose:
            print('Done SVD/PCA with numpy SVD (LAPACK)')

    elif mode == 'arpack':
        U, S, V = svds(matrix, k=ncomp)
        if verbose:
            print('Done SVD/PCA with scipy sparse SVD (ARPACK)')

    elif mode == 'randsvd':
        U, S, V = randomized_svd(matrix, n_components=ncomp, n_iter=2,
                                 transpose='auto', random_state=random_state)
        if verbose:
            print('Done SVD/PCA with randomized SVD')

    elif mode == 'cupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)  # move the data to the current device
        u_gpu, s_gpu, vh_gpu = cupy.linalg.svd(a_gpu, full_matrices=True,
        V = vh_gpu[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if full_output:
            S = s_gpu[:ncomp]
            if to_numpy:
                S = cupy.asnumpy(S)
            U = u_gpu[:, :ncomp]
            if to_numpy:
                U = cupy.asnumpy(U)
        if verbose:
            print('Done SVD/PCA with cupy (GPU)')

    elif mode == 'randcupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='cupy')
        if to_numpy:
            V = cupy.asnumpy(V)
            S = cupy.asnumpy(S)
            U = cupy.asnumpy(U)
        if verbose:
            print('Done randomized SVD/PCA with cupy (GPU)')

    elif mode == 'eigencupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)     # move the data to the current device
        C = cupy.dot(a_gpu, a_gpu.T)    # covariance matrix
        e, EV = cupy.linalg.eigh(C)     # eigenvalues and eigenvectors
        pc = cupy.dot(EV.T, a_gpu)      # using a compact trick when cov is MM'
        V = pc[::-1]                    # reverse to get last eigenvectors
        S = cupy.sqrt(e)[::-1]          # reverse since EVals go in increasing order
        for i in range(V.shape[1]):
            V[:, i] /= S                # scaling by the square root of eigvals
        V = V[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if verbose:
            print('Done PCA with cupy eigh function (GPU)')

    elif mode == 'pytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32').T))
        u_gpu, s_gpu, vh_gpu = torch.svd(a_gpu)
        V = vh_gpu[:ncomp]
        S = s_gpu[:ncomp]
        U = torch.transpose(u_gpu, 0, 1)[:ncomp]
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if verbose:
            print('Done SVD/PCA with pytorch (GPU)')

    elif mode == 'eigenpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32')))
        C = torch.mm(a_gpu, torch.transpose(a_gpu, 0, 1))
        e, EV = torch.eig(C, eigenvectors=True)
        V = torch.mm(torch.transpose(EV, 0, 1), a_gpu)
        S = torch.sqrt(e[:, 0])
        for i in range(V.shape[1]):
            V[:, i] /= S
        V = V[:ncomp]
        if to_numpy:
            V = np.array(V)
        if verbose:
            print('Done PCA with pytorch eig function')

    elif mode == 'randpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='pytorch')
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if verbose:
            print('Done randomized SVD/PCA with randomized pytorch (GPU)')

        raise ValueError('The SVD `mode` is not recognized')

    if full_output:
        if mode == 'lapack':
            return V.T, S, U.T
        elif mode == 'pytorch':
            if to_numpy:
                return V.T, S, U.T
                return torch.transpose(V, 0, 1), S, torch.transpose(U, 0, 1)
        elif mode in ('eigen', 'eigencupy', 'eigenpytorch'):
            return S, V
            return U, S, V
        if mode == 'lapack':
            return U.T
        elif mode == 'pytorch':
            return U
            return V
    # Generate vortex positions:
    N_vort = 48**2

    if loading_vortex_pos:
        with h5py.File('vortex_positions/vortex_pos_uniform.hdf5',
                       'r') as data:
            vort_pos = iter(data['positions'][...])
        vort_pos = include.phase_imprinting.get_positions(
            N_vort, 2 * xi, len_x, len_y)  # Generator of vortex positions

    theta = include.phase_imprinting.get_phase(N_vort, vort_pos, X,
                                               Y)  # Phase imprinting

    # Construct wavefunction and related:
    psi = cp.sqrt(n_0) * cp.exp(1j * theta)
    atom_num = dx * dy * cp.sum(cp.abs(psi)**2)
    theta_fix = cp.angle(psi)
    psi_k = cp.fft.fft2(psi)

    # ------------------------------------------------------------------------------------------------------------------
    # Imaginary time evolution
    # ------------------------------------------------------------------------------------------------------------------
    for i in range(2000):
        # Kinetic energy:
        psi_k *= cp.exp(-0.25 * dt * (Kx**2 + Ky**2))

        # Backward FFT:
        psi = cp.fft.ifft2(psi_k)

        # Interaction term:
def pca_noise_estimate(
    """ PCA based local noise estimation.

    data: 4D array
        the input dMRI data.

    gtab: gradient table object
      gradient information for the data gives us the bvals and bvecs of
      diffusion data, which is needed here to select between the noise
      estimation methods.
    patch_radius : int
        The radius of the local patch to be taken around each voxel (in
        voxels). Default: 1 (estimate noise in blocks of 3x3x3 voxels).
    correct_bias : bool
      Whether to correct for bias due to Rician noise. This is an implementation
      of equation 8 in [1]_.

    smooth : int
      Radius of a Gaussian smoothing filter to apply to the noise estimate
      before returning. Default: 2.

    sigma_corr: 3D array
        The local noise standard deviation estimate.

    .. [1] Manjon JV, Coupe P, Concha L, Buades A, Collins DL "Diffusion
           Weighted Image Denoising Using Overcomplete Local PCA". PLoS ONE
           8(9): e73021. doi:10.1371/journal.pone.0073021.
    # first identify the number of the b0 images
    K = np.count_nonzero(gtab.b0s_mask)

    if K > 1:
        # If multiple b0 values then use MUBE noise estimate
        data0 = data[..., cp.asarray(gtab.b0s_mask)]
        # sibe = False

        # if only one b0 value then SIBE noise estimate
        data0 = data[..., cp.asarray(~gtab.b0s_mask)]
        # sibe = True

    n0, n1, n2, n3 = data0.shape
    nsamples = n0 * n1 * n2

    if allow_single:
        data_dtype = cp.promote_types(data0.dtype, cp.float32)
        data_dtype = cp.float64
    data0 = data0.astype(data_dtype, copy=False)
    X = data0.reshape(nsamples, n3)
    # Demean:
    X = X - X.mean(axis=0, keepdims=True)
    # compute the covariance matrix, x
    r = cp.dot(X.T, X)
    # (symmetric) eigen decomposition
    w, v = cp.linalg.eigh(r)
    # project smallest eigenvector/value onto the data space
    I = X.dot(v[:, 0:1]).reshape(n0, n1, n2)
    del r, w, v

    s = 2 * patch_radius + 1
    sum_reg = ndi.uniform_filter(I, size=s)
    sigma_sq = I - sum_reg
    sigma_sq *= sigma_sq

    # find the SNR and make the correction for bias due to Rician noise:
    if correct_bias:
        mean = ndi.uniform_filter(data0.mean(-1), size=s, mode="reflect")
        snr = mean / cp.sqrt(sigma_sq)
        snr_sq = snr * snr
        # snr_sq = cp.asnumpy(snr_sq)  # transfer to host to use sps.iv
        # xi is practically equal to 1 above 37.4, and we overflow, raising
        # warnings and creating ot-a-numbers.
        # Instead, we will replace these values with 1 below
        with np.errstate(over="ignore", invalid="ignore"):
            tmp1 = snr_sq / 4
            tmp = sps.i0(tmp1)
            tmp *= 2 + snr_sq
            tmp += snr_sq * sps.i1(tmp1)
            tmp *= tmp
            tmp *= (np.pi / 8) * cp.exp(-snr_sq / 2)
            xi = 2 + snr_sq - tmp
            xi = xi.astype(data_dtype, copy=False)
            # xi = (2 + snr_sq - (np.pi / 8) * cp.exp(-snr_sq / 2) *
            #       ((2 + snr_sq) * sps.i0(snr_sq / 4) +
            #       (snr_sq) * sps.i1(snr_sq / 4)) ** 2).astype(float)
        xi[snr > 37.4] = 1
        sigma_corr = sigma_sq / xi
        sigma_corr[cp.isnan(sigma_corr)] = 0
        sigma_corr = sigma_sq

    if smooth is not None:
        ndi.gaussian_filter(sigma_corr, smooth, output=sigma_corr)

    cp.sqrt(sigma_corr, out=sigma_corr)
    return sigma_corr
def ts_stddev(x, window):
    if window > len(x):
        return cp.full(len(x), cp.nan)
    cov = ts_covariance(x, x, window)
    return cp.sqrt(cov)
    def partial_fit(self, X, y=None, check_input=True) -> "IncrementalPCA":
        Incremental fit with X. All of X is processed as a single batch.


        X : array-like or sparse matrix, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        check_input : bool
            Run check_array on X.
        y : Ignored


        self : object
            Returns the instance itself.

        if check_input:
            if scipy.sparse.issparse(X) or cupyx.scipy.sparse.issparse(X):
                raise TypeError(
                    "IncrementalPCA.partial_fit does not support "
                    "sparse input. Either convert data to dense "
                    "or use IncrementalPCA.fit to do so in batches.")


            X, n_samples, n_features, self.dtype = \
                input_to_cupy_array(X, order='K',
                                    check_dtype=[cp.float32, cp.float64])
            n_samples, n_features = X.shape

        if not hasattr(self, 'components_'):
            self.components_ = None

        if self.n_components is None:
            if self.components_ is None:
                self.n_components_ = min(n_samples, n_features)
                self.n_components_ = self.components_.shape[0]
        elif not 1 <= self.n_components <= n_features:
            raise ValueError("n_components=%r invalid for n_features=%d, need "
                             "more rows than columns for IncrementalPCA "
                             "processing" % (self.n_components, n_features))
        elif not self.n_components <= n_samples:
            raise ValueError("n_components=%r must be less or equal to "
                             "the batch number of samples "
                             "%d." % (self.n_components, n_samples))
            self.n_components_ = self.n_components

        if (self.components_ is not None) and (self.components_.shape[0] !=
            raise ValueError("Number of input features has changed from %i "
                             "to %i between calls to partial_fit! Try "
                             "setting n_components to a fixed value." %
                             (self.components_.shape[0], self.n_components_))
        # This is the first partial_fit
        if not hasattr(self, 'n_samples_seen_'):
            self.n_samples_seen_ = 0
            self.mean_ = .0
            self.var_ = .0

        # Update stats - they are 0 if this is the first step
        col_mean, col_var, n_total_samples = \
                X, last_mean=self.mean_, last_variance=self.var_,
        n_total_samples = n_total_samples[0]

        # Whitening
        if self.n_samples_seen_ == 0:
            # If it is the first step, simply whiten X
            X = X - col_mean
            col_batch_mean = cp.mean(X, axis=0)
            X = X - col_batch_mean
            # Build matrix of combined previous basis and new data
            mean_correction = \
                cp.sqrt((self.n_samples_seen_ * n_samples) /
                        n_total_samples) * (self.mean_ - col_batch_mean)
            X = cp.vstack((self.singular_values_.reshape((-1, 1)) *
                           self.components_, X, mean_correction))

        U, S, V = cp.linalg.svd(X, full_matrices=False)
        U, V = _svd_flip(U, V, u_based_decision=False)
        explained_variance = S ** 2 / (n_total_samples - 1)
        explained_variance_ratio = S ** 2 / cp.sum(col_var * n_total_samples)

        self.n_rows = n_total_samples
        self.n_samples_seen_ = n_total_samples
        self.components_ = V[:self.n_components_]
        self.singular_values_ = S[:self.n_components_]
        self.mean_ = col_mean
        self.var_ = col_var
        self.explained_variance_ = explained_variance[:self.n_components_]
        self.explained_variance_ratio_ = \
        if self.n_components_ < n_features:
            self.noise_variance_ = \
            self.noise_variance_ = 0.

        return self
def protected_sqrt(x1):
    """Closure of square root for negative arguments."""
    return cp.sqrt(cp.abs(x1))
def sqrt(x):
	return cp.sqrt(x)
    def grad(
        """Conjugate gradients for ptychography.

        model : str gaussian or poisson
            The noise model to use for the gradient.
        piter : int
            The number of gradient steps to take.
        recover_prb : bool
            Whether to recover the probe or assume the given probe is correct.

        assert prb.ndim == 3, "prb needs 3 dimensions, not %d" % prb.ndim

        print("# congujate gradient parameters\n"
              "iteration, step size object, step size probe, function min"
              )  # csv column headers
        for i in range(piter):
            # 1) object retrieval subproblem with fixed probe
            # forward operator
            self.fpsi = self.fwd_ptycho(self.fpsi, psi, scan, prb)
            gradpsi0 = gradpsi
           # take gradient
            if model == 'gaussian':
                gradpsi = self.adj_ptycho(
                    self.fpsi - cp.sqrt(data) * cp.exp(1j * cp.angle(self.fpsi)),
                ) / (cp.max(cp.abs(prb))**2)
            elif model == 'poisson':
                gradpsi = self.adj_ptycho(
                    self.fpsi - data * self.fpsi / (cp.abs(self.fpsi)**2 + 1e-32),
                ) / (cp.max(cp.abs(prb))**2)
            gammapsi = 0.25

            if (recover_prb):
                # 2) probe retrieval subproblem with fixed object
                # forward operator
                fprb = self.fwd_ptycho(psi, scan, prb)
                # take gradient
                if model == 'gaussian':
                    gradprb = self.adj_ptycho_prb(
                        fprb - cp.sqrt(data) * cp.exp(1j * cp.angle(fprb)),
                    ) / cp.max(cp.abs(psi))**2 / self.nscan
                elif model == 'poisson':
                    gradprb = self.adj_ptycho_prb(
                        fprb - data * fprb / (cp.abs(fprb)**2 + 1e-32),
                    ) / cp.max(cp.abs(psi))**2 / self.nscan
                # Dai-Yuan direction
                if (i == 0):
                    dprb = -gradprb
                    dprb = -gradprb + (
                        cp.linalg.norm(gradprb)**2 /
                        (cp.sum(cp.conj(dprb) * (gradprb - gradprb0))) * dprb)
                gradprb0 = gradprb
                # line search
                fdprb = self.fwd_ptycho(psi, scan, dprb)
                gammaprb = self.line_search(minf, fprb, fdprb)
                # update prb
                prb = prb + gammaprb * dprb

        #    # check convergence
        #    if (np.mod(i, 8) == 0):
        #        fpsi = self.fwd_ptycho(psi, scan, prb)
        #        print("%4d, %.3e, %.3e, %.7e" %
        #              (i, gammapsi, gammaprb, minf(fpsi)))

        return {
            'psi': psi,
            'prb': prb,
            'dpsi': dpsi,
            'gradpsi': gradpsi,
            'testpsi': testpsi,
            'gammapsi': gammapsi,
# If GPU compute, import CuPy instead of Numpy
if args.gpu:
  import cupy as xp
  import numpy as xp

if resume:
  pong        = pickle.load(open('save.p', 'rb'))
  total_hours = pong['total_time']
  episodes    = pong['total_ep']
  model       = pong['model']
  print('****RESUMING TRAINING*****')
  model       = {}
  model['W1'] = xp.random.randn(H,D) / xp.sqrt(D) # "Xavier" initialization
  model['W2'] = xp.random.randn(H) / xp.sqrt(H)
  total_hours = 0
  episodes    = 0

t1 = datetime.now()
# There are many episodes before updating our parameters, these accumulate
# in the gradient buffer
grad_buffer = {k : xp.zeros_like(v) for k,v in model.items() }

# Memory for decaying gradient memory
dec_grad_mem = { k : xp.zeros_like(v) for k,v in model.items() }
# Memory for squared decaying gradient memory
sqdec_grad_mem = { k : xp.zeros_like(v) for k,v in model.items() }

def sigmoid(x):
    def feedback_alignment(self, x, target, epoch, flag):
        learning_rate = 0.1
        decay_rate = 0.99
        eps = 0.0000000001
        reg = 0.01
        h1 = cp.dot(x, self.W_f1) + self.b1
        h1_ = cp.tanh(h1)
        h2 = cp.dot(h1_, self.W_f2) + self.b2
        h2_ = cp.tanh(h2)
        h3 = cp.dot(h2_, self.W_f3) + self.b3
        h3_ = cp.tanh(h3)
        h4 = cp.dot(h3_, self.W_f4) + self.b4
        h4_ = cp.tanh(h4)
        h5 = cp.dot(h4_, self.W_f5) + self.b5
        output = softmax(h5)

        delta5 = (output - target) / batch_size
        # delta_Wf5 = cp.dot(h4_.T, delta5) + reg * self.W_f5
        delta_Wf5 = cp.dot(h4_.T, delta5)
        self.cache_W5 = decay_rate * self.cache_W5 + (
            1 - decay_rate) * delta_Wf5 * delta_Wf5
        self.W_f5 -= learning_rate * delta_Wf5 / (cp.sqrt(self.cache_W5) + eps)
        # print(learning_rate * delta_Wf5 / (cp.sqrt(self.cache_W5) + eps))
        # print(0.12 * delta_Wf5)
        # delta_b5 = cp.dot(cp.ones(batch_size), delta5) + reg * self.b5
        delta_b5 = cp.dot(cp.ones(batch_size), delta5)
        self.cache_b5 = decay_rate * self.cache_b5 + (
            1 - decay_rate) * delta_b5 * delta_b5
        self.b5 -= learning_rate * delta_b5 / (cp.sqrt(self.cache_b5) + eps)

        delta4 = tanh_grad(h4) * cp.dot(delta5, self.B5)
        # delta_Wf4 = cp.dot(h3_.T, delta4) + reg * self.W_f4
        delta_Wf4 = cp.dot(h3_.T, delta4)
        self.cache_W4 = decay_rate * self.cache_W4 + (
            1 - decay_rate) * delta_Wf4 * delta_Wf4
        # self.W_f4 -= learning_rate * delta_Wf4 / (cp.sqrt(self.cache_W4) + eps)
        self.W_f4 -= learning_rate * delta_Wf4
        # delta_b4 = cp.dot(cp.ones(batch_size), delta4) + reg * self.b4
        delta_b4 = cp.dot(cp.ones(batch_size), delta4)
        self.cache_b4 = decay_rate * self.cache_b4 + (
            1 - decay_rate) * delta_b4 * delta_b4
        # self.b4 -= learning_rate * delta_b4 / (cp.sqrt(self.cache_b4) + eps)
        self.b4 -= learning_rate * delta_b4

        delta3 = tanh_grad(h3) * cp.dot(delta4, self.B4)
        # delta_Wf3 = cp.dot(h2_.T, delta3) + reg * self.W_f3
        delta_Wf3 = cp.dot(h2_.T, delta3)
        self.cache_W3 = decay_rate * self.cache_W3 + (
            1 - decay_rate) * delta_Wf3 * delta_Wf3
        # self.W_f3 -= learning_rate * delta_Wf3 / (cp.sqrt(self.cache_W3) + eps)
        self.W_f3 -= learning_rate * delta_Wf3
        # delta_b3 = cp.dot(cp.ones(batch_size), delta3) + reg * self.b3
        delta_b3 = cp.dot(cp.ones(batch_size), delta3)
        self.cache_b3 = decay_rate * self.cache_b3 + (
            1 - decay_rate) * delta_b3 * delta_b3
        # self.b3 -= learning_rate * delta_b3 / (cp.sqrt(self.cache_b3) + eps)
        self.b3 -= learning_rate * delta_b3

        delta2 = tanh_grad(h2) * cp.dot(delta3, self.B3)
        # delta_Wf2 = cp.dot(h1_.T, delta2) + reg * self.W_f2
        delta_Wf2 = cp.dot(h1_.T, delta2)
        self.cache_W2 = decay_rate * self.cache_W2 + (
            1 - decay_rate) * delta_Wf2 * delta_Wf2
        # self.W_f2 -= learning_rate * delta_Wf2 / (cp.sqrt(self.cache_W2) + eps)
        self.W_f2 -= learning_rate * delta_Wf2
        # delta_b2 = cp.dot(cp.ones(batch_size), delta2) + reg * self.b2
        delta_b2 = cp.dot(cp.ones(batch_size), delta2)
        self.cache_b2 = decay_rate * self.cache_b2 + (
            1 - decay_rate) * delta_b2 * delta_b2
        # self.b2 -= learning_rate * delta_b2 / (cp.sqrt(self.cache_b2) + eps)
        self.b2 -= learning_rate * delta_b2

        delta1 = tanh_grad(h1) * cp.dot(delta2, self.B2)
        # delta_Wf1 = cp.dot(x.T, delta1) + reg * self.W_f1
        delta_Wf1 = cp.dot(x.T, delta1)
        self.cache_W1 = decay_rate * self.cache_W1 + (
            1 - decay_rate) * delta_Wf1 * delta_Wf1
        # self.W_f1 -= learning_rate * delta_Wf1 / (cp.sqrt(self.cache_W1) + eps)
        self.W_f1 -= learning_rate * delta_Wf1
        # delta_b1 = cp.dot(cp.ones(batch_size), delta1) + reg * self.b1
        delta_b1 = cp.dot(cp.ones(batch_size), delta1)
        self.cache_b1 = decay_rate * self.cache_b1 + (
            1 - decay_rate) * delta_b1 * delta_b1
        # self.b1 -= learning_rate * delta_b1 / (cp.sqrt(self.cache_b1) + eps)
        self.b1 -= learning_rate * delta_b1
 def backward(self, dout=1.0):
     dout /= float(self.y.size) * np.sqrt(np.abs(
         (self.y - self.t) / self.t)) * self.t
     dout[self.mask] *= -1.0
     return dout
def svd_wrapper(matrix,
    """ Wrapper for different SVD libraries (CPU and GPU). 
    matrix : numpy ndarray, 2d
        2d input matrix.
    mode : {'lapack', 'arpack', 'eigen', 'randsvd', 'cupy', 'eigencupy',
        'randcupy', 'pytorch', 'eigenpytorch', 'randpytorch'}, str optional
        Switch for the SVD method/library to be used.

        ``lapack``: uses the LAPACK linear algebra library through Numpy
        and it is the most conventional way of computing the SVD
        (deterministic result computed on CPU).

        ``arpack``: uses the ARPACK Fortran libraries accessible through
        Scipy (computation on CPU).

        ``eigen``: computes the singular vectors through the
        eigendecomposition of the covariance M.M' (computation on CPU).

        ``randsvd``: uses the randomized_svd algorithm implemented in
        Sklearn (computation on CPU).

        ``cupy``: uses the Cupy library for GPU computation of the SVD as in
        the LAPACK version. `

        `eigencupy``: offers the same method as with the ``eigen`` option
        but on GPU (through Cupy).

        ``randcupy``: is an adaptation of the randomized_svd algorithm,
        where all the computations are done on a GPU (through Cupy). `

        `pytorch``: uses the Pytorch library for GPU computation of the SVD.

        ``eigenpytorch``: offers the same method as with the ``eigen``
        option but on GPU (through Pytorch).

        ``randpytorch``: is an adaptation of the randomized_svd algorithm,
        where all the linear algebra computations are done on a GPU
        (through Pytorch).

    ncomp : int
        Number of singular vectors to be obtained. In the cases when the full
        SVD is computed (LAPACK, ARPACK, EIGEN, CUPY), the matrix of singular 
        vectors is truncated.
    verbose: bool
        If True intermediate information is printed out.
    full_output : bool optional
        If True the 3 terms of the SVD factorization are returned. If ``mode``
        is eigen then only S and V are returned.
    random_state : int, RandomState instance or None, optional
        If int, random_state is the seed used by the random number generator.
        If RandomState instance, random_state is the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random. Used for ``randsvd`` mode.
    to_numpy : bool, optional
        If True (by default) the arrays computed in GPU are transferred from
        VRAM and converted to numpy ndarrays.

    V : numpy ndarray
        The right singular vectors of the input matrix. If ``full_output`` is
        True it returns the left and right singular vectors and the singular
        values of the input matrix. If ``mode`` is set to eigen then only S and
        V are returned.
    * For ``lapack`` SVD mode see:
    * For ``eigen`` mode see:
    * For ``arpack`` SVD mode see:
    * For ``randsvd`` SVD mode see:
        Finding structure with randomness: Stochastic algorithms for constructing
        approximate matrix decompositions
        Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061
    * For ``cupy`` SVD mode see:
    * For ``eigencupy`` mode see:
    * For ``pytorch`` SVD mode see:
    * For ``eigenpytorch`` mode see:

    if matrix.ndim != 2:
        raise TypeError('Input matrix is not a 2d array')

    if ncomp > min(matrix.shape[0], matrix.shape[1]):
        msg = '{} PCs cannot be obtained from a matrix with size [{},{}].'
        msg += ' Increase the size of the patches or request less PCs'
        raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1]))

    if mode == 'eigen':
        # building C as np.dot(matrix.T,matrix) is slower and takes more memory
        C = np.dot(matrix, matrix.T)  # covariance matrix
        e, EV = linalg.eigh(C)  # EVals and EVs
        pc = np.dot(EV.T, matrix)  # PCs using a compact trick when cov is MM'
        V = pc[::-1]  # reverse since we need the last EVs
        S = np.sqrt(np.abs(e))  # SVals = sqrt(EVals)
        S = S[::-1]  # reverse since EVals go in increasing order
        for i in range(V.shape[1]):
            V[:, i] /= S  # scaling EVs by the square root of EVals
        V = V[:ncomp]
        if verbose:
            print('Done PCA with numpy linalg eigh functions')

    elif mode == 'lapack':
        # n_frames is usually smaller than n_pixels. In this setting taking
        # the SVD of M' and keeping the left (transposed) SVs is faster than
        # taking the SVD of M (right SVs)
        U, S, V = linalg.svd(matrix.T, full_matrices=False)
        V = V[:ncomp]  # we cut projection matrix according to the # of PCs
        U = U[:, :ncomp]
        S = S[:ncomp]
        if verbose:
            print('Done SVD/PCA with numpy SVD (LAPACK)')

    elif mode == 'arpack':
        U, S, V = svds(matrix, k=ncomp)
        if verbose:
            print('Done SVD/PCA with scipy sparse SVD (ARPACK)')

    elif mode == 'randsvd':
        U, S, V = randomized_svd(matrix,
        if verbose:
            print('Done SVD/PCA with randomized SVD')

    elif mode == 'cupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)  # move the data to the current device
        u_gpu, s_gpu, vh_gpu = cupy.linalg.svd(a_gpu,
        V = vh_gpu[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if full_output:
            S = s_gpu[:ncomp]
            if to_numpy:
                S = cupy.asnumpy(S)
            U = u_gpu[:, :ncomp]
            if to_numpy:
                U = cupy.asnumpy(U)
        if verbose:
            print('Done SVD/PCA with cupy (GPU)')

    elif mode == 'randcupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='cupy')
        if to_numpy:
            V = cupy.asnumpy(V)
            S = cupy.asnumpy(S)
            U = cupy.asnumpy(U)
        if verbose:
            print('Done randomized SVD/PCA with cupy (GPU)')

    elif mode == 'eigencupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)  # move the data to the current device
        C = cupy.dot(a_gpu, a_gpu.T)  # covariance matrix
        e, EV = cupy.linalg.eigh(C)  # eigenvalues and eigenvectors
        pc = cupy.dot(EV.T, a_gpu)  # using a compact trick when cov is MM'
        V = pc[::-1]  # reverse to get last eigenvectors
        S = cupy.sqrt(e)[::-1]  # reverse since EVals go in increasing order
        for i in range(V.shape[1]):
            V[:, i] /= S  # scaling by the square root of eigvals
        V = V[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if verbose:
            print('Done PCA with cupy eigh function (GPU)')

    elif mode == 'pytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32').T))
        u_gpu, s_gpu, vh_gpu = torch.svd(a_gpu)
        V = vh_gpu[:ncomp]
        S = s_gpu[:ncomp]
        U = torch.transpose(u_gpu, 0, 1)[:ncomp]
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if verbose:
            print('Done SVD/PCA with pytorch (GPU)')

    elif mode == 'eigenpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32')))
        C = torch.mm(a_gpu, torch.transpose(a_gpu, 0, 1))
        e, EV = torch.eig(C, eigenvectors=True)
        V = torch.mm(torch.transpose(EV, 0, 1), a_gpu)
        S = torch.sqrt(e[:, 0])
        for i in range(V.shape[1]):
            V[:, i] /= S
        V = V[:ncomp]
        if to_numpy:
            V = np.array(V)
        if verbose:
            print('Done PCA with pytorch eig function')

    elif mode == 'randpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='pytorch')
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if verbose:
            print('Done randomized SVD/PCA with randomized pytorch (GPU)')

        raise ValueError('The SVD `mode` is not recognized')

    if full_output:
        if mode == 'lapack':
            return V.T, S, U.T
        elif mode == 'pytorch':
            if to_numpy:
                return V.T, S, U.T
                return torch.transpose(V, 0, 1), S, torch.transpose(U, 0, 1)
        elif mode in ('eigen', 'eigencupy', 'eigenpytorch'):
            return S, V
            return U, S, V
        if mode == 'lapack':
            return U.T
        elif mode == 'pytorch':
            return U
            return V
 def __init_weights(self, f, c, dim_in):
     """Initialise parameters He initialisation."""
     return np.random.randn(f, f, dim_in[-1], c) \
         * np.sqrt(2/(dim_in[1]*dim_in[2]))
def rmse(X, Y):
    return np.sqrt(np.mean((X - Y)**2))
def norm(x, ord=None, axis=None):
    """Norm of a cupy.scipy.spmatrix

    This function is able to return one of seven different sparse matrix norms,
    depending on the value of the ``ord`` parameter.

        x (sparse matrix) : Input sparse matrix.
        ord (non-zero int, inf, -inf, 'fro', optional) : Order of the norm (see
            table under ``Notes``). inf means numpy's `inf` object.
        axis : (int, 2-tuple of ints, None, optional): If `axis` is an
            integer, it specifies the axis of `x` along which to
            compute the vector norms.  If `axis` is a 2-tuple, it specifies the
            axes that hold 2-D matrices, and the matrix norms of these matrices
            are computed.  If `axis` is None then either a vector norm
            (when `x` is 1-D) or a matrix norm (when `x` is 2-D) is returned.
        ndarray : 0-D or 1-D array or norm(s).

    .. seealso:: :func:`scipy.sparse.linalg.norm`

    if not cupyx.scipy.sparse.issparse(x):
        raise TypeError(("input is not sparse. use cupy.linalg.norm"))

    # Check the default case first and handle it immediately.
    if axis is None and ord in (None, 'fro', 'f'):
        return _sparse_frobenius_norm(x)

    # Some norms require functions that are not implemented for all types.
    x = x.tocsr()

    if axis is None:
        axis = (0, 1)
    elif not isinstance(axis, tuple):
        msg = "'axis' must be None, an integer or a tuple of integers"
            int_axis = int(axis)
        except TypeError:
            raise TypeError(msg)
        if axis != int_axis:
            raise TypeError(msg)
        axis = (int_axis, )

    nd = 2
    if len(axis) == 2:
        row_axis, col_axis = axis
        if not (-nd <= row_axis < nd and -nd <= col_axis < nd):
            raise ValueError('Invalid axis %r for an array with shape %r' %
                             (axis, x.shape))
        if row_axis % nd == col_axis % nd:
            raise ValueError('Duplicate axes given.')
        if ord == 2:
            raise NotImplementedError
            # return _multi_svd_norm(x, row_axis, col_axis, amax)
        elif ord == -2:
            raise NotImplementedError
            # return _multi_svd_norm(x, row_axis, col_axis, amin)
        elif ord == 1:
            return abs(x).sum(axis=row_axis).max()
        elif ord == numpy.Inf:
            return abs(x).sum(axis=col_axis).max()
        elif ord == -1:
            return abs(x).sum(axis=row_axis).min()
        elif ord == -numpy.Inf:
            return abs(x).sum(axis=col_axis).min()
        elif ord in (None, 'f', 'fro'):
            # The axis order does not matter for this norm.
            return _sparse_frobenius_norm(x)
            raise ValueError("Invalid norm order for matrices.")
    elif len(axis) == 1:
        a, = axis
        if not (-nd <= a < nd):
            raise ValueError('Invalid axis %r for an array with shape %r' %
                             (axis, x.shape))
        if ord == numpy.Inf:
            return abs(x).max(axis=a).A.ravel()
        elif ord == -numpy.Inf:
            return abs(x).min(axis=a).A.ravel()
        elif ord == 0:
            # Zero norm
            return (x != 0).astype(numpy.float32).sum(axis=a).ravel().astype(
        elif ord == 1:
            # special case for speedup
            return abs(x).sum(axis=a).ravel()
        elif ord in (2, None):
            return cupy.sqrt(abs(x).power(2).sum(axis=a)).ravel()
                ord + 1
            except TypeError:
                raise ValueError('Invalid norm order for vectors.')
            return cupy.power(abs(x).power(ord).sum(axis=a), 1 / ord).ravel()
        raise ValueError("Improper number of dimensions to norm.")
 def R_vertical(self, theta, epsilon_r):
     R = (cp.sqrt(self.epsilon(epsilon_r) - (cp.sin(theta))**2) - self.epsilon(epsilon_r) * cp.cos(theta)) / \
         (cp.sqrt(self.epsilon(epsilon_r) - (cp.sin(theta))**2) +
          self.epsilon(epsilon_r) * cp.cos(theta))
     return R
文件: norms.py 项目: mattya/chainer
def norm(x, ord=None, axis=None, keepdims=False):
    """Returns one of matrix norms specified by ``ord`` parameter.

    Complex valued matrices and vectors are not supported.
    See numpy.linalg.norm for more detail.

        x (cupy.ndarray): Array to take norm. If ``axis`` is None,
            ``x`` must be 1-D or 2-D.
        ord (non-zero int, inf, -inf, 'fro'): Norm type.
        axis (int, 2-tuple of ints, None): 1-D or 2-D norm is cumputed over
        keepdims (bool): If this is set ``True``, the axes which are normed
            over are left.


    if not issubclass(x.dtype.type, (numpy.inexact, numpy.object_)):
        x = x.astype(float)

    # Immediately handle some default, simple, fast, and common cases.
    if axis is None:
        ndim = x.ndim
        if ((ord is None) or (ord in ('f', 'fro') and ndim == 2) or
                (ord == 2 and ndim == 1)):

            x = x.ravel()
            sqnorm = cupy.sum(x ** 2)
            ret = cupy.sqrt(sqnorm)
            if keepdims:
                ret = ret.reshape(ndim * [1])
            return ret

    # Normalize the `axis` argument to a tuple.
    nd = x.ndim
    if axis is None:
        axis = tuple(range(nd))
    elif not isinstance(axis, tuple):
            axis = int(axis)
        except Exception:
            raise TypeError(
                "'axis' must be None, an integer or a tuple of integers")
        axis = (axis,)

    if len(axis) == 1:
        if ord == numpy.Inf:
            return abs(x).max(axis=axis, keepdims=keepdims)
        elif ord == -numpy.Inf:
            return abs(x).min(axis=axis, keepdims=keepdims)
        elif ord == 0:
            # Zero norm
            return (x != 0).sum(axis=axis, keepdims=keepdims, dtype=x.dtype)
        elif ord == 1:
            # special case for speedup
            return abs(x).sum(axis=axis, keepdims=keepdims)
        elif ord is None or ord == 2:
            # special case for speedup
            s = x ** 2
            return cupy.sqrt(s.sum(axis=axis, keepdims=keepdims))
            except TypeError:
                raise ValueError("Invalid norm order for vectors.")
            absx = abs(x)
            absx **= ord
            return absx.sum(axis=axis, keepdims=keepdims) ** (1.0 / ord)
    elif len(axis) == 2:
        row_axis, col_axis = axis
        if row_axis < 0:
            row_axis += nd
        if col_axis < 0:
            col_axis += nd
        if not (0 <= row_axis < nd and 0 <= col_axis < nd):
            raise ValueError('Invalid axis %r for an array with shape %r' %
                             (axis, x.shape))
        if row_axis == col_axis:
            raise ValueError('Duplicate axes given.')
        if ord == 1:
            if col_axis > row_axis:
                col_axis -= 1
            ret = abs(x).sum(axis=row_axis).max(axis=col_axis)
        elif ord == numpy.Inf:
            if row_axis > col_axis:
                row_axis -= 1
            ret = abs(x).sum(axis=col_axis).max(axis=row_axis)
        elif ord == -1:
            if col_axis > row_axis:
                col_axis -= 1
            ret = abs(x).sum(axis=row_axis).min(axis=col_axis)
        elif ord == -numpy.Inf:
            if row_axis > col_axis:
                row_axis -= 1
            ret = abs(x).sum(axis=col_axis).min(axis=row_axis)
        elif ord in [None, 'fro', 'f']:
            ret = cupy.sqrt((x ** 2).sum(axis=axis))
            raise ValueError("Invalid norm order for matrices.")
        if keepdims:
            ret_shape = list(x.shape)
            ret_shape[axis[0]] = 1
            ret_shape[axis[1]] = 1
            ret = ret.reshape(ret_shape)
        return ret
        raise ValueError("Improper number of dimensions to norm.")
 def R_horizontal(self, theta, epsilon_r):
     R = (cp.cos(theta) - (cp.sqrt(self.epsilon(epsilon_r) - (cp.sin(theta))**2))) / \
         (cp.cos(theta) + cp.sqrt(self.epsilon(epsilon_r) - (cp.sin(theta))**2))
     return R
def svd_wrapper(matrix, mode, ncomp, debug, verbose, usv=False,
                random_state=None, to_numpy=True):
    """ Wrapper for different SVD libraries (CPU and GPU). 
    matrix : array_like, 2d
        2d input matrix.
    mode : {'lapack', 'arpack', 'eigen', 'randsvd', 'cupy', 'eigencupy',
            'randcupy', 'pytorch', 'eigenpytorch', 'randpytorch'}, str optional
        Switch for the SVD method/library to be used. ``lapack`` uses the LAPACK 
        linear algebra library through Numpy and it is the most conventional way 
        of computing the SVD (deterministic result computed on CPU). ``arpack`` 
        uses the ARPACK Fortran libraries accessible through Scipy (computation
        on CPU). ``eigen`` computes the singular vectors through the 
        eigendecomposition of the covariance M.M' (computation on CPU).
        ``randsvd`` uses the randomized_svd algorithm implemented in Sklearn 
        (computation on CPU). ``cupy`` uses the Cupy library for GPU computation
        of the SVD as in the LAPACK version. ``eigencupy`` offers the same 
        method as with the ``eigen`` option but on GPU (through Cupy). 
        ``randcupy`` is an adaptation f the randomized_svd algorithm, where all
        the computations are done on a GPU (through Cupy). ``pytorch`` uses the
        Pytorch library for GPU computation of the SVD. ``eigenpytorch`` offers
        the same method as with the ``eigen`` option but on GPU (through
        Pytorch). ``randpytorch`` is an adaptation of the randomized_svd
        algorithm, where all the linear algebra computations are done on a GPU
        (through Pytorch).
    ncomp : int
        Number of singular vectors to be obtained. In the cases when the full
        SVD is computed (LAPACK, ARPACK, EIGEN, CUPY), the matrix of singular 
        vectors is truncated. 
    debug : bool
        If True the explained variance ratio is computed and displayed.
    verbose: bool
        If True intermediate information is printed out.
    usv : bool optional
        If True the 3 terms of the SVD factorization are returned.
    random_state : int, RandomState instance or None, optional
        If int, random_state is the seed used by the random number generator.
        If RandomState instance, random_state is the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random. Used for ``randsvd`` mode.
    to_numpy : bool, optional
        If True (by default) the arrays computed in GPU are transferred from
        VRAM and converted to numpy ndarrays.

    V : array_like
        The right singular vectors of the input matrix. If ``usv`` is True it
        returns the left and right singular vectors and the singular values of
        the input matrix.
    * For ``lapack`` SVD mode see:
    * For ``eigen`` mode see:
    * For ``arpack`` SVD mode see:
    * For ``randsvd`` SVD mode see:
        Finding structure with randomness: Stochastic algorithms for constructing
        approximate matrix decompositions
        Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061
    * For ``cupy`` SVD mode see:
    * For ``eigencupy`` mode see:
    * For ``pytorch`` SVD mode see:
    * For ``eigenpytorch`` mode see:


    def reconstruction(ncomp, U, S, V, var=1):
        if mode == 'lapack':
            rec_matrix = np.dot(U[:, :ncomp],
                                np.dot(np.diag(S[:ncomp]), V[:ncomp]))
            rec_matrix = rec_matrix.T
            print('  Matrix reconstruction with {} PCs:'.format(ncomp))
            print('  Mean Absolute Error =', MAE(matrix, rec_matrix))
            print('  Mean Squared Error =', MSE(matrix, rec_matrix))

            # see https://github.com/scikit-learn/scikit-learn/blob/c3980bcbabd9d2527548820581725df2904e4a0d/sklearn/decomposition/pca.py
            exp_var = (S ** 2) / (S.shape[0] - 1)
            full_var = np.sum(exp_var)
            explained_variance_ratio = exp_var / full_var   # % of variance explained by each PC
            ratio_cumsum = np.cumsum(explained_variance_ratio)
        elif mode == 'eigen':
            exp_var = (S ** 2) / (S.shape[0] - 1)
            full_var = np.sum(exp_var)
            explained_variance_ratio = exp_var / full_var   # % of variance explained by each PC
            ratio_cumsum = np.cumsum(explained_variance_ratio)
            rec_matrix = np.dot(U, np.dot(np.diag(S), V))
            print('  Matrix reconstruction MAE =', MAE(matrix, rec_matrix))
            exp_var = (S ** 2) / (S.shape[0] - 1)
            full_var = np.var(matrix, axis=0).sum()
            explained_variance_ratio = exp_var / full_var   # % of variance explained by each PC
            if var == 1:
                explained_variance_ratio = explained_variance_ratio[::-1]
            ratio_cumsum = np.cumsum(explained_variance_ratio)
            msg = '  This info makes sense when the matrix is mean centered '
            msg += '(temp-mean scaling)'

        lw = 2; alpha = 0.4
        fig = plt.figure(figsize=vip_figsize)
        ax1 = plt.subplot2grid((1, 3), (0, 0), colspan=2)
                 explained_variance_ratio, alpha=alpha, where='mid',
                 label='Individual EVR', lw=lw)
        ax1.plot(ratio_cumsum, '.-', alpha=alpha,
                 label='Cumulative EVR', lw=lw)
        ax1.legend(loc='best', frameon=False, fontsize='medium')
        ax1.set_ylabel('Explained variance ratio (EVR)')
        ax1.set_xlabel('Principal components')
        ax1.grid(linestyle='solid', alpha=0.2)
        ax1.set_xlim(-10, explained_variance_ratio.shape[0] + 10)
        ax1.set_ylim(0, 1)

        trunc = 20
        ax2 = plt.subplot2grid((1, 3), (0, 2), colspan=1)
        # plt.setp(ax2.get_yticklabels(), visible=False)
        ax2.step(range(trunc), explained_variance_ratio[:trunc], alpha=alpha,
                 where='mid', lw=lw)
        ax2.plot(ratio_cumsum[:trunc], '.-', alpha=alpha, lw=lw)
        ax2.set_xlabel('Principal components')
        ax2.grid(linestyle='solid', alpha=0.2)
        ax2.set_xlim(-2, trunc + 2)
        ax2.set_ylim(0, 1)

        msg = '  Cumulative explained variance ratio for {} PCs = {:.5f}'
        # plt.savefig('figure.pdf', dpi=300, bbox_inches='tight')
        print(msg.format(ncomp, ratio_cumsum[ncomp - 1]))

    # --------------------------------------------------------------------------

    if matrix.ndim != 2:
        raise TypeError('Input matrix is not a 2d array')

    if usv:
        if mode not in ('lapack', 'arpack', 'randsvd', 'cupy', 'randcupy',
                        'pytorch', 'randpytorch'):
            msg = "Returning USV is supported with modes lapack, arpack, "
            msg += "randsvd, cupy, randcupy, pytorch or randpytorch"
            raise ValueError(msg)

    if ncomp > min(matrix.shape[0], matrix.shape[1]):
        msg = '{} PCs cannot be obtained from a matrix with size [{},{}].'
        msg += ' Increase the size of the patches or request less PCs'
        raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1]))

    if mode == 'eigen':
        # building C as np.dot(matrix.T,matrix) is slower and takes more memory
        C = np.dot(matrix, matrix.T)        # covariance matrix
        e, EV = linalg.eigh(C)              # EVals and EVs
        pc = np.dot(EV.T, matrix)           # PCs using a compact trick when cov is MM'
        V = pc[::-1]                        # reverse since we need the last EVs
        S = np.sqrt(np.abs(e))              # SVals = sqrt(EVals)
        S = S[::-1]                         # reverse since EVals go in increasing order
        if debug:
            reconstruction(ncomp, None, S, None)
        for i in range(V.shape[1]):
            V[:, i] /= S                    # scaling EVs by the square root of EVals
        V = V[:ncomp]
        if verbose:
            print('Done PCA with numpy linalg eigh functions')

    elif mode == 'lapack':
        # n_frames is usually smaller than n_pixels. In this setting taking the SVD of M'
        # and keeping the left (transposed) SVs is faster than taking the SVD of M (right SVs)
        U, S, V = linalg.svd(matrix.T, full_matrices=False)
        if debug:
            reconstruction(ncomp, U, S, V)
        V = V[:ncomp]                       # we cut projection matrix according to the # of PCs
        U = U[:, :ncomp]
        S = S[:ncomp]
        if verbose:
            print('Done SVD/PCA with numpy SVD (LAPACK)')

    elif mode == 'arpack':
        U, S, V = svds(matrix, k=ncomp)
        if debug:
            reconstruction(ncomp, U, S, V, -1)
        if verbose:
            print('Done SVD/PCA with scipy sparse SVD (ARPACK)')

    elif mode == 'randsvd':
        U, S, V = randomized_svd(matrix, n_components=ncomp, n_iter=2,
                                 transpose='auto', random_state=random_state)
        if debug:
            reconstruction(ncomp, U, S, V)
        if verbose:
            print('Done SVD/PCA with randomized SVD')

    elif mode == 'cupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)  # move the data to the current device
        u_gpu, s_gpu, vh_gpu = cupy.linalg.svd(a_gpu, full_matrices=True,
        V = vh_gpu[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if usv:
            S = s_gpu[:ncomp]
            if to_numpy:
                S = cupy.asnumpy(S)
            U = u_gpu[:, :ncomp]
            if to_numpy:
                U = cupy.asnumpy(U)
        if verbose:
            print('Done SVD/PCA with cupy (GPU)')

    elif mode == 'randcupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='cupy')
        if to_numpy:
            V = cupy.asnumpy(V)
            S = cupy.asnumpy(S)
            U = cupy.asnumpy(U)
        if debug:
            reconstruction(ncomp, U, S, V)
        if verbose:
            print('Done randomized SVD/PCA with cupy (GPU)')

    elif mode == 'eigencupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)         # move the data to the current device
        C = cupy.dot(a_gpu, a_gpu.T)        # covariance matrix
        e, EV = cupy.linalg.eigh(C)         # eigenvalues and eigenvectors
        pc = cupy.dot(EV.T, a_gpu)          # PCs using a compact trick when cov is MM'
        V = pc[::-1]                        # reverse since last eigenvectors are the ones we want
        S = cupy.sqrt(e)[::-1]              # reverse since eigenvalues are in increasing order
        if debug:
            reconstruction(ncomp, None, S, None)
        for i in range(V.shape[1]):
            V[:, i] /= S                    # scaling by the square root of eigenvalues
        V = V[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if verbose:
            print('Done PCA with cupy eigh function (GPU)')

    elif mode == 'pytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32').T))
        u_gpu, s_gpu, vh_gpu = torch.svd(a_gpu)
        V = vh_gpu[:ncomp]
        S = s_gpu[:ncomp]
        U = torch.transpose(u_gpu, 0, 1)[:ncomp]
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if verbose:
            print('Done SVD/PCA with pytorch (GPU)')

    elif mode == 'eigenpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32')))
        C = torch.mm(a_gpu, torch.transpose(a_gpu, 0, 1))
        e, EV = torch.eig(C, eigenvectors=True)
        V = torch.mm(torch.transpose(EV, 0, 1), a_gpu)
        S = torch.sqrt(e[:, 0])
        if debug:
            reconstruction(ncomp, None, S, None)
        for i in range(V.shape[1]):
            V[:, i] /= S
        V = V[:ncomp]
        if to_numpy:
            V = np.array(V)
        if verbose:
            print('Done PCA with pytorch eig function')

    elif mode == 'randpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='pytorch')
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if debug:
            reconstruction(ncomp, U, S, V)
        if verbose:
            print('Done randomized SVD/PCA with randomized pytorch (GPU)')

        raise ValueError('The SVD mode is not available')

    if usv:
        if mode == 'lapack':
            return V.T, S, U.T
        elif mode == 'pytorch':
            if to_numpy:
                return V.T, S, U.T
                return torch.transpose(V, 0, 1), S, torch.transpose(U, 0, 1)
            return U, S, V
        if mode == 'lapack':
            return U.T
        elif mode == 'pytorch':
            return U
            return V
 def path_length_difference(self, distance, ht, hr):
     delta_l = cp.sqrt((ht + hr)**2 + distance**2) - \
         cp.sqrt((ht - hr)**2 + distance**2)
     return delta_l