Exemplos de dot em Python, exemplos de cupy.dot em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: lstm_layer.py Projeto: bordingj/Neural-Layers

    def backward(self, inputs, grad_outputs):
        xp = cuda.get_array_module(*inputs)
        gh, gc = grad_outputs
        x, h_tm1, c_tm1 = inputs
        
        gc_tm1, gz = self.lstm_fun.backward(inputs=(c_tm1, self.z), 
                                     grad_outputs=(gc, gh))
        
        batchsize = x.shape[0]        
        gh_tm1 = xp.empty_like(h_tm1)
        gx      = xp.empty((batchsize,self.in_size),dtype=np.dtype('float32'))
        
        if xp is np:
            gh_tm1 = np.dot(gz, self.V, out=gh_tm1)
            # compute gradient with respect to the input x
            gx = np.dot(gz, self.W, out=gx)
             # compute gradients of weight matrices
            self.gW += gz.T.dot(x)
            self.gV += gz.T.dot(h_tm1)
            if not self.nobias:
                gb_ones = xp.ones((1,batchsize), dtype=np.dtype('float32'))
                self.gb += np.dot(gb_ones, gz)
        else:
            gh_tm1 = cp.dot(gz, self.V, out=gh_tm1)
            # compute gradient with respect to the input x
            gx = cp.dot(gz, self.W, out=gx)
            # compute gradients of weight matrices
            gpu.utils.dot_add(gz, x, C=self.gW, transa=True)
            gpu.utils.dot_add(gz, h_tm1, C=self.gV, transa=True)
            if not self.nobias:
                gb_ones = xp.ones((1,batchsize), dtype=np.dtype('float32'))
                gpu.utils.dot_add(gb_ones, gz, C=self.gb)

        return gx, gh_tm1, gc_tm1

Exemplo n.º 2

0

Exibir arquivo

Arquivo: lstm_decoder_layer.py Projeto: bordingj/Neural-Layers

    def backward(self, inputs, grad_outputs):
        xp = cuda.get_array_module(*inputs)
        gh, gc = grad_outputs
        x, h_tm1, c_tm1, q = inputs
        
        if gh is None:
            gh = xp.array([[0]], dtype=np.float32)
            gh_is_none = 1
        else:
            gh_is_none = 0
        if gc is None:
            gc = xp.array([[0]], dtype=np.float32)
            gc_is_none = 1
        else:
            gc_is_none = 0
        
        gc_tm1 = self.c
        
        batchsize = x.shape[0]
        
        gx      = xp.empty((batchsize,self.in_size),dtype=np.dtype('float32'))
        gq      = xp.empty((batchsize,self.encode_size),dtype=np.dtype('float32'))
        
        if xp is np:
            _lstm_backward_cpu(c=self.c, z=self.z, gh=gh, 
                          gc=gc, c_tm1=c_tm1,
                          gc_is_none=gc_is_none, gh_is_none=gh_is_none)

            # compute gradient with respect to the input x
            gz = self.z
            gh_tm1 = np.dot(gz, self.V, out=self.h)
            gx = np.dot(gz, self.W, out=gx)
            gq = np.dot(gz, self.U, out=gq) 
             # compute gradients of weight matrices
            self.gW += gz.T.dot(x)
            self.gV += gz.T.dot(h_tm1)
            self.gU += gz.T.dot(q)
            if not self.nobias:
                gb_ones = xp.ones((1,batchsize), dtype=np.dtype('float32'))
                self.gb += np.dot(gb_ones, gz)
        else:
            _lstm_backward_gpu(c=self.c, z=self.z, gh=gh, 
                          gc=gc, c_tm1=c_tm1,
                          gc_is_none=gc_is_none, gh_is_none=gh_is_none)

            # compute gradient with respect to the input x
            gz = self.z
            gh_tm1 = cp.dot(gz, self.V, out=self.h)
            gx = cp.dot(gz, self.W, out=gx)
            gq = cp.dot(gz, self.U, out=gq)
            # compute gradients of weight matrices
            gpu.utils.dot_add(gz, x, C=self.gW, transa=True)
            gpu.utils.dot_add(gz, h_tm1, C=self.gV, transa=True)
            gpu.utils.dot_add(gz, q, C=self.gU, transa=True)
            if not self.nobias:
                gb_ones = xp.ones((1,batchsize), dtype=np.dtype('float32'))
                gpu.utils.dot_add(gb_ones, gz, C=self.gb)

        return gx, gh_tm1, gc_tm1, gq

Exemplo n.º 3

0

Exibir arquivo

Arquivo: lstm_decoder_layer.py Projeto: bordingj/Neural-Layers

    def forward(self, inputs):
        xp = cuda.get_array_module(*inputs)
        x, h_tm1, c_tm1, q = inputs
        
        batchsize = x.shape[0]
        
        self.z       = xp.empty((batchsize,self.out_size*4),dtype=np.dtype('float32'))
        self.c       = xp.empty((batchsize,self.out_size),dtype=np.dtype('float32'))
        self.h       = xp.empty((batchsize,self.out_size),dtype=np.dtype('float32'))

        if xp is np:
            self.z = np.dot(x, self.W.T, out=self.z)
            self.z += np.dot(h_tm1, self.V.T)
            self.z += np.dot(q, self.U.T)
            if not self.nobias:
                self.z += self.b
            
            _lstm_forward_cpu(z=self.z, c_tm1=c_tm1, c=self.c, 
                         h=self.h, out_size=self.out_size)
        else:
            self.z = cp.dot(x, self.W.T, out=self.z)
            gpu.utils.dot_add(A=h_tm1, B=self.V, C=self.z, transb=True)
            gpu.utils.dot_add(A=q, B=self.U, C=self.z, transb=True)
            if not self.nobias:
                gpu.utils.addVec2Mat(self.z, self.b)
            _lstm_forward_gpu(z=self.z, c_tm1=c_tm1, c=self.c, 
                         h=self.h, out_size=self.out_size)
            
        return self.h, self.c

Exemplo n.º 4

0

Exibir arquivo

Arquivo: lstm_layer.py Projeto: bordingj/Neural-Layers

    def forward(self, inputs):
        xp = cuda.get_array_module(*inputs)
        x, h_tm1, c_tm1 = inputs
        
        batchsize = x.shape[0]
        
        z       = xp.empty((batchsize,self.out_size*4),dtype=np.dtype('float32'))

        if xp is np:
            z  = x.dot(self.W.T, out=z)
            z += h_tm1.dot(self.V.T)
            if not self.nobias:
                z += self.b
        else:
            z = cp.dot(x, self.W.T, out=z)
            gpu.utils.dot_add(A=h_tm1, B=self.V, C=z, transb=True)
            if not self.nobias:
                gpu.utils.addVec2Mat(z, self.b)
        
        self.lstm_fun = F.LSTM()
        c, h = self.lstm_fun.forward(inputs=(c_tm1, z))
        self.z = z
        return h, c

Exemplo n.º 5

0

Exibir arquivo

Arquivo: p200_GRU_sine.py Projeto: tf-nightly/p200504

    def backward(self, x, y, y_prev, gates, grad_y):
        a0, a1, a2 = gates

        delta_a2 = grad_y * a0 * (1 - a2**2)
        self.grad_w[2] += np.dot(x.T, delta_a2)
        self.grad_v[2] += np.dot((a1 * y_prev).T, delta_a2)

        delta_a0 = grad_y * (a2 - y_prev) * a0 * (1 - a0)
        self.grad_w[0] += np.dot(x.T, delta_a0)
        self.grad_v[0] += np.dot(y_prev.T, delta_a0)

        s = np.dot(delta_a2, self.v[2].T)
        delta_a1 = s * y_prev * a1 * (1 - a1)
        self.grad_w[1] += np.dot(x.T, delta_a1)
        self.grad_v[1] += np.dot(y_prev.T, delta_a1)

        self.grad_x = np.dot(delta_a0, self.w[0].T)
        +np.dot(delta_a1, self.w[1].T)
        +np.dot(delta_a2, self.w[2].T)

        self.grad_y_prev = np.dot(delta_a0, self.v[0].T)
        +np.dot(delta_a1, self.v[1].T)
        +a1 * s + grad_y * (1 - a0)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: _solve.py Projeto: zhaohb/cupy

def lstsq(a, b, rcond='warn'):
    """Return the least-squares solution to a linear matrix equation.

    Solves the equation `a x = b` by computing a vector `x` that
    minimizes the Euclidean 2-norm `|| b - a x ||^2`.  The equation may
    be under-, well-, or over- determined (i.e., the number of
    linearly independent rows of `a` can be less than, equal to, or
    greater than its number of linearly independent columns).  If `a`
    is square and of full rank, then `x` (but for round-off error) is
    the "exact" solution of the equation.

    Args:
        a (cupy.ndarray): "Coefficient" matrix with dimension ``(M, N)``
        b (cupy.ndarray): "Dependent variable" values with dimension ``(M,)``
            or ``(M, K)``
        rcond (float): Cutoff parameter for small singular values.
            For stability it computes the largest singular value denoted by
            ``s``, and sets all singular values smaller than ``s`` to zero.

    Returns:
        tuple:
            A tuple of ``(x, residuals, rank, s)``. Note ``x`` is the
            least-squares solution with shape ``(N,)`` or ``(N, K)`` depending
            if ``b`` was two-dimensional. The sums of ``residuals`` is the
            squared Euclidean 2-norm for each column in b - a*x. The
            ``residuals`` is an empty array if the rank of a is < N or M <= N,
            but  iff b is 1-dimensional, this is a (1,) shape array, Otherwise
            the shape is (K,). The ``rank`` of matrix ``a`` is an integer. The
            singular values of ``a`` are ``s``.

    .. warning::
        This function calls one or more cuSOLVER routine(s) which may yield
        invalid results if input conditions are not met.
        To detect these invalid results, you can set the `linalg`
        configuration to a value that is not `ignore` in
        :func:`cupyx.errstate` or :func:`cupyx.seterr`.

    .. seealso:: :func:`numpy.linalg.lstsq`
    """
    if rcond == 'warn':
        warnings.warn(
            '`rcond` parameter will change to the default of '
            'machine precision times ``max(M, N)`` where M and N '
            'are the input matrix dimensions.\n'
            'To use the future default and silence this warning '
            'we advise to pass `rcond=None`, to keep using the old, '
            'explicitly pass `rcond=-1`.', FutureWarning)
        rcond = -1

    _util._assert_cupy_array(a, b)
    _util._assert_rank2(a)
    if b.ndim > 2:
        raise linalg.LinAlgError('{}-dimensional array given. Array must be at'
                                 ' most two-dimensional'.format(b.ndim))
    m, n = a.shape[-2:]
    m2 = b.shape[0]
    if m != m2:
        raise linalg.LinAlgError('Incompatible dimensions')

    u, s, vh = cupy.linalg.svd(a, full_matrices=False)

    if rcond is None:
        rcond = numpy.finfo(s.dtype).eps * max(m, n)
    elif rcond <= 0 or rcond >= 1:
        # some doc of gelss/gelsd says "rcond < 0", but it's not true!
        rcond = numpy.finfo(s.dtype).eps

    # number of singular values and matrix rank
    cutoff = rcond * s.max()
    s1 = 1 / s
    sing_vals = s <= cutoff
    s1[sing_vals] = 0
    rank = s.size - sing_vals.sum(dtype=numpy.int32)

    # Solve the least-squares solution
    # x = vh.T.conj() @ diag(s1) @ u.T.conj() @ b
    z = (cupy.dot(b.T, u.conj()) * s1).T
    x = cupy.dot(vh.T.conj(), z)
    # Calculate squared Euclidean 2-norm for each column in b - a*x
    if m <= n or rank != n:
        resids = cupy.empty((0, ), dtype=s.dtype)
    else:
        e = b - a.dot(x)
        resids = cupy.atleast_1d(_nrm2_last_axis(e.T))
    return x, resids, rank, s

Exemplo n.º 7

0

Exibir arquivo

    for j in range(M - 1):
        x_train[i, j] = float(ss[j])
    y_train[i, 0] = float(ss[M - 1])
    x_train[i, M - 1] = 1
str = f.readline()
ss = str.split()
test_size = int(ss[0])
M = int(ss[1]) + 1
x_test = cp.ndarray(shape=(test_size, M), dtype=float)
y_test = cp.ndarray(shape=(test_size, 1), dtype=float)
for i in range(test_size):
    str = f.readline()
    ss = str.split()
    for j in range(M - 1):
        x_test[i, j] = float(ss[j])
    y_test[i, 0] = float(ss[M - 1])
    x_test[i, M - 1] = 1
print(M, data_size, test_size)

#进行梯度下降 其中w初始值为随机初始化的结果
w = gradient_descent(x_train, y_train, random_initialization(M, 1))
cnt = 0
for i in range(test_size):
    if (g(cp.dot(x_test[i], w)) < 0.5):
        flag = 0
    else:
        flag = 1
    if (flag == y_test[i, 0]):
        cnt = cnt + 1
#输出准确率
print(cnt / test_size)

Exemplo n.º 8

0

Exibir arquivo

    def gradient(self, x, target):
        h1 = cp.dot(x, self.W_f1) + self.b1
        h1_ = cp.tanh(h1)
        h2 = cp.dot(h1_, self.W_f2) + self.b2
        h2_ = cp.tanh(h2)
        h3 = cp.dot(h2_, self.W_f3) + self.b3
        h3_ = cp.tanh(h3)
        h4 = cp.dot(h3_, self.W_f4) + self.b4
        # h4_ = cp.tanh(h4)
        # h5 = cp.dot(h4_, self.W_f5) + self.b5
        output = softmax(h4)

        delta4 = (output - target) / batch_size
        # delta_Wf5 = cp.dot(h4_.T, delta5)
        # delta_b5 = cp.dot(cp.ones(batch_size), delta5)

        # delta4 = tanh_grad(h4) * cp.dot(delta5, self.B5)
        delta_Wf4 = cp.dot(h3_.T, delta4)
        delta_b4 = cp.dot(cp.ones(batch_size), delta4)

        delta3 = tanh_grad(h3) * cp.dot(delta4, self.W_f4.T)
        delta_Wf3 = cp.dot(h2_.T, delta3)
        delta_b3 = cp.dot(cp.ones(batch_size), delta3)

        delta2 = tanh_grad(h2) * cp.dot(delta3, self.W_f3.T)
        delta_Wf2 = cp.dot(h1_.T, delta2)
        delta_b2 = cp.dot(cp.ones(batch_size), delta2)

        delta1 = tanh_grad(h1) * cp.dot(delta2, self.W_f2.T)
        delta_Wf1 = cp.dot(x.T, delta1)
        delta_b1 = cp.dot(cp.ones(batch_size), delta1)
        # print(delta_Wf1)

        alpha1 = 0.02
        self.W_f1 -= alpha1 * delta_Wf1
        self.W_f2 -= alpha1 * delta_Wf2
        self.W_f3 -= alpha1 * delta_Wf3
        self.W_f4 -= alpha1 * delta_Wf4
        # self.W_f5 -= alpha1 * delta_Wf5
        self.b1 -= alpha1 * delta_b1
        self.b2 -= alpha1 * delta_b2
        self.b3 -= alpha1 * delta_b3
        self.b4 -= alpha1 * delta_b4

Exemplo n.º 9

0

Exibir arquivo

def least_square(X, Y):
    return cp.dot(cp.dot(cp.linalg.inv(cp.dot(X.T, X)), X.T), Y)

Exemplo n.º 10

0

Exibir arquivo

def loss(x, y, w):
    diff = cp.add(cp.dot(x, w), -1 * y)
    loss = 1.0 / (2 * data_size) * cp.dot(diff.T, diff)
    return loss[0, 0]

Exemplo n.º 11

0

Exibir arquivo

        cuda.close()
        raise


@jit(nopython=True, fastmath=True, nogil=True, parallel=True)
def jit_dot(a, b):
    return np.dot(a, b)


start = time.clock()
C1 = np.dot(A, B)
print('\nnumpy compute time used: %f' % (time.clock() - start))

del C5
start = time.clock()
C5 = cp.dot(A2, B2)
print('\ncupy compute time used: %f' % (time.clock() - start))

start = time.clock()
C4 = jit_dot(A, B)
print('\njit compute time used: %f' % (time.clock() - start))

start = time.clock()
C2 = cuda_dot1(A, B)
print('\ncuda jit 1 compute time used: %f' % (time.clock() - start))

start = time.clock()
C2 = cuda_dot2(A, B)
print('\ncuda jit 2 compute time used: %f' % (time.clock() - start))

start = time.clock()

Exemplo n.º 12

0

Exibir arquivo

def cgs(A, b, x0=None, tol=1e-5, maxiter=None, M=None, callback=None,
        atol=None):
    """Use Conjugate Gradient Squared iteration to solve ``Ax = b``.

    Args:
        A (ndarray, spmatrix or LinearOperator): The real or complex matrix of
            the linear system with shape ``(n, n)``.
        b (cupy.ndarray): Right hand side of the linear system with shape
            ``(n,)`` or ``(n, 1)``.
        x0 (cupy.ndarray): Starting guess for the solution.
        tol (float): Tolerance for convergence.
        maxiter (int): Maximum number of iterations.
        M (ndarray, spmatrix or LinearOperator): Preconditioner for ``A``.
            The preconditioner should approximate the inverse of ``A``.
            ``M`` must be :class:`cupy.ndarray`,
            :class:`cupyx.scipy.sparse.spmatrix` or
            :class:`cupyx.scipy.sparse.linalg.LinearOperator`.
        callback (function): User-specified function to call after each
            iteration. It is called as ``callback(xk)``, where ``xk`` is the
            current solution vector.
        atol (float): Tolerance for convergence.

    Returns:
        tuple:
            It returns ``x`` (cupy.ndarray) and ``info`` (int) where ``x`` is
            the converged solution and ``info`` provides convergence
            information.

    .. seealso:: :func:`scipy.sparse.linalg.cgs`
    """
    A, M, x, b = _make_system(A, M, x0, b)

    matvec = A.matvec
    psolve = M.matvec

    n = A.shape[0]
    if n == 0:
        return cupy.empty_like(b), 0
    b_norm = cupy.linalg.norm(b)
    if b_norm == 0:
        return b, 0
    if atol is None:
        atol = tol * float(b_norm)
    else:
        atol = max(float(atol), tol * float(b_norm))
    if maxiter is None:
        maxiter = n * 5

    r0 = b - matvec(x)

    rho = cupy.dot(r0, r0)

    # initialise vectors
    r = r0.copy()
    u = r0
    p = r0.copy()

    iters = 0
    while True:
        y = psolve(p)
        v = matvec(y)
        sigma = cupy.dot(r0, v)
        alpha = rho / sigma
        q = u - alpha * v

        z = psolve(u + q)
        x += alpha * z
        Az = matvec(z)
        r -= alpha * Az

        # Update residual norm and check convergence
        r_norm = cupy.linalg.norm(r)

        iters += 1
        if callback is not None:
            callback(x)

        if r_norm <= atol or iters >= maxiter:
            break

        rho_new = cupy.dot(r0, r)
        beta = rho_new / rho
        rho = rho_new
        u = r + beta * q
        p *= beta
        p += q
        p *= beta
        p += u

    info = 0
    if iters == maxiter and not (r_norm < atol):
        info = iters

    return x, info

Exemplo n.º 13

0

Exibir arquivo

Arquivo: filtering.py Projeto: h2oai/cusignal

def detrend(data, axis=-1, type="linear", bp=0, overwrite_data=False):
    """
    Remove linear trend along axis from data.

    Parameters
    ----------
    data : array_like
        The input data.
    axis : int, optional
        The axis along which to detrend the data. By default this is the
        last axis (-1).
    type : {'linear', 'constant'}, optional
        The type of detrending. If ``type == 'linear'`` (default),
        the result of a linear least-squares fit to `data` is subtracted
        from `data`.
        If ``type == 'constant'``, only the mean of `data` is subtracted.
    bp : array_like of ints, optional
        A sequence of break points. If given, an individual linear fit is
        performed for each part of `data` between two break points.
        Break points are specified as indices into `data`.
    overwrite_data : bool, optional
        If True, perform in place detrending and avoid a copy. Default is False

    Returns
    -------
    ret : ndarray
        The detrended input data.

    Examples
    --------
    >>> import cusignal
    >>> import cupy as cp
    >>> randgen = cp.random.RandomState(9)
    >>> npoints = 1000
    >>> noise = randgen.randn(npoints)
    >>> x = 3 + 2*cp.linspace(0, 1, npoints) + noise
    >>> (cusignal.detrend(x) - noise).max() < 0.01
    True
    """
    if type not in ["linear", "l", "constant", "c"]:
        raise ValueError("Trend type must be 'linear' or 'constant'.")
    data = asarray(data)
    dtype = data.dtype.char
    if dtype not in "dfDF":
        dtype = "d"
    if type in ["constant", "c"]:
        ret = data - expand_dims(mean(data, axis), axis)
        return ret
    else:
        dshape = data.shape
        N = dshape[axis]
        bp = sort(unique(r_[0, bp, N]))
        if cp.any(bp > N):
            raise ValueError("Breakpoints must be less than length of \
                data along given axis.")
        Nreg = len(bp) - 1
        # Restructure data so that axis is along first dimension and
        #  all other dimensions are collapsed into second dimension
        rnk = len(dshape)
        if axis < 0:
            axis = axis + rnk
        newdims = np.r_[axis, 0:axis, axis + 1:rnk]
        newdata = reshape(transpose(data, tuple(newdims)),
                          (N, _prod(dshape) // N))
        if not overwrite_data:
            newdata = newdata.copy()  # make sure we have a copy
        if newdata.dtype.char not in "dfDF":
            newdata = newdata.astype(dtype)
        # Find leastsq fit and remove it for each piece
        for m in range(Nreg):
            Npts = int(bp[m + 1] - bp[m])
            A = ones((Npts, 2), dtype)
            A[:, 0] = arange(1, Npts + 1) * 1.0 / Npts
            sl = slice(bp[m], bp[m + 1])
            coef, resids, rank, s = linalg.lstsq(A, newdata[sl])
            newdata[sl] = newdata[sl] - dot(A, coef)
        # Put data back in original shape.
        tdshape = take(asarray(dshape), asarray(newdims), 0)
        ret = reshape(newdata, tuple(cp.asnumpy(tdshape)))
        vals = list(range(1, rnk))
        olddims = vals[:axis] + [0] + vals[axis:]
        ret = transpose(ret, tuple(cp.asnumpy(olddims)))
        return ret

Exemplo n.º 14

0

Exibir arquivo

    def training(self):
        """Apprentissage avec 60 000 images
        Poids enregistré dans weights_cupy.npy
        """

        print("Training...")

        # Matrice diagonale de 1
        diagonale = cp.eye(27, 27)

        # globals() Return a dictionary representing the current global symbol table.
        self.activations_prime = [
            globals()[fonction.__name__ + '_prime']
            for fonction in self.activations
        ]

        node_dict = {}

        # Liste des poids
        # Initialisation des poids des nodes, pour ne pas à être à 0
        # Construit 3 matrices (100x1600, 100x100, 27x100)
        # /cp.sqrt() résultat expérimental de l'initialisation de Xavier Glorot et He
        weight_list = [cp.random.randn(self.layers[k+1], self.layers[k]) / \
                       cp.sqrt(self.layers[k]) for k in range(len(self.layers)-1)]

        # vecteur_ligne = image en ligne à la 1ère itération
        # nombre_lettre = nombre correspondant à la lettre de l'image
        # i pour itération, vecteur_colonne = x_train de i, nombre_lettre = y_train de i
        for i, (vecteur_ligne,
                nombre_lettre) in enumerate(zip(self.x_train, self.y_train)):

            # la ligne devient colonne
            vecteur_colonne = cp.array(vecteur_ligne, ndmin=2).T

            # IndexError: arrays used as indices must be of integer or boolean type.
            # (actual: <class 'numpy.object_'>) in diagonale[:,[nombre_lettre]]
            nombre_lettre = int(nombre_lettre)

            # Forward propagation
            node_dict[0] = vecteur_colonne
            for k in range(len(self.layers) - 1):
                # weight_list[k] (100x1600, 100x100 27x100) vecteur_colonne (1600,)
                # z de format 100 x 1
                z = cp.dot(weight_list[k], vecteur_colonne)

                # self.activations = non linéaire sinon sortie fonction linéaire de l'entrée
                # imite le seuil d'activation électrique du neuronne
                vecteur_colonne = self.activations[k](z)

                node_dict[k + 1] = vecteur_colonne

            # Retro propagation, delta_a = écart entre la sortie réelle et attendue
            delta_a = vecteur_colonne - diagonale[:, [nombre_lettre]]

            # Parcours des nodes en sens inverse pour corriger proportionnellement
            # les poids en fonction de l'erreur par rapport à la valeur souhaitée
            # Descente du Gradient stochastique
            for k in range(len(self.layers) - 2, -1, -1):
                delta_z = delta_a * self.activations_prime[k](node_dict[k + 1])
                delta_w = cp.dot(delta_z, node_dict[k].T)
                delta_a = cp.dot(weight_list[k].T, delta_z)
                # Pour converger vers le minimum d'erreur
                weight_list[k] -= self.learningrate * delta_w

        self.weight_list = weight_list

        # Dans un fichier
        print("type(weight_list :)", type(weight_list), "\nlen(weight_list) =",
              len(weight_list), "\n    0", len(weight_list[0]),
              type(weight_list[0]), "\n    1", len(weight_list[1]),
              type(weight_list[1]), "\n    2", len(weight_list[2]),
              type(weight_list[2]))

        cp.save('./weights_cupy.npy', weight_list, allow_pickle=True)
        print('weights_cupy.npy enregistré')

Exemplo n.º 15

0

Exibir arquivo

Arquivo: influence_matrix_utils.py Projeto: 2397957762/slippy

    def _cuda_bccg(f: typing.Callable, b: typing.Sequence, tol: float, max_it: int, x0: typing.Sequence,
                   min_pressure: float = 0.0, max_pressure: typing.Union[float, typing.Sequence] = cp.inf,
                   k_inn=1) -> typing.Tuple[cp.ndarray, bool]:
        """
        The Bound-Constrained Conjugate Gradient Method for Non-negative Matrices
        CUDA implementation

        Parameters
        ----------
        f: Callable
            A function equivalent to multiplication by a non negative n by n matrix must work with cupy arrays.
            Typically this function will be generated by slippy.contact.plan_convolve, this will guarantee
            compatibility with different versions of this function (FFTW and CUDA).
        b: array
            1 by n array of displacements
        tol: float
            The tolerance on the result
        max_it: int
            The maximum number of iterations used
        x0: array
            An initial guess of the solution
        min_pressure: float, optional (0)
            The minimum allowable pressure at each node, defaults to 0
        max_pressure: float, optional (inf)
            The maximum allowable pressure at each node, defaults to inf, for purely elastic contacts
        k_inn: int

        Returns
        -------
        x: cp.array
            The solution to the system f(x)-b = 0 with the constraints applied.

        Notes
        -----
        This function uses the method described in the reference below, with some modification.
        Firstly, this method allows both a minimum and maximum force to be set simulating quasi plastic regimes. The
        code has also been optimised in several places and importantly this version has also been modified to run
        on a GPU through cupy.

        If you do not have a CUDA compatible GPU, slippy can be imported while falling back to the fftw version
        by first importing slippy then patching the CUDA variable to False:

        >>> import slippy
        >>> slippy.CUDA = False
        >>> import slippy.contact
        >>> ...

        Though this should happen automatically if you don't have cupy installed.

        References
        ----------
        Vollebregt, E.A.H. The Bound-Constrained Conjugate Gradient Method for Non-negative Matrices. J Optim
        Theory Appl 162, 931–953 (2014). https://doi.org/10.1007/s10957-013-0499-x

        Examples
        --------

        """
        # if you use np or most built ins in this function at all it will slow it down a lot!
        try:
            float(max_pressure)
            max_is_float = True
        except TypeError:
            max_is_float = False
            max_pressure = cp.array(max_pressure)

        # initialize
        b = cp.asarray(b)
        x = cp.clip(cp.asarray(x0), min_pressure, max_pressure)
        g = f(x) - b
        msk_bnd_0 = cp.logical_and(x <= 0, g >= 0)
        msk_bnd_max = cp.logical_and(x >= max_pressure, g <= 0)
        n_bound = cp.sum(msk_bnd_0) + cp.sum(msk_bnd_max)
        n = b.size
        n_free = n - n_bound
        small = 1e-14
        it = 0
        it_inn = 0
        rho_prev = cp.nan
        rho = 0.0
        r, p, r_prev = 0, 0, 0
        failed = False

        while True:
            it += 1
            it_inn += 1
            x_prev = x
            if it > 1:
                r_prev = r
                rho_prev = rho
            r = -g
            r[msk_bnd_0] = 0
            r[msk_bnd_max] = 0
            rho = cp.dot(r, r)
            if it > 1:
                beta_pr = (rho - cp.dot(r, r_prev)) / rho_prev
                p = r + max([beta_pr, 0])*p
            else:
                p = r
            p[msk_bnd_0] = 0
            p[msk_bnd_max] = 0
            # compute tildex optimisation ignoring the bounds
            q = f(p)
            if it_inn < k_inn:
                q[msk_bnd_0] = cp.nan
                q[msk_bnd_max] = cp.nan
            alpha = cp.dot(r, p) / cp.dot(p, q)
            x = x + alpha * p

            rms_xk = cp.linalg.norm(x) / cp.sqrt(n_free)
            rms_upd = cp.linalg.norm(x - x_prev) / cp.sqrt(n_free)
            upd = rms_upd / rms_xk

            # project onto feasible domain
            changed = False
            outer_it = it_inn >= k_inn or upd < tol

            if outer_it:
                msk_prj_0 = x < -small
                if cp.any(msk_prj_0):
                    x[msk_prj_0] = 0
                    msk_bnd_0[msk_prj_0] = True
                    changed = True
                msk_prj_max = x >= max_pressure * (1 + small)
                if cp.any(msk_prj_max):
                    if max_is_float:
                        x[msk_prj_max] = max_pressure
                    else:
                        x[msk_prj_max] = max_pressure[msk_prj_max]
                    msk_bnd_max[msk_prj_max] = True
                    changed = True

            if changed or (outer_it and k_inn > 1):
                g = f(x) - b
            else:
                g = g + alpha * q

            check_grad = outer_it

            if check_grad:
                msk_rel = cp.logical_or(cp.logical_and(msk_bnd_0, g < -small), cp.logical_and(msk_bnd_max, g > small))
                if cp.any(msk_rel):
                    msk_bnd_0[msk_rel] = False
                    msk_bnd_max[msk_rel] = False
                    changed = True

            if changed:
                n_free = n - cp.sum(msk_bnd_0) - cp.sum(msk_bnd_max)

            if not n_free:
                print("No free nodes")
                warnings.warn("No free nodes for BCCG iterations")
                failed = True
                break

            if outer_it:
                it_inn = 0

            if it > max_it:
                print("Max iterations")
                warnings.warn("Bound constrained conjugate gradient iterations failed to converge")
                failed = True
                break

            if outer_it and (not changed) and upd < tol:
                break

        return x, bool(failed)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: p200_GRU_sine.py Projeto: tf-nightly/p200504

    def backward(self, t):
        delta = self.y - t

        self.grad_w = np.dot(self.x.T, delta)
        self.grad_b = np.sum(delta, axis=0)
        self.grad_x = np.dot(delta, self.w.T)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: p200_GRU_sine.py Projeto: tf-nightly/p200504

 def forward(self, x):
     self.x = x
     u = np.dot(x, self.w) + self.b
     self.y = u  # 恒等関数。

Exemplo n.º 18

0

Exibir arquivo

 def backward(self, dout):
     W, = self.params
     dx = np.dot(dout, W.T)
     dW = np.dot(x.T, dout)
     self.grads[0][...] = dW
     return dx

Exemplo n.º 19

0

Exibir arquivo

    def gradient(self, x, target, epoch):
        reg = 0.01
        h1 = cp.dot(x, self.W_f1) + self.b1
        h1_ = cp.tanh(h1)
        h2 = cp.dot(h1_, self.W_f2) + self.b2
        h2_ = cp.tanh(h2)
        h3 = cp.dot(h2_, self.W_f3) + self.b3
        h3_ = cp.tanh(h3)
        h4 = cp.dot(h3_, self.W_f4) + self.b4
        h4_ = cp.tanh(h4)
        h5 = cp.dot(h4_, self.W_f5) + self.b5
        output = softmax(h5)

        delta5 = (output - target) / batch_size
        self.delta_Wf5 = cp.dot(h4_.T, delta5) + reg * self.W_f5
        self.delta_b5 = cp.dot(cp.ones(batch_size), delta5) + reg * self.b5

        delta4 = tanh_grad(h4) * cp.dot(delta5, self.W_f5.T)
        self.delta_Wf4 = cp.dot(h3_.T, delta4) + reg * self.W_f4
        self.delta_b4 = cp.dot(cp.ones(batch_size), delta4) + reg * self.b4

        delta3 = tanh_grad(h3) * cp.dot(delta4, self.W_f4.T)
        self.delta_Wf3 = cp.dot(h2_.T, delta3) + reg * self.W_f3
        self.delta_b3 = cp.dot(cp.ones(batch_size), delta3) + reg * self.b3

        delta2 = tanh_grad(h2) * cp.dot(delta3, self.W_f3.T)
        self.delta_Wf2 = cp.dot(h1_.T, delta2) + reg * self.W_f2
        self.delta_b2 = cp.dot(cp.ones(batch_size), delta2) + reg * self.b2

        delta1 = tanh_grad(h1) * cp.dot(delta2, self.W_f2.T)
        self.delta_Wf1 = cp.dot(x.T, delta1) + reg * self.W_f1
        self.delta_b1 = cp.dot(cp.ones(batch_size), delta1) + reg * self.b1
        # print(delta_Wf1)
        # eta = self.learning_rate(epoch)
        eta = 0.02
        # eta, self.h_W1 = self.rms_prop(self.delta_Wf1, self.h_W1)
        self.W_f1 -= eta * self.delta_Wf1
        # eta, self.h_W2 = self.rms_prop(self.delta_Wf2, self.h_W2)
        self.W_f2 -= eta * self.delta_Wf2
        # eta, self.h_W3 = self.rms_prop(self.delta_Wf3, self.h_W3)
        self.W_f3 -= eta * self.delta_Wf3
        # eta, self.h_W4 = self.rms_prop(self.delta_Wf4, self.h_W4)
        self.W_f4 -= eta * self.delta_Wf4
        # eta, self.h_W5 = self.rms_prop(self.delta_Wf5, self.h_W5)
        self.W_f5 -= eta * self.delta_Wf5
        # eta, self.h_b1 = self.rms_prop(self.delta_b1, self.h_b1)
        self.b1 -= eta * self.delta_b1
        # eta, self.h_b2 = self.rms_prop(self.delta_b2, self.h_b2)
        self.b2 -= eta * self.delta_b2
        # eta, self.h_b3 = self.rms_prop(self.delta_b3, self.h_b3)
        self.b3 -= eta * self.delta_b3
        # eta, self.h_b4 = self.rms_prop(self.delta_b4, self.h_b4)
        self.b4 -= eta * self.delta_b4
        # eta, self.h_b5 = self.rms_prop(self.delta_b5, self.h_b5)
        self.b5 -= eta * self.delta_b5

Exemplo n.º 20

0

Exibir arquivo

 def forward(self, x):
     W, = self.params
     out = np.dot(x, W)
     self.x = x
     return out

Exemplo n.º 21

0

Exibir arquivo

    def feedback_alignment(self, x, target, epoch, flag):
        h1 = cp.dot(x, self.W_f1) + self.b1
        h1_ = cp.tanh(h1)
        h2 = cp.dot(h1_, self.W_f2) + self.b2
        h2_ = cp.tanh(h2)
        h3 = cp.dot(h2_, self.W_f3) + self.b3
        h3_ = cp.tanh(h3)
        h4 = cp.dot(h3_, self.W_f4) + self.b4
        h4_ = cp.tanh(h4)
        h5 = cp.dot(h4_, self.W_f5) + self.b5
        output = softmax(h5)

        delta5 = (output - target) / batch_size
        delta_Wf5 = cp.dot(h4_.T, delta5)
        delta_b5 = cp.dot(cp.ones(batch_size), delta5)

        delta4 = tanh_grad(h4) * cp.dot(delta5, self.B5)
        delta_Wf4 = cp.dot(h3_.T, delta4)
        delta_b4 = cp.dot(cp.ones(batch_size), delta4)

        delta3 = tanh_grad(h3) * cp.dot(delta4, self.B4)
        delta_Wf3 = cp.dot(h2_.T, delta3)
        delta_b3 = cp.dot(cp.ones(batch_size), delta3)

        delta2 = tanh_grad(h2) * cp.dot(delta3, self.B3)
        delta_Wf2 = cp.dot(h1_.T, delta2)
        delta_b2 = cp.dot(cp.ones(batch_size), delta2)

        delta1 = tanh_grad(h1) * cp.dot(delta2, self.B2)
        delta_Wf1 = cp.dot(x.T, delta1)
        delta_b1 = cp.dot(cp.ones(batch_size), delta1)
        # print(delta_Wf1)

        # calculated by back propagation
        if flag:
            deltabp5 = (output - target) / batch_size
            # delta_bpWf5 = cp.dot(h4_.T, deltabp5)
            # delta_bpb5 = cp.dot(cp.ones(batch_size), deltabp5)
            # self.angle_W5 = self.angle(delta_Wf5, delta_bpWf5)

            deltabp4 = tanh_grad(h4) * cp.dot(deltabp5, self.W_f5.T)
            delta_bpWf4 = cp.dot(h3_.T, deltabp4)
            # delta_bpb4 = cp.dot(cp.ones(batch_size), deltabp4)
            self.angle_W4 = self.angle(delta_Wf4, delta_bpWf4)

            deltabp3 = tanh_grad(h3) * cp.dot(deltabp4, self.W_f4.T)
            delta_bpWf3 = cp.dot(h2_.T, deltabp3)
            # delta_bpb3 = cp.dot(cp.ones(batch_size), deltabp3)
            self.angle_W3 = self.angle(delta_Wf3, delta_bpWf3)

            deltabp2 = tanh_grad(h2) * cp.dot(deltabp3, self.W_f3.T)
            delta_bpWf2 = cp.dot(h1_.T, deltabp2)
            # delta_bpb2 = cp.dot(cp.ones(batch_size), deltabp2)
            self.angle_W2 = self.angle(delta_Wf2, delta_bpWf2)

            deltabp1 = tanh_grad(h1) * cp.dot(deltabp2, self.W_f2.T)
            delta_bpWf1 = cp.dot(x.T, deltabp1)
            # delta_bpb1 = cp.dot(cp.ones(batch_size), deltabp1)
            self.angle_W1 = self.angle(delta_Wf1, delta_bpWf1)

        alpha1 = self.learning_rate(epoch)
        self.W_f1 -= alpha1 * delta_Wf1
        self.W_f2 -= alpha1 * delta_Wf2
        self.W_f3 -= alpha1 * delta_Wf3
        self.W_f4 -= alpha1 * delta_Wf4
        self.W_f5 -= alpha1 * delta_Wf5
        self.b1 -= alpha1 * delta_b1
        self.b2 -= alpha1 * delta_b2
        self.b3 -= alpha1 * delta_b3
        self.b4 -= alpha1 * delta_b4
        self.b5 -= alpha1 * delta_b5

Exemplo n.º 22

0

Exibir arquivo

def get_w_cp(x, t):
    xx = cp.dot(x.T, x)
    xx_inv = cp.linalg.inv(xx)
    xt = cp.dot(x > t, t)
    w = cp.dot(xx_inv, xt)
    return w

Exemplo n.º 23

0

Exibir arquivo

Arquivo: feedback_alignment.py Projeto: tripdancer0916/random_feedback

 def predict(self, x):
     h1 = cp.dot(x, self.W_f1)
     h1 = relu(h1)
     h2 = cp.dot(h1, self.W_f2)
     output = softmax(h2)
     return output

Exemplo n.º 24

0

Exibir arquivo

def least_square_regular(X, Y):
    return cp.dot(
        cp.dot(cp.linalg.inv(cp.add(cp.dot(X.T, X), lambd * cp.eye(M, k=0))),
               X.T), Y)

Exemplo n.º 25

0

Exibir arquivo

Arquivo: layer_cupy.py Projeto: l4zyf9x/convnet-numpy

    def backward(self, dy):
        self.grads['db'] = np.sum(dy, axis=0)
        self.grads['dW'] = np.dot(self.cache['x'].T, dy)
        dx = np.dot(dy, self.parameters['W'].T)

        return dx

Exemplo n.º 26

0

Exibir arquivo

def gradient_function(x, y, w):
    return (1.0 / M) * (cp.add(
        cp.add(cp.dot(cp.dot(x.T, x), w), -1 * cp.dot(x.T, y)), 0.001 * w))

Exemplo n.º 27

0

Exibir arquivo

def lstsq(a, b, rcond=1e-15):
    """Return the least-squares solution to a linear matrix equation.

    Solves the equation `a x = b` by computing a vector `x` that
    minimizes the Euclidean 2-norm `|| b - a x ||^2`.  The equation may
    be under-, well-, or over- determined (i.e., the number of
    linearly independent rows of `a` can be less than, equal to, or
    greater than its number of linearly independent columns).  If `a`
    is square and of full rank, then `x` (but for round-off error) is
    the "exact" solution of the equation.

    Args:
        a (cupy.ndarray): "Coefficient" matrix with dimension ``(M, N)``
        b (cupy.ndarray): "Dependent variable" values with dimension ``(M,)``
            or ``(M, K)``
        rcond (float): Cutoff parameter for small singular values.
            For stability it computes the largest singular value denoted by
            ``s``, and sets all singular values smaller than ``s`` to zero.

    Returns:
        tuple:
            A tuple of ``(x, residuals, rank, s)``. Note ``x`` is the
            least-squares solution with shape ``(N,)`` or ``(N, K)`` depending
            if ``b`` was two-dimensional. The sums of ``residuals`` is the
            squared Euclidean 2-norm for each column in b - a*x. The
            ``residuals`` is an empty array if the rank of a is < N or M <= N,
            but  iff b is 1-dimensional, this is a (1,) shape array, Otherwise
            the shape is (K,). The ``rank`` of matrix ``a`` is an integer. The
            singular values of ``a`` are ``s``.

    .. warning::
        This function calls one or more cuSOLVER routine(s) which may yield
        invalid results if input conditions are not met.
        To detect these invalid results, you can set the `linalg`
        configuration to a value that is not `ignore` in
        :func:`cupyx.errstate` or :func:`cupyx.seterr`.

    .. seealso:: :func:`numpy.linalg.lstsq`
    """
    util._assert_cupy_array(a, b)
    util._assert_rank2(a)
    if b.ndim > 2:
        raise linalg.LinAlgError('{}-dimensional array given. Array must be at'
                                 ' most two-dimensional'.format(b.ndim))
    m, n = a.shape[-2:]
    m2 = b.shape[0]
    if m != m2:
        raise linalg.LinAlgError('Incompatible dimensions')

    u, s, vt = cupy.linalg.svd(a, full_matrices=False)
    # number of singular values and matrix rank
    cutoff = rcond * s.max()
    s1 = 1 / s
    sing_vals = s <= cutoff
    s1[sing_vals] = 0
    rank = s.size - sing_vals.sum()

    if b.ndim == 2:
        s1 = cupy.repeat(s1.reshape(-1, 1), b.shape[1], axis=1)
    # Solve the least-squares solution
    z = core.dot(u.transpose(), b) * s1
    x = core.dot(vt.transpose(), z)
    # Calculate squared Euclidean 2-norm for each column in b - a*x
    if rank != n or m <= n:
        resids = cupy.array([], dtype=a.dtype)
    elif b.ndim == 2:
        e = b - core.dot(a, x)
        resids = cupy.sum(cupy.square(e), axis=0)
    else:
        e = b - cupy.dot(a, x)
        resids = cupy.dot(e.T, e).reshape(-1)
    return x, resids, rank, s

Exemplo n.º 28

0

Exibir arquivo

def loss(W, X, Y):
    y_hat = g(cp.dot(W.t, X))
    return -(cp.dot(Y.T, cp.log(y_hat)) + cp.dot((1 - Y).T, cp.log(1 - y_hat)))

Exemplo n.º 29

0

Exibir arquivo

Arquivo: svd.py Projeto: VChristiaens/VIP

def svd_wrapper(matrix, mode, ncomp, debug, verbose, usv=False,
                random_state=None, to_numpy=True):
    """ Wrapper for different SVD libraries (CPU and GPU). 
      
    Parameters
    ----------
    matrix : array_like, 2d
        2d input matrix.
    mode : {'lapack', 'arpack', 'eigen', 'randsvd', 'cupy', 'eigencupy',
            'randcupy', 'pytorch', 'eigenpytorch', 'randpytorch'}, str optional
        Switch for the SVD method/library to be used. ``lapack`` uses the LAPACK 
        linear algebra library through Numpy and it is the most conventional way 
        of computing the SVD (deterministic result computed on CPU). ``arpack`` 
        uses the ARPACK Fortran libraries accessible through Scipy (computation
        on CPU). ``eigen`` computes the singular vectors through the 
        eigendecomposition of the covariance M.M' (computation on CPU).
        ``randsvd`` uses the randomized_svd algorithm implemented in Sklearn 
        (computation on CPU). ``cupy`` uses the Cupy library for GPU computation
        of the SVD as in the LAPACK version. ``eigencupy`` offers the same 
        method as with the ``eigen`` option but on GPU (through Cupy). 
        ``randcupy`` is an adaptation f the randomized_svd algorithm, where all
        the computations are done on a GPU (through Cupy). ``pytorch`` uses the
        Pytorch library for GPU computation of the SVD. ``eigenpytorch`` offers
        the same method as with the ``eigen`` option but on GPU (through
        Pytorch). ``randpytorch`` is an adaptation of the randomized_svd
        algorithm, where all the linear algebra computations are done on a GPU
        (through Pytorch).
    ncomp : int
        Number of singular vectors to be obtained. In the cases when the full
        SVD is computed (LAPACK, ARPACK, EIGEN, CUPY), the matrix of singular 
        vectors is truncated. 
    debug : bool
        If True the explained variance ratio is computed and displayed.
    verbose: bool
        If True intermediate information is printed out.
    usv : bool optional
        If True the 3 terms of the SVD factorization are returned.
    random_state : int, RandomState instance or None, optional
        If int, random_state is the seed used by the random number generator.
        If RandomState instance, random_state is the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random. Used for ``randsvd`` mode.
    to_numpy : bool, optional
        If True (by default) the arrays computed in GPU are transferred from
        VRAM and converted to numpy ndarrays.

    Returns
    -------
    V : array_like
        The right singular vectors of the input matrix. If ``usv`` is True it
        returns the left and right singular vectors and the singular values of
        the input matrix.
    
    References
    ----------
    * For ``lapack`` SVD mode see:
        https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.svd.html
        http://www.netlib.org/lapack/
    * For ``eigen`` mode see:
        https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.eigh.html
    * For ``arpack`` SVD mode see:
        https://docs.scipy.org/doc/scipy-0.19.1/reference/generated/scipy.sparse.linalg.svds.html
        http://www.caam.rice.edu/software/ARPACK/
    * For ``randsvd`` SVD mode see:
        https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/extmath.py
        Finding structure with randomness: Stochastic algorithms for constructing
        approximate matrix decompositions
        Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061
    * For ``cupy`` SVD mode see:
        https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.linalg.svd.html
    * For ``eigencupy`` mode see:
        https://docs-cupy.chainer.org/en/master/reference/generated/cupy.linalg.eigh.html
    * For ``pytorch`` SVD mode see:
        http://pytorch.org/docs/master/torch.html#torch.svd
    * For ``eigenpytorch`` mode see:
        http://pytorch.org/docs/master/torch.html#torch.eig

    """

    def reconstruction(ncomp, U, S, V, var=1):
        if mode == 'lapack':
            rec_matrix = np.dot(U[:, :ncomp],
                                np.dot(np.diag(S[:ncomp]), V[:ncomp]))
            rec_matrix = rec_matrix.T
            print('  Matrix reconstruction with {} PCs:'.format(ncomp))
            print('  Mean Absolute Error =', MAE(matrix, rec_matrix))
            print('  Mean Squared Error =', MSE(matrix, rec_matrix))

            # see https://github.com/scikit-learn/scikit-learn/blob/c3980bcbabd9d2527548820581725df2904e4a0d/sklearn/decomposition/pca.py
            exp_var = (S ** 2) / (S.shape[0] - 1)
            full_var = np.sum(exp_var)
            explained_variance_ratio = exp_var / full_var   # % of variance explained by each PC
            ratio_cumsum = np.cumsum(explained_variance_ratio)
        elif mode == 'eigen':
            exp_var = (S ** 2) / (S.shape[0] - 1)
            full_var = np.sum(exp_var)
            explained_variance_ratio = exp_var / full_var   # % of variance explained by each PC
            ratio_cumsum = np.cumsum(explained_variance_ratio)
        else:
            rec_matrix = np.dot(U, np.dot(np.diag(S), V))
            print('  Matrix reconstruction MAE =', MAE(matrix, rec_matrix))
            exp_var = (S ** 2) / (S.shape[0] - 1)
            full_var = np.var(matrix, axis=0).sum()
            explained_variance_ratio = exp_var / full_var   # % of variance explained by each PC
            if var == 1:
                pass
            else:
                explained_variance_ratio = explained_variance_ratio[::-1]
            ratio_cumsum = np.cumsum(explained_variance_ratio)
            msg = '  This info makes sense when the matrix is mean centered '
            msg += '(temp-mean scaling)'
            print(msg)

        lw = 2; alpha = 0.4
        fig = plt.figure(figsize=vip_figsize)
        fig.subplots_adjust(wspace=0.4)
        ax1 = plt.subplot2grid((1, 3), (0, 0), colspan=2)
        ax1.step(range(explained_variance_ratio.shape[0]),
                 explained_variance_ratio, alpha=alpha, where='mid',
                 label='Individual EVR', lw=lw)
        ax1.plot(ratio_cumsum, '.-', alpha=alpha,
                 label='Cumulative EVR', lw=lw)
        ax1.legend(loc='best', frameon=False, fontsize='medium')
        ax1.set_ylabel('Explained variance ratio (EVR)')
        ax1.set_xlabel('Principal components')
        ax1.grid(linestyle='solid', alpha=0.2)
        ax1.set_xlim(-10, explained_variance_ratio.shape[0] + 10)
        ax1.set_ylim(0, 1)

        trunc = 20
        ax2 = plt.subplot2grid((1, 3), (0, 2), colspan=1)
        # plt.setp(ax2.get_yticklabels(), visible=False)
        ax2.step(range(trunc), explained_variance_ratio[:trunc], alpha=alpha,
                 where='mid', lw=lw)
        ax2.plot(ratio_cumsum[:trunc], '.-', alpha=alpha, lw=lw)
        ax2.set_xlabel('Principal components')
        ax2.grid(linestyle='solid', alpha=0.2)
        ax2.set_xlim(-2, trunc + 2)
        ax2.set_ylim(0, 1)

        msg = '  Cumulative explained variance ratio for {} PCs = {:.5f}'
        # plt.savefig('figure.pdf', dpi=300, bbox_inches='tight')
        print(msg.format(ncomp, ratio_cumsum[ncomp - 1]))

    # --------------------------------------------------------------------------

    if matrix.ndim != 2:
        raise TypeError('Input matrix is not a 2d array')

    if usv:
        if mode not in ('lapack', 'arpack', 'randsvd', 'cupy', 'randcupy',
                        'pytorch', 'randpytorch'):
            msg = "Returning USV is supported with modes lapack, arpack, "
            msg += "randsvd, cupy, randcupy, pytorch or randpytorch"
            raise ValueError(msg)

    if ncomp > min(matrix.shape[0], matrix.shape[1]):
        msg = '{} PCs cannot be obtained from a matrix with size [{},{}].'
        msg += ' Increase the size of the patches or request less PCs'
        raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1]))

    if mode == 'eigen':
        # building C as np.dot(matrix.T,matrix) is slower and takes more memory
        C = np.dot(matrix, matrix.T)        # covariance matrix
        e, EV = linalg.eigh(C)              # EVals and EVs
        pc = np.dot(EV.T, matrix)           # PCs using a compact trick when cov is MM'
        V = pc[::-1]                        # reverse since we need the last EVs
        S = np.sqrt(np.abs(e))              # SVals = sqrt(EVals)
        S = S[::-1]                         # reverse since EVals go in increasing order
        if debug:
            reconstruction(ncomp, None, S, None)
        for i in range(V.shape[1]):
            V[:, i] /= S                    # scaling EVs by the square root of EVals
        V = V[:ncomp]
        if verbose:
            print('Done PCA with numpy linalg eigh functions')

    elif mode == 'lapack':
        # n_frames is usually smaller than n_pixels. In this setting taking the SVD of M'
        # and keeping the left (transposed) SVs is faster than taking the SVD of M (right SVs)
        U, S, V = linalg.svd(matrix.T, full_matrices=False)
        if debug:
            reconstruction(ncomp, U, S, V)
        V = V[:ncomp]                       # we cut projection matrix according to the # of PCs
        U = U[:, :ncomp]
        S = S[:ncomp]
        if verbose:
            print('Done SVD/PCA with numpy SVD (LAPACK)')

    elif mode == 'arpack':
        U, S, V = svds(matrix, k=ncomp)
        if debug:
            reconstruction(ncomp, U, S, V, -1)
        if verbose:
            print('Done SVD/PCA with scipy sparse SVD (ARPACK)')

    elif mode == 'randsvd':
        U, S, V = randomized_svd(matrix, n_components=ncomp, n_iter=2,
                                 transpose='auto', random_state=random_state)
        if debug:
            reconstruction(ncomp, U, S, V)
        if verbose:
            print('Done SVD/PCA with randomized SVD')

    elif mode == 'cupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)  # move the data to the current device
        u_gpu, s_gpu, vh_gpu = cupy.linalg.svd(a_gpu, full_matrices=True,
                                               compute_uv=True)
        V = vh_gpu[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if usv:
            S = s_gpu[:ncomp]
            if to_numpy:
                S = cupy.asnumpy(S)
            U = u_gpu[:, :ncomp]
            if to_numpy:
                U = cupy.asnumpy(U)
        if verbose:
            print('Done SVD/PCA with cupy (GPU)')

    elif mode == 'randcupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='cupy')
        if to_numpy:
            V = cupy.asnumpy(V)
            S = cupy.asnumpy(S)
            U = cupy.asnumpy(U)
        if debug:
            reconstruction(ncomp, U, S, V)
        if verbose:
            print('Done randomized SVD/PCA with cupy (GPU)')

    elif mode == 'eigencupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)         # move the data to the current device
        C = cupy.dot(a_gpu, a_gpu.T)        # covariance matrix
        e, EV = cupy.linalg.eigh(C)         # eigenvalues and eigenvectors
        pc = cupy.dot(EV.T, a_gpu)          # PCs using a compact trick when cov is MM'
        V = pc[::-1]                        # reverse since last eigenvectors are the ones we want
        S = cupy.sqrt(e)[::-1]              # reverse since eigenvalues are in increasing order
        if debug:
            reconstruction(ncomp, None, S, None)
        for i in range(V.shape[1]):
            V[:, i] /= S                    # scaling by the square root of eigenvalues
        V = V[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if verbose:
            print('Done PCA with cupy eigh function (GPU)')

    elif mode == 'pytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32').T))
        u_gpu, s_gpu, vh_gpu = torch.svd(a_gpu)
        V = vh_gpu[:ncomp]
        S = s_gpu[:ncomp]
        U = torch.transpose(u_gpu, 0, 1)[:ncomp]
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if verbose:
            print('Done SVD/PCA with pytorch (GPU)')

    elif mode == 'eigenpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32')))
        C = torch.mm(a_gpu, torch.transpose(a_gpu, 0, 1))
        e, EV = torch.eig(C, eigenvectors=True)
        V = torch.mm(torch.transpose(EV, 0, 1), a_gpu)
        S = torch.sqrt(e[:, 0])
        if debug:
            reconstruction(ncomp, None, S, None)
        for i in range(V.shape[1]):
            V[:, i] /= S
        V = V[:ncomp]
        if to_numpy:
            V = np.array(V)
        if verbose:
            print('Done PCA with pytorch eig function')

    elif mode == 'randpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='pytorch')
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if debug:
            reconstruction(ncomp, U, S, V)
        if verbose:
            print('Done randomized SVD/PCA with randomized pytorch (GPU)')

    else:
        raise ValueError('The SVD mode is not available')

    if usv:
        if mode == 'lapack':
            return V.T, S, U.T
        elif mode == 'pytorch':
            if to_numpy:
                return V.T, S, U.T
            else:
                return torch.transpose(V, 0, 1), S, torch.transpose(U, 0, 1)
        else:
            return U, S, V
    else:
        if mode == 'lapack':
            return U.T
        elif mode == 'pytorch':
            return U
        else:
            return V

Exemplo n.º 30

0

Exibir arquivo

Arquivo: learn.py Projeto: rajatsaxena/pykilosort

def extractTemplatesfromSnippets(proc=None,
                                 probe=None,
                                 params=None,
                                 Nbatch=None,
                                 nPCs=None):
    # this function is very similar to extractPCfromSnippets.
    # outputs not just the PC waveforms, but also the template "prototype",
    # basically k-means clustering of 1D waveforms.

    NT = params.NT
    # skip every this many batches
    nskip = params.nskip
    nPCs = nPCs or params.nPCs
    nt0min = params.nt0min
    Nchan = probe.Nchan
    batchstart = np.arange(0, NT * Nbatch + 1, NT).astype(np.int64)

    k = 0
    # preallocate matrix to hold 1D spike snippets
    # dd = cp.zeros((params.nt0, int(5e4)), dtype=np.float32, order='F')
    dds = []

    for ibatch in tqdm(range(0, Nbatch, nskip), desc="Extracting templates"):
        offset = Nchan * batchstart[ibatch]
        dat = proc.flat[offset:offset + NT * Nchan].reshape((-1, Nchan),
                                                            order="F")

        # move data to GPU and scale it back to unit variance
        dataRAW = cp.asarray(dat, dtype=np.float32) / params.scaleproc

        # find isolated spikes from each batch
        row, col, mu = isolated_peaks_new(dataRAW, params)

        # for each peak, get the voltage snippet from that channel
        c = get_SpikeSample(dataRAW, row, col, params)

        # if k + c.shape[1] > dd.shape[1]:
        #     dd = cp.pad(dd, (0, dd.shape[1]), mode='constant')

        # dd[:, k:k + c.shape[1]] = c
        dds.append(c)
        k = k + c.shape[1]
        if k > 1e5:
            break

    # discard empty samples
    # dd = dd[:, :k]
    dd = cp.asfortranarray(cp.concatenate(dds, axis=1).astype(np.float32))

    # initialize the template clustering with random waveforms
    uu = np.random.permutation(dd.shape[1])[:nPCs]
    wTEMP = dd[:, uu]
    wTEMP = wTEMP / cp.sum(wTEMP**2, axis=0)**0.5  # normalize them

    for i in range(10):
        # at each iteration, assign the waveform to its most correlated cluster
        cc = cp.dot(wTEMP.T, dd)
        imax = cp.argmax(cc, axis=0)
        amax = cc[imax, np.arange(cc.shape[1])]
        for j in range(nPCs):
            # weighted average to get new cluster means
            wTEMP[:, j] = cp.dot(dd[:, imax == j], amax[imax == j].T)
        wTEMP = wTEMP / cp.sum(wTEMP**2, axis=0)**0.5  # unit normalize

    # the PCs are just the left singular vectors of the waveforms
    U, Sv, V = svdecon(dd)

    # take as many as needed
    wPCA = U[:, :nPCs]

    # adjust the arbitrary sign of the first PC so its negativity is downward
    wPCA[:, 0] = -wPCA[:, 0] * cp.sign(wPCA[nt0min, 0])

    return wTEMP, wPCA

Exemplo n.º 31

0

Exibir arquivo

Arquivo: p200_GRU_sine.py Projeto: tf-nightly/p200504

 def forward(self, x, y_prev):
     a0 = sigmoid(np.dot(x, self.w[0]) + np.dot(y_prev, self.v[0]))
     a1 = sigmoid(np.dot(x, self.w[1]) + np.dot(y_prev, self.v[1]))
     a2 = np.tanh(np.dot(x, self.w[2]) + np.dot(a1 * y_prev, self.v[2]))
     self.gates = np.stack((a0, a1, a2))
     self.y = (1 - a0) * y_prev + a0 * a2

Exemplo n.º 32

0

Exibir arquivo

Arquivo: svd.py Projeto: carlgogo/VIP

def svd_wrapper(matrix, mode, ncomp, verbose, full_output=False,
                random_state=None, to_numpy=True):
    """ Wrapper for different SVD libraries (CPU and GPU). 
      
    Parameters
    ----------
    matrix : numpy ndarray, 2d
        2d input matrix.
    mode : {'lapack', 'arpack', 'eigen', 'randsvd', 'cupy', 'eigencupy',
        'randcupy', 'pytorch', 'eigenpytorch', 'randpytorch'}, str optional
        Switch for the SVD method/library to be used.

        ``lapack``: uses the LAPACK linear algebra library through Numpy
        and it is the most conventional way of computing the SVD
        (deterministic result computed on CPU).

        ``arpack``: uses the ARPACK Fortran libraries accessible through
        Scipy (computation on CPU).

        ``eigen``: computes the singular vectors through the
        eigendecomposition of the covariance M.M' (computation on CPU).

        ``randsvd``: uses the randomized_svd algorithm implemented in
        Sklearn (computation on CPU).

        ``cupy``: uses the Cupy library for GPU computation of the SVD as in
        the LAPACK version. `

        `eigencupy``: offers the same method as with the ``eigen`` option
        but on GPU (through Cupy).

        ``randcupy``: is an adaptation of the randomized_svd algorithm,
        where all the computations are done on a GPU (through Cupy). `

        `pytorch``: uses the Pytorch library for GPU computation of the SVD.

        ``eigenpytorch``: offers the same method as with the ``eigen``
        option but on GPU (through Pytorch).

        ``randpytorch``: is an adaptation of the randomized_svd algorithm,
        where all the linear algebra computations are done on a GPU
        (through Pytorch).

    ncomp : int
        Number of singular vectors to be obtained. In the cases when the full
        SVD is computed (LAPACK, ARPACK, EIGEN, CUPY), the matrix of singular 
        vectors is truncated.
    verbose: bool
        If True intermediate information is printed out.
    full_output : bool optional
        If True the 3 terms of the SVD factorization are returned. If ``mode``
        is eigen then only S and V are returned.
    random_state : int, RandomState instance or None, optional
        If int, random_state is the seed used by the random number generator.
        If RandomState instance, random_state is the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random. Used for ``randsvd`` mode.
    to_numpy : bool, optional
        If True (by default) the arrays computed in GPU are transferred from
        VRAM and converted to numpy ndarrays.

    Returns
    -------
    V : numpy ndarray
        The right singular vectors of the input matrix. If ``full_output`` is
        True it returns the left and right singular vectors and the singular
        values of the input matrix. If ``mode`` is set to eigen then only S and
        V are returned.
    
    References
    ----------
    * For ``lapack`` SVD mode see:
        https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.svd.html
        http://www.netlib.org/lapack/
    * For ``eigen`` mode see:
        https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.eigh.html
    * For ``arpack`` SVD mode see:
        https://docs.scipy.org/doc/scipy-0.19.1/reference/generated/scipy.sparse.linalg.svds.html
        http://www.caam.rice.edu/software/ARPACK/
    * For ``randsvd`` SVD mode see:
        https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/extmath.py
        Finding structure with randomness: Stochastic algorithms for constructing
        approximate matrix decompositions
        Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061
    * For ``cupy`` SVD mode see:
        https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.linalg.svd.html
    * For ``eigencupy`` mode see:
        https://docs-cupy.chainer.org/en/master/reference/generated/cupy.linalg.eigh.html
    * For ``pytorch`` SVD mode see:
        http://pytorch.org/docs/master/torch.html#torch.svd
    * For ``eigenpytorch`` mode see:
        http://pytorch.org/docs/master/torch.html#torch.eig

    """
    if matrix.ndim != 2:
        raise TypeError('Input matrix is not a 2d array')

    if ncomp > min(matrix.shape[0], matrix.shape[1]):
        msg = '{} PCs cannot be obtained from a matrix with size [{},{}].'
        msg += ' Increase the size of the patches or request less PCs'
        raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1]))

    if mode == 'eigen':
        # building C as np.dot(matrix.T,matrix) is slower and takes more memory
        C = np.dot(matrix, matrix.T)    # covariance matrix
        e, EV = linalg.eigh(C)          # EVals and EVs
        pc = np.dot(EV.T, matrix)       # PCs using a compact trick when cov is MM'
        V = pc[::-1]                    # reverse since we need the last EVs
        S = np.sqrt(np.abs(e))          # SVals = sqrt(EVals)
        S = S[::-1]                     # reverse since EVals go in increasing order
        for i in range(V.shape[1]):
            V[:, i] /= S    # scaling EVs by the square root of EVals
        V = V[:ncomp]
        if verbose:
            print('Done PCA with numpy linalg eigh functions')

    elif mode == 'lapack':
        # n_frames is usually smaller than n_pixels. In this setting taking
        # the SVD of M' and keeping the left (transposed) SVs is faster than
        # taking the SVD of M (right SVs)
        U, S, V = linalg.svd(matrix.T, full_matrices=False)
        V = V[:ncomp]       # we cut projection matrix according to the # of PCs
        U = U[:, :ncomp]
        S = S[:ncomp]
        if verbose:
            print('Done SVD/PCA with numpy SVD (LAPACK)')

    elif mode == 'arpack':
        U, S, V = svds(matrix, k=ncomp)
        if verbose:
            print('Done SVD/PCA with scipy sparse SVD (ARPACK)')

    elif mode == 'randsvd':
        U, S, V = randomized_svd(matrix, n_components=ncomp, n_iter=2,
                                 transpose='auto', random_state=random_state)
        if verbose:
            print('Done SVD/PCA with randomized SVD')

    elif mode == 'cupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)  # move the data to the current device
        u_gpu, s_gpu, vh_gpu = cupy.linalg.svd(a_gpu, full_matrices=True,
                                               compute_uv=True)
        V = vh_gpu[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if full_output:
            S = s_gpu[:ncomp]
            if to_numpy:
                S = cupy.asnumpy(S)
            U = u_gpu[:, :ncomp]
            if to_numpy:
                U = cupy.asnumpy(U)
        if verbose:
            print('Done SVD/PCA with cupy (GPU)')

    elif mode == 'randcupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='cupy')
        if to_numpy:
            V = cupy.asnumpy(V)
            S = cupy.asnumpy(S)
            U = cupy.asnumpy(U)
        if verbose:
            print('Done randomized SVD/PCA with cupy (GPU)')

    elif mode == 'eigencupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)     # move the data to the current device
        C = cupy.dot(a_gpu, a_gpu.T)    # covariance matrix
        e, EV = cupy.linalg.eigh(C)     # eigenvalues and eigenvectors
        pc = cupy.dot(EV.T, a_gpu)      # using a compact trick when cov is MM'
        V = pc[::-1]                    # reverse to get last eigenvectors
        S = cupy.sqrt(e)[::-1]          # reverse since EVals go in increasing order
        for i in range(V.shape[1]):
            V[:, i] /= S                # scaling by the square root of eigvals
        V = V[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if verbose:
            print('Done PCA with cupy eigh function (GPU)')

    elif mode == 'pytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32').T))
        u_gpu, s_gpu, vh_gpu = torch.svd(a_gpu)
        V = vh_gpu[:ncomp]
        S = s_gpu[:ncomp]
        U = torch.transpose(u_gpu, 0, 1)[:ncomp]
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if verbose:
            print('Done SVD/PCA with pytorch (GPU)')

    elif mode == 'eigenpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32')))
        C = torch.mm(a_gpu, torch.transpose(a_gpu, 0, 1))
        e, EV = torch.eig(C, eigenvectors=True)
        V = torch.mm(torch.transpose(EV, 0, 1), a_gpu)
        S = torch.sqrt(e[:, 0])
        for i in range(V.shape[1]):
            V[:, i] /= S
        V = V[:ncomp]
        if to_numpy:
            V = np.array(V)
        if verbose:
            print('Done PCA with pytorch eig function')

    elif mode == 'randpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='pytorch')
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if verbose:
            print('Done randomized SVD/PCA with randomized pytorch (GPU)')

    else:
        raise ValueError('The SVD `mode` is not recognized')

    if full_output:
        if mode == 'lapack':
            return V.T, S, U.T
        elif mode == 'pytorch':
            if to_numpy:
                return V.T, S, U.T
            else:
                return torch.transpose(V, 0, 1), S, torch.transpose(U, 0, 1)
        elif mode in ('eigen', 'eigencupy', 'eigenpytorch'):
            return S, V
        else:
            return U, S, V
    else:
        if mode == 'lapack':
            return U.T
        elif mode == 'pytorch':
            return U
        else:
            return V

Exemplo n.º 33

0

Exibir arquivo

Arquivo: learn.py Projeto: rajatsaxena/pykilosort

def learnAndSolve8b(ctx, sanity_plots=False, plot_widgets=None, plot_pos=None):
    """This is the main optimization. Takes the longest time and uses the GPU heavily."""

    Nbatch = ctx.intermediate.Nbatch
    params = ctx.params
    probe = ctx.probe
    ir = ctx.intermediate
    proc = ir.proc

    iorig = ir.iorig

    # TODO: move_to_config
    NrankPC = 6  # this one is the rank of the PCs, used to detect spikes with threshold crossings
    Nrank = 3  # this one is the rank of the templates

    wTEMP, wPCA = extractTemplatesfromSnippets(proc=proc,
                                               probe=probe,
                                               params=params,
                                               Nbatch=Nbatch,
                                               nPCs=NrankPC)

    # move these to the GPU
    wPCA = cp.asarray(wPCA[:, :Nrank], dtype=np.float32, order="F")
    wTEMP = cp.asarray(wTEMP, dtype=np.float32, order="F")
    wPCAd = cp.asarray(wPCA, dtype=np.float64,
                       order="F")  # convert to double for extra precision

    nt0 = params.nt0
    nt0min = params.nt0min
    nBatches = Nbatch
    NT = params.NT
    Nfilt = params.Nfilt
    Nchan = probe.Nchan

    # two variables for the same thing? number of nearest channels to each primary channel
    # TODO: unclear - let's fix this
    NchanNear = min(probe.Nchan, 32)
    Nnearest = min(probe.Nchan, 32)

    # decay of gaussian spatial mask centered on a channel
    sigmaMask = params.sigmaMask

    batchstart = list(range(0, NT * nBatches + 1, NT))

    # find the closest NchanNear channels, and the masks for those channels
    iC, mask, C2C = getClosestChannels(probe, sigmaMask, NchanNear)

    # sorting order for the batches
    isortbatches = iorig
    nhalf = int(ceil(nBatches / 2)) - 1  # halfway point

    # this batch order schedule goes through half of the data forward and backward during the model
    # fitting and then goes through the data symmetrically-out from the center during the final
    # pass
    ischedule = np.concatenate(
        (np.arange(nhalf, nBatches), np.arange(nBatches - 1, nhalf - 1, -1)))
    i1 = np.arange(nhalf - 1, -1, -1)
    i2 = np.arange(nhalf, nBatches)

    irounds = np.concatenate((ischedule, i1, i2))

    niter = irounds.size
    if irounds[niter - nBatches - 1] != nhalf:
        # this check is in here in case I do somehting weird when I try different schedules
        raise ValueError("Mismatch between number of batches")

    # these two flags are used to keep track of what stage of model fitting we're at
    # flag_final = 0
    flag_resort = 1

    # this is the absolute temporal offset in seconds corresponding to the start of the
    # spike sorted time segment
    t0 = 0  # ceil(params.trange(1) * ops.fs)

    nInnerIter = 60  # this is for SVD for the power iteration

    # schedule of learning rates for the model fitting part
    # starts small and goes high, it corresponds approximately to the number of spikes
    # from the past that were averaged to give rise to the current template
    pmi = cp.exp(
        -1.0 /
        cp.linspace(params.momentum[0], params.momentum[1], niter - nBatches))

    Nsum = min(
        Nchan,
        7)  # how many channels to extend out the waveform in mexgetspikes
    # lots of parameters passed into the CUDA scripts
    Params = np.array(
        [
            NT,
            Nfilt,
            params.Th[0],
            nInnerIter,
            nt0,
            Nnearest,
            Nrank,
            params.lam,
            pmi[0],
            Nchan,
            NchanNear,
            params.nt0min,
            1,
            Nsum,
            NrankPC,
            params.Th[0],
        ],
        dtype=np.float64,
    )

    # W0 has to be ordered like this
    W0 = cp.transpose(
        cp.atleast_3d(cp.asarray(wPCA, dtype=np.float64, order="F")),
        [0, 2, 1])

    # initialize the list of channels each template lives on
    iList = cp.zeros((Nnearest, Nfilt), dtype=np.int32, order="F")

    # initialize average number of spikes per batch for each template
    nsp = cp.zeros((0, 1), dtype=np.float64, order="F")

    # this flag starts 0, is set to 1 later
    Params[12] = 0

    # kernels for subsample alignment
    Ka, Kb = getKernels(params)

    p1 = 0.95  # decay of nsp estimate in each batch

    ntot = 0
    # this keeps track of dropped templates for debugging purposes
    ndrop = np.zeros(2, dtype=np.float32, order="F")

    # this is the minimum firing rate that all templates must maintain, or be dropped
    m0 = params.minFR * params.NT / params.fs

    # allocate variables when switching to extraction phase
    # this holds spike times, clusters and other info per spike
    st3 = []  # cp.zeros((int(1e7), 5), dtype=np.float32, order='F')

    # these ones store features per spike
    # Nnearest is the number of nearest templates to store features for
    fW = LargeArrayWriter(ctx.path("fW", ext=".dat"),
                          dtype=np.float32,
                          shape=(Nnearest, -1))
    # NchanNear is the number of nearest channels to take PC features from
    fWpc = LargeArrayWriter(ctx.path("fWpc", ext=".dat"),
                            dtype=np.float32,
                            shape=(NchanNear, Nrank, -1))

    for ibatch in tqdm(range(niter), desc="Optimizing templates"):
        # korder is the index of the batch at this point in the schedule
        korder = int(irounds[ibatch])
        # k is the index of the batch in absolute terms
        k = int(isortbatches[korder])
        logger.debug("Batch %d/%d, %d templates.", ibatch, niter, Nfilt)

        if ibatch > niter - nBatches - 1 and korder == nhalf:
            # this is required to revert back to the template states in the middle of the
            # batches
            W, dWU = ir.W, ir.dWU
            logger.debug("Reverted back to middle timepoint.")

        if ibatch < niter - nBatches:
            # obtained pm for this batch
            Params[8] = float(pmi[ibatch])
            pm = pmi[ibatch] * ones((Nfilt, ), dtype=np.float64, order="F")

        # loading a single batch (same as everywhere)
        offset = Nchan * batchstart[k]
        dat = proc.flat[offset:offset + NT * Nchan].reshape((-1, Nchan),
                                                            order="F")
        dataRAW = cp.asarray(dat, dtype=np.float32) / params.scaleproc

        if ibatch == 0:
            # only on the first batch, we first get a new set of spikes from the residuals,
            # which in this case is the unmodified data because we start with no templates
            # CUDA function to get spatiotemporal clips from spike detections
            dWU, cmap = mexGetSpikes2(Params, dataRAW, wTEMP, iC)

            dWU = cp.asarray(dWU, dtype=np.float64, order="F")

            # project these into the wPCA waveforms
            dWU = cp.reshape(
                cp.dot(
                    wPCAd,
                    cp.dot(wPCAd.T, dWU.reshape((dWU.shape[0], -1),
                                                order="F"))),
                dWU.shape,
                order="F",
            )

            # initialize the low-rank decomposition with standard waves
            W = W0[:, cp.ones(dWU.shape[2], dtype=np.int32), :]
            Nfilt = W.shape[1]  # update the number of filters/templates
            # initialize the number of spikes for new templates with the minimum allowed value,
            # so it doesn't get thrown back out right away
            nsp = _extend(nsp, 0, Nfilt, m0)
            Params[1] = Nfilt  # update in the CUDA parameters

        if flag_resort:
            # this is a flag to resort the order of the templates according to best peak
            # channel
            # this is important in order to have cohesive memory requests from the GPU RAM
            # max channel (either positive or negative peak)
            iW = cp.argmax(cp.abs(dWU[nt0min - 1, :, :]), axis=0)
            # iW = int32(squeeze(iW))

            isort = cp.argsort(iW)  # sort by max abs channel
            iW = iW[isort]
            W = W[:,
                  isort, :]  # user ordering to resort all the other template variables
            dWU = dWU[:, :, isort]
            nsp = nsp[isort]

        # decompose dWU by svd of time and space (via covariance matrix of 61 by 61 samples)
        # this uses a "warm start" by remembering the W from the previous iteration
        W, U, mu = mexSVDsmall2(Params, dWU, W, iC, iW, Ka, Kb)

        # UtU is the gram matrix of the spatial components of the low-rank SVDs
        # it tells us which pairs of templates are likely to "interfere" with each other
        # such as when we subtract off a template
        # this needs to change (but I don't know why!)
        UtU, maskU = getMeUtU(iW, iC, mask, Nnearest, Nchan)

        # main CUDA function in the whole codebase. does the iterative template matching
        # based on the current templates, gets features for these templates if requested
        # (featW, featPC),
        # gets scores for the template fits to each spike (vexp), outputs the average of
        # waveforms assigned to each cluster (dWU0),
        # and probably a few more things I forget about
        st0, id0, x0, featW, dWU0, drez, nsp0, featPC, vexp = mexMPnu8(
            Params, dataRAW, U, W, mu, iC, iW, UtU, iList, wPCA, params)

        logger.debug("%d spikes.", x0.size)

        # Sometimes nsp can get transposed (think this has to do with it being
        # a single element in one iteration, to which elements are added
        # nsp, nsp0, and pm must all be row vectors (Nfilt x 1), so force nsp
        # to be a row vector.
        # nsp = cp.atleast_2d(nsp)
        # nsprow, nspcol = nsp.shape
        # if nsprow < nspcol:
        #     nsp = nsp.T
        nsp = nsp.squeeze()

        # updates the templates as a running average weighted by recency
        # since some clusters have different number of spikes, we need to apply the
        # exp(pm) factor several times, and fexp is the resulting update factor
        # for each template
        fexp = np.exp(nsp0 * cp.log(pm[:Nfilt]))
        fexp = cp.reshape(fexp, (1, 1, -1), order="F")
        dWU = dWU * fexp + (1 - fexp) * (
            dWU0 / cp.reshape(cp.maximum(1, nsp0), (1, 1, -1), order="F"))

        # nsp just gets updated according to the fixed factor p1
        nsp = nsp * p1 + (1 - p1) * nsp0

        if ibatch == niter - nBatches - 1:
            # if we reached this point, we need to disable secondary template updates
            # like dropping, and adding new templates. We need to memorize the state of the
            # templates at this timepoint, and set the processing mode to "extraction and
            # tracking"

            flag_resort = 0  # no need to resort templates by channel any more
            # flag_final = 1  # this is the "final" pass

            # final clean up, triage templates one last time
            W, U, dWU, mu, nsp, ndrop = triageTemplates2(
                params, iW, C2C, W, U, dWU, mu, nsp, ndrop)

            # final number of templates
            Nfilt = W.shape[1]
            Params[1] = Nfilt

            # final covariance matrix between all templates
            WtW, iList = getMeWtW(W, U, Nnearest)

            # iW is the final channel assigned to each template
            iW = cp.argmax(cp.abs(dWU[nt0min - 1, :, :]), axis=0)

            # extract ALL features on the last pass
            Params[
                12] = 2  # this is a flag to output features (PC and template features)

            # different threshold on last pass?
            Params[2] = params.Th[
                -1]  # usually the threshold is much lower on the last pass

            # memorize the state of the templates
            logger.debug("Memorized middle timepoint.")
            ir.W, ir.dWU, ir.U, ir.mu = W, dWU, U, mu
            ir.Wraw = cp.zeros((U.shape[0], W.shape[0], U.shape[1]),
                               dtype=np.float64,
                               order="F")
            for n in range(U.shape[1]):
                # temporarily use U rather Urot until I have a chance to test it
                ir.Wraw[:, :, n] = mu[n] * cp.dot(U[:, n, :], W[:, n, :].T)

        if ibatch < niter - nBatches - 1:
            # during the main "learning" phase of fitting a model
            if ibatch % 5 == 0:
                # this drops templates based on spike rates and/or similarities to
                # other templates
                W, U, dWU, mu, nsp, ndrop = triageTemplates2(
                    params, iW, C2C, W, U, dWU, mu, nsp, ndrop)

            Nfilt = W.shape[1]  # update the number of filters
            Params[1] = Nfilt

            # this adds new templates if they are detected in the residual
            dWU0, cmap = mexGetSpikes2(Params, drez, wTEMP, iC)

            if dWU0.shape[2] > 0:
                # new templates need to be integrated into the same format as all templates
                # apply PCA for smoothing purposes
                dWU0 = cp.reshape(
                    cp.dot(
                        wPCAd,
                        cp.dot(
                            wPCAd.T,
                            dWU0.reshape(
                                (dWU0.shape[0], dWU0.shape[1] * dWU0.shape[2]),
                                order="F",
                            ),
                        ),
                    ),
                    dWU0.shape,
                    order="F",
                )
                dWU = cp.concatenate((dWU, dWU0), axis=2)

                m = dWU0.shape[2]
                # initialize temporal components of waveforms
                W = _extend(W,
                            Nfilt,
                            Nfilt + m,
                            W0[:, cp.ones(m, dtype=np.int32), :],
                            axis=1)

                # initialize the number of spikes with the minimum allowed
                nsp = _extend(nsp, Nfilt, Nfilt + m,
                              params.minFR * NT / params.fs)
                # initialize the amplitude of this spike with a lowish number
                mu = _extend(mu, Nfilt, Nfilt + m, 10)

                # if the number of filters exceed the maximum allowed, clip it
                Nfilt = min(params.Nfilt, W.shape[1])
                Params[1] = Nfilt

                W = W[:, :
                      Nfilt, :]  # remove any new filters over the maximum allowed
                dWU = dWU[:, :, :
                          Nfilt]  # remove any new filters over the maximum allowed
                nsp = nsp[:
                          Nfilt]  # remove any new filters over the maximum allowed
                mu = mu[:
                        Nfilt]  # remove any new filters over the maximum allowed

        if ibatch > niter - nBatches - 1:
            # during the final extraction pass, this keeps track of all spikes and features

            # we memorize the spatio-temporal decomposition of the waveforms at this batch
            # this is currently only used in the GUI to provide an accurate reconstruction
            # of the raw data at this time
            ir.WA[..., k] = cp.asnumpy(W)
            ir.UA[..., k] = cp.asnumpy(U)
            ir.muA[..., k] = cp.asnumpy(mu)

            # we carefully assign the correct absolute times to spikes found in this batch
            ioffset = params.ntbuff - 1
            if k == 0:
                ioffset = 0  # the first batch is special (no pre-buffer)

            toff = nt0min + t0 - ioffset + (NT - params.ntbuff) * k
            st = toff + st0

            st30 = np.c_[
                cp.asnumpy(st),  # spike times
                cp.asnumpy(id0),  # spike clusters (0-indexing)
                cp.asnumpy(x0),  # template amplitudes
                cp.asnumpy(vexp),  # residual variance of this spike
                korder *
                np.ones(st.size),  # batch from which this spike was found
            ]
            # Check the number of spikes.
            assert st30.shape[0] == featW.shape[1] == featPC.shape[2]
            st3.append(st30)
            fW.append(featW)
            fWpc.append(featPC)

            ntot = ntot + x0.size  # keeps track of total number of spikes so far

        if ibatch == niter - nBatches - 1:
            # these next three store the low-d template decompositions
            ir.WA = np.zeros((nt0, Nfilt, Nrank, nBatches),
                             dtype=np.float32,
                             order="F")
            ir.UA = np.zeros((Nchan, Nfilt, Nrank, nBatches),
                             dtype=np.float32,
                             order="F")
            ir.muA = np.zeros((Nfilt, nBatches), dtype=np.float32, order="F")

        if ibatch % 100 == 0:
            # this is some of the relevant diagnostic information to be printed during training
            logger.info(("%d / %d batches, %d units, nspks: %2.4f, mu: %2.4f, "
                         "nst0: %d, merges: %2.4f, %2.4f"), ibatch, niter,
                        Nfilt, nsp.sum(), median(mu), st0.size, *ndrop)

            if sanity_plots:
                assert plot_widgets is not None, "if sanity_plots is set, then plot_widgets cannot be None"
                plot_diagnostics(W, U, mu, nsp, plot_widgets[plot_pos])

        free_gpu_memory()

    # Close the large array writers and save the JSON metadata files to disk.
    fW.close()
    fWpc.close()

    # just display the total number of spikes
    logger.info("Found %d spikes.", ntot)

    # Save results to the ctx.intermediate object.
    ir.st3 = np.concatenate(st3, axis=0)

    # the similarity score between templates is simply the correlation,
    # taken as the max over several consecutive time delays
    ir.simScore = cp.asnumpy(cp.max(WtW, axis=2))

    # NOTE: these are now already saved by LargeArrayWriter
    # fWa = np.concatenate(fW, axis=-1)
    # fWpca = np.concatenate(fWpc, axis=-1)

    # the template features are stored in cProj, like in Kilosort1
    # ir.cProj = fWa.T
    # the neihboring templates idnices are stored in iNeigh
    ir.iNeigh = cp.asnumpy(iList)

    #  permute the PC projections in the right order
    # ir.cProjPC = np.transpose(fWpca, (2, 1, 0))
    # iNeighPC keeps the indices of the channels corresponding to the PC features
    ir.iNeighPC = cp.asnumpy(iC[:, iW])

    # Number of spikes.
    assert ir.st3.shape[0] == fW.shape[-1] == fWpc.shape[-1]

    # this whole next block is just done to compress the compressed templates
    # we separately svd the time components of each template, and the spatial components
    # this also requires a careful decompression function, available somewhere in the GUI code
    nKeep = min(Nchan * 3, 20)  # how many PCs to keep
    W_a = np.zeros((nt0 * Nrank, nKeep, Nfilt), dtype=np.float32)
    W_b = np.zeros((nBatches, nKeep, Nfilt), dtype=np.float32)
    U_a = np.zeros((Nchan * Nrank, nKeep, Nfilt), dtype=np.float32)
    U_b = np.zeros((nBatches, nKeep, Nfilt), dtype=np.float32)

    for j in tqdm(range(Nfilt), desc="Compressing templates"):
        # do this for every template separately
        WA = np.reshape(ir.WA[:, j, ...], (-1, nBatches), order="F")
        # svd on the GPU was faster for this, but the Python randomized CPU version
        # might be faster still
        # WA = gpuArray(WA)
        A, B, C = svdecon_cpu(WA)
        # W_a times W_b results in a reconstruction of the time components
        W_a[:, :, j] = np.dot(A[:, :nKeep], B[:nKeep, :nKeep])
        W_b[:, :, j] = C[:, :nKeep]

        UA = np.reshape(ir.UA[:, j, ...], (-1, nBatches), order="F")
        # UA = gpuArray(UA)
        A, B, C = svdecon_cpu(UA)
        # U_a times U_b results in a reconstruction of the time components
        U_a[:, :, j] = np.dot(A[:, :nKeep], B[:nKeep, :nKeep])
        U_b[:, :, j] = C[:, :nKeep]

    logger.info("Finished compressing time-varying templates.")

    return Bunch(
        wPCA=wPCA[:, :Nrank],
        wTEMP=wTEMP,
        st3=ir.st3,
        simScore=ir.simScore,
        # cProj=ir.cProj,
        # cProjPC=ir.cProjPC,
        iNeigh=ir.iNeigh,
        iNeighPC=ir.iNeighPC,
        WA=ir.WA,
        UA=ir.UA,
        W=ir.W,
        U=ir.U,
        dWU=ir.dWU,
        mu=ir.mu,
        W_a=W_a,
        W_b=W_b,
        U_a=U_a,
        U_b=U_b,
    )

Exemplo n.º 34

0

Exibir arquivo

def splitAllClusters(ctx, flag):
    # I call this algorithm "bimodal pursuit"
    # split clusters if they have bimodal projections
    # the strategy is to maximize a bimodality score and find a single vector projection
    # that maximizes it. If the distribution along that maximal projection crosses a
    # bimodality threshold, then the cluster is split along that direction
    # it only uses the PC features for each spike, stored in ir.cProjPC

    params = ctx.params
    probe = ctx.probe
    ir = ctx.intermediate
    Nchan = ctx.probe.Nchan

    wPCA = cp.asarray(ir.wPCA)  # use PCA projections to reconstruct templates when we do splits
    assert wPCA.shape[1] == 3

    # Take intermediate arrays from context.
    st3 = cp.asnumpy(ir.st3_m)
    cProjPC = ir.cProjPC
    dWU = ir.dWU

    # For the following arrays that will be overwritten by this function, try to get
    # it from a previous call to this function (as it is called twice), otherwise
    # get it from before (without the _s suffix).
    W = ir.get('W_s', ir.W)
    simScore = ir.get('simScore_s', ir.simScore)
    iNeigh = ir.get('iNeigh_s', ir.iNeigh)
    iNeighPC = ir.get('iNeighPC_s', ir.iNeighPC)

    # this is the threshold for splits, and is one of the main parameters users can change
    ccsplit = params.AUCsplit

    NchanNear = min(Nchan, 32)
    Nnearest = min(Nchan, 32)
    sigmaMask = params.sigmaMask

    ik = -1
    Nfilt = W.shape[1]
    nsplits = 0

    # determine what channels each template lives on
    iC, mask, C2C = getClosestChannels(probe, sigmaMask, NchanNear)

    # the waveforms must be aligned to this sample
    nt0min = params.nt0min
    # find the peak abs channel for each template
    iW = np.argmax(np.abs((dWU[nt0min - 1, :, :])), axis=0)

    # keep track of original cluster for each cluster. starts with all clusters being their
    # own origin.
    isplit = np.arange(Nfilt)
    dt = 1. / 1000
    nccg = 0

    while ik < Nfilt:
        if ik % 100 == 0:
            # periodically write updates
            logger.info(f'Found {nsplits} splits, checked {ik}/{Nfilt} clusters, nccg {nccg}')
        ik += 1

        isp = (st3[:, 1] == ik)  # get all spikes from this cluster
        nSpikes = isp.sum()
        logger.debug(f"Splitting template {ik}/{Nfilt} with {nSpikes} spikes.")
        free_gpu_memory()

        if nSpikes < 300:
            # do not split if fewer than 300 spikes (we cannot estimate
            # cross-correlograms accurately)
            continue

        ss = st3[isp, 0] / params.fs  # convert to seconds

        clp0 = cProjPC[isp, :, :]  # get the PC projections for these spikes
        clp0 = cp.asarray(clp0, dtype=cp.float32)  # upload to the GPU
        clp0 = clp0.reshape((clp0.shape[0], -1), order='F')
        m = mean(clp0, axis=0)
        clp = clp0
        clp -= m  # mean center them

        isp = np.nonzero(isp)[0]

        # (DEV_NOTES) Python flattens clp0 in C order rather than Fortran order so the
        # flattened PC projections will be slightly different, however this is fixed when
        # the projections are reformed later

        # subtract a running average, because the projections are NOT drift corrected
        clpc = my_conv2(clp, 250, 0)
        clp -= clpc

        # now use two different ways to initialize the bimodal direction
        # the main script calls this function twice, and does both initializations

        if flag:
            u, s, v = svdecon(clp.T)
            u, v = -u, -v  # change sign for consistency with MATLAB
            w = u[:, 0]  # initialize with the top PC
        else:
            w = mean(clp0, axis=0)  # initialize with the mean of NOT drift-corrected trace
            w = w / cp.sum(w ** 2) ** 0.5  # unit-normalize

        # initial projections of waveform PCs onto 1D vector
        x = cp.dot(clp, w)
        s1 = var(x[x > mean(x)])  # initialize estimates of variance for the first
        s2 = var(x[x < mean(x)])  # and second gaussian in the mixture of 1D gaussians

        mu1 = mean(x[x > mean(x)])  # initialize the means as well
        mu2 = mean(x[x < mean(x)])
        # and the probability that a spike is assigned to the first Gaussian
        p = mean(x > mean(x))

        # initialize matrix of log probabilities that each spike is assigned to the first
        # or second cluster
        logp = cp.zeros((nSpikes, 2), order='F')

        # do 50 pursuit iteration

        logP = cp.zeros(50)  # used to monitor the cost function

        for k in range(50):
            # for each spike, estimate its probability to come from either Gaussian cluster
            logp[:, 0] = -1. / 2 * log(s1) - ((x - mu1) ** 2) / (2 * s1) + log(p)
            logp[:, 1] = -1. / 2 * log(s2) - ((x - mu2) ** 2) / (2 * s2) + log(1 - p)

            lMax = logp.max(axis=1)
            logp = logp - lMax[:, cp.newaxis]  # subtract the max for floating point accuracy
            rs = cp.exp(logp)  # exponentiate the probabilities

            pval = cp.log(cp.sum(rs, axis=1)) + lMax  # get the normalizer and add back the max
            logP[k] = mean(pval)  # this is the cost function: we can monitor its increase

            rs = rs / cp.sum(rs, axis=1)[:, cp.newaxis]  # normalize so that probabilities sum to 1

            p = mean(rs[:, 0])  # mean probability to be assigned to Gaussian 1
            # new estimate of mean of cluster 1 (weighted by "responsibilities")
            mu1 = cp.dot(rs[:, 0], x) / cp.sum(rs[:, 0])
            # new estimate of mean of cluster 2 (weighted by "responsibilities")
            mu2 = cp.dot(rs[:, 1], x) / cp.sum(rs[:, 1])

            s1 = cp.dot(rs[:, 0], (x - mu1) ** 2) / cp.sum(rs[:, 0])  # new estimates of variances
            s2 = cp.dot(rs[:, 1], (x - mu2) ** 2) / cp.sum(rs[:, 1])

            if (k >= 10) and (k % 2 == 0):
                # starting at iteration 10, we start re-estimating the pursuit direction
                # that is, given the Gaussian cluster assignments, and the mean and variances,
                # we re-estimate w
                # these equations follow from the model
                StS = cp.matmul(
                    clp.T, clp * (rs[:, 0] / s1 + rs[:, 1] / s2)[:, cp.newaxis]) / nSpikes
                StMu = cp.dot(clp.T, rs[:, 0] * mu1 / s1 + rs[:, 1] * mu2 / s2) / nSpikes

                # this is the new estimate of the best pursuit direction
                w = cp.linalg.solve(StS.T, StMu)
                w = w / cp.sum(w ** 2) ** 0.5  # which we unit normalize
                x = cp.dot(clp, w)

        # these spikes are assigned to cluster 1
        ilow = rs[:, 0] > rs[:, 1]
        # the mean probability of spikes assigned to cluster 1
        plow = mean(rs[:, 0][ilow])
        phigh = mean(rs[:, 1][~ilow])  # same for cluster 2
        # the smallest cluster has this proportion of all spikes
        nremove = min(mean(ilow), mean(~ilow))

        # did this split fix the autocorrelograms?
        # compute the cross-correlogram between spikes in the putative new clusters
        ilow_cpu = cp.asnumpy(ilow)
        K, Qi, Q00, Q01, rir = ccg(ss[ilow_cpu], ss[~ilow_cpu], 500, dt)
        Q12 = (Qi / max(Q00, Q01)).min()  # refractoriness metric 1
        R = rir.min()  # refractoriness metric 2

        # if the CCG has a dip, don't do the split.
        # These thresholds are consistent with the ones from merges.
        if (Q12 < 0.25) and (R < 0.05):  # if both metrics are below threshold.
            nccg += 1  # keep track of how many splits were voided by the CCG criterion
            continue

        # now decide if the split would result in waveforms that are too similar
        # the reconstructed mean waveforms for putative cluster 1
        # c1 = cp.matmul(wPCA, cp.reshape((mean(clp0[ilow, :], 0), 3, -1), order='F'))
        c1 = cp.matmul(wPCA, mean(clp0[ilow, :], 0).reshape((3, -1), order='F'))
        # the reconstructed mean waveforms for putative cluster 2
        # c2 = cp.matmul(wPCA, cp.reshape((mean(clp0[~ilow, :], 0), 3, -1), order='F'))
        c2 = cp.matmul(wPCA, mean(clp0[~ilow, :], 0).reshape((3, -1), order='F'))

        cc = cp.corrcoef(c1.ravel(), c2.ravel())  # correlation of mean waveforms
        n1 = sqrt(cp.sum(c1 ** 2))  # the amplitude estimate 1
        n2 = sqrt(cp.sum(c2 ** 2))  # the amplitude estimate 2

        r0 = 2 * abs((n1 - n2) / (n1 + n2))

        # if the templates are correlated, and their amplitudes are similar, stop the split!!!

        if (cc[0, 1] > 0.9) and (r0 < 0.2):
            continue

        # finaly criteria to continue with the split: if the split piece is more than 5% of all
        # spikes, if the split piece is more than 300 spikes, and if the confidences for
        # assigning spikes to # both clusters exceeds a preset criterion ccsplit
        if (nremove > 0.05) and (min(plow, phigh) > ccsplit) and (
                min(cp.sum(ilow), cp.sum(~ilow)) > 300):
            # one cluster stays, one goes
            Nfilt += 1

            # the templates for the splits have been estimated from PC coefficients

            # (DEV_NOTES) code below involves multiple CuPy arrays changing shape to accomodate
            # the extra cluster, this could potentially be done more efficiently?

            dWU = cp.concatenate((
                cp.asarray(dWU), cp.zeros((*dWU.shape[:-1], 1), order='F')), axis=2)
            dWU[:, iC[:, iW[ik]], Nfilt - 1] = c2
            dWU[:, iC[:, iW[ik]], ik] = c1

            # the temporal components are therefore just the PC waveforms
            W = cp.asarray(W)
            W = cp.concatenate((W, cp.transpose(cp.atleast_3d(wPCA), (0, 2, 1))), axis=1)
            assert W.shape[1] == Nfilt

            # copy the best channel from the original template
            iW = cp.asarray(iW)
            iW = cp.pad(iW, (0, (Nfilt - len(iW))), mode='constant')
            iW[Nfilt - 1] = iW[ik]
            assert iW.shape[0] == Nfilt

            # copy the provenance index to keep track of splits
            isplit = cp.asarray(isplit)
            isplit = cp.pad(isplit, (0, (Nfilt - len(isplit))), mode='constant')
            isplit[Nfilt - 1] = isplit[ik]
            assert isplit.shape[0] == Nfilt

            st3[isp[ilow_cpu], 1] = Nfilt - 1  # overwrite spike indices with the new index

            # copy similarity scores from the original
            simScore = cp.asarray(simScore)
            simScore = cp.pad(
                simScore, (0, (Nfilt - simScore.shape[0])), mode='constant')
            simScore[:, Nfilt - 1] = simScore[:, ik]
            simScore[Nfilt - 1, :] = simScore[ik, :]
            # copy similarity scores from the original
            simScore[ik, Nfilt - 1] = 1  # set the similarity with original to 1
            simScore[Nfilt - 1, ik] = 1  # set the similarity with original to 1
            assert simScore.shape == (Nfilt, Nfilt)

            # copy neighbor template list from the original
            iNeigh = cp.asarray(iNeigh)
            iNeigh = cp.pad(
                iNeigh, ((0, 0), (0, (Nfilt - iNeigh.shape[1]))), mode='constant')
            iNeigh[:, Nfilt - 1] = iNeigh[:, ik]
            assert iNeigh.shape[1] == Nfilt

            # copy neighbor channel list from the original
            iNeighPC = cp.asarray(iNeighPC)
            iNeighPC = cp.pad(
                iNeighPC, ((0, 0), (0, (Nfilt - iNeighPC.shape[1]))), mode='constant')
            iNeighPC[:, Nfilt - 1] = iNeighPC[:, ik]
            assert iNeighPC.shape[1] == Nfilt

            # try this cluster again
            # the cluster piece that stays at this index needs to be tested for splits again
            # before proceeding
            ik -= 1
            # the piece that became a new cluster will be tested again when we get to the end
            # of the list
            nsplits += 1  # keep track of how many splits we did
    #         pbar.update(ik)
    # pbar.close()

    logger.info(
        f'Finished splitting. Found {nsplits} splits, checked '
        f'{ik}/{Nfilt} clusters, nccg {nccg}')

    Nfilt = W.shape[1]  # new number of templates
    Nrank = 3
    Nchan = probe.Nchan
    Params = cp.array(
        [0, Nfilt, 0, 0, W.shape[0], Nnearest, Nrank, 0, 0, Nchan, NchanNear, nt0min, 0],
        dtype=cp.float64)  # make a new Params to pass on parameters to CUDA

    # we need to re-estimate the spatial profiles

    # we get the time upsampling kernels again
    Ka, Kb = getKernels(params)
    # we run SVD
    W, U, mu = mexSVDsmall2(Params, dWU, W, iC, iW, Ka, Kb)

    # we re-compute similarity scores between templates
    WtW, iList = getMeWtW(W.astype(cp.float32), U.astype(cp.float32), Nnearest)
    # ir.iList = iList  # over-write the list of nearest templates

    isplit = simScore == 1  # overwrite the similarity scores of clusters with same parent
    simScore = WtW.max(axis=2)
    simScore[isplit] = 1  # 1 means they come from the same parent

    iNeigh = iList[:, :Nfilt]  # get the new neighbor templates
    iNeighPC = iC[:, iW[:Nfilt]]  # get the new neighbor channels

    # for Phy, we need to pad the spikes with zeros so the spikes are aligned to the center of
    # the window
    Wphy = cp.concatenate(
        (cp.zeros((1 + nt0min, Nfilt, Nrank), order='F'), W), axis=0)

    # ir.isplit = isplit  # keep track of origins for each cluster

    return Bunch(
        st3_s=st3,

        W_s=W,
        U_s=U,
        mu_s=mu,
        simScore_s=simScore,
        iNeigh_s=iNeigh,
        iNeighPC_s=iNeighPC,

        Wphy=Wphy,
        iList=iList,
        isplit=isplit,
    )

Exemplo n.º 35

0

Exibir arquivo

def rezToPhy(ctx, dat_path=None, output_dir=None):
    # pull out results from kilosort's rez to either return to workspace or to
    # save in the appropriate format for the phy GUI to run on. If you provide
    # a savePath it should be a folder

    savePath = output_dir
    Path(savePath).mkdir(exist_ok=True, parents=True)

    ctx = checkClusters(ctx)  # check clusters integrity

    probe = ctx.probe
    ir = ctx.intermediate
    params = ctx.params
    nt0 = params.nt0

    # spikeTimes will be in samples, not seconds
    W = cp.asarray(ir.Wphy).astype(np.float32)
    Wrot = ir.Wrot
    est_contam_rate = ir.est_contam_rate
    good = ir.good

    st3 = cp.asarray(ir.st3_c)

    U = cp.asarray(ir.U_s).astype(np.float32)
    iNeigh = ir.iNeigh_s
    iNeighPC = ir.iNeighPC_s
    simScore = ir.simScore_s

    if st3.shape[1] > 4:
        st3 = st3[:, :4]

    isort = cp.argsort(st3[:, 0])
    st3 = st3[isort, :]
    # cProj = ir.cProj_c[cp.asnumpy(isort), :]
    # cProjPC = ir.cProjPC_c[cp.asnumpy(isort), :, :]

    fs = os.listdir(savePath)
    for file in fs:
        if file.endswith('.npy'):
            os.remove(join(savePath, file))
    if os.path.isdir(join(savePath, '.phy')):
        shutil.rmtree(join(savePath, '.phy'))

    spikeTimes = st3[:, 0].astype(cp.uint64)
    spikeTemplates = st3[:, 1].astype(cp.uint32)

    # (DEV_NOTES) if statement below seems useless due to above if statement
    if st3.shape[1] > 4:
        spikeClusters = (1 + st3[:, 4]).astype(cp.uint32)

    # templateFeatures = cProj
    templateFeatureInds = iNeigh.astype(cp.uint32)
    # pcFeatures = cProjPC
    pcFeatureInds = iNeighPC.astype(cp.uint32)

    whiteningMatrix = cp.asarray(Wrot) / params.scaleproc
    whiteningMatrixInv = cp.linalg.pinv(whiteningMatrix)

    amplitudes = st3[:, 2]

    Nchan = probe.Nchan

    xcoords = probe.xc
    ycoords = probe.yc
    chanMap = probe.chanMap
    chanMap0ind = chanMap  # - 1

    nt0, Nfilt = W.shape[:2]

    # (DEV_NOTES) 2 lines below can be combined
    # templates = cp.einsum('ikl,jkl->ijk', U, W).astype(cp.float32)
    # templates = cp.zeros((Nchan, nt0, Nfilt), dtype=np.float32, order='F')
    tempAmpsUnscaled = cp.zeros(Nfilt, dtype=np.float32)
    templates_writer = NpyWriter(join(savePath, 'templates.npy'), (Nfilt, nt0, Nchan), np.float32)
    for iNN in tqdm(range(Nfilt), desc="Computing templates"):
        t = cp.dot(U[:, iNN, :], W[:, iNN, :].T).T
        templates_writer.append(t)
        t_unw = cp.dot(t, whiteningMatrixInv)
        assert t_unw.ndim == 2
        tempChanAmps = t_unw.max(axis=0) - t_unw.min(axis=0)
        tempAmpsUnscaled[iNN] = tempChanAmps.max()

    templates_writer.close()
    # templates = cp.transpose(templates, (2, 1, 0))  # now it's nTemplates x nSamples x nChannels
    # we include all channels so this is trivial
    templatesInds = cp.tile(np.arange(Nfilt), (Nchan, 1))

    # here we compute the amplitude of every template...

    # unwhiten all the templates
    # tempsUnW = cp.einsum('ijk,kl->ijl', templates, whiteningMatrixinv)
    # tempsUnW = cp.zeros(templates.shape, dtype=np.float32, order='F')
    # for t in tqdm(range(templates.shape[0]), desc="Unwhitening the templates"):
    #     tempsUnW[t, :, :] = cp.dot(templates[t, :, :], whiteningMatrixInv)

    # The amplitude on each channel is the positive peak minus the negative
    # tempChanAmps = tempsUnW.max(axis=1) - tempsUnW.min(axis=1)

    # The template amplitude is the amplitude of its largest channel
    # tempAmpsUnscaled = tempChanAmps.max(axis=1)

    # assign all spikes the amplitude of their template multiplied by their
    # scaling amplitudes
    # tempAmpsUnscaled = cp.(tempAmpsUnscaled, axis=0).astype(np.float32)
    spikeAmps = tempAmpsUnscaled[spikeTemplates] * amplitudes

    # take the average of all spike amps to get actual template amps (since
    # tempScalingAmps are equal mean for all templates)
    ta = clusterAverage(spikeTemplates, spikeAmps)
    tids = cp.unique(spikeTemplates).astype(np.int64)
    tempAmps = cp.zeros_like(tempAmpsUnscaled, order='F')
    tempAmps[tids] = ta  # because ta only has entries for templates that had at least one spike
    tempAmps = params.gain * tempAmps  # for consistency, make first dimension template number

    # PCs
    ix = ir.spikes_to_remove  # length: number of spikes BEFORE -1 cluster removed

    cProj_shape = ir.cProj.shape
    cProj_shape = (st3.shape[0],) + cProj_shape[1:]

    cProjPC_shape = ir.cProjPC.shape
    cProjPC_shape = (st3.shape[0],) + cProjPC_shape[1:]

    tfw = NpyWriter(join(savePath, 'template_features.npy'), cProj_shape, np.float32)
    pcw = NpyWriter(join(savePath, 'pc_features.npy'), cProjPC_shape, np.float32)

    isort = cp.asnumpy(isort)
    N = len(ix)  # number of spikes including those assigned to -1
    assert ir.cProj.shape[0] == N
    assert ir.cProjPC.shape[0] == N

    spikes_to_keep = np.nonzero(~ix)[0]  # indices of the spikes to keep in the cProj index space

    # if len(ix) > ir.cProj.shape[0]:
    #     ix = ix[:cProj.shape[0]]
    # else:
    #     ix = np.pad(ix, (0, ir.cProj.shape[0] - len(ix)), mode='constant')
    # assert ix.shape[0] == ir.cProj.shape[0] == ir.cProjPC.shape[0]

    k = int(ceil(float(N) / 100))  # 100 chunks
    assert k >= 1
    for i in tqdm(range(0, N, k), desc="Saving template and PC features"):
        # NOTE: cProj and cProjPC still have the spikes assigned to -1 that have yet to be removed

        # spike indices in cProj that need to be kept in this chunk
        ind = spikes_to_keep[isort[i:i + k]]

        cProj = ir.cProj[ind]
        cProjPC = ir.cProjPC[ind]

        tfw.append(cProj)
        pcw.append(cProjPC)
    tfw.close()
    pcw.close()
    # with open(, 'wb') as fp:
    #     save_large_array(fp, templateFeatures)
    # cProj = ir.cProj_c[cp.asnumpy(isort), :]
    # cProjPC = ir.cProjPC_c[cp.asnumpy(isort), :, :]

    def _save(name, arr, dtype=None):
        cp.save(join(savePath, name + '.npy'), arr.astype(dtype or arr.dtype))

    if savePath is not None:
        _save('spike_times', spikeTimes)
        _save('spike_templates', spikeTemplates, cp.uint32)
        if st3.shape[1] > 4:
            _save('spike_clusters', spikeClusters, cp.uint32)
        else:
            _save('spike_clusters', spikeTemplates, cp.uint32)
        _save('amplitudes', amplitudes)
        # _save('templates', templates)
        _save('templates_ind', templatesInds)

        chanMap0ind = chanMap0ind.astype(cp.int32)

        _save('channel_map', chanMap0ind)
        _save('channel_positions', np.c_[xcoords, ycoords])

        # _save('template_features', templateFeatures)
        # with open(join(savePath, 'template_features.npy'), 'wb') as fp:
        #     save_large_array(fp, templateFeatures)
        _save('template_feature_ind', templateFeatureInds.T)

        # _save('pc_features', pcFeatures)
        # with open(join(savePath, 'pc_features.npy'), 'wb') as fp:
        #     save_large_array(fp, pcFeatures)
        _save('pc_feature_ind', pcFeatureInds.T)

        _save('whitening_mat', whiteningMatrix)
        _save('whitening_mat_inv', whiteningMatrixInv)

        if 'simScore' in ir:
            similarTemplates = simScore
            _save('similar_templates', similarTemplates)

        est_contam_rate[np.isnan(est_contam_rate)] = 1
        with open(join(savePath, 'cluster_group.tsv'), 'w') as f:
            f.write('cluster_id\tgroup\n')
            for j in range(len(good)):
                if good[j]:
                    f.write('%d\tgood\n' % j)
                # else:
                #     f.write('%d\tmua\n' % j)

        with open(join(savePath, 'cluster_ContamPct.tsv'), 'w') as f:
            f.write('cluster_id\tContamPct\n')
            for j in range(len(good)):
                f.write('%d\t%.1f\n' % (j, 100 * est_contam_rate[j]))

        with open(join(savePath, 'cluster_Amplitude.tsv'), 'w') as f:
            f.write('cluster_id\tAmplitude\n')
            for j in range(len(good)):
                f.write('%d\t%.1f\n' % (j, tempAmps[j]))

        # make params file
        if not os.path.exists(join(savePath, 'params.py')):
            with open(join(savePath, 'params.py'), 'w') as f:
                f.write('dat_path = "../%s"\n' % dat_path)
                f.write('n_channels_dat = %d\n' % probe.NchanTOT)
                f.write('dtype = "int16"\n')
                f.write('offset = 0\n')
                f.write('hp_filtered = False\n')
                f.write('sample_rate = %i\n' % params.fs)
                f.write('template_scaling = %.1f\n' % params.get('templateScaling', 1.0))

Exemplo n.º 36

0

Exibir arquivo

Arquivo: svd.py Projeto: VChristiaens/VIP

def randomized_svd_gpu(M, n_components, n_oversamples=10, n_iter='auto',
                       transpose='auto', random_state=0, lib='cupy'):
    """Computes a truncated randomized SVD on GPU. Adapted from Sklearn.

    Parameters
    ----------
    M : ndarray or sparse matrix
        Matrix to decompose
    n_components : int
        Number of singular values and vectors to extract.
    n_oversamples : int (default is 10)
        Additional number of random vectors to sample the range of M so as
        to ensure proper conditioning. The total number of random vectors
        used to find the range of M is n_components + n_oversamples. Smaller
        number can improve speed but can negatively impact the quality of
        approximation of singular vectors and singular values.
    n_iter : int or 'auto' (default is 'auto')
        Number of power iterations. It can be used to deal with very noisy
        problems. When 'auto', it is set to 4, unless `n_components` is small
        (< .1 * min(X.shape)) `n_iter` in which case is set to 7.
        This improves precision with few components.
    transpose : True, False or 'auto' (default)
        Whether the algorithm should be applied to M.T instead of M. The
        result should approximately be the same. The 'auto' mode will
        trigger the transposition if M.shape[1] > M.shape[0] since this
        implementation of randomized SVD tend to be a little faster in that
        case.
    random_state : int, RandomState instance or None, optional (default=None)
        The seed of the pseudo random number generator to use when shuffling
        the data.  If int, random_state is the seed used by the random number
        generator; If RandomState instance, random_state is the random number
        generator; If None, the random number generator is the RandomState
        instance used by `np.random`.
    lib : {'cupy', 'pytorch'}, str optional
        Chooses the GPU library to be used.

    Notes
    -----
    This algorithm finds a (usually very good) approximate truncated
    singular value decomposition using randomization to speed up the
    computations. It is particularly fast on large matrices on which
    you wish to extract only a small number of components. In order to
    obtain further speed up, `n_iter` can be set <=2 (at the cost of
    loss of precision).

    References
    ----------
    * Finding structure with randomness: Stochastic algorithms for constructing
      approximate matrix decompositions
      Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061
    * A randomized algorithm for the decomposition of matrices
      Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert
    * An implementation of a randomized algorithm for principal component
      analysis
      A. Szlam et al. 2014
    """
    random_state = check_random_state(random_state)
    n_random = n_components + n_oversamples
    n_samples, n_features = M.shape

    if n_iter == 'auto':
        # Checks if the number of iterations is explicitly specified
        n_iter = 7 if n_components < .1 * min(M.shape) else 4

    if transpose == 'auto':
        transpose = n_samples < n_features
    if transpose:
        M = M.T # this implementation is a bit faster with smaller shape[1]

    if lib == 'cupy':
        M = cupy.array(M)
        M = cupy.asarray(M)

        # Generating normal random vectors with shape: (M.shape[1], n_random)
        Q = random_state.normal(size=(M.shape[1], n_random))
        Q = cupy.array(Q)
        Q = cupy.asarray(Q)

        # Perform power iterations with Q to further 'imprint' the top
        # singular vectors of M in Q
        for i in range(n_iter):
            Q = cupy.dot(M, Q)
            Q = cupy.dot(M.T, Q)

        # Sample the range of M using by linear projection of Q. Extract an orthonormal basis
        Q, _ = cupy.linalg.qr(cupy.dot(M, Q), mode='reduced')

        # project M to the (k + p) dimensional space using the basis vectors
        B = cupy.dot(Q.T, M)

        B = cupy.array(B)
        Q = cupy.array(Q)
        # compute the SVD on the thin matrix: (k + p) wide
        Uhat, s, V = cupy.linalg.svd(B, full_matrices=False, compute_uv=True)
        del B
        U = cupy.dot(Q, Uhat)

        if transpose:
            # transpose back the results according to the input convention
            return V[:n_components, :].T, s[:n_components], U[:,
                                                            :n_components].T
        else:
            return U[:, :n_components], s[:n_components], V[:n_components, :]

    elif lib == 'pytorch':
        M_gpu = torch.Tensor.cuda(torch.from_numpy(M.astype('float32')))

        # Generating normal random vectors with shape: (M.shape[1], n_random)
        Q = torch.cuda.FloatTensor(M_gpu.shape[1], n_random).normal_()

        # Perform power iterations with Q to further 'imprint' the top
        # singular vectors of M in Q
        for i in range(n_iter):
            Q = torch.mm(M_gpu, Q)
            Q = torch.mm(torch.transpose(M_gpu, 0, 1), Q)

        # Sample the range of M using by linear projection of Q. Extract an orthonormal basis
        Q, _ = torch.qr(torch.mm(M_gpu, Q))

        # project M to the (k + p) dimensional space using the basis vectors
        B = torch.mm(torch.transpose(Q, 0, 1), M_gpu)

        # compute the SVD on the thin matrix: (k + p) wide
        Uhat, s, V = torch.svd(B)
        del B
        U = torch.mm(Q, Uhat)

        if transpose:
            # transpose back the results according to the input convention
            return (torch.transpose(V[:n_components, :], 0, 1),
                    s[:n_components],
                    torch.transpose(U[:, :n_components], 0, 1))
        else:
            return U[:, :n_components], s[:n_components], V[:n_components, :]

Exemplo n.º 37

0

Exibir arquivo

    def _derivative(self, x):
        """Compute the derivative of P(x)
        Parameters
        ----------
        x : numpy array, shape (n_features,)
            One configuration
        Returns
        -------
        derivative : numpy array, shape (m_parameters,)
        """
        w2 = np.reshape(self.w,
                        (self.n_features, self.d, self.D, self.D, self.mu))
        derivative = np.zeros(
            (self.n_features, self.d, self.D, self.D, self.mu),
            dtype=np.complex128)

        #Store intermediate tensor contractions for the derivatives:
        #left to right and right to left
        #tmp stores the contraction of the first i+1 tensors from the left
        #in tmp[i,:,:], tmp2 the remaining tensors on the right
        #the mps contracted is the remaining contraction tmp[i-1]w[i]tmp2[i+1]
        tmp = np.zeros((self.n_features, self.D * self.D), dtype=np.complex128)
        tmp2 = np.zeros((self.n_features, self.D * self.D),
                        dtype=np.complex128)
        tmp[0, :] = np.einsum('ij,kj->ik', w2[0, x[0], 0, :, :],
                              np.conj(w2[0, x[0],
                                         0, :, :])).reshape(self.D * self.D)
        for i in xrange(1, self.n_features - 1):
            newtmp = np.einsum('imj,klj->ikml', w2[i, x[i], :, :, :],
                               np.conj(w2[i, x[i], :, :, :])).reshape(
                                   (self.D * self.D, self.D * self.D))
            tmp[i, :] = np.dot(tmp[i - 1, :], newtmp)
        newtmp = np.einsum(
            'ij,kj->ik', w2[self.n_features - 1, x[self.n_features - 1], :,
                            0, :],
            np.conj(w2[self.n_features - 1, x[self.n_features - 1], :,
                       0, :])).reshape(self.D * self.D)
        mpscontracted = np.inner(tmp[self.n_features - 2, :], newtmp)
        tmp[self.n_features - 1, :] = mpscontracted

        tmp2[self.n_features - 1, :] = newtmp
        for i in xrange(self.n_features - 2, -1, -1):
            newtmp = np.einsum('imj,klj->ikml', w2[i, x[i], :, :, :],
                               np.conj(w2[i, x[i], :, :, :])).reshape(
                                   (self.D * self.D, self.D * self.D))
            tmp2[i, :] = np.dot(newtmp, tmp2[i + 1, :])
        newtmp = np.einsum('ij,kj->ik', w2[0, x[0], 0, :, :],
                           np.conj(w2[0, x[0],
                                      0, :, :])).reshape(self.D * self.D)
        tmp2[0, :] = np.inner(newtmp, tmp2[1, :])

        #Now for each tensor, the derivative is the contraction of the rest of the tensors

        derivative[0, x[0],
                   0, :, :] = 2 * np.einsum('ij,il->lj', w2[0, x[0], 0, :, :],
                                            tmp2[1, :].reshape(self.D, self.D))
        derivative[self.n_features-1,x[self.n_features-1],:,0,:]=\
            2*np.einsum('ij,il->lj',w2[self.n_features-1,x[self.n_features-1],:,0,:],
                        tmp[self.n_features-2,:].reshape(self.D,self.D))
        for i in xrange(1, self.n_features - 1):
            temp1 = tmp[i - 1, :].reshape(self.D, self.D)
            temp2 = tmp2[i + 1, :].reshape(self.D, self.D)
            derivative[i, x[i], :, :, :] = 2 * np.einsum(
                'ikm,ij,kl->jlm', w2[i, x[i], :, :, :], temp1, temp2)

        return derivative.reshape(self.m_parameters)