def backward(self, inputs, grad_outputs): xp = cuda.get_array_module(*inputs) gh, gc = grad_outputs x, h_tm1, c_tm1 = inputs gc_tm1, gz = self.lstm_fun.backward(inputs=(c_tm1, self.z), grad_outputs=(gc, gh)) batchsize = x.shape[0] gh_tm1 = xp.empty_like(h_tm1) gx = xp.empty((batchsize,self.in_size),dtype=np.dtype('float32')) if xp is np: gh_tm1 = np.dot(gz, self.V, out=gh_tm1) # compute gradient with respect to the input x gx = np.dot(gz, self.W, out=gx) # compute gradients of weight matrices self.gW += gz.T.dot(x) self.gV += gz.T.dot(h_tm1) if not self.nobias: gb_ones = xp.ones((1,batchsize), dtype=np.dtype('float32')) self.gb += np.dot(gb_ones, gz) else: gh_tm1 = cp.dot(gz, self.V, out=gh_tm1) # compute gradient with respect to the input x gx = cp.dot(gz, self.W, out=gx) # compute gradients of weight matrices gpu.utils.dot_add(gz, x, C=self.gW, transa=True) gpu.utils.dot_add(gz, h_tm1, C=self.gV, transa=True) if not self.nobias: gb_ones = xp.ones((1,batchsize), dtype=np.dtype('float32')) gpu.utils.dot_add(gb_ones, gz, C=self.gb) return gx, gh_tm1, gc_tm1
def backward(self, inputs, grad_outputs): xp = cuda.get_array_module(*inputs) gh, gc = grad_outputs x, h_tm1, c_tm1, q = inputs if gh is None: gh = xp.array([[0]], dtype=np.float32) gh_is_none = 1 else: gh_is_none = 0 if gc is None: gc = xp.array([[0]], dtype=np.float32) gc_is_none = 1 else: gc_is_none = 0 gc_tm1 = self.c batchsize = x.shape[0] gx = xp.empty((batchsize,self.in_size),dtype=np.dtype('float32')) gq = xp.empty((batchsize,self.encode_size),dtype=np.dtype('float32')) if xp is np: _lstm_backward_cpu(c=self.c, z=self.z, gh=gh, gc=gc, c_tm1=c_tm1, gc_is_none=gc_is_none, gh_is_none=gh_is_none) # compute gradient with respect to the input x gz = self.z gh_tm1 = np.dot(gz, self.V, out=self.h) gx = np.dot(gz, self.W, out=gx) gq = np.dot(gz, self.U, out=gq) # compute gradients of weight matrices self.gW += gz.T.dot(x) self.gV += gz.T.dot(h_tm1) self.gU += gz.T.dot(q) if not self.nobias: gb_ones = xp.ones((1,batchsize), dtype=np.dtype('float32')) self.gb += np.dot(gb_ones, gz) else: _lstm_backward_gpu(c=self.c, z=self.z, gh=gh, gc=gc, c_tm1=c_tm1, gc_is_none=gc_is_none, gh_is_none=gh_is_none) # compute gradient with respect to the input x gz = self.z gh_tm1 = cp.dot(gz, self.V, out=self.h) gx = cp.dot(gz, self.W, out=gx) gq = cp.dot(gz, self.U, out=gq) # compute gradients of weight matrices gpu.utils.dot_add(gz, x, C=self.gW, transa=True) gpu.utils.dot_add(gz, h_tm1, C=self.gV, transa=True) gpu.utils.dot_add(gz, q, C=self.gU, transa=True) if not self.nobias: gb_ones = xp.ones((1,batchsize), dtype=np.dtype('float32')) gpu.utils.dot_add(gb_ones, gz, C=self.gb) return gx, gh_tm1, gc_tm1, gq
def forward(self, inputs): xp = cuda.get_array_module(*inputs) x, h_tm1, c_tm1, q = inputs batchsize = x.shape[0] self.z = xp.empty((batchsize,self.out_size*4),dtype=np.dtype('float32')) self.c = xp.empty((batchsize,self.out_size),dtype=np.dtype('float32')) self.h = xp.empty((batchsize,self.out_size),dtype=np.dtype('float32')) if xp is np: self.z = np.dot(x, self.W.T, out=self.z) self.z += np.dot(h_tm1, self.V.T) self.z += np.dot(q, self.U.T) if not self.nobias: self.z += self.b _lstm_forward_cpu(z=self.z, c_tm1=c_tm1, c=self.c, h=self.h, out_size=self.out_size) else: self.z = cp.dot(x, self.W.T, out=self.z) gpu.utils.dot_add(A=h_tm1, B=self.V, C=self.z, transb=True) gpu.utils.dot_add(A=q, B=self.U, C=self.z, transb=True) if not self.nobias: gpu.utils.addVec2Mat(self.z, self.b) _lstm_forward_gpu(z=self.z, c_tm1=c_tm1, c=self.c, h=self.h, out_size=self.out_size) return self.h, self.c
def forward(self, inputs): xp = cuda.get_array_module(*inputs) x, h_tm1, c_tm1 = inputs batchsize = x.shape[0] z = xp.empty((batchsize,self.out_size*4),dtype=np.dtype('float32')) if xp is np: z = x.dot(self.W.T, out=z) z += h_tm1.dot(self.V.T) if not self.nobias: z += self.b else: z = cp.dot(x, self.W.T, out=z) gpu.utils.dot_add(A=h_tm1, B=self.V, C=z, transb=True) if not self.nobias: gpu.utils.addVec2Mat(z, self.b) self.lstm_fun = F.LSTM() c, h = self.lstm_fun.forward(inputs=(c_tm1, z)) self.z = z return h, c
def backward(self, x, y, y_prev, gates, grad_y): a0, a1, a2 = gates delta_a2 = grad_y * a0 * (1 - a2**2) self.grad_w[2] += np.dot(x.T, delta_a2) self.grad_v[2] += np.dot((a1 * y_prev).T, delta_a2) delta_a0 = grad_y * (a2 - y_prev) * a0 * (1 - a0) self.grad_w[0] += np.dot(x.T, delta_a0) self.grad_v[0] += np.dot(y_prev.T, delta_a0) s = np.dot(delta_a2, self.v[2].T) delta_a1 = s * y_prev * a1 * (1 - a1) self.grad_w[1] += np.dot(x.T, delta_a1) self.grad_v[1] += np.dot(y_prev.T, delta_a1) self.grad_x = np.dot(delta_a0, self.w[0].T) +np.dot(delta_a1, self.w[1].T) +np.dot(delta_a2, self.w[2].T) self.grad_y_prev = np.dot(delta_a0, self.v[0].T) +np.dot(delta_a1, self.v[1].T) +a1 * s + grad_y * (1 - a0)
def lstsq(a, b, rcond='warn'): """Return the least-squares solution to a linear matrix equation. Solves the equation `a x = b` by computing a vector `x` that minimizes the Euclidean 2-norm `|| b - a x ||^2`. The equation may be under-, well-, or over- determined (i.e., the number of linearly independent rows of `a` can be less than, equal to, or greater than its number of linearly independent columns). If `a` is square and of full rank, then `x` (but for round-off error) is the "exact" solution of the equation. Args: a (cupy.ndarray): "Coefficient" matrix with dimension ``(M, N)`` b (cupy.ndarray): "Dependent variable" values with dimension ``(M,)`` or ``(M, K)`` rcond (float): Cutoff parameter for small singular values. For stability it computes the largest singular value denoted by ``s``, and sets all singular values smaller than ``s`` to zero. Returns: tuple: A tuple of ``(x, residuals, rank, s)``. Note ``x`` is the least-squares solution with shape ``(N,)`` or ``(N, K)`` depending if ``b`` was two-dimensional. The sums of ``residuals`` is the squared Euclidean 2-norm for each column in b - a*x. The ``residuals`` is an empty array if the rank of a is < N or M <= N, but iff b is 1-dimensional, this is a (1,) shape array, Otherwise the shape is (K,). The ``rank`` of matrix ``a`` is an integer. The singular values of ``a`` are ``s``. .. warning:: This function calls one or more cuSOLVER routine(s) which may yield invalid results if input conditions are not met. To detect these invalid results, you can set the `linalg` configuration to a value that is not `ignore` in :func:`cupyx.errstate` or :func:`cupyx.seterr`. .. seealso:: :func:`numpy.linalg.lstsq` """ if rcond == 'warn': warnings.warn( '`rcond` parameter will change to the default of ' 'machine precision times ``max(M, N)`` where M and N ' 'are the input matrix dimensions.\n' 'To use the future default and silence this warning ' 'we advise to pass `rcond=None`, to keep using the old, ' 'explicitly pass `rcond=-1`.', FutureWarning) rcond = -1 _util._assert_cupy_array(a, b) _util._assert_rank2(a) if b.ndim > 2: raise linalg.LinAlgError('{}-dimensional array given. Array must be at' ' most two-dimensional'.format(b.ndim)) m, n = a.shape[-2:] m2 = b.shape[0] if m != m2: raise linalg.LinAlgError('Incompatible dimensions') u, s, vh = cupy.linalg.svd(a, full_matrices=False) if rcond is None: rcond = numpy.finfo(s.dtype).eps * max(m, n) elif rcond <= 0 or rcond >= 1: # some doc of gelss/gelsd says "rcond < 0", but it's not true! rcond = numpy.finfo(s.dtype).eps # number of singular values and matrix rank cutoff = rcond * s.max() s1 = 1 / s sing_vals = s <= cutoff s1[sing_vals] = 0 rank = s.size - sing_vals.sum(dtype=numpy.int32) # Solve the least-squares solution # x = vh.T.conj() @ diag(s1) @ u.T.conj() @ b z = (cupy.dot(b.T, u.conj()) * s1).T x = cupy.dot(vh.T.conj(), z) # Calculate squared Euclidean 2-norm for each column in b - a*x if m <= n or rank != n: resids = cupy.empty((0, ), dtype=s.dtype) else: e = b - a.dot(x) resids = cupy.atleast_1d(_nrm2_last_axis(e.T)) return x, resids, rank, s
for j in range(M - 1): x_train[i, j] = float(ss[j]) y_train[i, 0] = float(ss[M - 1]) x_train[i, M - 1] = 1 str = f.readline() ss = str.split() test_size = int(ss[0]) M = int(ss[1]) + 1 x_test = cp.ndarray(shape=(test_size, M), dtype=float) y_test = cp.ndarray(shape=(test_size, 1), dtype=float) for i in range(test_size): str = f.readline() ss = str.split() for j in range(M - 1): x_test[i, j] = float(ss[j]) y_test[i, 0] = float(ss[M - 1]) x_test[i, M - 1] = 1 print(M, data_size, test_size) #进行梯度下降 其中w初始值为随机初始化的结果 w = gradient_descent(x_train, y_train, random_initialization(M, 1)) cnt = 0 for i in range(test_size): if (g(cp.dot(x_test[i], w)) < 0.5): flag = 0 else: flag = 1 if (flag == y_test[i, 0]): cnt = cnt + 1 #输出准确率 print(cnt / test_size)
def gradient(self, x, target): h1 = cp.dot(x, self.W_f1) + self.b1 h1_ = cp.tanh(h1) h2 = cp.dot(h1_, self.W_f2) + self.b2 h2_ = cp.tanh(h2) h3 = cp.dot(h2_, self.W_f3) + self.b3 h3_ = cp.tanh(h3) h4 = cp.dot(h3_, self.W_f4) + self.b4 # h4_ = cp.tanh(h4) # h5 = cp.dot(h4_, self.W_f5) + self.b5 output = softmax(h4) delta4 = (output - target) / batch_size # delta_Wf5 = cp.dot(h4_.T, delta5) # delta_b5 = cp.dot(cp.ones(batch_size), delta5) # delta4 = tanh_grad(h4) * cp.dot(delta5, self.B5) delta_Wf4 = cp.dot(h3_.T, delta4) delta_b4 = cp.dot(cp.ones(batch_size), delta4) delta3 = tanh_grad(h3) * cp.dot(delta4, self.W_f4.T) delta_Wf3 = cp.dot(h2_.T, delta3) delta_b3 = cp.dot(cp.ones(batch_size), delta3) delta2 = tanh_grad(h2) * cp.dot(delta3, self.W_f3.T) delta_Wf2 = cp.dot(h1_.T, delta2) delta_b2 = cp.dot(cp.ones(batch_size), delta2) delta1 = tanh_grad(h1) * cp.dot(delta2, self.W_f2.T) delta_Wf1 = cp.dot(x.T, delta1) delta_b1 = cp.dot(cp.ones(batch_size), delta1) # print(delta_Wf1) alpha1 = 0.02 self.W_f1 -= alpha1 * delta_Wf1 self.W_f2 -= alpha1 * delta_Wf2 self.W_f3 -= alpha1 * delta_Wf3 self.W_f4 -= alpha1 * delta_Wf4 # self.W_f5 -= alpha1 * delta_Wf5 self.b1 -= alpha1 * delta_b1 self.b2 -= alpha1 * delta_b2 self.b3 -= alpha1 * delta_b3 self.b4 -= alpha1 * delta_b4
def least_square(X, Y): return cp.dot(cp.dot(cp.linalg.inv(cp.dot(X.T, X)), X.T), Y)
def loss(x, y, w): diff = cp.add(cp.dot(x, w), -1 * y) loss = 1.0 / (2 * data_size) * cp.dot(diff.T, diff) return loss[0, 0]
cuda.close() raise @jit(nopython=True, fastmath=True, nogil=True, parallel=True) def jit_dot(a, b): return np.dot(a, b) start = time.clock() C1 = np.dot(A, B) print('\nnumpy compute time used: %f' % (time.clock() - start)) del C5 start = time.clock() C5 = cp.dot(A2, B2) print('\ncupy compute time used: %f' % (time.clock() - start)) start = time.clock() C4 = jit_dot(A, B) print('\njit compute time used: %f' % (time.clock() - start)) start = time.clock() C2 = cuda_dot1(A, B) print('\ncuda jit 1 compute time used: %f' % (time.clock() - start)) start = time.clock() C2 = cuda_dot2(A, B) print('\ncuda jit 2 compute time used: %f' % (time.clock() - start)) start = time.clock()
def cgs(A, b, x0=None, tol=1e-5, maxiter=None, M=None, callback=None, atol=None): """Use Conjugate Gradient Squared iteration to solve ``Ax = b``. Args: A (ndarray, spmatrix or LinearOperator): The real or complex matrix of the linear system with shape ``(n, n)``. b (cupy.ndarray): Right hand side of the linear system with shape ``(n,)`` or ``(n, 1)``. x0 (cupy.ndarray): Starting guess for the solution. tol (float): Tolerance for convergence. maxiter (int): Maximum number of iterations. M (ndarray, spmatrix or LinearOperator): Preconditioner for ``A``. The preconditioner should approximate the inverse of ``A``. ``M`` must be :class:`cupy.ndarray`, :class:`cupyx.scipy.sparse.spmatrix` or :class:`cupyx.scipy.sparse.linalg.LinearOperator`. callback (function): User-specified function to call after each iteration. It is called as ``callback(xk)``, where ``xk`` is the current solution vector. atol (float): Tolerance for convergence. Returns: tuple: It returns ``x`` (cupy.ndarray) and ``info`` (int) where ``x`` is the converged solution and ``info`` provides convergence information. .. seealso:: :func:`scipy.sparse.linalg.cgs` """ A, M, x, b = _make_system(A, M, x0, b) matvec = A.matvec psolve = M.matvec n = A.shape[0] if n == 0: return cupy.empty_like(b), 0 b_norm = cupy.linalg.norm(b) if b_norm == 0: return b, 0 if atol is None: atol = tol * float(b_norm) else: atol = max(float(atol), tol * float(b_norm)) if maxiter is None: maxiter = n * 5 r0 = b - matvec(x) rho = cupy.dot(r0, r0) # initialise vectors r = r0.copy() u = r0 p = r0.copy() iters = 0 while True: y = psolve(p) v = matvec(y) sigma = cupy.dot(r0, v) alpha = rho / sigma q = u - alpha * v z = psolve(u + q) x += alpha * z Az = matvec(z) r -= alpha * Az # Update residual norm and check convergence r_norm = cupy.linalg.norm(r) iters += 1 if callback is not None: callback(x) if r_norm <= atol or iters >= maxiter: break rho_new = cupy.dot(r0, r) beta = rho_new / rho rho = rho_new u = r + beta * q p *= beta p += q p *= beta p += u info = 0 if iters == maxiter and not (r_norm < atol): info = iters return x, info
def detrend(data, axis=-1, type="linear", bp=0, overwrite_data=False): """ Remove linear trend along axis from data. Parameters ---------- data : array_like The input data. axis : int, optional The axis along which to detrend the data. By default this is the last axis (-1). type : {'linear', 'constant'}, optional The type of detrending. If ``type == 'linear'`` (default), the result of a linear least-squares fit to `data` is subtracted from `data`. If ``type == 'constant'``, only the mean of `data` is subtracted. bp : array_like of ints, optional A sequence of break points. If given, an individual linear fit is performed for each part of `data` between two break points. Break points are specified as indices into `data`. overwrite_data : bool, optional If True, perform in place detrending and avoid a copy. Default is False Returns ------- ret : ndarray The detrended input data. Examples -------- >>> import cusignal >>> import cupy as cp >>> randgen = cp.random.RandomState(9) >>> npoints = 1000 >>> noise = randgen.randn(npoints) >>> x = 3 + 2*cp.linspace(0, 1, npoints) + noise >>> (cusignal.detrend(x) - noise).max() < 0.01 True """ if type not in ["linear", "l", "constant", "c"]: raise ValueError("Trend type must be 'linear' or 'constant'.") data = asarray(data) dtype = data.dtype.char if dtype not in "dfDF": dtype = "d" if type in ["constant", "c"]: ret = data - expand_dims(mean(data, axis), axis) return ret else: dshape = data.shape N = dshape[axis] bp = sort(unique(r_[0, bp, N])) if cp.any(bp > N): raise ValueError("Breakpoints must be less than length of \ data along given axis.") Nreg = len(bp) - 1 # Restructure data so that axis is along first dimension and # all other dimensions are collapsed into second dimension rnk = len(dshape) if axis < 0: axis = axis + rnk newdims = np.r_[axis, 0:axis, axis + 1:rnk] newdata = reshape(transpose(data, tuple(newdims)), (N, _prod(dshape) // N)) if not overwrite_data: newdata = newdata.copy() # make sure we have a copy if newdata.dtype.char not in "dfDF": newdata = newdata.astype(dtype) # Find leastsq fit and remove it for each piece for m in range(Nreg): Npts = int(bp[m + 1] - bp[m]) A = ones((Npts, 2), dtype) A[:, 0] = arange(1, Npts + 1) * 1.0 / Npts sl = slice(bp[m], bp[m + 1]) coef, resids, rank, s = linalg.lstsq(A, newdata[sl]) newdata[sl] = newdata[sl] - dot(A, coef) # Put data back in original shape. tdshape = take(asarray(dshape), asarray(newdims), 0) ret = reshape(newdata, tuple(cp.asnumpy(tdshape))) vals = list(range(1, rnk)) olddims = vals[:axis] + [0] + vals[axis:] ret = transpose(ret, tuple(cp.asnumpy(olddims))) return ret
def training(self): """Apprentissage avec 60 000 images Poids enregistré dans weights_cupy.npy """ print("Training...") # Matrice diagonale de 1 diagonale = cp.eye(27, 27) # globals() Return a dictionary representing the current global symbol table. self.activations_prime = [ globals()[fonction.__name__ + '_prime'] for fonction in self.activations ] node_dict = {} # Liste des poids # Initialisation des poids des nodes, pour ne pas à être à 0 # Construit 3 matrices (100x1600, 100x100, 27x100) # /cp.sqrt() résultat expérimental de l'initialisation de Xavier Glorot et He weight_list = [cp.random.randn(self.layers[k+1], self.layers[k]) / \ cp.sqrt(self.layers[k]) for k in range(len(self.layers)-1)] # vecteur_ligne = image en ligne à la 1ère itération # nombre_lettre = nombre correspondant à la lettre de l'image # i pour itération, vecteur_colonne = x_train de i, nombre_lettre = y_train de i for i, (vecteur_ligne, nombre_lettre) in enumerate(zip(self.x_train, self.y_train)): # la ligne devient colonne vecteur_colonne = cp.array(vecteur_ligne, ndmin=2).T # IndexError: arrays used as indices must be of integer or boolean type. # (actual: <class 'numpy.object_'>) in diagonale[:,[nombre_lettre]] nombre_lettre = int(nombre_lettre) # Forward propagation node_dict[0] = vecteur_colonne for k in range(len(self.layers) - 1): # weight_list[k] (100x1600, 100x100 27x100) vecteur_colonne (1600,) # z de format 100 x 1 z = cp.dot(weight_list[k], vecteur_colonne) # self.activations = non linéaire sinon sortie fonction linéaire de l'entrée # imite le seuil d'activation électrique du neuronne vecteur_colonne = self.activations[k](z) node_dict[k + 1] = vecteur_colonne # Retro propagation, delta_a = écart entre la sortie réelle et attendue delta_a = vecteur_colonne - diagonale[:, [nombre_lettre]] # Parcours des nodes en sens inverse pour corriger proportionnellement # les poids en fonction de l'erreur par rapport à la valeur souhaitée # Descente du Gradient stochastique for k in range(len(self.layers) - 2, -1, -1): delta_z = delta_a * self.activations_prime[k](node_dict[k + 1]) delta_w = cp.dot(delta_z, node_dict[k].T) delta_a = cp.dot(weight_list[k].T, delta_z) # Pour converger vers le minimum d'erreur weight_list[k] -= self.learningrate * delta_w self.weight_list = weight_list # Dans un fichier print("type(weight_list :)", type(weight_list), "\nlen(weight_list) =", len(weight_list), "\n 0", len(weight_list[0]), type(weight_list[0]), "\n 1", len(weight_list[1]), type(weight_list[1]), "\n 2", len(weight_list[2]), type(weight_list[2])) cp.save('./weights_cupy.npy', weight_list, allow_pickle=True) print('weights_cupy.npy enregistré')
def _cuda_bccg(f: typing.Callable, b: typing.Sequence, tol: float, max_it: int, x0: typing.Sequence, min_pressure: float = 0.0, max_pressure: typing.Union[float, typing.Sequence] = cp.inf, k_inn=1) -> typing.Tuple[cp.ndarray, bool]: """ The Bound-Constrained Conjugate Gradient Method for Non-negative Matrices CUDA implementation Parameters ---------- f: Callable A function equivalent to multiplication by a non negative n by n matrix must work with cupy arrays. Typically this function will be generated by slippy.contact.plan_convolve, this will guarantee compatibility with different versions of this function (FFTW and CUDA). b: array 1 by n array of displacements tol: float The tolerance on the result max_it: int The maximum number of iterations used x0: array An initial guess of the solution min_pressure: float, optional (0) The minimum allowable pressure at each node, defaults to 0 max_pressure: float, optional (inf) The maximum allowable pressure at each node, defaults to inf, for purely elastic contacts k_inn: int Returns ------- x: cp.array The solution to the system f(x)-b = 0 with the constraints applied. Notes ----- This function uses the method described in the reference below, with some modification. Firstly, this method allows both a minimum and maximum force to be set simulating quasi plastic regimes. The code has also been optimised in several places and importantly this version has also been modified to run on a GPU through cupy. If you do not have a CUDA compatible GPU, slippy can be imported while falling back to the fftw version by first importing slippy then patching the CUDA variable to False: >>> import slippy >>> slippy.CUDA = False >>> import slippy.contact >>> ... Though this should happen automatically if you don't have cupy installed. References ---------- Vollebregt, E.A.H. The Bound-Constrained Conjugate Gradient Method for Non-negative Matrices. J Optim Theory Appl 162, 931–953 (2014). https://doi.org/10.1007/s10957-013-0499-x Examples -------- """ # if you use np or most built ins in this function at all it will slow it down a lot! try: float(max_pressure) max_is_float = True except TypeError: max_is_float = False max_pressure = cp.array(max_pressure) # initialize b = cp.asarray(b) x = cp.clip(cp.asarray(x0), min_pressure, max_pressure) g = f(x) - b msk_bnd_0 = cp.logical_and(x <= 0, g >= 0) msk_bnd_max = cp.logical_and(x >= max_pressure, g <= 0) n_bound = cp.sum(msk_bnd_0) + cp.sum(msk_bnd_max) n = b.size n_free = n - n_bound small = 1e-14 it = 0 it_inn = 0 rho_prev = cp.nan rho = 0.0 r, p, r_prev = 0, 0, 0 failed = False while True: it += 1 it_inn += 1 x_prev = x if it > 1: r_prev = r rho_prev = rho r = -g r[msk_bnd_0] = 0 r[msk_bnd_max] = 0 rho = cp.dot(r, r) if it > 1: beta_pr = (rho - cp.dot(r, r_prev)) / rho_prev p = r + max([beta_pr, 0])*p else: p = r p[msk_bnd_0] = 0 p[msk_bnd_max] = 0 # compute tildex optimisation ignoring the bounds q = f(p) if it_inn < k_inn: q[msk_bnd_0] = cp.nan q[msk_bnd_max] = cp.nan alpha = cp.dot(r, p) / cp.dot(p, q) x = x + alpha * p rms_xk = cp.linalg.norm(x) / cp.sqrt(n_free) rms_upd = cp.linalg.norm(x - x_prev) / cp.sqrt(n_free) upd = rms_upd / rms_xk # project onto feasible domain changed = False outer_it = it_inn >= k_inn or upd < tol if outer_it: msk_prj_0 = x < -small if cp.any(msk_prj_0): x[msk_prj_0] = 0 msk_bnd_0[msk_prj_0] = True changed = True msk_prj_max = x >= max_pressure * (1 + small) if cp.any(msk_prj_max): if max_is_float: x[msk_prj_max] = max_pressure else: x[msk_prj_max] = max_pressure[msk_prj_max] msk_bnd_max[msk_prj_max] = True changed = True if changed or (outer_it and k_inn > 1): g = f(x) - b else: g = g + alpha * q check_grad = outer_it if check_grad: msk_rel = cp.logical_or(cp.logical_and(msk_bnd_0, g < -small), cp.logical_and(msk_bnd_max, g > small)) if cp.any(msk_rel): msk_bnd_0[msk_rel] = False msk_bnd_max[msk_rel] = False changed = True if changed: n_free = n - cp.sum(msk_bnd_0) - cp.sum(msk_bnd_max) if not n_free: print("No free nodes") warnings.warn("No free nodes for BCCG iterations") failed = True break if outer_it: it_inn = 0 if it > max_it: print("Max iterations") warnings.warn("Bound constrained conjugate gradient iterations failed to converge") failed = True break if outer_it and (not changed) and upd < tol: break return x, bool(failed)
def backward(self, t): delta = self.y - t self.grad_w = np.dot(self.x.T, delta) self.grad_b = np.sum(delta, axis=0) self.grad_x = np.dot(delta, self.w.T)
def forward(self, x): self.x = x u = np.dot(x, self.w) + self.b self.y = u # 恒等関数。
def backward(self, dout): W, = self.params dx = np.dot(dout, W.T) dW = np.dot(x.T, dout) self.grads[0][...] = dW return dx
def gradient(self, x, target, epoch): reg = 0.01 h1 = cp.dot(x, self.W_f1) + self.b1 h1_ = cp.tanh(h1) h2 = cp.dot(h1_, self.W_f2) + self.b2 h2_ = cp.tanh(h2) h3 = cp.dot(h2_, self.W_f3) + self.b3 h3_ = cp.tanh(h3) h4 = cp.dot(h3_, self.W_f4) + self.b4 h4_ = cp.tanh(h4) h5 = cp.dot(h4_, self.W_f5) + self.b5 output = softmax(h5) delta5 = (output - target) / batch_size self.delta_Wf5 = cp.dot(h4_.T, delta5) + reg * self.W_f5 self.delta_b5 = cp.dot(cp.ones(batch_size), delta5) + reg * self.b5 delta4 = tanh_grad(h4) * cp.dot(delta5, self.W_f5.T) self.delta_Wf4 = cp.dot(h3_.T, delta4) + reg * self.W_f4 self.delta_b4 = cp.dot(cp.ones(batch_size), delta4) + reg * self.b4 delta3 = tanh_grad(h3) * cp.dot(delta4, self.W_f4.T) self.delta_Wf3 = cp.dot(h2_.T, delta3) + reg * self.W_f3 self.delta_b3 = cp.dot(cp.ones(batch_size), delta3) + reg * self.b3 delta2 = tanh_grad(h2) * cp.dot(delta3, self.W_f3.T) self.delta_Wf2 = cp.dot(h1_.T, delta2) + reg * self.W_f2 self.delta_b2 = cp.dot(cp.ones(batch_size), delta2) + reg * self.b2 delta1 = tanh_grad(h1) * cp.dot(delta2, self.W_f2.T) self.delta_Wf1 = cp.dot(x.T, delta1) + reg * self.W_f1 self.delta_b1 = cp.dot(cp.ones(batch_size), delta1) + reg * self.b1 # print(delta_Wf1) # eta = self.learning_rate(epoch) eta = 0.02 # eta, self.h_W1 = self.rms_prop(self.delta_Wf1, self.h_W1) self.W_f1 -= eta * self.delta_Wf1 # eta, self.h_W2 = self.rms_prop(self.delta_Wf2, self.h_W2) self.W_f2 -= eta * self.delta_Wf2 # eta, self.h_W3 = self.rms_prop(self.delta_Wf3, self.h_W3) self.W_f3 -= eta * self.delta_Wf3 # eta, self.h_W4 = self.rms_prop(self.delta_Wf4, self.h_W4) self.W_f4 -= eta * self.delta_Wf4 # eta, self.h_W5 = self.rms_prop(self.delta_Wf5, self.h_W5) self.W_f5 -= eta * self.delta_Wf5 # eta, self.h_b1 = self.rms_prop(self.delta_b1, self.h_b1) self.b1 -= eta * self.delta_b1 # eta, self.h_b2 = self.rms_prop(self.delta_b2, self.h_b2) self.b2 -= eta * self.delta_b2 # eta, self.h_b3 = self.rms_prop(self.delta_b3, self.h_b3) self.b3 -= eta * self.delta_b3 # eta, self.h_b4 = self.rms_prop(self.delta_b4, self.h_b4) self.b4 -= eta * self.delta_b4 # eta, self.h_b5 = self.rms_prop(self.delta_b5, self.h_b5) self.b5 -= eta * self.delta_b5
def forward(self, x): W, = self.params out = np.dot(x, W) self.x = x return out
def feedback_alignment(self, x, target, epoch, flag): h1 = cp.dot(x, self.W_f1) + self.b1 h1_ = cp.tanh(h1) h2 = cp.dot(h1_, self.W_f2) + self.b2 h2_ = cp.tanh(h2) h3 = cp.dot(h2_, self.W_f3) + self.b3 h3_ = cp.tanh(h3) h4 = cp.dot(h3_, self.W_f4) + self.b4 h4_ = cp.tanh(h4) h5 = cp.dot(h4_, self.W_f5) + self.b5 output = softmax(h5) delta5 = (output - target) / batch_size delta_Wf5 = cp.dot(h4_.T, delta5) delta_b5 = cp.dot(cp.ones(batch_size), delta5) delta4 = tanh_grad(h4) * cp.dot(delta5, self.B5) delta_Wf4 = cp.dot(h3_.T, delta4) delta_b4 = cp.dot(cp.ones(batch_size), delta4) delta3 = tanh_grad(h3) * cp.dot(delta4, self.B4) delta_Wf3 = cp.dot(h2_.T, delta3) delta_b3 = cp.dot(cp.ones(batch_size), delta3) delta2 = tanh_grad(h2) * cp.dot(delta3, self.B3) delta_Wf2 = cp.dot(h1_.T, delta2) delta_b2 = cp.dot(cp.ones(batch_size), delta2) delta1 = tanh_grad(h1) * cp.dot(delta2, self.B2) delta_Wf1 = cp.dot(x.T, delta1) delta_b1 = cp.dot(cp.ones(batch_size), delta1) # print(delta_Wf1) # calculated by back propagation if flag: deltabp5 = (output - target) / batch_size # delta_bpWf5 = cp.dot(h4_.T, deltabp5) # delta_bpb5 = cp.dot(cp.ones(batch_size), deltabp5) # self.angle_W5 = self.angle(delta_Wf5, delta_bpWf5) deltabp4 = tanh_grad(h4) * cp.dot(deltabp5, self.W_f5.T) delta_bpWf4 = cp.dot(h3_.T, deltabp4) # delta_bpb4 = cp.dot(cp.ones(batch_size), deltabp4) self.angle_W4 = self.angle(delta_Wf4, delta_bpWf4) deltabp3 = tanh_grad(h3) * cp.dot(deltabp4, self.W_f4.T) delta_bpWf3 = cp.dot(h2_.T, deltabp3) # delta_bpb3 = cp.dot(cp.ones(batch_size), deltabp3) self.angle_W3 = self.angle(delta_Wf3, delta_bpWf3) deltabp2 = tanh_grad(h2) * cp.dot(deltabp3, self.W_f3.T) delta_bpWf2 = cp.dot(h1_.T, deltabp2) # delta_bpb2 = cp.dot(cp.ones(batch_size), deltabp2) self.angle_W2 = self.angle(delta_Wf2, delta_bpWf2) deltabp1 = tanh_grad(h1) * cp.dot(deltabp2, self.W_f2.T) delta_bpWf1 = cp.dot(x.T, deltabp1) # delta_bpb1 = cp.dot(cp.ones(batch_size), deltabp1) self.angle_W1 = self.angle(delta_Wf1, delta_bpWf1) alpha1 = self.learning_rate(epoch) self.W_f1 -= alpha1 * delta_Wf1 self.W_f2 -= alpha1 * delta_Wf2 self.W_f3 -= alpha1 * delta_Wf3 self.W_f4 -= alpha1 * delta_Wf4 self.W_f5 -= alpha1 * delta_Wf5 self.b1 -= alpha1 * delta_b1 self.b2 -= alpha1 * delta_b2 self.b3 -= alpha1 * delta_b3 self.b4 -= alpha1 * delta_b4 self.b5 -= alpha1 * delta_b5
def get_w_cp(x, t): xx = cp.dot(x.T, x) xx_inv = cp.linalg.inv(xx) xt = cp.dot(x > t, t) w = cp.dot(xx_inv, xt) return w
def predict(self, x): h1 = cp.dot(x, self.W_f1) h1 = relu(h1) h2 = cp.dot(h1, self.W_f2) output = softmax(h2) return output
def least_square_regular(X, Y): return cp.dot( cp.dot(cp.linalg.inv(cp.add(cp.dot(X.T, X), lambd * cp.eye(M, k=0))), X.T), Y)
def backward(self, dy): self.grads['db'] = np.sum(dy, axis=0) self.grads['dW'] = np.dot(self.cache['x'].T, dy) dx = np.dot(dy, self.parameters['W'].T) return dx
def gradient_function(x, y, w): return (1.0 / M) * (cp.add( cp.add(cp.dot(cp.dot(x.T, x), w), -1 * cp.dot(x.T, y)), 0.001 * w))
def lstsq(a, b, rcond=1e-15): """Return the least-squares solution to a linear matrix equation. Solves the equation `a x = b` by computing a vector `x` that minimizes the Euclidean 2-norm `|| b - a x ||^2`. The equation may be under-, well-, or over- determined (i.e., the number of linearly independent rows of `a` can be less than, equal to, or greater than its number of linearly independent columns). If `a` is square and of full rank, then `x` (but for round-off error) is the "exact" solution of the equation. Args: a (cupy.ndarray): "Coefficient" matrix with dimension ``(M, N)`` b (cupy.ndarray): "Dependent variable" values with dimension ``(M,)`` or ``(M, K)`` rcond (float): Cutoff parameter for small singular values. For stability it computes the largest singular value denoted by ``s``, and sets all singular values smaller than ``s`` to zero. Returns: tuple: A tuple of ``(x, residuals, rank, s)``. Note ``x`` is the least-squares solution with shape ``(N,)`` or ``(N, K)`` depending if ``b`` was two-dimensional. The sums of ``residuals`` is the squared Euclidean 2-norm for each column in b - a*x. The ``residuals`` is an empty array if the rank of a is < N or M <= N, but iff b is 1-dimensional, this is a (1,) shape array, Otherwise the shape is (K,). The ``rank`` of matrix ``a`` is an integer. The singular values of ``a`` are ``s``. .. warning:: This function calls one or more cuSOLVER routine(s) which may yield invalid results if input conditions are not met. To detect these invalid results, you can set the `linalg` configuration to a value that is not `ignore` in :func:`cupyx.errstate` or :func:`cupyx.seterr`. .. seealso:: :func:`numpy.linalg.lstsq` """ util._assert_cupy_array(a, b) util._assert_rank2(a) if b.ndim > 2: raise linalg.LinAlgError('{}-dimensional array given. Array must be at' ' most two-dimensional'.format(b.ndim)) m, n = a.shape[-2:] m2 = b.shape[0] if m != m2: raise linalg.LinAlgError('Incompatible dimensions') u, s, vt = cupy.linalg.svd(a, full_matrices=False) # number of singular values and matrix rank cutoff = rcond * s.max() s1 = 1 / s sing_vals = s <= cutoff s1[sing_vals] = 0 rank = s.size - sing_vals.sum() if b.ndim == 2: s1 = cupy.repeat(s1.reshape(-1, 1), b.shape[1], axis=1) # Solve the least-squares solution z = core.dot(u.transpose(), b) * s1 x = core.dot(vt.transpose(), z) # Calculate squared Euclidean 2-norm for each column in b - a*x if rank != n or m <= n: resids = cupy.array([], dtype=a.dtype) elif b.ndim == 2: e = b - core.dot(a, x) resids = cupy.sum(cupy.square(e), axis=0) else: e = b - cupy.dot(a, x) resids = cupy.dot(e.T, e).reshape(-1) return x, resids, rank, s
def loss(W, X, Y): y_hat = g(cp.dot(W.t, X)) return -(cp.dot(Y.T, cp.log(y_hat)) + cp.dot((1 - Y).T, cp.log(1 - y_hat)))
def svd_wrapper(matrix, mode, ncomp, debug, verbose, usv=False, random_state=None, to_numpy=True): """ Wrapper for different SVD libraries (CPU and GPU). Parameters ---------- matrix : array_like, 2d 2d input matrix. mode : {'lapack', 'arpack', 'eigen', 'randsvd', 'cupy', 'eigencupy', 'randcupy', 'pytorch', 'eigenpytorch', 'randpytorch'}, str optional Switch for the SVD method/library to be used. ``lapack`` uses the LAPACK linear algebra library through Numpy and it is the most conventional way of computing the SVD (deterministic result computed on CPU). ``arpack`` uses the ARPACK Fortran libraries accessible through Scipy (computation on CPU). ``eigen`` computes the singular vectors through the eigendecomposition of the covariance M.M' (computation on CPU). ``randsvd`` uses the randomized_svd algorithm implemented in Sklearn (computation on CPU). ``cupy`` uses the Cupy library for GPU computation of the SVD as in the LAPACK version. ``eigencupy`` offers the same method as with the ``eigen`` option but on GPU (through Cupy). ``randcupy`` is an adaptation f the randomized_svd algorithm, where all the computations are done on a GPU (through Cupy). ``pytorch`` uses the Pytorch library for GPU computation of the SVD. ``eigenpytorch`` offers the same method as with the ``eigen`` option but on GPU (through Pytorch). ``randpytorch`` is an adaptation of the randomized_svd algorithm, where all the linear algebra computations are done on a GPU (through Pytorch). ncomp : int Number of singular vectors to be obtained. In the cases when the full SVD is computed (LAPACK, ARPACK, EIGEN, CUPY), the matrix of singular vectors is truncated. debug : bool If True the explained variance ratio is computed and displayed. verbose: bool If True intermediate information is printed out. usv : bool optional If True the 3 terms of the SVD factorization are returned. random_state : int, RandomState instance or None, optional If int, random_state is the seed used by the random number generator. If RandomState instance, random_state is the random number generator. If None, the random number generator is the RandomState instance used by np.random. Used for ``randsvd`` mode. to_numpy : bool, optional If True (by default) the arrays computed in GPU are transferred from VRAM and converted to numpy ndarrays. Returns ------- V : array_like The right singular vectors of the input matrix. If ``usv`` is True it returns the left and right singular vectors and the singular values of the input matrix. References ---------- * For ``lapack`` SVD mode see: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.svd.html http://www.netlib.org/lapack/ * For ``eigen`` mode see: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.eigh.html * For ``arpack`` SVD mode see: https://docs.scipy.org/doc/scipy-0.19.1/reference/generated/scipy.sparse.linalg.svds.html http://www.caam.rice.edu/software/ARPACK/ * For ``randsvd`` SVD mode see: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/extmath.py Finding structure with randomness: Stochastic algorithms for constructing approximate matrix decompositions Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061 * For ``cupy`` SVD mode see: https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.linalg.svd.html * For ``eigencupy`` mode see: https://docs-cupy.chainer.org/en/master/reference/generated/cupy.linalg.eigh.html * For ``pytorch`` SVD mode see: http://pytorch.org/docs/master/torch.html#torch.svd * For ``eigenpytorch`` mode see: http://pytorch.org/docs/master/torch.html#torch.eig """ def reconstruction(ncomp, U, S, V, var=1): if mode == 'lapack': rec_matrix = np.dot(U[:, :ncomp], np.dot(np.diag(S[:ncomp]), V[:ncomp])) rec_matrix = rec_matrix.T print(' Matrix reconstruction with {} PCs:'.format(ncomp)) print(' Mean Absolute Error =', MAE(matrix, rec_matrix)) print(' Mean Squared Error =', MSE(matrix, rec_matrix)) # see https://github.com/scikit-learn/scikit-learn/blob/c3980bcbabd9d2527548820581725df2904e4a0d/sklearn/decomposition/pca.py exp_var = (S ** 2) / (S.shape[0] - 1) full_var = np.sum(exp_var) explained_variance_ratio = exp_var / full_var # % of variance explained by each PC ratio_cumsum = np.cumsum(explained_variance_ratio) elif mode == 'eigen': exp_var = (S ** 2) / (S.shape[0] - 1) full_var = np.sum(exp_var) explained_variance_ratio = exp_var / full_var # % of variance explained by each PC ratio_cumsum = np.cumsum(explained_variance_ratio) else: rec_matrix = np.dot(U, np.dot(np.diag(S), V)) print(' Matrix reconstruction MAE =', MAE(matrix, rec_matrix)) exp_var = (S ** 2) / (S.shape[0] - 1) full_var = np.var(matrix, axis=0).sum() explained_variance_ratio = exp_var / full_var # % of variance explained by each PC if var == 1: pass else: explained_variance_ratio = explained_variance_ratio[::-1] ratio_cumsum = np.cumsum(explained_variance_ratio) msg = ' This info makes sense when the matrix is mean centered ' msg += '(temp-mean scaling)' print(msg) lw = 2; alpha = 0.4 fig = plt.figure(figsize=vip_figsize) fig.subplots_adjust(wspace=0.4) ax1 = plt.subplot2grid((1, 3), (0, 0), colspan=2) ax1.step(range(explained_variance_ratio.shape[0]), explained_variance_ratio, alpha=alpha, where='mid', label='Individual EVR', lw=lw) ax1.plot(ratio_cumsum, '.-', alpha=alpha, label='Cumulative EVR', lw=lw) ax1.legend(loc='best', frameon=False, fontsize='medium') ax1.set_ylabel('Explained variance ratio (EVR)') ax1.set_xlabel('Principal components') ax1.grid(linestyle='solid', alpha=0.2) ax1.set_xlim(-10, explained_variance_ratio.shape[0] + 10) ax1.set_ylim(0, 1) trunc = 20 ax2 = plt.subplot2grid((1, 3), (0, 2), colspan=1) # plt.setp(ax2.get_yticklabels(), visible=False) ax2.step(range(trunc), explained_variance_ratio[:trunc], alpha=alpha, where='mid', lw=lw) ax2.plot(ratio_cumsum[:trunc], '.-', alpha=alpha, lw=lw) ax2.set_xlabel('Principal components') ax2.grid(linestyle='solid', alpha=0.2) ax2.set_xlim(-2, trunc + 2) ax2.set_ylim(0, 1) msg = ' Cumulative explained variance ratio for {} PCs = {:.5f}' # plt.savefig('figure.pdf', dpi=300, bbox_inches='tight') print(msg.format(ncomp, ratio_cumsum[ncomp - 1])) # -------------------------------------------------------------------------- if matrix.ndim != 2: raise TypeError('Input matrix is not a 2d array') if usv: if mode not in ('lapack', 'arpack', 'randsvd', 'cupy', 'randcupy', 'pytorch', 'randpytorch'): msg = "Returning USV is supported with modes lapack, arpack, " msg += "randsvd, cupy, randcupy, pytorch or randpytorch" raise ValueError(msg) if ncomp > min(matrix.shape[0], matrix.shape[1]): msg = '{} PCs cannot be obtained from a matrix with size [{},{}].' msg += ' Increase the size of the patches or request less PCs' raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1])) if mode == 'eigen': # building C as np.dot(matrix.T,matrix) is slower and takes more memory C = np.dot(matrix, matrix.T) # covariance matrix e, EV = linalg.eigh(C) # EVals and EVs pc = np.dot(EV.T, matrix) # PCs using a compact trick when cov is MM' V = pc[::-1] # reverse since we need the last EVs S = np.sqrt(np.abs(e)) # SVals = sqrt(EVals) S = S[::-1] # reverse since EVals go in increasing order if debug: reconstruction(ncomp, None, S, None) for i in range(V.shape[1]): V[:, i] /= S # scaling EVs by the square root of EVals V = V[:ncomp] if verbose: print('Done PCA with numpy linalg eigh functions') elif mode == 'lapack': # n_frames is usually smaller than n_pixels. In this setting taking the SVD of M' # and keeping the left (transposed) SVs is faster than taking the SVD of M (right SVs) U, S, V = linalg.svd(matrix.T, full_matrices=False) if debug: reconstruction(ncomp, U, S, V) V = V[:ncomp] # we cut projection matrix according to the # of PCs U = U[:, :ncomp] S = S[:ncomp] if verbose: print('Done SVD/PCA with numpy SVD (LAPACK)') elif mode == 'arpack': U, S, V = svds(matrix, k=ncomp) if debug: reconstruction(ncomp, U, S, V, -1) if verbose: print('Done SVD/PCA with scipy sparse SVD (ARPACK)') elif mode == 'randsvd': U, S, V = randomized_svd(matrix, n_components=ncomp, n_iter=2, transpose='auto', random_state=random_state) if debug: reconstruction(ncomp, U, S, V) if verbose: print('Done SVD/PCA with randomized SVD') elif mode == 'cupy': if no_cupy: raise RuntimeError('Cupy is not installed') a_gpu = cupy.array(matrix) a_gpu = cupy.asarray(a_gpu) # move the data to the current device u_gpu, s_gpu, vh_gpu = cupy.linalg.svd(a_gpu, full_matrices=True, compute_uv=True) V = vh_gpu[:ncomp] if to_numpy: V = cupy.asnumpy(V) if usv: S = s_gpu[:ncomp] if to_numpy: S = cupy.asnumpy(S) U = u_gpu[:, :ncomp] if to_numpy: U = cupy.asnumpy(U) if verbose: print('Done SVD/PCA with cupy (GPU)') elif mode == 'randcupy': if no_cupy: raise RuntimeError('Cupy is not installed') U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='cupy') if to_numpy: V = cupy.asnumpy(V) S = cupy.asnumpy(S) U = cupy.asnumpy(U) if debug: reconstruction(ncomp, U, S, V) if verbose: print('Done randomized SVD/PCA with cupy (GPU)') elif mode == 'eigencupy': if no_cupy: raise RuntimeError('Cupy is not installed') a_gpu = cupy.array(matrix) a_gpu = cupy.asarray(a_gpu) # move the data to the current device C = cupy.dot(a_gpu, a_gpu.T) # covariance matrix e, EV = cupy.linalg.eigh(C) # eigenvalues and eigenvectors pc = cupy.dot(EV.T, a_gpu) # PCs using a compact trick when cov is MM' V = pc[::-1] # reverse since last eigenvectors are the ones we want S = cupy.sqrt(e)[::-1] # reverse since eigenvalues are in increasing order if debug: reconstruction(ncomp, None, S, None) for i in range(V.shape[1]): V[:, i] /= S # scaling by the square root of eigenvalues V = V[:ncomp] if to_numpy: V = cupy.asnumpy(V) if verbose: print('Done PCA with cupy eigh function (GPU)') elif mode == 'pytorch': if no_torch: raise RuntimeError('Pytorch is not installed') a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32').T)) u_gpu, s_gpu, vh_gpu = torch.svd(a_gpu) V = vh_gpu[:ncomp] S = s_gpu[:ncomp] U = torch.transpose(u_gpu, 0, 1)[:ncomp] if to_numpy: V = np.array(V) S = np.array(S) U = np.array(U) if verbose: print('Done SVD/PCA with pytorch (GPU)') elif mode == 'eigenpytorch': if no_torch: raise RuntimeError('Pytorch is not installed') a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32'))) C = torch.mm(a_gpu, torch.transpose(a_gpu, 0, 1)) e, EV = torch.eig(C, eigenvectors=True) V = torch.mm(torch.transpose(EV, 0, 1), a_gpu) S = torch.sqrt(e[:, 0]) if debug: reconstruction(ncomp, None, S, None) for i in range(V.shape[1]): V[:, i] /= S V = V[:ncomp] if to_numpy: V = np.array(V) if verbose: print('Done PCA with pytorch eig function') elif mode == 'randpytorch': if no_torch: raise RuntimeError('Pytorch is not installed') U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='pytorch') if to_numpy: V = np.array(V) S = np.array(S) U = np.array(U) if debug: reconstruction(ncomp, U, S, V) if verbose: print('Done randomized SVD/PCA with randomized pytorch (GPU)') else: raise ValueError('The SVD mode is not available') if usv: if mode == 'lapack': return V.T, S, U.T elif mode == 'pytorch': if to_numpy: return V.T, S, U.T else: return torch.transpose(V, 0, 1), S, torch.transpose(U, 0, 1) else: return U, S, V else: if mode == 'lapack': return U.T elif mode == 'pytorch': return U else: return V
def extractTemplatesfromSnippets(proc=None, probe=None, params=None, Nbatch=None, nPCs=None): # this function is very similar to extractPCfromSnippets. # outputs not just the PC waveforms, but also the template "prototype", # basically k-means clustering of 1D waveforms. NT = params.NT # skip every this many batches nskip = params.nskip nPCs = nPCs or params.nPCs nt0min = params.nt0min Nchan = probe.Nchan batchstart = np.arange(0, NT * Nbatch + 1, NT).astype(np.int64) k = 0 # preallocate matrix to hold 1D spike snippets # dd = cp.zeros((params.nt0, int(5e4)), dtype=np.float32, order='F') dds = [] for ibatch in tqdm(range(0, Nbatch, nskip), desc="Extracting templates"): offset = Nchan * batchstart[ibatch] dat = proc.flat[offset:offset + NT * Nchan].reshape((-1, Nchan), order="F") # move data to GPU and scale it back to unit variance dataRAW = cp.asarray(dat, dtype=np.float32) / params.scaleproc # find isolated spikes from each batch row, col, mu = isolated_peaks_new(dataRAW, params) # for each peak, get the voltage snippet from that channel c = get_SpikeSample(dataRAW, row, col, params) # if k + c.shape[1] > dd.shape[1]: # dd = cp.pad(dd, (0, dd.shape[1]), mode='constant') # dd[:, k:k + c.shape[1]] = c dds.append(c) k = k + c.shape[1] if k > 1e5: break # discard empty samples # dd = dd[:, :k] dd = cp.asfortranarray(cp.concatenate(dds, axis=1).astype(np.float32)) # initialize the template clustering with random waveforms uu = np.random.permutation(dd.shape[1])[:nPCs] wTEMP = dd[:, uu] wTEMP = wTEMP / cp.sum(wTEMP**2, axis=0)**0.5 # normalize them for i in range(10): # at each iteration, assign the waveform to its most correlated cluster cc = cp.dot(wTEMP.T, dd) imax = cp.argmax(cc, axis=0) amax = cc[imax, np.arange(cc.shape[1])] for j in range(nPCs): # weighted average to get new cluster means wTEMP[:, j] = cp.dot(dd[:, imax == j], amax[imax == j].T) wTEMP = wTEMP / cp.sum(wTEMP**2, axis=0)**0.5 # unit normalize # the PCs are just the left singular vectors of the waveforms U, Sv, V = svdecon(dd) # take as many as needed wPCA = U[:, :nPCs] # adjust the arbitrary sign of the first PC so its negativity is downward wPCA[:, 0] = -wPCA[:, 0] * cp.sign(wPCA[nt0min, 0]) return wTEMP, wPCA
def forward(self, x, y_prev): a0 = sigmoid(np.dot(x, self.w[0]) + np.dot(y_prev, self.v[0])) a1 = sigmoid(np.dot(x, self.w[1]) + np.dot(y_prev, self.v[1])) a2 = np.tanh(np.dot(x, self.w[2]) + np.dot(a1 * y_prev, self.v[2])) self.gates = np.stack((a0, a1, a2)) self.y = (1 - a0) * y_prev + a0 * a2
def svd_wrapper(matrix, mode, ncomp, verbose, full_output=False, random_state=None, to_numpy=True): """ Wrapper for different SVD libraries (CPU and GPU). Parameters ---------- matrix : numpy ndarray, 2d 2d input matrix. mode : {'lapack', 'arpack', 'eigen', 'randsvd', 'cupy', 'eigencupy', 'randcupy', 'pytorch', 'eigenpytorch', 'randpytorch'}, str optional Switch for the SVD method/library to be used. ``lapack``: uses the LAPACK linear algebra library through Numpy and it is the most conventional way of computing the SVD (deterministic result computed on CPU). ``arpack``: uses the ARPACK Fortran libraries accessible through Scipy (computation on CPU). ``eigen``: computes the singular vectors through the eigendecomposition of the covariance M.M' (computation on CPU). ``randsvd``: uses the randomized_svd algorithm implemented in Sklearn (computation on CPU). ``cupy``: uses the Cupy library for GPU computation of the SVD as in the LAPACK version. ` `eigencupy``: offers the same method as with the ``eigen`` option but on GPU (through Cupy). ``randcupy``: is an adaptation of the randomized_svd algorithm, where all the computations are done on a GPU (through Cupy). ` `pytorch``: uses the Pytorch library for GPU computation of the SVD. ``eigenpytorch``: offers the same method as with the ``eigen`` option but on GPU (through Pytorch). ``randpytorch``: is an adaptation of the randomized_svd algorithm, where all the linear algebra computations are done on a GPU (through Pytorch). ncomp : int Number of singular vectors to be obtained. In the cases when the full SVD is computed (LAPACK, ARPACK, EIGEN, CUPY), the matrix of singular vectors is truncated. verbose: bool If True intermediate information is printed out. full_output : bool optional If True the 3 terms of the SVD factorization are returned. If ``mode`` is eigen then only S and V are returned. random_state : int, RandomState instance or None, optional If int, random_state is the seed used by the random number generator. If RandomState instance, random_state is the random number generator. If None, the random number generator is the RandomState instance used by np.random. Used for ``randsvd`` mode. to_numpy : bool, optional If True (by default) the arrays computed in GPU are transferred from VRAM and converted to numpy ndarrays. Returns ------- V : numpy ndarray The right singular vectors of the input matrix. If ``full_output`` is True it returns the left and right singular vectors and the singular values of the input matrix. If ``mode`` is set to eigen then only S and V are returned. References ---------- * For ``lapack`` SVD mode see: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.svd.html http://www.netlib.org/lapack/ * For ``eigen`` mode see: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.eigh.html * For ``arpack`` SVD mode see: https://docs.scipy.org/doc/scipy-0.19.1/reference/generated/scipy.sparse.linalg.svds.html http://www.caam.rice.edu/software/ARPACK/ * For ``randsvd`` SVD mode see: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/extmath.py Finding structure with randomness: Stochastic algorithms for constructing approximate matrix decompositions Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061 * For ``cupy`` SVD mode see: https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.linalg.svd.html * For ``eigencupy`` mode see: https://docs-cupy.chainer.org/en/master/reference/generated/cupy.linalg.eigh.html * For ``pytorch`` SVD mode see: http://pytorch.org/docs/master/torch.html#torch.svd * For ``eigenpytorch`` mode see: http://pytorch.org/docs/master/torch.html#torch.eig """ if matrix.ndim != 2: raise TypeError('Input matrix is not a 2d array') if ncomp > min(matrix.shape[0], matrix.shape[1]): msg = '{} PCs cannot be obtained from a matrix with size [{},{}].' msg += ' Increase the size of the patches or request less PCs' raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1])) if mode == 'eigen': # building C as np.dot(matrix.T,matrix) is slower and takes more memory C = np.dot(matrix, matrix.T) # covariance matrix e, EV = linalg.eigh(C) # EVals and EVs pc = np.dot(EV.T, matrix) # PCs using a compact trick when cov is MM' V = pc[::-1] # reverse since we need the last EVs S = np.sqrt(np.abs(e)) # SVals = sqrt(EVals) S = S[::-1] # reverse since EVals go in increasing order for i in range(V.shape[1]): V[:, i] /= S # scaling EVs by the square root of EVals V = V[:ncomp] if verbose: print('Done PCA with numpy linalg eigh functions') elif mode == 'lapack': # n_frames is usually smaller than n_pixels. In this setting taking # the SVD of M' and keeping the left (transposed) SVs is faster than # taking the SVD of M (right SVs) U, S, V = linalg.svd(matrix.T, full_matrices=False) V = V[:ncomp] # we cut projection matrix according to the # of PCs U = U[:, :ncomp] S = S[:ncomp] if verbose: print('Done SVD/PCA with numpy SVD (LAPACK)') elif mode == 'arpack': U, S, V = svds(matrix, k=ncomp) if verbose: print('Done SVD/PCA with scipy sparse SVD (ARPACK)') elif mode == 'randsvd': U, S, V = randomized_svd(matrix, n_components=ncomp, n_iter=2, transpose='auto', random_state=random_state) if verbose: print('Done SVD/PCA with randomized SVD') elif mode == 'cupy': if no_cupy: raise RuntimeError('Cupy is not installed') a_gpu = cupy.array(matrix) a_gpu = cupy.asarray(a_gpu) # move the data to the current device u_gpu, s_gpu, vh_gpu = cupy.linalg.svd(a_gpu, full_matrices=True, compute_uv=True) V = vh_gpu[:ncomp] if to_numpy: V = cupy.asnumpy(V) if full_output: S = s_gpu[:ncomp] if to_numpy: S = cupy.asnumpy(S) U = u_gpu[:, :ncomp] if to_numpy: U = cupy.asnumpy(U) if verbose: print('Done SVD/PCA with cupy (GPU)') elif mode == 'randcupy': if no_cupy: raise RuntimeError('Cupy is not installed') U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='cupy') if to_numpy: V = cupy.asnumpy(V) S = cupy.asnumpy(S) U = cupy.asnumpy(U) if verbose: print('Done randomized SVD/PCA with cupy (GPU)') elif mode == 'eigencupy': if no_cupy: raise RuntimeError('Cupy is not installed') a_gpu = cupy.array(matrix) a_gpu = cupy.asarray(a_gpu) # move the data to the current device C = cupy.dot(a_gpu, a_gpu.T) # covariance matrix e, EV = cupy.linalg.eigh(C) # eigenvalues and eigenvectors pc = cupy.dot(EV.T, a_gpu) # using a compact trick when cov is MM' V = pc[::-1] # reverse to get last eigenvectors S = cupy.sqrt(e)[::-1] # reverse since EVals go in increasing order for i in range(V.shape[1]): V[:, i] /= S # scaling by the square root of eigvals V = V[:ncomp] if to_numpy: V = cupy.asnumpy(V) if verbose: print('Done PCA with cupy eigh function (GPU)') elif mode == 'pytorch': if no_torch: raise RuntimeError('Pytorch is not installed') a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32').T)) u_gpu, s_gpu, vh_gpu = torch.svd(a_gpu) V = vh_gpu[:ncomp] S = s_gpu[:ncomp] U = torch.transpose(u_gpu, 0, 1)[:ncomp] if to_numpy: V = np.array(V) S = np.array(S) U = np.array(U) if verbose: print('Done SVD/PCA with pytorch (GPU)') elif mode == 'eigenpytorch': if no_torch: raise RuntimeError('Pytorch is not installed') a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32'))) C = torch.mm(a_gpu, torch.transpose(a_gpu, 0, 1)) e, EV = torch.eig(C, eigenvectors=True) V = torch.mm(torch.transpose(EV, 0, 1), a_gpu) S = torch.sqrt(e[:, 0]) for i in range(V.shape[1]): V[:, i] /= S V = V[:ncomp] if to_numpy: V = np.array(V) if verbose: print('Done PCA with pytorch eig function') elif mode == 'randpytorch': if no_torch: raise RuntimeError('Pytorch is not installed') U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='pytorch') if to_numpy: V = np.array(V) S = np.array(S) U = np.array(U) if verbose: print('Done randomized SVD/PCA with randomized pytorch (GPU)') else: raise ValueError('The SVD `mode` is not recognized') if full_output: if mode == 'lapack': return V.T, S, U.T elif mode == 'pytorch': if to_numpy: return V.T, S, U.T else: return torch.transpose(V, 0, 1), S, torch.transpose(U, 0, 1) elif mode in ('eigen', 'eigencupy', 'eigenpytorch'): return S, V else: return U, S, V else: if mode == 'lapack': return U.T elif mode == 'pytorch': return U else: return V
def learnAndSolve8b(ctx, sanity_plots=False, plot_widgets=None, plot_pos=None): """This is the main optimization. Takes the longest time and uses the GPU heavily.""" Nbatch = ctx.intermediate.Nbatch params = ctx.params probe = ctx.probe ir = ctx.intermediate proc = ir.proc iorig = ir.iorig # TODO: move_to_config NrankPC = 6 # this one is the rank of the PCs, used to detect spikes with threshold crossings Nrank = 3 # this one is the rank of the templates wTEMP, wPCA = extractTemplatesfromSnippets(proc=proc, probe=probe, params=params, Nbatch=Nbatch, nPCs=NrankPC) # move these to the GPU wPCA = cp.asarray(wPCA[:, :Nrank], dtype=np.float32, order="F") wTEMP = cp.asarray(wTEMP, dtype=np.float32, order="F") wPCAd = cp.asarray(wPCA, dtype=np.float64, order="F") # convert to double for extra precision nt0 = params.nt0 nt0min = params.nt0min nBatches = Nbatch NT = params.NT Nfilt = params.Nfilt Nchan = probe.Nchan # two variables for the same thing? number of nearest channels to each primary channel # TODO: unclear - let's fix this NchanNear = min(probe.Nchan, 32) Nnearest = min(probe.Nchan, 32) # decay of gaussian spatial mask centered on a channel sigmaMask = params.sigmaMask batchstart = list(range(0, NT * nBatches + 1, NT)) # find the closest NchanNear channels, and the masks for those channels iC, mask, C2C = getClosestChannels(probe, sigmaMask, NchanNear) # sorting order for the batches isortbatches = iorig nhalf = int(ceil(nBatches / 2)) - 1 # halfway point # this batch order schedule goes through half of the data forward and backward during the model # fitting and then goes through the data symmetrically-out from the center during the final # pass ischedule = np.concatenate( (np.arange(nhalf, nBatches), np.arange(nBatches - 1, nhalf - 1, -1))) i1 = np.arange(nhalf - 1, -1, -1) i2 = np.arange(nhalf, nBatches) irounds = np.concatenate((ischedule, i1, i2)) niter = irounds.size if irounds[niter - nBatches - 1] != nhalf: # this check is in here in case I do somehting weird when I try different schedules raise ValueError("Mismatch between number of batches") # these two flags are used to keep track of what stage of model fitting we're at # flag_final = 0 flag_resort = 1 # this is the absolute temporal offset in seconds corresponding to the start of the # spike sorted time segment t0 = 0 # ceil(params.trange(1) * ops.fs) nInnerIter = 60 # this is for SVD for the power iteration # schedule of learning rates for the model fitting part # starts small and goes high, it corresponds approximately to the number of spikes # from the past that were averaged to give rise to the current template pmi = cp.exp( -1.0 / cp.linspace(params.momentum[0], params.momentum[1], niter - nBatches)) Nsum = min( Nchan, 7) # how many channels to extend out the waveform in mexgetspikes # lots of parameters passed into the CUDA scripts Params = np.array( [ NT, Nfilt, params.Th[0], nInnerIter, nt0, Nnearest, Nrank, params.lam, pmi[0], Nchan, NchanNear, params.nt0min, 1, Nsum, NrankPC, params.Th[0], ], dtype=np.float64, ) # W0 has to be ordered like this W0 = cp.transpose( cp.atleast_3d(cp.asarray(wPCA, dtype=np.float64, order="F")), [0, 2, 1]) # initialize the list of channels each template lives on iList = cp.zeros((Nnearest, Nfilt), dtype=np.int32, order="F") # initialize average number of spikes per batch for each template nsp = cp.zeros((0, 1), dtype=np.float64, order="F") # this flag starts 0, is set to 1 later Params[12] = 0 # kernels for subsample alignment Ka, Kb = getKernels(params) p1 = 0.95 # decay of nsp estimate in each batch ntot = 0 # this keeps track of dropped templates for debugging purposes ndrop = np.zeros(2, dtype=np.float32, order="F") # this is the minimum firing rate that all templates must maintain, or be dropped m0 = params.minFR * params.NT / params.fs # allocate variables when switching to extraction phase # this holds spike times, clusters and other info per spike st3 = [] # cp.zeros((int(1e7), 5), dtype=np.float32, order='F') # these ones store features per spike # Nnearest is the number of nearest templates to store features for fW = LargeArrayWriter(ctx.path("fW", ext=".dat"), dtype=np.float32, shape=(Nnearest, -1)) # NchanNear is the number of nearest channels to take PC features from fWpc = LargeArrayWriter(ctx.path("fWpc", ext=".dat"), dtype=np.float32, shape=(NchanNear, Nrank, -1)) for ibatch in tqdm(range(niter), desc="Optimizing templates"): # korder is the index of the batch at this point in the schedule korder = int(irounds[ibatch]) # k is the index of the batch in absolute terms k = int(isortbatches[korder]) logger.debug("Batch %d/%d, %d templates.", ibatch, niter, Nfilt) if ibatch > niter - nBatches - 1 and korder == nhalf: # this is required to revert back to the template states in the middle of the # batches W, dWU = ir.W, ir.dWU logger.debug("Reverted back to middle timepoint.") if ibatch < niter - nBatches: # obtained pm for this batch Params[8] = float(pmi[ibatch]) pm = pmi[ibatch] * ones((Nfilt, ), dtype=np.float64, order="F") # loading a single batch (same as everywhere) offset = Nchan * batchstart[k] dat = proc.flat[offset:offset + NT * Nchan].reshape((-1, Nchan), order="F") dataRAW = cp.asarray(dat, dtype=np.float32) / params.scaleproc if ibatch == 0: # only on the first batch, we first get a new set of spikes from the residuals, # which in this case is the unmodified data because we start with no templates # CUDA function to get spatiotemporal clips from spike detections dWU, cmap = mexGetSpikes2(Params, dataRAW, wTEMP, iC) dWU = cp.asarray(dWU, dtype=np.float64, order="F") # project these into the wPCA waveforms dWU = cp.reshape( cp.dot( wPCAd, cp.dot(wPCAd.T, dWU.reshape((dWU.shape[0], -1), order="F"))), dWU.shape, order="F", ) # initialize the low-rank decomposition with standard waves W = W0[:, cp.ones(dWU.shape[2], dtype=np.int32), :] Nfilt = W.shape[1] # update the number of filters/templates # initialize the number of spikes for new templates with the minimum allowed value, # so it doesn't get thrown back out right away nsp = _extend(nsp, 0, Nfilt, m0) Params[1] = Nfilt # update in the CUDA parameters if flag_resort: # this is a flag to resort the order of the templates according to best peak # channel # this is important in order to have cohesive memory requests from the GPU RAM # max channel (either positive or negative peak) iW = cp.argmax(cp.abs(dWU[nt0min - 1, :, :]), axis=0) # iW = int32(squeeze(iW)) isort = cp.argsort(iW) # sort by max abs channel iW = iW[isort] W = W[:, isort, :] # user ordering to resort all the other template variables dWU = dWU[:, :, isort] nsp = nsp[isort] # decompose dWU by svd of time and space (via covariance matrix of 61 by 61 samples) # this uses a "warm start" by remembering the W from the previous iteration W, U, mu = mexSVDsmall2(Params, dWU, W, iC, iW, Ka, Kb) # UtU is the gram matrix of the spatial components of the low-rank SVDs # it tells us which pairs of templates are likely to "interfere" with each other # such as when we subtract off a template # this needs to change (but I don't know why!) UtU, maskU = getMeUtU(iW, iC, mask, Nnearest, Nchan) # main CUDA function in the whole codebase. does the iterative template matching # based on the current templates, gets features for these templates if requested # (featW, featPC), # gets scores for the template fits to each spike (vexp), outputs the average of # waveforms assigned to each cluster (dWU0), # and probably a few more things I forget about st0, id0, x0, featW, dWU0, drez, nsp0, featPC, vexp = mexMPnu8( Params, dataRAW, U, W, mu, iC, iW, UtU, iList, wPCA, params) logger.debug("%d spikes.", x0.size) # Sometimes nsp can get transposed (think this has to do with it being # a single element in one iteration, to which elements are added # nsp, nsp0, and pm must all be row vectors (Nfilt x 1), so force nsp # to be a row vector. # nsp = cp.atleast_2d(nsp) # nsprow, nspcol = nsp.shape # if nsprow < nspcol: # nsp = nsp.T nsp = nsp.squeeze() # updates the templates as a running average weighted by recency # since some clusters have different number of spikes, we need to apply the # exp(pm) factor several times, and fexp is the resulting update factor # for each template fexp = np.exp(nsp0 * cp.log(pm[:Nfilt])) fexp = cp.reshape(fexp, (1, 1, -1), order="F") dWU = dWU * fexp + (1 - fexp) * ( dWU0 / cp.reshape(cp.maximum(1, nsp0), (1, 1, -1), order="F")) # nsp just gets updated according to the fixed factor p1 nsp = nsp * p1 + (1 - p1) * nsp0 if ibatch == niter - nBatches - 1: # if we reached this point, we need to disable secondary template updates # like dropping, and adding new templates. We need to memorize the state of the # templates at this timepoint, and set the processing mode to "extraction and # tracking" flag_resort = 0 # no need to resort templates by channel any more # flag_final = 1 # this is the "final" pass # final clean up, triage templates one last time W, U, dWU, mu, nsp, ndrop = triageTemplates2( params, iW, C2C, W, U, dWU, mu, nsp, ndrop) # final number of templates Nfilt = W.shape[1] Params[1] = Nfilt # final covariance matrix between all templates WtW, iList = getMeWtW(W, U, Nnearest) # iW is the final channel assigned to each template iW = cp.argmax(cp.abs(dWU[nt0min - 1, :, :]), axis=0) # extract ALL features on the last pass Params[ 12] = 2 # this is a flag to output features (PC and template features) # different threshold on last pass? Params[2] = params.Th[ -1] # usually the threshold is much lower on the last pass # memorize the state of the templates logger.debug("Memorized middle timepoint.") ir.W, ir.dWU, ir.U, ir.mu = W, dWU, U, mu ir.Wraw = cp.zeros((U.shape[0], W.shape[0], U.shape[1]), dtype=np.float64, order="F") for n in range(U.shape[1]): # temporarily use U rather Urot until I have a chance to test it ir.Wraw[:, :, n] = mu[n] * cp.dot(U[:, n, :], W[:, n, :].T) if ibatch < niter - nBatches - 1: # during the main "learning" phase of fitting a model if ibatch % 5 == 0: # this drops templates based on spike rates and/or similarities to # other templates W, U, dWU, mu, nsp, ndrop = triageTemplates2( params, iW, C2C, W, U, dWU, mu, nsp, ndrop) Nfilt = W.shape[1] # update the number of filters Params[1] = Nfilt # this adds new templates if they are detected in the residual dWU0, cmap = mexGetSpikes2(Params, drez, wTEMP, iC) if dWU0.shape[2] > 0: # new templates need to be integrated into the same format as all templates # apply PCA for smoothing purposes dWU0 = cp.reshape( cp.dot( wPCAd, cp.dot( wPCAd.T, dWU0.reshape( (dWU0.shape[0], dWU0.shape[1] * dWU0.shape[2]), order="F", ), ), ), dWU0.shape, order="F", ) dWU = cp.concatenate((dWU, dWU0), axis=2) m = dWU0.shape[2] # initialize temporal components of waveforms W = _extend(W, Nfilt, Nfilt + m, W0[:, cp.ones(m, dtype=np.int32), :], axis=1) # initialize the number of spikes with the minimum allowed nsp = _extend(nsp, Nfilt, Nfilt + m, params.minFR * NT / params.fs) # initialize the amplitude of this spike with a lowish number mu = _extend(mu, Nfilt, Nfilt + m, 10) # if the number of filters exceed the maximum allowed, clip it Nfilt = min(params.Nfilt, W.shape[1]) Params[1] = Nfilt W = W[:, : Nfilt, :] # remove any new filters over the maximum allowed dWU = dWU[:, :, : Nfilt] # remove any new filters over the maximum allowed nsp = nsp[: Nfilt] # remove any new filters over the maximum allowed mu = mu[: Nfilt] # remove any new filters over the maximum allowed if ibatch > niter - nBatches - 1: # during the final extraction pass, this keeps track of all spikes and features # we memorize the spatio-temporal decomposition of the waveforms at this batch # this is currently only used in the GUI to provide an accurate reconstruction # of the raw data at this time ir.WA[..., k] = cp.asnumpy(W) ir.UA[..., k] = cp.asnumpy(U) ir.muA[..., k] = cp.asnumpy(mu) # we carefully assign the correct absolute times to spikes found in this batch ioffset = params.ntbuff - 1 if k == 0: ioffset = 0 # the first batch is special (no pre-buffer) toff = nt0min + t0 - ioffset + (NT - params.ntbuff) * k st = toff + st0 st30 = np.c_[ cp.asnumpy(st), # spike times cp.asnumpy(id0), # spike clusters (0-indexing) cp.asnumpy(x0), # template amplitudes cp.asnumpy(vexp), # residual variance of this spike korder * np.ones(st.size), # batch from which this spike was found ] # Check the number of spikes. assert st30.shape[0] == featW.shape[1] == featPC.shape[2] st3.append(st30) fW.append(featW) fWpc.append(featPC) ntot = ntot + x0.size # keeps track of total number of spikes so far if ibatch == niter - nBatches - 1: # these next three store the low-d template decompositions ir.WA = np.zeros((nt0, Nfilt, Nrank, nBatches), dtype=np.float32, order="F") ir.UA = np.zeros((Nchan, Nfilt, Nrank, nBatches), dtype=np.float32, order="F") ir.muA = np.zeros((Nfilt, nBatches), dtype=np.float32, order="F") if ibatch % 100 == 0: # this is some of the relevant diagnostic information to be printed during training logger.info(("%d / %d batches, %d units, nspks: %2.4f, mu: %2.4f, " "nst0: %d, merges: %2.4f, %2.4f"), ibatch, niter, Nfilt, nsp.sum(), median(mu), st0.size, *ndrop) if sanity_plots: assert plot_widgets is not None, "if sanity_plots is set, then plot_widgets cannot be None" plot_diagnostics(W, U, mu, nsp, plot_widgets[plot_pos]) free_gpu_memory() # Close the large array writers and save the JSON metadata files to disk. fW.close() fWpc.close() # just display the total number of spikes logger.info("Found %d spikes.", ntot) # Save results to the ctx.intermediate object. ir.st3 = np.concatenate(st3, axis=0) # the similarity score between templates is simply the correlation, # taken as the max over several consecutive time delays ir.simScore = cp.asnumpy(cp.max(WtW, axis=2)) # NOTE: these are now already saved by LargeArrayWriter # fWa = np.concatenate(fW, axis=-1) # fWpca = np.concatenate(fWpc, axis=-1) # the template features are stored in cProj, like in Kilosort1 # ir.cProj = fWa.T # the neihboring templates idnices are stored in iNeigh ir.iNeigh = cp.asnumpy(iList) # permute the PC projections in the right order # ir.cProjPC = np.transpose(fWpca, (2, 1, 0)) # iNeighPC keeps the indices of the channels corresponding to the PC features ir.iNeighPC = cp.asnumpy(iC[:, iW]) # Number of spikes. assert ir.st3.shape[0] == fW.shape[-1] == fWpc.shape[-1] # this whole next block is just done to compress the compressed templates # we separately svd the time components of each template, and the spatial components # this also requires a careful decompression function, available somewhere in the GUI code nKeep = min(Nchan * 3, 20) # how many PCs to keep W_a = np.zeros((nt0 * Nrank, nKeep, Nfilt), dtype=np.float32) W_b = np.zeros((nBatches, nKeep, Nfilt), dtype=np.float32) U_a = np.zeros((Nchan * Nrank, nKeep, Nfilt), dtype=np.float32) U_b = np.zeros((nBatches, nKeep, Nfilt), dtype=np.float32) for j in tqdm(range(Nfilt), desc="Compressing templates"): # do this for every template separately WA = np.reshape(ir.WA[:, j, ...], (-1, nBatches), order="F") # svd on the GPU was faster for this, but the Python randomized CPU version # might be faster still # WA = gpuArray(WA) A, B, C = svdecon_cpu(WA) # W_a times W_b results in a reconstruction of the time components W_a[:, :, j] = np.dot(A[:, :nKeep], B[:nKeep, :nKeep]) W_b[:, :, j] = C[:, :nKeep] UA = np.reshape(ir.UA[:, j, ...], (-1, nBatches), order="F") # UA = gpuArray(UA) A, B, C = svdecon_cpu(UA) # U_a times U_b results in a reconstruction of the time components U_a[:, :, j] = np.dot(A[:, :nKeep], B[:nKeep, :nKeep]) U_b[:, :, j] = C[:, :nKeep] logger.info("Finished compressing time-varying templates.") return Bunch( wPCA=wPCA[:, :Nrank], wTEMP=wTEMP, st3=ir.st3, simScore=ir.simScore, # cProj=ir.cProj, # cProjPC=ir.cProjPC, iNeigh=ir.iNeigh, iNeighPC=ir.iNeighPC, WA=ir.WA, UA=ir.UA, W=ir.W, U=ir.U, dWU=ir.dWU, mu=ir.mu, W_a=W_a, W_b=W_b, U_a=U_a, U_b=U_b, )
def splitAllClusters(ctx, flag): # I call this algorithm "bimodal pursuit" # split clusters if they have bimodal projections # the strategy is to maximize a bimodality score and find a single vector projection # that maximizes it. If the distribution along that maximal projection crosses a # bimodality threshold, then the cluster is split along that direction # it only uses the PC features for each spike, stored in ir.cProjPC params = ctx.params probe = ctx.probe ir = ctx.intermediate Nchan = ctx.probe.Nchan wPCA = cp.asarray(ir.wPCA) # use PCA projections to reconstruct templates when we do splits assert wPCA.shape[1] == 3 # Take intermediate arrays from context. st3 = cp.asnumpy(ir.st3_m) cProjPC = ir.cProjPC dWU = ir.dWU # For the following arrays that will be overwritten by this function, try to get # it from a previous call to this function (as it is called twice), otherwise # get it from before (without the _s suffix). W = ir.get('W_s', ir.W) simScore = ir.get('simScore_s', ir.simScore) iNeigh = ir.get('iNeigh_s', ir.iNeigh) iNeighPC = ir.get('iNeighPC_s', ir.iNeighPC) # this is the threshold for splits, and is one of the main parameters users can change ccsplit = params.AUCsplit NchanNear = min(Nchan, 32) Nnearest = min(Nchan, 32) sigmaMask = params.sigmaMask ik = -1 Nfilt = W.shape[1] nsplits = 0 # determine what channels each template lives on iC, mask, C2C = getClosestChannels(probe, sigmaMask, NchanNear) # the waveforms must be aligned to this sample nt0min = params.nt0min # find the peak abs channel for each template iW = np.argmax(np.abs((dWU[nt0min - 1, :, :])), axis=0) # keep track of original cluster for each cluster. starts with all clusters being their # own origin. isplit = np.arange(Nfilt) dt = 1. / 1000 nccg = 0 while ik < Nfilt: if ik % 100 == 0: # periodically write updates logger.info(f'Found {nsplits} splits, checked {ik}/{Nfilt} clusters, nccg {nccg}') ik += 1 isp = (st3[:, 1] == ik) # get all spikes from this cluster nSpikes = isp.sum() logger.debug(f"Splitting template {ik}/{Nfilt} with {nSpikes} spikes.") free_gpu_memory() if nSpikes < 300: # do not split if fewer than 300 spikes (we cannot estimate # cross-correlograms accurately) continue ss = st3[isp, 0] / params.fs # convert to seconds clp0 = cProjPC[isp, :, :] # get the PC projections for these spikes clp0 = cp.asarray(clp0, dtype=cp.float32) # upload to the GPU clp0 = clp0.reshape((clp0.shape[0], -1), order='F') m = mean(clp0, axis=0) clp = clp0 clp -= m # mean center them isp = np.nonzero(isp)[0] # (DEV_NOTES) Python flattens clp0 in C order rather than Fortran order so the # flattened PC projections will be slightly different, however this is fixed when # the projections are reformed later # subtract a running average, because the projections are NOT drift corrected clpc = my_conv2(clp, 250, 0) clp -= clpc # now use two different ways to initialize the bimodal direction # the main script calls this function twice, and does both initializations if flag: u, s, v = svdecon(clp.T) u, v = -u, -v # change sign for consistency with MATLAB w = u[:, 0] # initialize with the top PC else: w = mean(clp0, axis=0) # initialize with the mean of NOT drift-corrected trace w = w / cp.sum(w ** 2) ** 0.5 # unit-normalize # initial projections of waveform PCs onto 1D vector x = cp.dot(clp, w) s1 = var(x[x > mean(x)]) # initialize estimates of variance for the first s2 = var(x[x < mean(x)]) # and second gaussian in the mixture of 1D gaussians mu1 = mean(x[x > mean(x)]) # initialize the means as well mu2 = mean(x[x < mean(x)]) # and the probability that a spike is assigned to the first Gaussian p = mean(x > mean(x)) # initialize matrix of log probabilities that each spike is assigned to the first # or second cluster logp = cp.zeros((nSpikes, 2), order='F') # do 50 pursuit iteration logP = cp.zeros(50) # used to monitor the cost function for k in range(50): # for each spike, estimate its probability to come from either Gaussian cluster logp[:, 0] = -1. / 2 * log(s1) - ((x - mu1) ** 2) / (2 * s1) + log(p) logp[:, 1] = -1. / 2 * log(s2) - ((x - mu2) ** 2) / (2 * s2) + log(1 - p) lMax = logp.max(axis=1) logp = logp - lMax[:, cp.newaxis] # subtract the max for floating point accuracy rs = cp.exp(logp) # exponentiate the probabilities pval = cp.log(cp.sum(rs, axis=1)) + lMax # get the normalizer and add back the max logP[k] = mean(pval) # this is the cost function: we can monitor its increase rs = rs / cp.sum(rs, axis=1)[:, cp.newaxis] # normalize so that probabilities sum to 1 p = mean(rs[:, 0]) # mean probability to be assigned to Gaussian 1 # new estimate of mean of cluster 1 (weighted by "responsibilities") mu1 = cp.dot(rs[:, 0], x) / cp.sum(rs[:, 0]) # new estimate of mean of cluster 2 (weighted by "responsibilities") mu2 = cp.dot(rs[:, 1], x) / cp.sum(rs[:, 1]) s1 = cp.dot(rs[:, 0], (x - mu1) ** 2) / cp.sum(rs[:, 0]) # new estimates of variances s2 = cp.dot(rs[:, 1], (x - mu2) ** 2) / cp.sum(rs[:, 1]) if (k >= 10) and (k % 2 == 0): # starting at iteration 10, we start re-estimating the pursuit direction # that is, given the Gaussian cluster assignments, and the mean and variances, # we re-estimate w # these equations follow from the model StS = cp.matmul( clp.T, clp * (rs[:, 0] / s1 + rs[:, 1] / s2)[:, cp.newaxis]) / nSpikes StMu = cp.dot(clp.T, rs[:, 0] * mu1 / s1 + rs[:, 1] * mu2 / s2) / nSpikes # this is the new estimate of the best pursuit direction w = cp.linalg.solve(StS.T, StMu) w = w / cp.sum(w ** 2) ** 0.5 # which we unit normalize x = cp.dot(clp, w) # these spikes are assigned to cluster 1 ilow = rs[:, 0] > rs[:, 1] # the mean probability of spikes assigned to cluster 1 plow = mean(rs[:, 0][ilow]) phigh = mean(rs[:, 1][~ilow]) # same for cluster 2 # the smallest cluster has this proportion of all spikes nremove = min(mean(ilow), mean(~ilow)) # did this split fix the autocorrelograms? # compute the cross-correlogram between spikes in the putative new clusters ilow_cpu = cp.asnumpy(ilow) K, Qi, Q00, Q01, rir = ccg(ss[ilow_cpu], ss[~ilow_cpu], 500, dt) Q12 = (Qi / max(Q00, Q01)).min() # refractoriness metric 1 R = rir.min() # refractoriness metric 2 # if the CCG has a dip, don't do the split. # These thresholds are consistent with the ones from merges. if (Q12 < 0.25) and (R < 0.05): # if both metrics are below threshold. nccg += 1 # keep track of how many splits were voided by the CCG criterion continue # now decide if the split would result in waveforms that are too similar # the reconstructed mean waveforms for putative cluster 1 # c1 = cp.matmul(wPCA, cp.reshape((mean(clp0[ilow, :], 0), 3, -1), order='F')) c1 = cp.matmul(wPCA, mean(clp0[ilow, :], 0).reshape((3, -1), order='F')) # the reconstructed mean waveforms for putative cluster 2 # c2 = cp.matmul(wPCA, cp.reshape((mean(clp0[~ilow, :], 0), 3, -1), order='F')) c2 = cp.matmul(wPCA, mean(clp0[~ilow, :], 0).reshape((3, -1), order='F')) cc = cp.corrcoef(c1.ravel(), c2.ravel()) # correlation of mean waveforms n1 = sqrt(cp.sum(c1 ** 2)) # the amplitude estimate 1 n2 = sqrt(cp.sum(c2 ** 2)) # the amplitude estimate 2 r0 = 2 * abs((n1 - n2) / (n1 + n2)) # if the templates are correlated, and their amplitudes are similar, stop the split!!! if (cc[0, 1] > 0.9) and (r0 < 0.2): continue # finaly criteria to continue with the split: if the split piece is more than 5% of all # spikes, if the split piece is more than 300 spikes, and if the confidences for # assigning spikes to # both clusters exceeds a preset criterion ccsplit if (nremove > 0.05) and (min(plow, phigh) > ccsplit) and ( min(cp.sum(ilow), cp.sum(~ilow)) > 300): # one cluster stays, one goes Nfilt += 1 # the templates for the splits have been estimated from PC coefficients # (DEV_NOTES) code below involves multiple CuPy arrays changing shape to accomodate # the extra cluster, this could potentially be done more efficiently? dWU = cp.concatenate(( cp.asarray(dWU), cp.zeros((*dWU.shape[:-1], 1), order='F')), axis=2) dWU[:, iC[:, iW[ik]], Nfilt - 1] = c2 dWU[:, iC[:, iW[ik]], ik] = c1 # the temporal components are therefore just the PC waveforms W = cp.asarray(W) W = cp.concatenate((W, cp.transpose(cp.atleast_3d(wPCA), (0, 2, 1))), axis=1) assert W.shape[1] == Nfilt # copy the best channel from the original template iW = cp.asarray(iW) iW = cp.pad(iW, (0, (Nfilt - len(iW))), mode='constant') iW[Nfilt - 1] = iW[ik] assert iW.shape[0] == Nfilt # copy the provenance index to keep track of splits isplit = cp.asarray(isplit) isplit = cp.pad(isplit, (0, (Nfilt - len(isplit))), mode='constant') isplit[Nfilt - 1] = isplit[ik] assert isplit.shape[0] == Nfilt st3[isp[ilow_cpu], 1] = Nfilt - 1 # overwrite spike indices with the new index # copy similarity scores from the original simScore = cp.asarray(simScore) simScore = cp.pad( simScore, (0, (Nfilt - simScore.shape[0])), mode='constant') simScore[:, Nfilt - 1] = simScore[:, ik] simScore[Nfilt - 1, :] = simScore[ik, :] # copy similarity scores from the original simScore[ik, Nfilt - 1] = 1 # set the similarity with original to 1 simScore[Nfilt - 1, ik] = 1 # set the similarity with original to 1 assert simScore.shape == (Nfilt, Nfilt) # copy neighbor template list from the original iNeigh = cp.asarray(iNeigh) iNeigh = cp.pad( iNeigh, ((0, 0), (0, (Nfilt - iNeigh.shape[1]))), mode='constant') iNeigh[:, Nfilt - 1] = iNeigh[:, ik] assert iNeigh.shape[1] == Nfilt # copy neighbor channel list from the original iNeighPC = cp.asarray(iNeighPC) iNeighPC = cp.pad( iNeighPC, ((0, 0), (0, (Nfilt - iNeighPC.shape[1]))), mode='constant') iNeighPC[:, Nfilt - 1] = iNeighPC[:, ik] assert iNeighPC.shape[1] == Nfilt # try this cluster again # the cluster piece that stays at this index needs to be tested for splits again # before proceeding ik -= 1 # the piece that became a new cluster will be tested again when we get to the end # of the list nsplits += 1 # keep track of how many splits we did # pbar.update(ik) # pbar.close() logger.info( f'Finished splitting. Found {nsplits} splits, checked ' f'{ik}/{Nfilt} clusters, nccg {nccg}') Nfilt = W.shape[1] # new number of templates Nrank = 3 Nchan = probe.Nchan Params = cp.array( [0, Nfilt, 0, 0, W.shape[0], Nnearest, Nrank, 0, 0, Nchan, NchanNear, nt0min, 0], dtype=cp.float64) # make a new Params to pass on parameters to CUDA # we need to re-estimate the spatial profiles # we get the time upsampling kernels again Ka, Kb = getKernels(params) # we run SVD W, U, mu = mexSVDsmall2(Params, dWU, W, iC, iW, Ka, Kb) # we re-compute similarity scores between templates WtW, iList = getMeWtW(W.astype(cp.float32), U.astype(cp.float32), Nnearest) # ir.iList = iList # over-write the list of nearest templates isplit = simScore == 1 # overwrite the similarity scores of clusters with same parent simScore = WtW.max(axis=2) simScore[isplit] = 1 # 1 means they come from the same parent iNeigh = iList[:, :Nfilt] # get the new neighbor templates iNeighPC = iC[:, iW[:Nfilt]] # get the new neighbor channels # for Phy, we need to pad the spikes with zeros so the spikes are aligned to the center of # the window Wphy = cp.concatenate( (cp.zeros((1 + nt0min, Nfilt, Nrank), order='F'), W), axis=0) # ir.isplit = isplit # keep track of origins for each cluster return Bunch( st3_s=st3, W_s=W, U_s=U, mu_s=mu, simScore_s=simScore, iNeigh_s=iNeigh, iNeighPC_s=iNeighPC, Wphy=Wphy, iList=iList, isplit=isplit, )
def rezToPhy(ctx, dat_path=None, output_dir=None): # pull out results from kilosort's rez to either return to workspace or to # save in the appropriate format for the phy GUI to run on. If you provide # a savePath it should be a folder savePath = output_dir Path(savePath).mkdir(exist_ok=True, parents=True) ctx = checkClusters(ctx) # check clusters integrity probe = ctx.probe ir = ctx.intermediate params = ctx.params nt0 = params.nt0 # spikeTimes will be in samples, not seconds W = cp.asarray(ir.Wphy).astype(np.float32) Wrot = ir.Wrot est_contam_rate = ir.est_contam_rate good = ir.good st3 = cp.asarray(ir.st3_c) U = cp.asarray(ir.U_s).astype(np.float32) iNeigh = ir.iNeigh_s iNeighPC = ir.iNeighPC_s simScore = ir.simScore_s if st3.shape[1] > 4: st3 = st3[:, :4] isort = cp.argsort(st3[:, 0]) st3 = st3[isort, :] # cProj = ir.cProj_c[cp.asnumpy(isort), :] # cProjPC = ir.cProjPC_c[cp.asnumpy(isort), :, :] fs = os.listdir(savePath) for file in fs: if file.endswith('.npy'): os.remove(join(savePath, file)) if os.path.isdir(join(savePath, '.phy')): shutil.rmtree(join(savePath, '.phy')) spikeTimes = st3[:, 0].astype(cp.uint64) spikeTemplates = st3[:, 1].astype(cp.uint32) # (DEV_NOTES) if statement below seems useless due to above if statement if st3.shape[1] > 4: spikeClusters = (1 + st3[:, 4]).astype(cp.uint32) # templateFeatures = cProj templateFeatureInds = iNeigh.astype(cp.uint32) # pcFeatures = cProjPC pcFeatureInds = iNeighPC.astype(cp.uint32) whiteningMatrix = cp.asarray(Wrot) / params.scaleproc whiteningMatrixInv = cp.linalg.pinv(whiteningMatrix) amplitudes = st3[:, 2] Nchan = probe.Nchan xcoords = probe.xc ycoords = probe.yc chanMap = probe.chanMap chanMap0ind = chanMap # - 1 nt0, Nfilt = W.shape[:2] # (DEV_NOTES) 2 lines below can be combined # templates = cp.einsum('ikl,jkl->ijk', U, W).astype(cp.float32) # templates = cp.zeros((Nchan, nt0, Nfilt), dtype=np.float32, order='F') tempAmpsUnscaled = cp.zeros(Nfilt, dtype=np.float32) templates_writer = NpyWriter(join(savePath, 'templates.npy'), (Nfilt, nt0, Nchan), np.float32) for iNN in tqdm(range(Nfilt), desc="Computing templates"): t = cp.dot(U[:, iNN, :], W[:, iNN, :].T).T templates_writer.append(t) t_unw = cp.dot(t, whiteningMatrixInv) assert t_unw.ndim == 2 tempChanAmps = t_unw.max(axis=0) - t_unw.min(axis=0) tempAmpsUnscaled[iNN] = tempChanAmps.max() templates_writer.close() # templates = cp.transpose(templates, (2, 1, 0)) # now it's nTemplates x nSamples x nChannels # we include all channels so this is trivial templatesInds = cp.tile(np.arange(Nfilt), (Nchan, 1)) # here we compute the amplitude of every template... # unwhiten all the templates # tempsUnW = cp.einsum('ijk,kl->ijl', templates, whiteningMatrixinv) # tempsUnW = cp.zeros(templates.shape, dtype=np.float32, order='F') # for t in tqdm(range(templates.shape[0]), desc="Unwhitening the templates"): # tempsUnW[t, :, :] = cp.dot(templates[t, :, :], whiteningMatrixInv) # The amplitude on each channel is the positive peak minus the negative # tempChanAmps = tempsUnW.max(axis=1) - tempsUnW.min(axis=1) # The template amplitude is the amplitude of its largest channel # tempAmpsUnscaled = tempChanAmps.max(axis=1) # assign all spikes the amplitude of their template multiplied by their # scaling amplitudes # tempAmpsUnscaled = cp.(tempAmpsUnscaled, axis=0).astype(np.float32) spikeAmps = tempAmpsUnscaled[spikeTemplates] * amplitudes # take the average of all spike amps to get actual template amps (since # tempScalingAmps are equal mean for all templates) ta = clusterAverage(spikeTemplates, spikeAmps) tids = cp.unique(spikeTemplates).astype(np.int64) tempAmps = cp.zeros_like(tempAmpsUnscaled, order='F') tempAmps[tids] = ta # because ta only has entries for templates that had at least one spike tempAmps = params.gain * tempAmps # for consistency, make first dimension template number # PCs ix = ir.spikes_to_remove # length: number of spikes BEFORE -1 cluster removed cProj_shape = ir.cProj.shape cProj_shape = (st3.shape[0],) + cProj_shape[1:] cProjPC_shape = ir.cProjPC.shape cProjPC_shape = (st3.shape[0],) + cProjPC_shape[1:] tfw = NpyWriter(join(savePath, 'template_features.npy'), cProj_shape, np.float32) pcw = NpyWriter(join(savePath, 'pc_features.npy'), cProjPC_shape, np.float32) isort = cp.asnumpy(isort) N = len(ix) # number of spikes including those assigned to -1 assert ir.cProj.shape[0] == N assert ir.cProjPC.shape[0] == N spikes_to_keep = np.nonzero(~ix)[0] # indices of the spikes to keep in the cProj index space # if len(ix) > ir.cProj.shape[0]: # ix = ix[:cProj.shape[0]] # else: # ix = np.pad(ix, (0, ir.cProj.shape[0] - len(ix)), mode='constant') # assert ix.shape[0] == ir.cProj.shape[0] == ir.cProjPC.shape[0] k = int(ceil(float(N) / 100)) # 100 chunks assert k >= 1 for i in tqdm(range(0, N, k), desc="Saving template and PC features"): # NOTE: cProj and cProjPC still have the spikes assigned to -1 that have yet to be removed # spike indices in cProj that need to be kept in this chunk ind = spikes_to_keep[isort[i:i + k]] cProj = ir.cProj[ind] cProjPC = ir.cProjPC[ind] tfw.append(cProj) pcw.append(cProjPC) tfw.close() pcw.close() # with open(, 'wb') as fp: # save_large_array(fp, templateFeatures) # cProj = ir.cProj_c[cp.asnumpy(isort), :] # cProjPC = ir.cProjPC_c[cp.asnumpy(isort), :, :] def _save(name, arr, dtype=None): cp.save(join(savePath, name + '.npy'), arr.astype(dtype or arr.dtype)) if savePath is not None: _save('spike_times', spikeTimes) _save('spike_templates', spikeTemplates, cp.uint32) if st3.shape[1] > 4: _save('spike_clusters', spikeClusters, cp.uint32) else: _save('spike_clusters', spikeTemplates, cp.uint32) _save('amplitudes', amplitudes) # _save('templates', templates) _save('templates_ind', templatesInds) chanMap0ind = chanMap0ind.astype(cp.int32) _save('channel_map', chanMap0ind) _save('channel_positions', np.c_[xcoords, ycoords]) # _save('template_features', templateFeatures) # with open(join(savePath, 'template_features.npy'), 'wb') as fp: # save_large_array(fp, templateFeatures) _save('template_feature_ind', templateFeatureInds.T) # _save('pc_features', pcFeatures) # with open(join(savePath, 'pc_features.npy'), 'wb') as fp: # save_large_array(fp, pcFeatures) _save('pc_feature_ind', pcFeatureInds.T) _save('whitening_mat', whiteningMatrix) _save('whitening_mat_inv', whiteningMatrixInv) if 'simScore' in ir: similarTemplates = simScore _save('similar_templates', similarTemplates) est_contam_rate[np.isnan(est_contam_rate)] = 1 with open(join(savePath, 'cluster_group.tsv'), 'w') as f: f.write('cluster_id\tgroup\n') for j in range(len(good)): if good[j]: f.write('%d\tgood\n' % j) # else: # f.write('%d\tmua\n' % j) with open(join(savePath, 'cluster_ContamPct.tsv'), 'w') as f: f.write('cluster_id\tContamPct\n') for j in range(len(good)): f.write('%d\t%.1f\n' % (j, 100 * est_contam_rate[j])) with open(join(savePath, 'cluster_Amplitude.tsv'), 'w') as f: f.write('cluster_id\tAmplitude\n') for j in range(len(good)): f.write('%d\t%.1f\n' % (j, tempAmps[j])) # make params file if not os.path.exists(join(savePath, 'params.py')): with open(join(savePath, 'params.py'), 'w') as f: f.write('dat_path = "../%s"\n' % dat_path) f.write('n_channels_dat = %d\n' % probe.NchanTOT) f.write('dtype = "int16"\n') f.write('offset = 0\n') f.write('hp_filtered = False\n') f.write('sample_rate = %i\n' % params.fs) f.write('template_scaling = %.1f\n' % params.get('templateScaling', 1.0))
def randomized_svd_gpu(M, n_components, n_oversamples=10, n_iter='auto', transpose='auto', random_state=0, lib='cupy'): """Computes a truncated randomized SVD on GPU. Adapted from Sklearn. Parameters ---------- M : ndarray or sparse matrix Matrix to decompose n_components : int Number of singular values and vectors to extract. n_oversamples : int (default is 10) Additional number of random vectors to sample the range of M so as to ensure proper conditioning. The total number of random vectors used to find the range of M is n_components + n_oversamples. Smaller number can improve speed but can negatively impact the quality of approximation of singular vectors and singular values. n_iter : int or 'auto' (default is 'auto') Number of power iterations. It can be used to deal with very noisy problems. When 'auto', it is set to 4, unless `n_components` is small (< .1 * min(X.shape)) `n_iter` in which case is set to 7. This improves precision with few components. transpose : True, False or 'auto' (default) Whether the algorithm should be applied to M.T instead of M. The result should approximately be the same. The 'auto' mode will trigger the transposition if M.shape[1] > M.shape[0] since this implementation of randomized SVD tend to be a little faster in that case. random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. lib : {'cupy', 'pytorch'}, str optional Chooses the GPU library to be used. Notes ----- This algorithm finds a (usually very good) approximate truncated singular value decomposition using randomization to speed up the computations. It is particularly fast on large matrices on which you wish to extract only a small number of components. In order to obtain further speed up, `n_iter` can be set <=2 (at the cost of loss of precision). References ---------- * Finding structure with randomness: Stochastic algorithms for constructing approximate matrix decompositions Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061 * A randomized algorithm for the decomposition of matrices Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert * An implementation of a randomized algorithm for principal component analysis A. Szlam et al. 2014 """ random_state = check_random_state(random_state) n_random = n_components + n_oversamples n_samples, n_features = M.shape if n_iter == 'auto': # Checks if the number of iterations is explicitly specified n_iter = 7 if n_components < .1 * min(M.shape) else 4 if transpose == 'auto': transpose = n_samples < n_features if transpose: M = M.T # this implementation is a bit faster with smaller shape[1] if lib == 'cupy': M = cupy.array(M) M = cupy.asarray(M) # Generating normal random vectors with shape: (M.shape[1], n_random) Q = random_state.normal(size=(M.shape[1], n_random)) Q = cupy.array(Q) Q = cupy.asarray(Q) # Perform power iterations with Q to further 'imprint' the top # singular vectors of M in Q for i in range(n_iter): Q = cupy.dot(M, Q) Q = cupy.dot(M.T, Q) # Sample the range of M using by linear projection of Q. Extract an orthonormal basis Q, _ = cupy.linalg.qr(cupy.dot(M, Q), mode='reduced') # project M to the (k + p) dimensional space using the basis vectors B = cupy.dot(Q.T, M) B = cupy.array(B) Q = cupy.array(Q) # compute the SVD on the thin matrix: (k + p) wide Uhat, s, V = cupy.linalg.svd(B, full_matrices=False, compute_uv=True) del B U = cupy.dot(Q, Uhat) if transpose: # transpose back the results according to the input convention return V[:n_components, :].T, s[:n_components], U[:, :n_components].T else: return U[:, :n_components], s[:n_components], V[:n_components, :] elif lib == 'pytorch': M_gpu = torch.Tensor.cuda(torch.from_numpy(M.astype('float32'))) # Generating normal random vectors with shape: (M.shape[1], n_random) Q = torch.cuda.FloatTensor(M_gpu.shape[1], n_random).normal_() # Perform power iterations with Q to further 'imprint' the top # singular vectors of M in Q for i in range(n_iter): Q = torch.mm(M_gpu, Q) Q = torch.mm(torch.transpose(M_gpu, 0, 1), Q) # Sample the range of M using by linear projection of Q. Extract an orthonormal basis Q, _ = torch.qr(torch.mm(M_gpu, Q)) # project M to the (k + p) dimensional space using the basis vectors B = torch.mm(torch.transpose(Q, 0, 1), M_gpu) # compute the SVD on the thin matrix: (k + p) wide Uhat, s, V = torch.svd(B) del B U = torch.mm(Q, Uhat) if transpose: # transpose back the results according to the input convention return (torch.transpose(V[:n_components, :], 0, 1), s[:n_components], torch.transpose(U[:, :n_components], 0, 1)) else: return U[:, :n_components], s[:n_components], V[:n_components, :]
def _derivative(self, x): """Compute the derivative of P(x) Parameters ---------- x : numpy array, shape (n_features,) One configuration Returns ------- derivative : numpy array, shape (m_parameters,) """ w2 = np.reshape(self.w, (self.n_features, self.d, self.D, self.D, self.mu)) derivative = np.zeros( (self.n_features, self.d, self.D, self.D, self.mu), dtype=np.complex128) #Store intermediate tensor contractions for the derivatives: #left to right and right to left #tmp stores the contraction of the first i+1 tensors from the left #in tmp[i,:,:], tmp2 the remaining tensors on the right #the mps contracted is the remaining contraction tmp[i-1]w[i]tmp2[i+1] tmp = np.zeros((self.n_features, self.D * self.D), dtype=np.complex128) tmp2 = np.zeros((self.n_features, self.D * self.D), dtype=np.complex128) tmp[0, :] = np.einsum('ij,kj->ik', w2[0, x[0], 0, :, :], np.conj(w2[0, x[0], 0, :, :])).reshape(self.D * self.D) for i in xrange(1, self.n_features - 1): newtmp = np.einsum('imj,klj->ikml', w2[i, x[i], :, :, :], np.conj(w2[i, x[i], :, :, :])).reshape( (self.D * self.D, self.D * self.D)) tmp[i, :] = np.dot(tmp[i - 1, :], newtmp) newtmp = np.einsum( 'ij,kj->ik', w2[self.n_features - 1, x[self.n_features - 1], :, 0, :], np.conj(w2[self.n_features - 1, x[self.n_features - 1], :, 0, :])).reshape(self.D * self.D) mpscontracted = np.inner(tmp[self.n_features - 2, :], newtmp) tmp[self.n_features - 1, :] = mpscontracted tmp2[self.n_features - 1, :] = newtmp for i in xrange(self.n_features - 2, -1, -1): newtmp = np.einsum('imj,klj->ikml', w2[i, x[i], :, :, :], np.conj(w2[i, x[i], :, :, :])).reshape( (self.D * self.D, self.D * self.D)) tmp2[i, :] = np.dot(newtmp, tmp2[i + 1, :]) newtmp = np.einsum('ij,kj->ik', w2[0, x[0], 0, :, :], np.conj(w2[0, x[0], 0, :, :])).reshape(self.D * self.D) tmp2[0, :] = np.inner(newtmp, tmp2[1, :]) #Now for each tensor, the derivative is the contraction of the rest of the tensors derivative[0, x[0], 0, :, :] = 2 * np.einsum('ij,il->lj', w2[0, x[0], 0, :, :], tmp2[1, :].reshape(self.D, self.D)) derivative[self.n_features-1,x[self.n_features-1],:,0,:]=\ 2*np.einsum('ij,il->lj',w2[self.n_features-1,x[self.n_features-1],:,0,:], tmp[self.n_features-2,:].reshape(self.D,self.D)) for i in xrange(1, self.n_features - 1): temp1 = tmp[i - 1, :].reshape(self.D, self.D) temp2 = tmp2[i + 1, :].reshape(self.D, self.D) derivative[i, x[i], :, :, :] = 2 * np.einsum( 'ikm,ij,kl->jlm', w2[i, x[i], :, :, :], temp1, temp2) return derivative.reshape(self.m_parameters)