def moments(data, n_neighbors=30, n_pcs=None, mode='connectivities', method='umap', use_rep=None, copy=False): """Computes moments for velocity estimation. Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. n_neighbors: `int` (default: 30) Number of neighbors to use. n_pcs: `int` (default: None) Number of principal components to use. If not specified, the full space is used of a pre-computed PCA, or 30 components are used when PCA is computed internally. mode: `'connectivities'` or `'distances'` (default: `'connectivities'`) Distance metric to use for moment computation. method : {{'umap', 'gauss', 'hnsw', 'sklearn', `None`}} (default: `'umap'`) Use 'umap' [McInnes18]_ or 'gauss' (Gauss kernel following [Coifman05]_ with adaptive width [Haghverdi16]_) for computing connectivities. use_rep : `None`, `'X'` or any key for `.obsm` (default: None) Use the indicated representation. If `None`, the representation is chosen automatically: for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used. copy: `bool` (default: `False`) Return a copy instead of writing to adata. Returns ------- Returns or updates `adata` with the attributes Ms: `.layers` dense matrix with first order moments of spliced counts. Mu: `.layers` dense matrix with first order moments of unspliced counts. """ adata = data.copy() if copy else data if 'spliced' not in adata.layers.keys() or 'unspliced' not in adata.layers.keys(): raise ValueError('Could not find spliced / unspliced counts.') if any([not_yet_normalized(adata.layers[layer]) for layer in {'spliced', 'unspliced'}]): normalize_per_cell(adata) if neighbors_to_be_recomputed(adata, n_neighbors=n_neighbors): if use_rep is None: use_rep = 'X_pca' neighbors(adata, n_neighbors=n_neighbors, use_rep=use_rep, n_pcs=n_pcs, method=method) if mode not in adata.uns['neighbors']: raise ValueError('mode can only be \'connectivities\' or \'distances\'') logg.info('computing moments based on ' + str(mode), r=True) connectivities = get_connectivities(adata, mode, n_neighbors=n_neighbors, recurse_neighbors=False) adata.layers['Ms'] = csr_matrix.dot(connectivities, csr_matrix(adata.layers['spliced'])).astype(np.float32).A adata.layers['Mu'] = csr_matrix.dot(connectivities, csr_matrix(adata.layers['unspliced'])).astype(np.float32).A # if renormalize: normalize_per_cell(adata, layers={'Ms', 'Mu'}, enforce=True) logg.info(' finished', time=True, end=' ' if settings.verbosity > 2 else '\n') logg.hint( 'added \n' ' \'Ms\' and \'Mu\', moments of spliced/unspliced abundances (adata.layers)') return adata if copy else None
def get_moments(adata, layer=None, second_order=None, centered=True, mode="connectivities"): """Computes moments for a specified layer. First and second order moments. If centered, that corresponds to means and variances across nearest neighbors. Arguments --------- adata: `AnnData` Annotated data matrix. layer: `str` (default: `None`) Key of layer with abundances to consider for moment computation. second_order: `bool` (default: `None`) Whether to compute second order moments from abundances. centered: `bool` (default: `True`) Whether to compute centered (=variance) or uncentered second order moments. mode: `'connectivities'` or `'distances'` (default: `'connectivities'`) Distance metric to use for moment computation. Returns ------- Mx: first or second order moments """ if "neighbors" not in adata.uns: raise ValueError( "You need to run `pp.neighbors` first to compute a neighborhood graph." ) connectivities = get_connectivities(adata, mode=mode) X = (adata.X if layer is None else adata.layers[layer] if isinstance(layer, str) else layer) X = (csr_matrix(X) if isinstance(layer, str) and layer in {"spliced", "unspliced"} else np.array(X) if not issparse(X) else X) if not issparse(X): X = X[:, ~np.isnan(X.sum(0))] if second_order: X2 = X.multiply(X) if issparse(X) else X**2 Mx = (csr_matrix.dot(connectivities, X2) if second_order else csr_matrix.dot(connectivities, X)) if centered: mu = csr_matrix.dot(connectivities, X) mu2 = mu.multiply(mu) if issparse(mu) else mu**2 Mx = Mx - mu2 else: Mx = csr_matrix.dot(connectivities, X) if issparse(X): Mx = Mx.astype(np.float32).A return Mx
def moments(adata, n_neighbors=30, n_pcs=30, mode='connectivities', renormalize=False, copy=False): """Computes first order moments for velocity estimation. Arguments --------- adata: :class:`~anndata.AnnData` Annotated data matrix. n_neighbors: `int` (default: 30) Number of neighbors to use. n_pcs: `int` (default: 30) Number of principal components to use. mode: `'connectivities'` or `'distances'` (default: `'connectivities'`) Distance metric to use for moment computation. renormalize: `bool` (default: `False`) Renormalize the moments by total counts per cell to its median. copy: `bool` (default: `False`) Return a copy instead of writing to adata. Returns ------- Returns or updates `adata` with the attributes Ms: `.layers` dense matrix with first order moments of spliced counts. Mu: `.layers` dense matrix with first order moments of unspliced counts. """ if 'neighbors' not in adata.uns.keys() or n_neighbors > adata.uns['neighbors']['params']['n_neighbors']: from scanpy.api.pp import neighbors, pca if 'X_pca' not in adata.obsm.keys() or n_pcs > adata.obsm['X_pca'].shape[1]: pca(adata, n_comps=n_pcs, svd_solver='arpack') neighbors(adata, n_neighbors=n_neighbors, use_rep='X_pca') if mode not in adata.uns['neighbors']: raise ValueError('mode can only be \'connectivities\' or \'distances\'') logg.info('computing moments', r=True) normalize_layers(adata) connectivities = get_connectivities(adata, mode) #connectivities += connectivities.dot(connectivities*.5) adata.layers['Ms'] = csr_matrix.dot(connectivities, csr_matrix(adata.layers['spliced'])).A adata.layers['Mu'] = csr_matrix.dot(connectivities, csr_matrix(adata.layers['unspliced'])).A if renormalize: normalize_layers(adata, layers={'Ms', 'Mu'}) logg.info(' finished', time=True, end=' ' if settings.verbosity > 2 else '\n') logg.hint( 'added to `.layers`\n' ' \'Ms\', moments of spliced abundances\n' ' \'Mu\', moments of unspliced abundances') return adata if copy else None
def terminal_states(data, vkey='velocity', self_transitions=False, basis=None, weight_diffusion=0, scale_diffusion=1, eps=1e-3, copy=False): """Computes terminal states (root and end points) via eigenvalue decomposition. """ adata = data.copy() if copy else data connectivities = get_connectivities(adata, 'distances') logg.info('computing root cells', r=True, end=' ') T = transition_matrix(adata, vkey=vkey, basis=basis, weight_diffusion=weight_diffusion, scale_diffusion=scale_diffusion, self_transitions=self_transitions, backward=True) eigvecs = eigs(T, eps=eps, perc=[2, 98])[1] eigvec = csr_matrix.dot(connectivities, eigvecs).sum(1) eigvec = np.clip(eigvec, 0, np.percentile(eigvec, 98)) adata.obs['root'] = scale(eigvec) logg.info('using ' + str(eigvecs.shape[1]) + ' eigenvectors with eigenvalue 1.') logg.info('computing end points', end=' ') T = transition_matrix(adata, vkey=vkey, basis=basis, weight_diffusion=weight_diffusion, scale_diffusion=scale_diffusion, self_transitions=self_transitions, backward=False) eigvecs = eigs(T, eps=eps, perc=[2, 98])[1] eigvec = csr_matrix.dot(connectivities, eigvecs).sum(1) eigvec = np.clip(eigvec, 0, np.percentile(eigvec, 98)) adata.obs['end'] = scale(eigvec) logg.info('using ' + str(eigvecs.shape[1]) + ' eigenvectors with eigenvalue 1.') logg.info(' finished', time=True, end=' ' if settings.verbosity > 2 else '\n') logg.hint( 'added\n' ' \'root\', root cells of Markov diffusion process (adata.obs)\n' ' \'end\', end points of Markov diffusion process (adata.obs)') return adata if copy else None
def moments(data, n_neighbors=30, n_pcs=30, mode='connectivities', use_rep=None, recurse_neighbors=False, renormalize=False, copy=False): """Computes moments for velocity estimation. Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. n_neighbors: `int` (default: 30) Number of neighbors to use. n_pcs: `int` (default: 30) Number of principal components to use. mode: `'connectivities'` or `'distances'` (default: `'connectivities'`) Distance metric to use for moment computation. renormalize: `bool` (default: `False`) Renormalize the moments by total counts per cell to its median. copy: `bool` (default: `False`) Return a copy instead of writing to adata. Returns ------- Returns or updates `adata` with the attributes Ms: `.layers` dense matrix with first order moments of spliced counts. Mu: `.layers` dense matrix with first order moments of unspliced counts. """ adata = data.copy() if copy else data if 'spliced' not in adata.layers.keys() or 'unspliced' not in adata.layers.keys(): raise ValueError('Could not find spliced / unspliced counts.') if 'neighbors' not in adata.uns.keys() or n_neighbors > adata.uns['neighbors']['params']['n_neighbors']: neighbors(adata, n_neighbors=n_neighbors, use_rep=('X_pca' if use_rep is None else use_rep), n_pcs=n_pcs) if mode not in adata.uns['neighbors']: raise ValueError('mode can only be \'connectivities\' or \'distances\'') logg.info('computing moments based on ' + str(mode), r=True) normalize_layers(adata) connectivities = get_connectivities(adata, mode, n_neighbors=n_neighbors, recurse_neighbors=recurse_neighbors) adata.layers['Ms'] = csr_matrix.dot(connectivities, csr_matrix(adata.layers['spliced'])).astype(np.float32).A adata.layers['Mu'] = csr_matrix.dot(connectivities, csr_matrix(adata.layers['unspliced'])).astype(np.float32).A if renormalize: normalize_layers(adata, layers={'Ms', 'Mu'}, enforce=True) logg.info(' finished', time=True, end=' ' if settings.verbosity > 2 else '\n') logg.hint( 'added \n' ' \'Ms\' and \'Mu\', moments of spliced/unspliced abundances (adata.layers)') return adata if copy else None
def collaborative_recommend_method(rated_dict): rating_sparse = load_sparse_csr(data_path + "user_rating_matrix_sparse.npz") with open(data_path + "movie_id_index", "r") as input: movie_id_index = yaml.safe_load(input) input.close() with open(data_path + "movie_index_id", "r") as input: movie_index_id = yaml.safe_load(input) input.close() col = [] row = np.zeros(20) data = [] for key, value in rated_dict.items(): try: key = key.strip() movie_index = movie_id_index[key] col.append(movie_index) data.append(float(value)) except: continue user_rate = coo_matrix((data, (row,col)), shape=(1, 10197)).tocsr() similarities = csr_matrix(cosine_similarity(user_rate, rating_sparse)) predict_rating = csr_matrix.dot(similarities, rating_sparse) predict_rating_sorted = predict_rating.getrow(0).toarray().ravel() #sort predicted rating result predict_rating_top = heapq.nlargest(20, range(len(predict_rating_sorted)), predict_rating_sorted.__getitem__) #fetch top 20 movies predict_rating_top_mapped = list(map(lambda x: movie_index_id[str(x)], predict_rating_top)) predict_rating_top_selected = list(filter(lambda x: x not in list(map(lambda x: x.strip(), rated_dict.keys())), predict_rating_top_mapped)) predict_rating_top_final = predict_rating_top_selected[:10] return predict_rating_top_final
def backward(ctx, grad_output): # This is a pattern that is very convenient - at the top of backward # unpack saved_tensors and initialize all gradients w.r.t. inputs to # None. Thanks to the fact that additional trailing Nones are # ignored, the return statement is simple even when the function has # optional inputs. sparse_input, weight, bias = ctx.saved_tensors grad_input = grad_weight = grad_bias = None # These needs_input_grad checks are optional and there only to # improve efficiency. If you want to make your code simpler, you can # skip them. Returning gradients for inputs that don't require it is # not an error. # if ctx.needs_input_grad[0]: # grad_input = grad_output.mm(weight) grad_output_np = grad_output.data.numpy() # print grad_output_np sparse_input_np = csr_matrix(sparse_input.data.numpy().T) # print sparse_input_np if ctx.needs_input_grad[1]: grad_weight = torch.autograd.Variable( torch.from_numpy((csr_matrix.dot(sparse_input_np, grad_output_np)).T).float()) # if bias is not None and ctx.needs_input_grad[2]: # grad_bias = grad_output.sum(0).squeeze(0) return grad_input, grad_weight, grad_bias
def stream(B, a, k, PSI, Y, M, ord1, i, u, is_sparse, beg): Q = [] Q1 = [] w = [] if is_sparse == 1: PSI = PSI + SM.dot(a.T, a) else: PSI = PSI + np.dot(a.T, a) Z = Alg1(PSI, k) #Z and PSI are matrices for y in Y: BB = B[y] C = np.dot(np.array(BB), Z) s = np.sum(np.power(C, 2), 1) #line 14 ord1[y] = s.tolist() My = M[y].copy() ord2 = ord1[y].copy() for mm in range(len(M[y])): if u[np.mod(mm, 10000), y] > (s[mm] / (s[mm] + k)): My.remove(M[y][mm]) ord2.remove(ord1[y][mm]) M[y] = My ord1[y] = ord2 for y in Y: if len(M[y]) == 1: Q.append(M[y]) Q1.append(ord1[y]) for q in Q1: w.append(k / (len(Q1) * q[0])) return PSI, Q, w, s, M, time.time() - beg
def canonicalCorrelations(self, datasets=None): """ Take :py:attr:`k` datasets and return the :py:attr:`k` canonical correlations. :param datasets: A list of numpy.arrays. """ if datasets is not None: XZ = [ csr_matrix.dot(X, Z.transpose()) for X, Z in zip(datasets, self.ZZ) ] k = self.getK() corrs = np.zeros((k, len(XZ), len(XZ))) for i in range(len(XZ)): corrs[:, i, i] = np.ones(k) for j in range(i + 1, len(XZ)): for cc in range(k): corrs[cc, j, i] = np.corrcoef(XZ[i][:, cc], XZ[j][:, cc], rowvar=False)[0, 1] for cc in range(k): # fill in the upper triangles with the transpose of the lower corrs[cc, :, :] = corrs[cc, :, :] + np.tril( corrs[cc, :, :], -1).T else: corrs = None # TODO: save canonical correlations during CCA.fit return np.nan_to_num(corrs)
def run_rwr(P, alpha=0.9, eps=1e-4, max_iters=10, verbose=False): """ Run Random Walk with Restarts on a graph *P*: noramlized csr scipy sparse matrix *alpha*: restart parameter *max_iters*: maximum number of iterations *eps*: maximum difference of node scores from one iteration to the next """ # intialize with a 1 along each diagonal P0 = eye(P.shape[0]) # matrix of node score vectors X = csr_matrix(P.shape) prev_X = csr_matrix(P.shape) for iters in trange(1,max_iters+1): X = alpha*csr_matrix.dot(P, prev_X) + ((1-alpha)*P0) max_d = (X - prev_X).max() if verbose: print("\t\titer %d max score change: %0.6f" % (iters, max_d)) if max_d < eps: # converged! break prev_X = X.copy() if iters == max_iters: print("Reached max iters %d" % (max_iters)) else: print("RWR converged after %d iters" % (iters)) return X
def evaluate(self, t, algo=''): assert self.seeds is not None assert self.graph_type == 1 self.graph = self.graph.tocsr() colors = self.seeds #for i in range(self.total_nodes): # if colors[i] == 0: # colors[i] = random.choice([1,-1]) for i in range(0, t): colors = csr_matrix.dot(self.graph, colors) if save: if self.only_scc: np.save( 'saved/epinions_{}_{}_{}_{}_scc.npy'.format( algo, t, sum(abs(self.seeds)), sum(self.targets)), colors) else: np.save( 'saved/epinions_{}_{}_{}_{}.npy'.format( algo, t, sum(abs(self.seeds)), sum(self.targets)), colors) res1 = Network.eval_stats_purple(self.targets, self.partitions, colors) res2 = Network.eval_stats(self.targets, self.partitions, colors) self.seeds = None result = (res1['P+ C+'] + res1['P- C-'], res2['P+ C+'] + res2['P- C-']) return result #- result['p+ c-'] - result['p- c+']
def forward(ctx, sparse_input, weight, bias=None): sparse_input_np = csr_matrix(sparse_input.data.numpy()) ctx.save_for_backward(sparse_input, weight, bias) weight_np = weight.data.numpy().T output = csr_matrix.dot(sparse_input_np, weight_np) output = torch.autograd.Variable(torch.from_numpy(output).float()) if bias is not None: output += bias.unsqueeze(0).expand_as(output) return output
def _calc_cutX4(laplace_spmat, part_vec): ''' The method calculates and returns 4 times the size of the cut from a partition vector. ''' # x'Lx = 4w(E(P1, P2)) Lx = csr_matrix.dot(laplace_spmat, part_vec) cut_x4 = np.dot(part_vec, Lx) return cut_x4
def kmeans_plspls1(A,w,eps,V,clus_num,we,alfa_app,is_sparse,is_jl): """ This funtion operates the kmeans++ initialization algorithm. each point chosed under the Sinus probability. Input: A: data matrix, n points, each on a sphere of dimension d. k: number of required points to find. Output: Cents: K initial centroids, each of a dimension d. """ if is_sparse==1: A=SM(A) if is_jl==1: dex=int(clus_num*np.log(A.shape[0])) ran=np.random.randn(A.shape[1],dex) A=SM.dot(A,ran) is_sparse=0 #A=np.multiply(w1,A) num_of_samples = A.shape[0] if any(np.isnan(np.ravel(w)))+any(np.isinf(np.ravel(w))): Cents= A[np.random.choice(num_of_samples,size=1),:] #choosing arbitrary point as the first else: w[w<0]=0 Cents= A[np.random.choice(num_of_samples,size=1,p=np.ravel(w)/np.sum(np.ravel(w))),:] #choosing arbitrary point as the first if is_sparse==1: PA=make_P(A) else: PA=make_P_dense(A) fcost=alfa_app*1.1 h1=1 inds=[] while (Cents.shape[0]<clus_num+1): Cents2=Cents[h1-1:h1,:] if is_sparse==1: Pmina,tags,_=squaredis(PA,Cents2) else: Pmina,tags,_=squaredis_dense(PA,Cents2) if h1==1: Pmin=Pmina else: Pmin=np.minimum(Pmin,Pmina) Pmin[np.asarray(inds)]=0 Pmin[Pmin<0]=0 Pmin00=np.multiply(w,Pmin) Pmin0=Pmin00/np.sum(Pmin00) if any(np.isnan(np.ravel(Pmin0)))+any(np.isinf(np.ravel(Pmin0))): ind=np.random.choice(Pmin.shape[0],1) else: Pmin0[Pmin0<0]=0 ind=np.random.choice(Pmin.shape[0],1, p=Pmin0) if is_sparse==1: Cents=vstack((Cents,A[ind,:])) else: Cents=np.concatenate((Cents,A[ind,:]),0) inds.append(ind) h1=h1+1 return Cents,inds
def dot(A, B): if type(A) is spm and type(B) is spm: return spm.dot(A, B) elif is_dense(A) and is_dense(B): return np.dot(A, B) elif type(A) is DST and type(B) is DST: return A.matmul(B) else: raise NotImplementedError()
def calculate_F(w, Xtr, Ytr): """ """ w = csr_matrix(w) wx = csr_matrix.dot(w, Xtr.T) ywx = wx.multiply(Ytr) constraint = 0 z = (ywx < 1).toarray() constraint = (1 - ywx.toarray()[z]).sum(axis=0) f = 0.5 * (np.linalg.norm(w.toarray()))**2 + constraint return f
def calculate_F(w, Xtr, Ytr): """ calculate value of primal objective """ w = csr_matrix(w) wx = csr_matrix.dot(w, Xtr.T) ywx = wx.multiply(Ytr) # calculate sum of slack variables slackSum = (1 - ywx.toarray()[(ywx < 1).toarray()]).sum(axis=0) f = 0.5 * (np.linalg.norm(w.toarray()))**2 + slackSum return f
def alaa_coreset(wiki0,j,eps,w,is_pca,spar): """ our algorithm, equivalent to Algorithm 1 in the paper. input: wiki0:data matrix j: dimension of the approximated subspace eps: determine coreset size w: initial weights is_pca: 1 coreset for pca, 0 coreset dor SVD spar: is data in sparse format output: weighted coreset """ coreset_size=j/eps dex=int(j*np.log(wiki0.shape[0])) d=wiki0.shape[1] if is_pca==1: j=j+1 wiki0=PCA_to_SVD(wiki0,eps,spar) if is_jl==1: ran=np.random.randn(wiki0.shape[1],dex) if spar==1: wiki=SM.dot(wiki0,ran) else: wiki=np.dot(wiki0,ran) else: wiki=wiki0 w=w/wiki.shape[0] sensetivities=[] jd=j w1=np.reshape(w,(len(w),1)) wiki1=np.multiply(np.sqrt(w1),wiki) k=0 for i,p in enumerate(wiki1) : k=k+1 sensetivities.append(calc_sens(wiki1,p,jd,eps)) p0=np.asarray(sensetivities) if is_pca==1: p0=p0+81*eps indec=np.random.choice(np.arange(wiki.shape[0]),int(coreset_size),p=p0/np.sum(p0)) #sampling according to the sensitivity p=p0/np.sum(p0) #normalizing sensitivies w=np.ones(wiki.shape[0]) u=np.divide(np.sqrt(w),p)/coreset_size #caculating new weights u1=u[indec]#picking weights of sampled u1=np.reshape(u1,(len(u1),1)) squ=np.sqrt(u1) if spar==1: C=SM(wiki0)[indec,:d].multiply(squ) #weighted coreset else: C=np.multiply(squ,wiki0[indec,:d]) return C
def get_ek(self, psi): assert isinstance(psi, Wavefunction) v = self.v dv = self.grid.dv nelect = self.system.nelect psi = psi.get_psi() # T = psi'*Lap3*psi T1 = csr_matrix.dot(v, psi) T2 = csc_matrix.dot(csc_matrix(psi), T1) T = T2 * nelect * dv T = T[0] return T
def second_order_moments(adata): """Computes second order moments for stochastic velocity estimation. Arguments --------- adata: `AnnData` Annotated data matrix. Returns ------- Mss: Second order moments for spliced abundances Mus: Second order moments for spliced with unspliced abundances """ if 'neighbors' not in adata.uns: raise ValueError('You need to run `pp.neighbors` first to compute a neighborhood graph.') connectivities = get_connectivities(adata, 'connectivities') s, u = csr_matrix(adata.layers['spliced']), csr_matrix(adata.layers['unspliced']) Mss = csr_matrix.dot(connectivities, s.multiply(s)).A Mus = csr_matrix.dot(connectivities, s.multiply(u)).A return Mss, Mus
def SCNW_classic(A2, k, coreset_size, is_jl): coreset_size = int(coreset_size) """ This function operates the CNW algorithm, exactly as elaborated in Feldman & Ras inputs: A: data matrix, n points, each of dimension d. k: an algorithm parameter which determines the normalization neededand the error given the coreset size. coreset_size: the maximal coreset size (number of lines inequal to zero) demanded for input. output: error: The error between the original data to the CNW coreset. duration: the duration this CNW operation lasted """ if is_jl == 1: dex = int(k * np.log(A2.shape[0])) ran = np.random.randn(A2.shape[1], dex) A1 = SM.dot(A2, ran) else: A1 = np.copy(A2) print('A1.shape', A1.shape) epsi = np.sqrt(k / coreset_size) # A, A3 = initializing_data(A1, k) print('A.shape', A.shape) At = np.transpose(A) AtA = np.dot(At, A) num_of_channels = A.shape[1] ww = np.zeros((int(coreset_size))) Z = np.zeros((num_of_channels, num_of_channels)) X_u = k * np.diag(np.ones(num_of_channels)) X_l = -k * np.diag(np.ones(num_of_channels)) delta_u = epsi + 2 * np.power(epsi, 2) delta_l = epsi - 2 * np.power(epsi, 2) ind = np.zeros(int(coreset_size), dtype=np.int) for j in range(coreset_size): if j % 50 == 1: print('j=', j) X_u = X_u + delta_u * AtA X_l = X_l + delta_l * AtA Z, jj, t = single_CNW_iteration_classic(A, At, delta_u, delta_l, X_u, X_l, Z) ww[j] = t ind[j] = jj sqrt_ww = np.sqrt(epsi * ww / k) sqrt_ww = np.reshape(sqrt_ww, (len(sqrt_ww), 1)) if is_jl == 1: SA0 = SM(A2)[ind, :].multiply(sqrt_ww) else: SA0 = np.multiply(A2[ind, :], sqrt_ww) return SA0, ind
def Nonuniform(AA0,k,is_pca,eps,spar): """ non uniform sampling opponent to our algorithm, from Varadarajan, Kasturi, and Xin Xiao. "On the sensitivity of shape fitting problems." arXiv preprint arXiv:1209.4893 (2012). input: AA0:data matrix k: dimension of the approximated subspace is_pca: if 1 will provide a coreset to PCA, 0 will provide coreset for SVD eps: detemines coreset size spar: is data in sparse format output: weighted coreset """ d=AA0.shape[1] if is_pca==1: k=k+1 AA0=PCA_to_SVD(AA0,eps,spar) if is_jl==1: dex=int(k*np.log(AA0.shape[0])) ran=np.random.randn(AA0.shape[1],dex) if spar==1: AA=SM.dot(AA0,ran) else: AA=np.dot(AA0,ran) else: AA=AA0 size_of_coreset=int(k+k/eps-1) U,D,VT=ssp.linalg.svds(AA,k) V = np.transpose(VT) AAV = np.dot(AA, V) del V del VT x = np.sum(np.power(AA, 2), 1) y = np.sum(np.power(AAV, 2), 1) P = np.abs(x - y) AAV=np.concatenate((AAV,np.zeros((AAV.shape[0],1))),1) Ua, _, _ = ssp.linalg.svds(AAV,k) U = np.sum(np.power(Ua, 2), 1) pro = 2 * P / np.sum(P) + 8 * U if is_pca==1: pro=pro+81*eps pro0 = pro / sum(pro) w=np.ones(AA.shape[0]) u=np.divide(w,pro0)/size_of_coreset DMM_ind=np.random.choice(AA.shape[0],size_of_coreset, p=pro0) u1=np.reshape(u[DMM_ind],(len(DMM_ind),1)) if spar==1: SA0=SM(AA0)[DMM_ind,:d].multiply(np.sqrt(u1)) else: SA0=np.multiply(np.sqrt(u1),AA0[DMM_ind,:d]) return SA0
def _transform(self, datasets, outcome_index=None): # TODO: use 'outcome_index' to allow user to use d - 1 datasets to predict the # remaining one ZZ = self.ZZ d = len(ZZ) assert len(datasets) == d,\ "number of datasets should be len(self.ZZ)" standardization = self.getStandardization() if standardization: datasets = [X - np.mean(X, axis=0) for X in datasets] if outcome_index is not None: assert outcome_index >= 0 and outcome_index < d,\ "outcome_index is not a valid index for datasets" XZ = [ csr_matrix.dot(datasets[j], ZZ[j].transpose()) for j in [x for x in range(d) if x != outcome_index] ] XZ_sums = np.sum(XZ, axis=0) prediction = np.dot(XZ_sums, pinv(ZZ[outcome_index].todense()).transpose()) else: XZ = [ csr_matrix.dot(X, Z.transpose()) for X, Z in zip(datasets, ZZ) ] XZ_sums = [] for i in range(d): for j in range(d): if j != i: if len(XZ_sums) == i: XZ_sums.append(XZ[j]) else: XZ_sums[i] += XZ[j] prediction = [ np.dot(XZ_sums[i], pinv(ZZ[i].todense()).transpose()) for i in range(d) ] return prediction
def get_moments(adata, layer=None, second_order=None, centered=True): """Computes moments for a specified layer. First and second order moments. If centered, that corresponds to means and variances across nearest neighbors. Arguments --------- adata: `AnnData` Annotated data matrix. layer: `str` (default: `None`) Key of layer with abundances to consider for moment computation. second_order: `bool` (default: `None`) Whether to compute second order (instead of first order) moments from abundances. centered: `bool` (default: `True`) Whether to compute centered or uncentered second order moments (centered = variance). Returns ------- Mx: first or second order moments """ if 'neighbors' not in adata.uns: raise ValueError('You need to run `pp.neighbors` first to compute a neighborhood graph.') connectivities = get_connectivities(adata) X = adata.X if layer is None else adata.layers[layer] X = csr_matrix(X) if layer in {'spliced', 'unspliced'} else np.array(X) if not issparse(X) else X if not issparse(X): X = X[:, ~np.isnan(X.sum(0))] if second_order: X2 = X.multiply(X) if issparse(X) else X ** 2 Mx = csr_matrix.dot(connectivities, X2) if second_order else csr_matrix.dot(connectivities, X) if centered: mu = csr_matrix.dot(connectivities, X) mu2 = mu.multiply(mu) if issparse(mu) else mu ** 2 Mx = Mx - mu2 else: Mx = csr_matrix.dot(connectivities, X) if issparse(X): Mx = Mx.astype(np.float32).A return Mx
def compress_graph_from_hard_partition_ts(G,nodes,features,p,partition,node_subset): """ Obtain a sparse tall-skinny matrix and new probabilities from a hard partition of a graph. For each point, we only find the distance to its anchor, not to all other anchors. ----------- Parameters: G : NetworkX graph nodes : sorted list of graph nodes p : probability vector of sorted nodes partition : list of sets containing node labels node_subset : sorted list of anchor node labels ------- Returns: dists : |nodes|x|node_subset| matrix of distances from each block of partition to anchor in that block membership : |nodes|x|node_subset| membership matrix p_compressed : vector of aggregated probabilities on anchors """ # Distances between anchors dists_subset = np.zeros((len(node_subset),len(node_subset))) for i in range(len(node_subset)): for j in range(i+1,len(node_subset)): dists_subset[i,j] = shortest_path_length(G,node_subset[i],node_subset[j]) dists_subset = dists_subset + dists_subset.T # Sparse tall-skinny matrix of distances and feature-vector distances from points to their own anchors # Also, tall-skinny membership matrix and mass-compression matrix row_idx, col_idx, dist_data, mass_data, fdist_data = [], [], [], [], [] for (aidx,anchor) in enumerate(node_subset): bidx = [anchor in v for v in partition].index(True) #block containing current anchor point block = partition[bidx] for b in block: idx = nodes.index(b) d = shortest_path_length(G,nodes[idx],anchor) fd = pairwise_distances(features[nodes.index(anchor),:].reshape(1,-1), features[idx,:].reshape(1,-1))[0][0] row_idx.append(idx) col_idx.append(aidx) dist_data.append(d) mass_data.append(p[idx]) fdist_data.append(fd) dists = coo_matrix((dist_data, (row_idx, col_idx)),shape=(len(nodes), len(node_subset))) fdists = coo_matrix((fdist_data, (row_idx, col_idx)),shape=(len(nodes), len(node_subset))) membership = coo_matrix(([1 for v in row_idx], (row_idx, col_idx)),shape=(len(nodes), len(node_subset))) # coup = coo_matrix((mass_data, (row_idx, col_idx)),shape=(len(nodes), len(node_subset))) p_subset = csr_matrix.dot(p, membership) return dists.tocsr(),fdists.tocsr(),membership.tocsr(),p_subset, dists_subset
def cut_size(A_spmat, partitions): nvert = A_spmat.get_shape()[0] cut = 0.0 for part1 in partitions: p1 = np.zeros(nvert) for vert_id in part1: p1[vert_id] = 1 not_p1 = np.ones(nvert) - p1 Ay = csr_matrix.dot(A_spmat, not_p1) cut += np.dot(p1, Ay) return cut
def second_order_moments(adata, adjusted=False): """Computes second order moments for stochastic velocity estimation. Arguments --------- adata: `AnnData` Annotated data matrix. Returns ------- Mss: Second order moments for spliced abundances Mus: Second order moments for spliced with unspliced abundances """ if 'neighbors' not in adata.uns: raise ValueError('You need to run `pp.neighbors` first to compute a neighborhood graph.') connectivities = get_connectivities(adata) s, u = csr_matrix(adata.layers['spliced']), csr_matrix(adata.layers['unspliced']) Mss = csr_matrix.dot(connectivities, s.multiply(s)).astype(np.float32).A Mus = csr_matrix.dot(connectivities, s.multiply(u)).astype(np.float32).A if adjusted: Mss = 2 * Mss - adata.layers['Ms'].reshape(Mss.shape) Mus = 2 * Mus - adata.layers['Mu'].reshape(Mus.shape) return Mss, Mus
def squaredis(P,Cent): d=Cent.shape[1] C=SM((Cent.shape[0],d+2)) C[:,1]=1 #C is defined just as in the algorithm you sent me. C[:,0] =SM.sum(SM.power(Cent, 2), 1) C[:,2:d+2]=Cent D=SM.dot(P,C.T) D=D.toarray() Tags=D.argmin(1)#finding the most close centroid for each point if min(D.shape)>1: dists=D.min(1) else: dists=np.ravel(D) y=D.argmin(0) return dists,Tags,y
def df_to_matrix(df, factors): # # fill sparse matrix with ratings from a dataframe. # matr[userid][movieid] = rating. # ratings = df['rating'].tolist() # subtract 1 to map id to index users = [id for id in df['userid'].tolist()] movies = [id for id in df['movieidx'].tolist()] review_matrix_csr = csr_matrix((ratings, (users, movies)), shape=(num_reviewers, num_movies + 1)) u, s, vt = svds(review_matrix_csr, k=factors) user_factors = csr_matrix.dot(u, s) item_factors = vt return user_factors, item_factors, review_matrix_csr
def init_totals(self, t, only_target): assert self.graph_type == 1 totals = np.zeros(t) colors = np.copy(self.partitions) if only_target: colors *= self.targets self.graph = self.graph.tocsr() totals[0] = sum( Network.eval_stats(self.targets, self.partitions, colors).values()) for i in range(1, t): colors = csr_matrix.dot(self.graph, colors) totals[i] = sum( Network.eval_stats(self.targets, self.partitions, colors).values()) #print(totals) return totals
def cosine(a, b): x = csr_matrix.dot(a, b.T)[0, 0] / (norm2(a) * norm2(b)) return x
def smoothed_cosine(a, b): # calculate set intersection by converting to binary and taking the dot product overlap = csr_matrix.dot(binarize(a), binarize(b).T)[0, 0] # smooth cosine by discounting by set intersection return (overlap / (SMOOTHING + overlap)) * cosine(a, b)
def cosine_dist(x, y): x_n = csr_matrix.sqrt(csr_matrix.dot(x, x.T)) y_n = csr_matrix.sqrt(csr_matrix.dot(y, y.T)) return 1 - csr_matrix.dot(x, y.T) / (x_n * y_n)