def normalized_laplacian_embed(G,d, scaled=False): """ Generates an n by d matrix using an svd of the normalized laplacian Each row of the output corresponds to a node (ordered according to G.node) so that each node is assigned a vector in d-dimensional euclidean space. Parameters ---------- G -- networkx graph d -- embedding dimension scaled -- whether to scaled the embedding by the square root of the eigenvalues (default=False) Returns ------- n times d matrix where n=G.number_of_nodes() """ L = laplacian_matrix(G) if scaled: u,s,_ = la.svds(sparse.csr_matrix(L), d) return np.dot(u,np.diag(np.sqrt(s))) else: u,_,_ = la.svds(sparse.csr_matrix(L), d) return u
def fastsvds(M,r): """ "Fast" but less accurate SVD by computing the SVD of MM^T or M^TM ***IF*** one of the dimensions of M is much smaller than the other. Note. This is numerically less stable, but useful for large hyperspectral images. """ m,n = M.shape rationmn = 10 # Parameter, should be >= 1 if m < rationmn*n: MMt = np.dot(M,M.T) u,s,v = svds(MMt,r) s = np.diag(s) v = np.dot(M.T, u) v = np.multiply(v,repmat( (sum(v**2)+1e-16)**(-0.5),n,1)) s = np.sqrt(s) elif n < rationmn*m: MtM = np.dot(M.T,M) u,s,v = svds(MtM,r) s = np.diag(s) u = np.dot(M,v) u = np.multiply(u,repmat( (sum(u**2)+1e-16)**(-0.5),m,1)) s = np.sqrt(s) else: u,s,v = svds(M,r) s = np.diag(s) return (u,s,v)
def sci_pseudoinverse(Mat, precision): """ Pseudoinverse computation. pseudoinverse using scipy. The function takes a sparse matrix and a precision score as the input. """ matrix = Mat.tocsc() if matrix.shape[0] <= matrix.shape[1]: val = int((precision * matrix.shape[0]) / 100) u, s, vt = ssl.svds(matrix.tocsc(), k=val) UT = ss.csr_matrix(np.nan_to_num(u.transpose())) SI = ss.csr_matrix(np.nan_to_num(np.diag(1 / s))) VT = ss.csr_matrix(np.nan_to_num(vt)) temp_matrix = spmatrixmul(VT.transpose(), SI) pinv_matrix = spmatrixmul(temp_matrix, UT) del u, s, vt, UT, SI, VT, temp_matrix else: val = int((precision * matrix.transpose().shape[0]) / 100) u, s, vt = ssl.svds(matrix.transpose().tocsc(), k=val) UT = ss.csr_matrix(np.nan_to_num(u.transpose())) SI = ss.csr_matrix(np.nan_to_num(np.diag(1 / s))) VT = ss.csr_matrix(np.nan_to_num(vt)) temp_matrix = spmatrixmul(UT.transpose(), SI) pinv_matrix = spmatrixmul(temp_matrix, VT) del u, s, vt, UT, SI, VT, temp_matrix return pinv_matrix.tocsr()
def rank_constrained_least_squares(X, Y, rank, alpha1, alpha2=None, U0=None, V0=None, max_bfgs_iter=500, m=10, gradient_tolerance=1e-5, callback=None, verbose=3): """ Minimizes .5 * ||XUV.T - Y|| ** 2 + .5 * alpha * (||U|| ** 2 + ||V|| ** 2) """ if alpha2 is None: alpha2 = alpha1 energy_function = get_vec_and_grad_func(X, Y, alpha1, alpha2, rank, X.shape[1], callback=callback) #energy_gradient = get_grad_func(X, Y, alpha1, alpha2, rank, len(X.T)) # if not already done, initialize U and V if V0 is None: if U0 is not None: # if only V0 is None initialize U with a least squares U = U0.copy() V = np.linalg.pinv(X.dot(U)).dot(Y).T else: # decompose a ridge solution _, largest_singular_value_of_X, _ = svds(X, k=1) ridge_penalty = largest_singular_value_of_X * .1 ridge = Ridge(alpha=ridge_penalty) ridge_coef = ridge.fit(X, Y).coef_.T U, s, VT = svds(ridge_coef, k=rank) V = VT.T * np.sqrt(s) U *= np.sqrt(s)[np.newaxis, :] else: V = V0.copy() if U0 is None: raise Exception U = U0.copy() initial_UV_vec = np.vstack([U, V]).ravel() result = fmin_l_bfgs_b(energy_function, x0=initial_UV_vec, #fprime=energy_gradient, #maxiter=max_bfgs_iter, maxfun=max_bfgs_iter, # gtol=gradient_tolerance, m=m, #callback=callback, iprint=verbose) concat_matrix = result[0].reshape(-1, rank) n_features = X.shape[1] U_res = concat_matrix[:n_features] V_res = concat_matrix[n_features:] return U_res, V_res, result[1:]
def pca_analysis_dense(X_train,X_trest,n_components): u,s,v = linalg.svds(X_train,n_components) print(s.shape) screePlot('original.pdf',s[::-1]) normalized = normalize(X_train,norm='l1',axis = 0) u,s,v = linalg.svds(normalized,n_components) print(s.shape) screePlot('normalized.pdf',s[::-1])
def get_singularvalues_v_cycle(self, nu0=0, nu1=1, all_svdvals=False, k_max=5, k_min=5): T, P_inv = self.get_v_cycle_it_matrix(nu0, nu1) if all_svdvals: return sp.linalg.svdvals(T.todense()) else: svdval_list = sprsla.svds(T, k=k_max, which='LM', return_eigenvectors=False) svdval_list.append(sprsla.svds(T, k=k_min, which='SM', return_eigenvectors=False)) return svdval_list
def scipy_svds(a, k=6, ncv=None, return_vecs=True, **kwargs): """ Compute a number of singular value pairs """ settings = { 'k': k, 'ncv': choose_ncv(k, a.shape[0]) if ncv is None else ncv, 'return_singular_vectors': return_vecs} if return_vecs: uk, sk, vtk = spla.svds(a, **settings, **kwargs) so = np.argsort(-sk) return np.asmatrix(uk[:, so]), sk[so], np.asmatrix(vtk[so, :]) else: sk = spla.svds(a, **settings, **kwargs) return sk[np.argsort(-sk)]
def returnsvd(filepath, k): # Read data print("Creation of data with ratings") datam = pd.read_csv(filepath, engine="python", iterator=True, sep="::", chunksize=10000, usecols=[1, 2]) data = pd.concat([chunk for chunk in datam], ignore_index=True) data.columns = ["item_id", "tag"] # remove upper/lowe case data.tag = data.tag.astype(str) data.tag = data.tag.apply(str.lower) count = data.groupby(["item_id", "tag"]).size() data = count.reset_index() data.columns = ["item_id", "tag_id", "count"] # sort by items and keep traces of the original indices inditem = np.sort(data["item_id"].unique()) reinditem = pd.Series({inditem[i]: i for i in np.arange(len(inditem))}) data["item_id"] = reinditem[data["item_id"].values].values # compute the occurence of tags indtag = np.sort(data["tag_id"].unique()) reindtag = pd.Series({indtag[i]: i for i in np.arange(len(indtag))}) data["tag_id"] = reindtag[data["tag_id"].values].values data_sparse = coo_matrix( (data["count"].values.astype(float), (data["item_id"].values, data["tag_id"].values)) ).tolil() print("..........sparse matrix built") # compute the actual svd p, d, q = splin.svds(data_sparse.tocsc(), k) return p, d, q, reinditem
def _CFSVD(self, ratingsMat): user_ratings_mean = np.mean(ratingsMat, axis=1) # mean over user ratings R_demeaned = ratingsMat - user_ratings_mean.reshape(-1, 1) from scipy.sparse.linalg import svds U, sigma, Vt = svds(R_demeaned, k=10) sigma = np.diag(sigma) self.all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
def get(self): B = dok_matrix((self.rows,self.d), dtype=float32) for ((row,col,val),p) in self.sampler.get(with_probabilities=True): B[row,col] += val/(p*self.nnz) covariance = dot(B.transpose(),B) (_,s,Vt) = svds(covariance, k=self.ell, maxiter=50, return_singular_vectors=True) return dot(diag(sqrt(s[:self.ell])), Vt[:self.ell,:])
def _fit(self, X): X = as_float_array(X, copy=False) random_state = check_random_state(self.random_state) if self.algorithm == "arpack": U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol) # svds doesn't abide by scipy.linalg.svd/randomized_svd # conventions, so reverse its outputs. Sigma = Sigma[::-1] U, VT = svd_flip(U[:, ::-1], VT[::-1]) elif self.algorithm == "randomized": k = self.n_components n_features = X.shape[1] if k >= n_features: raise ValueError("n_components must be < n_features;" " got %d >= %d" % (k, n_features)) U, Sigma, VT = randomized_svd(X, self.n_components, n_iter=self.n_iter, random_state=random_state) else: raise ValueError("unknown algorithm %r" % self.algorithm) self.components_ = VT return U, Sigma, VT
def svd_wifis(wf_lists,hash_num,nk): data_size = wf_lists.shape[0] ij = np.zeros((2,data_size * 10)) data = np.zeros((data_size * 10)) row = data_i = 0 macs = set() for wf_list in wf_lists: wf = common.str_to_wf(wf_list) for (k,v) in wf.iteritems(): ki = long(k,base=16) mask = 0xffffffffffff if ki == 0 or ki & mask == mask: continue #k = get_hash(k) k = get_mac_idx(ki) (ij[0,data_i],ij[1,data_i],data[data_i]) = (row,k,v) data_i += 1 macs.add(k) row += 1 m = sp.csr_matrix((data,ij)) (u,s,vt) = la.svds(m,k = min(nk,min(m.shape)//2)) print m.todense() return u,s,vt
def get_svd(data): input = np.genfromtxt(data,dtype = dt,delimiter = '\t') row = 0 data_i = 0 ij = np.zeros((2,input.shape[0]*10),dtype=np.uint64) data = np.zeros((input.shape[0]*10),dtype=np.float) macs = set() for wf_list in input['wf_list']: wf = common.str_to_wf(wf_list) for (k,v) in wf.iteritems(): ki = long(k,base=16) mask = 0xffffffffffff if ki == 0 or ki & mask == mask: continue #k = get_hash(k) k = get_mac_idx(ki) (ij[0,data_i],ij[1,data_i],data[data_i]) = (row,k,v) data_i += 1 macs.add(k) row += 1 # print '%x,%x' % (ij[1,...].min(),ij[1,...].max()) m = sp.csr_matrix((data,ij)) # print len(macs) (u,s,vt) = la.svds(m,k=10) print '\n'.join([ '\t'.join(p) for p in filter_small(u) ])
def svd_factorize_matrix(y_mat, rank, return_embeddings=False): """ exact approximation of a matrix using square loss an fully observed entries Args: y_mat: input matrix to approximate rank: rank of the approximation return_embeddings: boolean. If True, it returns the embeddings instead of the approximate matrix Returns: approximate matrix of the specified rank Example: >>> np.random.seed(1) >>> mat = toy_factorization_problem(5, 4) >>> svd_factorize_matrix(mat, 2) array([[ 3.492, 0.148, 1.681, 1.545], [ 2.356, -0.032, 1.273, 0.648], [ 6.038, 0.099, 3.074, 2.198], [ 3.338, -0.508, 2.295, -0.472], [ 0.09 , 0.148, -0.11 , 0.473]]) """ from scipy.sparse.linalg import svds u1_mat, d1_vec, v1_matt = svds(y_mat, rank) d1_diag_matrix = np.zeros((rank, rank)) for i in range(rank): d1_diag_matrix[i, i] = np.sqrt(d1_vec[i]) u = np.dot(u1_mat, d1_diag_matrix) v = np.dot(v1_matt.T, d1_diag_matrix) if return_embeddings: return u, v else: return np.dot(u, v.T)
def rank_trunc(gram_mat, k, fast=True): """ k-th order approximation of the Gram Matrix G. Parameters ---------- gram_mat : array, shape (n_samples, n_samples) the Gram matrix k : int the order approximation fast : bool use svd (if False) or svds (if True). Return ------ gram_mat_k : array, shape (n_samples, n_samples) The rank k Gram matrix. """ if fast: u, s, v = svds(gram_mat, k) # pass # TODO else: U, S, V = svd(gram_mat) # U V [nxn] s = S[:k] u = U[:k, :k] v = V[:k, :k] # pass # TODO gram_mat_k = (u.dot(np.diag(s))).dot(v) return gram_mat_k
def computePCsPython(out_dir,k,bfile,ffile): """ reading in """ RV = plink_reader.readBED(bfile,useMAFencoding=True) X = RV['snps'] """ normalizing markers """ print 'Normalizing SNPs...' p_ref = X.mean(axis=0)/2. X -= 2*p_ref with warnings.catch_warnings(): warnings.simplefilter("ignore") X /= SP.sqrt(2*p_ref*(1-p_ref)) hasNan = SP.any(SP.isnan(X),axis=0) print '%d SNPs have a nan entry. Exluding them for computing the covariance matrix.'%hasNan.sum() X = X[:,~hasNan] """ computing prinicipal components """ U,S,Vt = SSL.svds(X,k=k) U -= U.mean(0) U /= U.std(0) U = U[:,::-1] """ saving to output """ NP.savetxt(ffile, U, delimiter='\t',fmt='%.6f')
def _fit_truncated(self, X, n_components, svd_solver): """Fit the model by computing truncated SVD (by ARPACK or randomized) on X """ n_samples, n_features = X.shape if isinstance(n_components, six.string_types): raise ValueError("n_components=%r cannot be a string " "with svd_solver='%s'" % (n_components, svd_solver)) elif not 1 <= n_components <= n_features: raise ValueError("n_components=%r must be between 1 and " "n_features=%r with svd_solver='%s'" % (n_components, n_features, svd_solver)) elif svd_solver == 'arpack' and n_components == n_features: raise ValueError("n_components=%r must be stricly less than " "n_features=%r with svd_solver='%s'" % (n_components, n_features, svd_solver)) random_state = check_random_state(self.random_state) # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ if svd_solver == 'arpack': # random init solution, as ARPACK does it internally v0 = random_state.uniform(-1, 1, size=min(X.shape)) U, S, V = svds(X, k=n_components, tol=self.tol, v0=v0) # svds doesn't abide by scipy.linalg.svd/randomized_svd # conventions, so reverse its outputs. S = S[::-1] # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U[:, ::-1], V[::-1]) elif svd_solver == 'randomized': # sign flipping is done inside U, S, V = randomized_svd(X, n_components=n_components, n_iter=self.iterated_power, flip_sign=True, random_state=random_state) self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = V self.n_components_ = n_components # Get variance explained by singular values self.explained_variance_ = (S ** 2) / (n_samples - 1) total_var = np.var(X, ddof=1, axis=0) self.explained_variance_ratio_ = \ self.explained_variance_ / total_var.sum() self.singular_values_ = S.copy() # Store the singular values. if self.n_components_ < min(n_features, n_samples): self.noise_variance_ = (total_var.sum() - self.explained_variance_.sum()) self.noise_variance_ /= min(n_features, n_samples) - n_components else: self.noise_variance_ = 0. return U, S, V
def fit(self, X, Y): # copy since this will contains the centered data check_consistent_length(X, Y) X = check_array(X, dtype=np.float64, copy=self.copy) Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) if self.n_components > max(Y.shape[1], X.shape[1]): raise ValueError("Invalid number of components n_components=%d" " with X of shape %s and Y of shape %s." % (self.n_components, str(X.shape), str(Y.shape))) # Scale (in place) X, Y, self.x_mean_, self.y_mean_, self.x_std_, self.y_std_ = ( _center_scale_xy(X, Y, self.scale)) # svd(X'Y) C = np.dot(X.T, Y) # The arpack svds solver only works if the number of extracted # components is smaller than rank(X) - 1. Hence, if we want to extract # all the components (C.shape[1]), we have to use another one. Else, # let's use arpacks to compute only the interesting components. if self.n_components >= np.min(C.shape): U, s, V = svd(C, full_matrices=False) else: U, s, V = svds(C, k=self.n_components) # Deterministic output U, V = svd_flip(U, V) V = V.T self.x_scores_ = np.dot(X, U) self.y_scores_ = np.dot(Y, V) self.x_weights_ = U self.y_weights_ = V return self
def cv(self, factor, split_val, shadow_func=None, shadow_to_val=None, del_freq=None): """ Cross-validate prediction of factor 'factor'. """ self._prepare(factor, split_val, shadow_func=shadow_func, shadow_to_val=shadow_to_val, del_freq=del_freq) fac_ind = self.col_names.index(factor) self.clf = KNNC(40, algorithm='brute', metric='cosine') z=self._get_features_only(self.non_null_set).astype(float) target = np.ravel(self.non_null_set.getcol(fac_ind).todense()) u, s, v = linalg.svds(z, k=51) T = u.dot(np.diag(s)) kf = cross_validation.KFold(len(target), 5) for train_idx, test_idx in kf: #print len(train_idx), len(test_idx) self.clf.fit(T[train_idx], target[train_idx]) r = self.clf.predict(T[test_idx]) print 'Average error:',\ np.mean(np.abs(r - target[test_idx])),\ "+/-",\ np.std(np.abs(r - target[test_idx]))
def rank_trunc(gram_mat, k, fast=True): """ k-th order approximation of the Gram Matrix G. Parameters ---------- gram_mat : array, shape (n_samples, n_samples) the Gram matrix k : int the order approximation fast : bool use svd (if False) or svds (if True). Return ------ gram_mat_k : array, shape (n_samples, n_samples) The rank k Gram matrix. """ if fast: u,s,v=svds(gram_mat,k) # TODO Question 2-3 else: U,S,V=svd(gram_mat) #full by default--> both U,V: [nxn] here (for G=<Gram_matrix>) s=S[:k] u=U[:k,:k] v=V[:k,:k] gram_mat_k = (u.dot(np.diag(s))).dot(v) return gram_mat_k, u, s
def mySVD(train, test, k): user_ratings_mean = np.mean(train, axis = 1) R_demeaned = train - user_ratings_mean.reshape(-1, 1) U, sigma, Vt = svds(R_demeaned, k=k) sigma = np.diag(sigma) all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1) return get_mae(all_user_predicted_ratings, train), get_mae(all_user_predicted_ratings, test)
def reducedim_svd(self,factors): #print self.fullmatrix print "Number of factors is "+str(factors) ut,s,vt = svds(self.fullmatrix,factors) if numpy.isnan(numpy.min(s)): print "Warning: diagonal matrix contains NaNs" #s=FixNaNs(s) s[numpy.isnan(s)]=0 if numpy.isnan(numpy.min(s)): print "Error: diagonal matrix still contains NaNs, exiting" exit(1) print "Completed svd routine" self.reducedmatrix=numpy.dot(ut,numpy.diag(s)) print "Computed reduced vector space" #remove negative numbers - make equal to zero self.reducedmatrix[self.reducedmatrix<0]=0 # print self.reducedmatrix for vector in self.vectordict.values(): vector.array=sparse.csc_matrix(self.reducedmatrix[vector.rowindex]) print "Stored individual vectors"
def n_of_modules_markov(W): K = Kmatrix(W) v = W.sum() n, m = K.shape r = min(n, m) exp_value = n * m / float(v) u,svs,v = linalg.svds(K,r - 2) svs = svs[::-1] eigs = np.power(svs,2) eigs = eigs[1:] # remove the unitary sv p = np.zeros((r - 1,)) for j in xrange(r - 1): sigma = eigs.sum() p[j] = min(exp_value / sigma,1.) eigs = eigs[1:] # remove the largest sv pdiff = p[1:] - p[:-1] try: delta = pdiff.argmax() except ValueError: delta = -1 q = 2 + delta # the first addend is explained as follows: there is at least one module & python index starts at 0 return q, pdiff,p, svs
def reduce_dims(A, k, return_svs=False): ''' Reduces original vectors to 'k' dimensions by using Singular Value Decomposition to reduce the modified A matrix. Parameters ----- A : scipy lil_matrix modified counts matrix k : int number of reduced dimensions return_svs : bool, default is False when True, returns the array of singular values for each reduced dimension Returns ----- Aprime : matrix A matrix reduced to k dimensions s : array singular values for each reduced dimension only returned if return_svs=True ''' u, s, vt = linalg.svds(A, k=k, return_singular_vectors='vh') Aprime = np.dot(np.diag(s), vt).T if return_svs: return Aprime, s else: return Aprime
def get_coords( axes = 'gene', rows = None, time_val = None, spatial_idxs = None, ids = None): bdnet = nio.getBDTNP() gene_matrix = array([v['vals'][:,time_val] for v in bdnet.values() if str(time_val + 1) in v['steps']]) gene_matrix_keys = [k for k in bdnet.keys() if str(time_val +1) in v['steps']] if axes == 'gene': import scipy.sparse as ssp import scipy.sparse.linalg as las import scipy.sparse.lil as ll adj = ssp.csr_matrix(gene_matrix.T) n_c = 3 U,s, Vh = svd = las.svds(adj, n_c) filtered_genes = ll.lil_matrix(U)*ll.lil_matrix(diag(s)) *ll.lil_matrix(Vh) xs_gene = U[ids,0] ys_gene = U[ids,1] zs_gene = U[ids,2] elif axes == 'space': space_space =array([[ [r[idxs] for idxs in sidxs] for sidxs in spatial_idxs] for r in rows]) space_space = space_space[:, : , time_val] xs_gene = space_space[ids, 0] ys_gene = space_space[ids, 1] zs_gene = space_space[ids, 2] return xs_gene, ys_gene, zs_gene
def write_svd(norm_matrix, rank, prefix): u,s,vt = svds(norm_matrix, k=rank) u = u[:,::-1][:,:rank] s = s[::-1][:rank] v = vt.T[:,::-1][:,:rank] for mat, name in ((u, 'u'), (s, 's'), (v, 'v')): np.save("%s_%s.npy" % (prefix, name, ), mat)
def spectral_partition(W,q,method = 'complete', metric = 'cosine'): n,m = W.shape K = Kmatrix(W) if n == m: try: e,v = linalg.eigen(K, q) except TypeError: e,v = linalg.eigs(K, q) else: try: u,e,v = linalg.svds(K, q) except AttributeError: u,e,v = linalg.svd(K, q) v = np.concatenate((u, v.T), 0) max_index = e.argmax() v = np.delete(v,max_index,1) Obs = np.real(v) D = distance.pdist(Obs,metric = metric) D = np.multiply(D >= 0, D) Z = linkage(D, method = method, metric = metric) cluster = fcluster(Z, q, criterion = 'maxclust') cluster += - 1 cluster = {'spectral' : cluster} return cluster
def get_svs(documents, k=50): ''' Returns the k singular values of the modified counts matrix. These values can be plotted to determine the optimal k value for LSI. Parameters ----- documents : array of vectors each vector is given as the non-zero indices of the unreduced vector > ex: The vector [0 0 2 0 1] should ve input > as [2, 2, 4] k : int, default is 50 number of singular values Returns ----- s : array array of k singular values ''' # build counts matrix A = build_counts(documents) # modify using TF-IDF A2 = tfidf(A) # reduce using SVD s = linalg.svds(A2, k=k, return_singular_vectors=False) # return singular values return s[::-1]
def fit(self, t1, t2): assert self.Ais != None, "!!!! First, distribute rows of A using disRand or disBAM" d = self.d # number of distributed matrice m = np.shape(self.A)[1] # dimension of row space Bis = [None] * d # outputs of local PCA Atis = [None] * d # rank t1 approximation of each Ai # local PCA for i in range(d): # U, S, Vh = svd(Ais[i]) # Bis[i] = np.diag(S[:t1]).dot(Vh[:t1,:]) # Atis[i] = U[:,:t1].dot(Bis[i]) ni = self.Ais[i].shape[0] if t1 < ni: # Target rank t1 is less than Number of Rows U, S, Vt = svds(self.Ais[i], k=t1) Bis[i] = np.diag(S).dot(Vt) Atis[i] = U.dot(Bis[i]) else: # Number of Rows is less than t1 U, S, Vt = svd(self.Ais[i]) Bis[i] = np.diag(S[:ni]).dot(Vt[:ni,:]) Atis[i] = self.Ais[i] # global PCA K = np.zeros((m,m)) for i in range(d): K += Bis[i].T.dot(Bis[i]) # L,Q = eig(K) # C = Q[:,:t2] # C = C.real L,Q = eigs(K, k=t2) self.C = Q.real self.Bis = Bis self.Atis = Atis
def cluster_fps(self): clkg = hcluster.linkage(self.dm,method = 'average') coarse_r = hcluster.fcluster(clkg,0.3,criterion = 'distance') self.coarse_r = coarse_r bcount = np.bincount(coarse_r) knum = len(np.nonzero(bcount > 1)[0]) s = self.density_matrix.shape if False and len(s) >1 and s[0] > 10 and s[1] > 10 and knum < min(s) / 2: (u,s,vt) = la.svds(self.sps_matrixs,k = knum) self.u = u print '============' else: self.result = self.coarse_r return (clkg,clkg) #rankA = npla.matrix_rank(self.sps_matrixs) # if rankA < 3: a = np.matrix(np.diag(s)) * np.matrix(vt) pd = dist.pdist(np.array(a.T),'cosine') pd[np.abs(pd) < 1e-11] = 0 lkg = hcluster.linkage(pd,method = 'average') self.lkg = lkg self.result = hcluster.fcluster(lkg,self.svd_cluster_thr,criterion = 'distance') # self.result = hcluster.fcluster(lkg,1) # self.result = hcluster.fclusterdata(u,0.7,metric = 'cosine', criterion = 'distance',method = 'average') return (lkg,clkg)
def cli(): parser = argparse.ArgumentParser() parser.add_argument( '--nfo', action='store_true', help='''Compute or plot the singular-value decomposition of the near-field operator (NFO).''') parser.add_argument( '--lso', action='store_true', help='''Compute or plot the singular-value decomposition of the Lippmann-Schwinger operator (LSO).''') parser.add_argument( '--numVals', '-k', type=int, help='''Specify the number of singular values/vectors to compute. Must a positive integer between 1 and the order of the square input matrix.''') parser.add_argument( '--domain', '-d', type=str, choices=['time', 'freq'], help='''Specify whether to compute the singular-value decomposition in the time domain or frequency domain. Default is set to frequency domain for faster, more accurate performance.''') parser.add_argument( '--plot', '-p', action='store_true', help='''Plot the computed singular values and vectors.''') parser.add_argument( '--format', '-f', type=str, default='pdf', choices=['png', 'pdf', 'ps', 'eps', 'svg'], help= '''Specify the image format of the saved file. Accepted formats are png, pdf, ps, eps, and svg. Default format is set to pdf.''') parser.add_argument( '--mode', type=str, choices=['light', 'dark'], required=False, help='''Specify whether to view plots in light mode for daytime viewing or dark mode for nighttime viewing. Mode must be either \'light\' or \'dark\'.''') args = parser.parse_args() if args.nfo and not args.lso: operatorType = 'near-field operator' inputType = 'data' try: SVD = np.load('NFO_SVD.npz') s = SVD['s'] Uh = SVD['Uh'] V = SVD['V'] domain = SVD['domain'] except FileNotFoundError: s, Uh, V, domain = None, None, None, 'freq' elif not args.nfo and args.lso: operatorType = 'Lippmann-Schwinger operator' inputType = 'test functions' try: SVD = np.load('LSO_SVD.npz') s = SVD['s'] Uh = SVD['Uh'] V = SVD['V'] domain = SVD['domain'] except FileNotFoundError: s, Uh, V, domain = None, None, None, 'freq' elif args.nfo and args.lso: sys.exit( textwrap.dedent(''' UsageError: Please specify only one of the arguments \'--nfo\' or \'--lso\'. ''')) else: sys.exit( textwrap.dedent(''' For which operator would you like to compute or plot a singular-value decomposition? Enter: vzsvd --nfo for the near-field operator or vzsvd --lso for the Lippmann-Schwinger operator. ''')) #============================================================================== # if an SVD already exists... if any(v is not None for v in [s, Uh, V]) and args.numVals is not None and args.plot is True: if args.numVals >= 1 and args.numVals == len(s): userResponded = False print( textwrap.dedent(''' A singular-value decomposition of the {s} for {n} values/vectors already exists. What would you like to do? Enter '1' to specify a new number of values/vectors to compute. (Default) Enter '2' to recompute a singular-value decomposition for {n} values/vectors. Enter 'q/quit' to exit. '''.format(s=operatorType, n=args.numVals))) while userResponded == False: answer = input('Action: ') if answer == '' or answer == '1': k = int( input( 'Please specify the number of singular values/vectors to compute: ' )) if isValid(k): print('Proceeding with numVals = %s...' % (k)) userResponded = True computeSVD = True break else: break elif answer == '2': k = args.numVals print( 'Recomputing SVD of the %s for %s singular values/vectors...' % (operatorType, k)) userResponded = True computeSVD = True elif answer == 'q' or answer == 'quit': sys.exit('Exiting program.\n') else: print( 'Invalid response. Please enter \'1\', \'2\', or \'q/quit\'.' ) elif args.numVals >= 1 and args.numVals != len(s): k = args.numVals computeSVD = True elif args.numVals < 1: userResponded = False print( textwrap.dedent(''' ValueError: Argument '-k/--numVals' must be a positive integer between 1 and the order of the square input matrix. The parameter will be set to the default value of 6. What would you like to do? Enter '1' to specify a value of the parameter. (Default) Enter '2' to proceed with the default value. Enter 'q/quit' exit the program. ''')) while userResponded == False: answer = input('Action: ') if answer == '' or answer == '1': k = int( input( 'Please specify the number of singular values/vectors to compute: ' )) if isValid(k): print('Proceeding with numVals = %s...' % (k)) userResponded = True computeSVD = True break else: break elif answer == '2': k = 6 print('Proceeding with the default value numVals = %s...' % (k)) computeSVD = True userResponded = True break elif answer == 'q' or answer == 'quit': sys.exit('Exiting program.\n') else: print( 'Invalid response. Please enter \'1\', \'2\', or \'q/quit\'.' ) elif all(v is not None for v in [s, Uh, V]) and args.numVals is None and args.plot is True: computeSVD = False elif all(v is not None for v in [s, Uh, V]) and args.numVals is not None and args.plot is False: if args.numVals >= 1 and args.numVals == len(s): userResponded = False print( textwrap.dedent(''' A singular-value decomposition of the {s} for {n} values/vectors already exists. What would you like to do? Enter '1' to specify a new number of values/vectors to compute. (Default) Enter '2' to recompute a singular-value decomposition for {n} values/vectors. Enter 'q/quit' to exit. '''.format(s=operatorType, n=args.numVals))) while userResponded == False: answer = input('Action: ') if answer == '' or answer == '1': k = int( input( 'Please specify the number of singular values/vectors to compute: ' )) if isValid(k): print('Proceeding with numVals = %s...' % (k)) userResponded = True computeSVD = True break else: break elif answer == '2': k = args.numVals print( 'Recomputing SVD of the %s for %s singular values/vectors...' % (operatorType, k)) userResponded = True computeSVD = True elif answer == 'q' or answer == 'quit': sys.exit('Exiting program.\n') else: print( 'Invalid response. Please enter \'1\', \'2\', or \'q/quit\'.' ) elif args.numVals >= 1 and args.numVals != len(s): k = args.numVals computeSVD = True elif args.numVals < 1: userResponded = False print( textwrap.dedent(''' ValueError: Argument '-k/--numVals' must be a positive integer between 1 and the order of the square input matrix. The parameter will be set to the default value of 6. What would you like to do? Enter '1' to specify a value of the parameter. (Default) Enter '2' to proceed with the default value. Enter 'q/quit' exit the program. ''')) while userResponded == False: answer = input('Action: ') if answer == '' or answer == '1': k = int( input( 'Please specify the number of singular values/vectors to compute: ' )) if isValid(k): print('Proceeding with numVals = %s...' % (k)) userResponded = True computeSVD = True break else: break elif answer == '2': k = 6 print('Proceeding with the default value numVals = %s...' % (k)) computeSVD = True userResponded = True break elif answer == 'q' or answer == 'quit': sys.exit('Exiting program.\n') else: print( 'Invalid response. Please enter \'1\', \'2\', or \'q/quit\'.' ) elif all(v is not None for v in [s, Uh, V]) and args.numVals is None and args.plot is False: sys.exit( textwrap.dedent(''' No action specified. A singular-value decomposition of the %s for %s values/vectors already exists. Please specify at least one of '-k/--numVals' or '-p/--plot' arguments with 'vzsvd' command. ''' % (operatorType, len(s)))) #============================================================================== # if an SVD does not already exist... elif any(v is None for v in [s, Uh, V]) and args.numVals is not None and args.plot is True: if args.numVals >= 1: computeSVD = True k = args.numVals elif args.numVals < 1: userResponded = False print( textwrap.dedent(''' ValueError: Argument '-k/--numVals' must be a positive integer between 1 and the order of the square input matrix. The parameter will be set to the default value of 6. What would you like to do? Enter '1' to specify a value of the parameter. (Default) Enter '2' to proceed with the default value. Enter 'q/quit' exit the program. ''')) while userResponded == False: answer = input('Action: ') if answer == '' or answer == '1': k = int( input( 'Please specify the number of singular values/vectors to compute: ' )) if isValid(k): print('Proceeding with numVals = %s...' % (k)) userResponded = True computeSVD = True break else: break elif answer == '2': k = 6 print('Proceeding with the default value numVals = %s...' % (k)) computeSVD = True userResponded = True break elif answer == 'q' or answer == 'quit': sys.exit('Exiting program.\n') else: print( 'Invalid response. Please enter \'1\', \'2\', or \'q/quit\'.' ) elif any(v is None for v in [s, Uh, V]) and args.numVals is None and args.plot is True: userResponded = False print( textwrap.dedent(''' PlotError: A singular-value decomposition of the {s} does not exist. A plot will be generated after a singular-value decomposition has been computed. Enter '1' to specify a number of singular values/vectors to compute. (Default) Enter 'q/quit' to exit. '''.format(s=operatorType))) while userResponded == False: answer = input('Action: ') if answer == '' or answer == '1': k = int( input( 'Please specify the number of singular values/vectors to compute: ' )) if isValid(k): print('Proceeding with numVals = %s...' % (k)) userResponded = True computeSVD = True break else: break elif answer == 'q' or answer == 'quit': sys.exit('Exiting program.\n') else: print('Invalid response. Please enter \'1\', or \'q/quit\'.') elif any(v is None for v in [s, Uh, V]) and args.numVals is not None and args.plot is False: if args.numVals >= 1: k = args.numVals computeSVD = True elif args.numVals < 1: userResponded = False print( textwrap.dedent(''' ValueError: Argument '-k/--numVals' must be a positive integer between 1 and the order of the square input matrix. The parameter will be set to the default value of 6. What would you like to do? Enter '1' to specify a value of the parameter. (Default) Enter '2' to proceed with the default value. Enter 'q/quit' exit the program. ''')) while userResponded == False: answer = input('Action: ') if answer == '' or answer == '1': k = int( input( 'Please specify the number of singular values/vectors to compute: ' )) if isValid(k): print('Proceeding with numVals = %s...' % (k)) userResponded = True computeSVD = True break else: break elif answer == '2': k = 6 print('Proceeding with the default value numVals = %s...' % (k)) computeSVD = True userResponded = True break elif answer == 'q' or answer == 'quit': sys.exit('Exiting program.\n') else: print( 'Invalid response. Please enter \'1\', \'2\', or \'q/quit\'.' ) elif any(v is None for v in [s, Uh, V]) and args.numVals is None and args.plot is False: sys.exit( textwrap.dedent(''' Nothing to be done. A singular-value decomposition of the {s} does not exist. Please specify at least one of '-k/--numVals' or '-p/--plot' arguments with 'vzsvd' command. '''.format(s=operatorType))) #============================================================================== # Read in data files datadir = np.load('datadir.npz') receiverPoints = np.load(str(datadir['receivers'])) recordingTimes = np.load(str(datadir['recordingTimes'])) dt = recordingTimes[1] - recordingTimes[0] if Path('window.npz').exists(): windowDict = np.load('window.npz') # Apply the receiver window rstart = windowDict['rstart'] rstop = windowDict['rstop'] rstep = windowDict['rstep'] # Apply the time window tstart = windowDict['tstart'] tstop = windowDict['tstop'] tstep = windowDict['tstep'] # Convert time window parameters to corresponding array indices Tstart = int(round(tstart / dt)) Tstop = int(round(tstop / dt)) else: rstart = 0 rstop = receiverPoints.shape[0] rstep = 1 tstart = recordingTimes[0] tstop = recordingTimes[-1] Tstart = 0 Tstop = len(recordingTimes) tstep = 1 # Apply the receiver window rinterval = np.arange(rstart, rstop, rstep) receiverPoints = receiverPoints[rinterval, :] # Apply the time window tinterval = np.arange(Tstart, Tstop, tstep) recordingTimes = recordingTimes[tinterval] # Used for getting time and frequency units if Path('plotParams.pkl').exists(): plotParams = pickle.load(open('plotParams.pkl', 'rb')) else: plotParams = default_params() if computeSVD: # get time units for printing time windows or time shifts tu = plotParams['tu'] if args.nfo: if Path('noisyData.npz').exists(): userResponded = False print( textwrap.dedent(''' Detected that band-limited noise has been added to the data array. Would you like to compute an SVD of the noisy data? ([y]/n) Enter 'q/quit' exit the program. ''')) while userResponded == False: answer = input('Action: ') if answer == '' or answer == 'y' or answer == 'yes': print( 'Proceeding with singular-value decomposition using noisy data...' ) # read in the noisy data array X = np.load('noisyData.npz')['noisyData'] userResponded = True elif answer == 'n' or answer == 'no': print( 'Proceeding with singular-value decomposition using noise-free data...' ) # read in the recorded data array X = np.load(str(datadir['recordedData'])) userResponded = True elif answer == 'q' or answer == 'quit': sys.exit('Exiting program.\n') else: print( 'Invalid response. Please enter \'y/yes\', \'n\no\', or \'q/quit\'.' ) else: # read in the recorded data array X = np.load(str(datadir['recordedData'])) if Path('window.npz').exists(): print('Detected user-specified window:\n') # For display/printing purposes, count receivers with one-based # indexing. This amounts to incrementing the rstart parameter by 1 print('window @ receivers : start =', rstart + 1) print('window @ receivers : stop =', rstop) print('window @ receivers : step =', rstep, '\n') if tu != '': print('window @ time : start = %0.2f %s' % (tstart, tu)) print('window @ time : stop = %0.2f %s' % (tstop, tu)) else: print('window @ time : start =', tstart) print('window @ time : stop =', tstop) print('window @ time : step =', tstep, '\n') # Apply the source window slabel = windowDict['slabel'] sstart = windowDict['sstart'] sstop = windowDict['sstop'] sstep = windowDict['sstep'] sinterval = np.arange(sstart, sstop, sstep) # For display/printing purposes, count recordings/sources with one-based # indexing. This amounts to incrementing the sstart parameter by 1 print('window @ %s : start = %s' % (slabel, sstart + 1)) print('window @ %s : stop = %s' % (slabel, sstop)) print('window @ %s : step = %s\n' % (slabel, sstep)) print('Applying window to data volume...') X = X[rinterval, :, :] X = X[:, tinterval, :] X = X[:, :, sinterval] Nr, Nt, Ns = X.shape # Apply tapered cosine (Tukey) window to time signals. # This ensures the fast fourier transform (FFT) used in # the definition of the matrix-vector product below is # acting on a function that is continuous at its edges. peakFreq = pulseFun.peakFreq # Np : Number of samples in the dominant period T = 1 / peakFreq Np = int(round(1 / (tstep * dt * peakFreq))) # alpha is set to taper over 6 of the dominant period of the # pulse function (3 periods from each end of the signal) alpha = 6 * Np / Nt print('Tapering time signals with Tukey window: %d' % (int(round(alpha * 100))) + '%') TukeyWindow = tukey(Nt, alpha) X *= TukeyWindow[None, :, None] else: Nr, Nt, Ns = X.shape elif args.lso: if Path('samplingGrid.npz').exists(): samplingGrid = np.load('samplingGrid.npz') x = samplingGrid['x'] y = samplingGrid['y'] tau = samplingGrid['tau'] if 'z' in samplingGrid: z = samplingGrid['z'] else: z = None else: sys.exit( textwrap.dedent(''' A sampling grid needs to be set up before computing a singular-value decomposition of the %s. Enter: vzgrid --help from the command-line for more information on how to set up a sampling grid. ''' % (operatorType))) pulse = lambda t: pulseFun.pulse(t) velocity = pulseFun.velocity peakFreq = pulseFun.peakFreq peakTime = pulseFun.peakTime if Path('VZTestFuncs.npz').exists(): print( '\nDetected that free-space test functions have already been computed...' ) print( 'Checking consistency with current space-time sampling grid...' ) TFDict = np.load('VZTestFuncs.npz') if samplingIsCurrent(TFDict, receiverPoints, recordingTimes, velocity, tau, x, y, z, peakFreq, peakTime): X = TFDict['TFarray'] sourcePoints = TFDict['samplingPoints'] print('Moving forward to SVD...') else: print('Recomputing test functions...') # set up the convolution times based on length of recording time interval T = recordingTimes[-1] - recordingTimes[0] convolutionTimes = np.linspace(-T, T, 2 * len(recordingTimes) - 1) if tau[0] != 0: if tu != '': print( 'Recomputing test functions for focusing time %0.2f %s...' % (tau[0], tu)) else: print( 'Recomputing test functions for focusing time %0.2f...' % (tau[0])) X, sourcePoints = sampleSpace( receiverPoints, convolutionTimes - tau[0], velocity, x, y, z, pulse) else: X, sourcePoints = sampleSpace(receiverPoints, convolutionTimes, velocity, x, y, z, pulse) if z is None: np.savez('VZTestFuncs.npz', TFarray=X, time=recordingTimes, receivers=receiverPoints, peakFreq=peakFreq, peakTime=peakTime, velocity=velocity, x=x, y=y, tau=tau, samplingPoints=sourcePoints) else: np.savez('VZTestFuncs.npz', TFarray=X, time=recordingTimes, receivers=receiverPoints, peakFreq=peakFreq, peakTime=peakTime, velocity=velocity, x=x, y=y, z=z, tau=tau, samplingPoints=sourcePoints) else: print( '\nComputing free-space test functions for the current space-time sampling grid...' ) if tau[0] != 0: if tu != '': print( 'Computing test functions for focusing time %0.2f %s...' % (tau[0], tu)) else: print( 'Computing test functions for focusing time %0.2f...' % (tau[0])) X, sourcePoints = sampleSpace(receiverPoints, recordingTimes - tau[0], velocity, x, y, z, pulse) else: X, sourcePoints = sampleSpace(receiverPoints, recordingTimes, velocity, x, y, z, pulse) if z is None: np.savez('VZTestFuncs.npz', TFarray=X, time=recordingTimes, receivers=receiverPoints, peakFreq=peakFreq, peakTime=peakTime, velocity=velocity, x=x, y=y, tau=tau, samplingPoints=sourcePoints) else: np.savez('VZTestFuncs.npz', TFarray=X, time=recordingTimes, receivers=receiverPoints, peakFreq=peakFreq, peakTime=peakTime, velocity=velocity, x=x, y=y, z=z, tau=tau, samplingPoints=sourcePoints) Nr, Nt, Ns = X.shape #============================================================================== if args.domain is not None: domain = args.domain if domain == 'freq': # Transform convolutional operator into frequency domain and bandpass for efficient SVD print('Transforming %s to the frequency domain...' % (inputType)) N = nextPow2(2 * Nt) X = np.fft.rfft(X, n=N, axis=1) if plotParams['fmax'] is None: freqs = np.fft.rfftfreq(N, tstep * dt) plotParams['fmax'] = np.max(freqs) # Apply the frequency window fmin = plotParams['fmin'] fmax = plotParams['fmax'] fu = plotParams['fu'] # frequency units (e.g., Hz) if fu != '': print('Applying bandpass filter: [%0.2f %s, %0.2f %s]' % (fmin, fu, fmax, fu)) else: print('Applying bandpass filter: [%0.2f, %0.2f]' % (fmin, fmax)) df = 1.0 / (N * tstep * dt) startIndex = int(round(fmin / df)) stopIndex = int(round(fmax / df)) finterval = np.arange(startIndex, stopIndex, 1) X = X[:, finterval, :] #============================================================================== # Compute the k largest singular values (which='LM') of the operator A # Singular values are elements of the vector 's' # Left singular vectors are columns of 'U' # Right singular vectors are columns of 'V' A = asConvolutionalOperator(X) if k == 1: print('Computing SVD of the %s for 1 singular value/vector...' % (operatorType)) else: print('Computing SVD of the %s for %s singular values/vectors...' % (operatorType, k)) startTime = time.time() U, s, Vh = svds(A, k, which='LM') endTime = time.time() print('Elapsed time:', humanReadable(endTime - startTime), '\n') # sort the singular values and corresponding vectors in descending order # (i.e., largest to smallest) index = s.argsort()[::-1] s = s[index] Uh = U[:, index].conj().T V = Vh[index, :].conj().T # Write binary output with numpy if args.nfo: np.savez('NFO_SVD.npz', s=s, Uh=Uh, V=V, domain=domain) elif args.lso: np.savez('LSO_SVD.npz', s=s, Uh=Uh, V=V, domain=domain) #============================================================================== if args.plot and all(v is not None for v in [s, Uh, V]): Nr = receiverPoints.shape[0] Nt = len(recordingTimes) try: k except NameError: k = len(s) if args.domain is not None and domain != args.domain: if domain == 'freq': s1 = 'time' s2 = 'frequency' else: s1 = 'frequency' s2 = 'time' sys.exit( textwrap.dedent(''' Error: Attempted to plot the singular-value decomposition in the %s domain, but the decomposition was computed in the %s domain. ''' % (s1, s2))) if domain == 'freq': # plot singular vectors in frequency domain N = nextPow2(2 * Nt) freqs = np.fft.rfftfreq(N, tstep * dt) if plotParams['fmax'] is None: plotParams['fmax'] = np.max(freqs) # Apply the frequency window fmin = plotParams['fmin'] fmax = plotParams['fmax'] df = 1.0 / (N * tstep * dt) startIndex = int(round(fmin / df)) stopIndex = int(round(fmax / df)) finterval = np.arange(startIndex, stopIndex, 1) freqs = freqs[finterval] fmax = freqs[-1] M = len(freqs) Ns = int(V.shape[0] / M) U = np.reshape(Uh.conj().T, (Nr, M, k)) V = np.reshape(V, (Ns, M, k)) else: # domain == 'time' M = 2 * Nt - 1 Ns = int(V.shape[0] / M) U = np.reshape(Uh.T, (Nr, M, k)) V = np.reshape(V, (Ns, M, k)) T = recordingTimes[-1] - recordingTimes[0] times = np.linspace(-T, T, M) if args.nfo: # Near-field operator try: sinterval except NameError: if Path('window.npz').exists(): sstart = windowDict['sstart'] sstop = windowDict['sstop'] sstep = windowDict['sstep'] else: sstart = 0 sstop = Ns sstep = 1 sinterval = np.arange(sstart, sstop, sstep) if 'sources' in datadir: sourcePoints = np.load(str(datadir['sources'])) sourcePoints = sourcePoints[sinterval, :] else: sourcePoints = None else: # if args.lso (Lippmann-Schwinger operator) # in the case of the Lippmann-Schwinger operator, 'sourcePoints' # correspond to sampling points, which should always exist. try: sourcePoints except NameError: if Path('VZTestFuncs.npz').exists(): TFDict = np.load('VZTestFuncs.npz') sourcePoints = TFDict['samplingPoints'] else: sys.exit( textwrap.dedent(''' Error: A sampling grid must exist and test functions computed before a singular-value decomposition of the Lippmann-Schwinger operator can be computed or plotted. ''')) sstart = 0 sstop = sourcePoints.shape[0] sstep = 1 sinterval = np.arange(sstart, sstop, sstep) # increment source/recording interval and receiver interval to be consistent # with one-based indexing (i.e., count from one instead of zero) sinterval += 1 rinterval += 1 rstart += 1 sstart += 1 if args.mode is not None: plotParams['view_mode'] = args.mode pickle.dump(plotParams, open('plotParams.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) remove_keymap_conflicts({'left', 'right', 'up', 'down', 'save'}) if domain == 'freq': # plot the left singular vectors fig_lvec, ax_lvec_r, ax_lvec_i = setFigure( num_axes=2, mode=plotParams['view_mode']) ax_lvec_r.volume = U.real ax_lvec_i.volume = U.imag ax_lvec_r.index = 0 ax_lvec_i.index = 0 fig_lvec.suptitle('Left-Singular Vector', color=ax_lvec_r.titlecolor, fontsize=16) fig_lvec.subplots_adjust(bottom=0.27, top=0.86) leftTitle_r = vector_title('left', ax_lvec_r.index + 1, 'real') leftTitle_i = vector_title('left', ax_lvec_i.index + 1, 'imag') for ax, title in zip([ax_lvec_r, ax_lvec_i], [leftTitle_r, leftTitle_i]): left_im = plotFreqVectors(ax, ax.volume[:, :, ax.index], freqs, fmin, fmax, rstart, rinterval, receiverPoints, title, 'left', plotParams) lp0 = ax_lvec_r.get_position().get_points().flatten() lp1 = ax_lvec_i.get_position().get_points().flatten() left_cax = fig_lvec.add_axes([lp0[0], 0.12, lp1[2] - lp0[0], 0.03]) lcbar = fig_lvec.colorbar(left_im, left_cax, orientation='horizontal') lcbar.outline.set_edgecolor(ax_lvec_r.cbaredgecolor) lcbar.ax.tick_params(axis='x', colors=ax_lvec_r.labelcolor) lcbar.ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f')) lcbar.set_label('Amplitude', labelpad=5, rotation=0, fontsize=12, color=ax_lvec_r.labelcolor) fig_lvec.canvas.mpl_connect( 'key_press_event', lambda event: process_key_vectors( event, freqs, fmin, fmax, rstart, sstart, rinterval, sinterval, receiverPoints, sourcePoints, plotParams, 'cmplx_left')) # plot the right singular vectors fig_rvec, ax_rvec_r, ax_rvec_i = setFigure( num_axes=2, mode=plotParams['view_mode']) ax_rvec_r.volume = V.real ax_rvec_i.volume = V.imag ax_rvec_r.index = 0 ax_rvec_i.index = 0 fig_rvec.suptitle('Right-Singular Vector', color=ax_rvec_r.titlecolor, fontsize=16) fig_rvec.subplots_adjust(bottom=0.27, top=0.86) rightTitle_r = vector_title('right', ax_rvec_r.index + 1, 'real') rightTitle_i = vector_title('right', ax_rvec_i.index + 1, 'imag') for ax, title in zip([ax_rvec_r, ax_rvec_i], [rightTitle_r, rightTitle_i]): right_im = plotFreqVectors(ax, ax.volume[:, :, ax.index], freqs, fmin, fmax, sstart, sinterval, sourcePoints, title, 'right', plotParams) rp0 = ax_rvec_r.get_position().get_points().flatten() rp1 = ax_rvec_i.get_position().get_points().flatten() right_cax = fig_rvec.add_axes( [rp0[0], 0.12, rp1[2] - rp0[0], 0.03]) rcbar = fig_rvec.colorbar(right_im, right_cax, orientation='horizontal') rcbar.outline.set_edgecolor(ax_rvec_r.cbaredgecolor) rcbar.ax.tick_params(axis='x', colors=ax_rvec_r.labelcolor) rcbar.ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f')) rcbar.set_label('Amplitude', labelpad=5, rotation=0, fontsize=12, color=ax_lvec_r.labelcolor) fig_rvec.canvas.mpl_connect( 'key_press_event', lambda event: process_key_vectors( event, freqs, fmin, fmax, rstart, sstart, rinterval, sinterval, receiverPoints, sourcePoints, plotParams, 'cmplx_right')) else: # domain == 'time' fig_vec, ax_lvec, ax_rvec = setFigure(num_axes=2, mode=plotParams['view_mode']) ax_lvec.volume = U ax_lvec.index = 0 leftTitle = vector_title('left', ax_lvec.index + 1) plotWiggles(ax_lvec, ax_lvec.volume[:, :, ax_lvec.index], times, -T, T, rstart, rinterval, receiverPoints, leftTitle, 'left', plotParams) ax_rvec.volume = V ax_rvec.index = 0 rightTitle = vector_title('right', ax_rvec.index + 1) plotWiggles(ax_rvec, ax_rvec.volume[:, :, ax_rvec.index], times, -T, T, sstart, sinterval, sourcePoints, rightTitle, 'right', plotParams) fig_vec.tight_layout() fig_vec.canvas.mpl_connect( 'key_press_event', lambda event: process_key_vectors( event, times, -T, T, rstart, sstart, rinterval, sinterval, receiverPoints, sourcePoints, plotParams)) #============================================================================== # plot the singular values # figure and axis for singular values fig_vals, ax_vals = setFigure(num_axes=1, mode=plotParams['view_mode']) n = np.arange(1, k + 1, 1) kappa = s[0] / s[-1] # condition number = max(s) / min(s) ax_vals.plot(n, s, '.', clip_on=False, markersize=9, label=r'Condition Number: %0.1e' % (kappa), color=ax_vals.pointcolor) ax_vals.set_xlabel('n', color=ax_vals.labelcolor) ax_vals.set_ylabel('$\sigma_n$', color=ax_vals.labelcolor) legend = ax_vals.legend(title='Singular Values', loc='upper center', bbox_to_anchor=(0.5, 1.25), markerscale=0, handlelength=0, handletextpad=0, fancybox=True, shadow=True, fontsize='large') legend.get_title().set_fontsize('large') ax_vals.set_xlim([1, k]) ax_vals.set_ylim(bottom=0) ax_vals.locator_params(axis='y', nticks=6) ax_vals.ticklabel_format(style='sci', axis='y', scilimits=(0, 0)) fig_vals.tight_layout() fig_vals.savefig('singularValues.' + args.format, format=args.format, bbox_inches='tight', facecolor=fig_vals.get_facecolor()) plt.show()
def hysime(y, n, Rn, verbose=true): y = arg[0] # 1st parameter is the data set L, N = y.shape if not np.prod(y.shape): raise ValueError("the data set is empty") n = arg[1] # the 2nd parameter is the noise Ln, Nn = n.shape Rn = arg[2] # the 3rd parameter is the noise correlation matrix d1, d2 = Rn.shape #if nargin == 4, verbose = ~strcmp(lower(varargin{4}),'off'); if ln != L or Nn != N: # n is an empty matrix or with different size raise ValueError("empty noise matrix or its size does", "not agree with size of y\n") if d1 != d2 or d1 != L: print("Bad noise correlation matrix\n") Rn = n * n.getH() / N x = y - n if verbose: print(1, "Computing the correlation matrices\n") L, N = y.shape Ry = y * y.getH() / N # sample correlation matrix Rx = x * x.getH() / N # signal correlation matrix estimates if verbose: print( 1, "Computing the eigen vectors of the signal correlation matrix\n") E, D = svds(Rx) # eigen values of Rx in decreasing order, equation (15) dx = block_diag(D) if verbose: print(1, "Estimating the number of endmembers\n") Rn = Rn + np.sum(block_diag(Rx)) / L / 10**10 * np.identity(L) Py = block_diag(E.getH() * Ry * E) # equation (23) Pn = block_diag(E.getH() * Rn * E) # equation (24) cost_F = -Py + 2 * Pn # equation (22) # syntax might need revision kf = np.sum(cost_F < 0) dummy, ind_asc = np.sort(cost_F) Ek = E[:, ind_asc[1:kf]] if verbose: print(1, "The signal subspace dimension is: k = %d\n", kf) # only for plot purposes, equation (19) Py_sort = np.trace(Ry) - np.cumsum(Py(ind_asc)) Pn_sort = 2 * np.cumsum(Pn(ind_asc)) cost_F_sort = Py_sort + Pn_sort fig = plt.figure() semilogy(indice, cost_F_sort(indice), indice, Py_sort(indice), indice, Pn_sort(indice), 2, 5, **kwargs) plt.semilogx([1, 10, 100], [1, 10, 100]) plt.xlabel("k") plt.ylabel("mse(k)") plt.title('HySime') legend('Mean Squared Error', 'Projection Error', 'Noise Power') plt.show()
def make_assoc_dict(deps, minct=100, svd=False, outpath=None): #def make_tc_dict(deps, minct=2, laplace=1.0, positive=True, outpath=None): #def make_tc_dict(deps, mostcommon=0.8, laplace=1.0, positive=True, outpath=None): """PMI. Laplace smooothing not currently implemented. """ ctr = defaultdict(int) # for keeping joint counts wordctr = defaultdict(int) ctxctr = defaultdict(int) print('Getting counts') for triple in tqdm(deps): ctr[tuple([triple[0] + '-' + triple[1], triple[2]])] += 1 ctxctr[triple[0] + '-' + triple[1]] += 1 wordctr[triple[1]] += 1 wordctr[triple[2]] += 1 # Enforcing min. ct. ctr = { k: v for k, v in ctr.items() if all(x > minct for x in [wordctr[k[1]], ctxctr[k[0]]]) } wordctr = {k: v for k, v in wordctr.items() if wordctr[k] > minct} ctxctr = {k: v for k, v in ctxctr.items() if ctxctr[k] > minct} total = sum(v for v in ctr.values()) print('\n# total triples: {}'.format(total)) print('# words: {}'.format(len(wordctr))) print('# rel-gov pairs: {}\n'.format(len(ctxctr))) print('Converting to PMI') for k in tqdm(ctr.keys()): ctr[k] /= wordctr[k[1]] ctr[k] /= ctxctr[k[0]] ctr[k] *= total ctr[k] = np.log2(ctr[k]) print('Converting to pandas') ctrpd = pd.Series(ctr).reset_index() ctrpd.columns = ['Rel-Gov', 'Dep', 'PMI'] ctrpd['PMI'] = pd.arrays.SparseArray(ctrpd['PMI']) # Converting to sparse if outpath and not svd: print('Writing to compressed .csv') ctrpd.to_csv(os.path.join(outpath, 'specPMI_' + str(minct) + 'cutoff.csv.gz'), compression='gzip', na_rep=np.nan) elif outpath and svd: print('Performing SVD w/ {} dimensions'.format(svd)) print('NOTE: Converting to (sparse) positive PMI for consistency!') ctrpd['PMI'].clip(lower=0.0, inplace=True) # Making it PPMI in place #ctrpd = ctrpd.astype(pd.SparseDtype(np.float32, fill_value=0.0)) ctrpd = ctrpd.pivot( index='Dep', columns='Rel-Gov', values='PMI', ).fillna(0.0) print('Density: {}'.format(ctrpd.sparse.density)) wvecs, singvals, cvecs = svds(ctrpd.sparse.to_coo(), k=svd) print('Making word and context symmetric (Levy et al. 2015)') wsym = wvecs * np.sqrt(singvals) # Transposing so context vectors are in the rows csym = cvecs.T * np.sqrt(singvals) wsym = pd.DataFrame(wsym, index=ctrpd.index) csym = pd.DataFrame(csym, index=ctrpd.columns) print('Writing to compressed .csv') wsym.to_csv(os.path.join( outpath, 'WordVecs' + str(minct) + 'SVD' + str(svd) + '.csv.gz'), compression='gzip', na_rep=np.nan) csym.to_csv(os.path.join( outpath, 'ContextVecs' + str(minct) + 'SVD' + str(svd) + '.csv.gz'), compression='gzip', na_rep=np.nan) return ctrpd
def test_svds_input_validation_A(self, args): A, error_type, message = args with pytest.raises(error_type, match=message): svds(A, k=1, solver=self.solver)
def test_svds_input_validation_tol_2(self, tol): # I think the stack trace is reasonable here message = "'<' not supported between instances" with pytest.raises(TypeError, match=message): svds(np.eye(10), tol=tol, solver=self.solver)
def test_svd_linop(self): solver = self.solver nmks = [(6, 7, 3), (9, 5, 4), (10, 8, 5)] def reorder(args): U, s, VH = args j = np.argsort(s) return U[:, j], s[j], VH[j, :] for n, m, k in nmks: # Test svds on a LinearOperator. A = np.random.RandomState(52).randn(n, m) L = CheckingLinearOperator(A) if solver == 'propack': v0 = np.ones(n) else: v0 = np.ones(min(A.shape)) U1, s1, VH1 = reorder(svds(A, k, v0=v0, solver=solver)) U2, s2, VH2 = reorder(svds(L, k, v0=v0, solver=solver)) assert_allclose(np.abs(U1), np.abs(U2)) assert_allclose(s1, s2) assert_allclose(np.abs(VH1), np.abs(VH2)) assert_allclose(np.dot(U1, np.dot(np.diag(s1), VH1)), np.dot(U2, np.dot(np.diag(s2), VH2))) # Try again with which="SM". A = np.random.RandomState(1909).randn(n, m) L = CheckingLinearOperator(A) # TODO: arpack crashes when v0=v0, which="SM" kwargs = {'v0': v0} if solver not in {None, 'arpack'} else {} U1, s1, VH1 = reorder( svds(A, k, which="SM", solver=solver, **kwargs)) U2, s2, VH2 = reorder( svds(L, k, which="SM", solver=solver, **kwargs)) assert_allclose(np.abs(U1), np.abs(U2)) assert_allclose(s1, s2) assert_allclose(np.abs(VH1), np.abs(VH2)) assert_allclose(np.dot(U1, np.dot(np.diag(s1), VH1)), np.dot(U2, np.dot(np.diag(s2), VH2))) if k < min(n, m) - 1: # Complex input and explicit which="LM". for (dt, eps) in [(complex, 1e-7), (np.complex64, 1e-3)]: rng = np.random.RandomState(1648) A = (rng.randn(n, m) + 1j * rng.randn(n, m)).astype(dt) L = CheckingLinearOperator(A) U1, s1, VH1 = reorder(svds(A, k, which="LM", solver=solver)) U2, s2, VH2 = reorder(svds(L, k, which="LM", solver=solver)) assert_allclose(np.abs(U1), np.abs(U2), rtol=eps) assert_allclose(s1, s2, rtol=eps) assert_allclose(np.abs(VH1), np.abs(VH2), rtol=eps) assert_allclose(np.dot(U1, np.dot(np.diag(s1), VH1)), np.dot(U2, np.dot(np.diag(s2), VH2)), rtol=eps)
def test_svds_input_validation_return_singular_vectors(self, rsv): message = "`return_singular_vectors` must be in" with pytest.raises(ValueError, match=message): svds(np.eye(10), return_singular_vectors=rsv, solver=self.solver)
def test_svds_input_validation_maxiter_1(self, maxiter): message = ("`maxiter` must be a positive integer.") with pytest.raises(ValueError, match=message): svds(np.eye(10), maxiter=maxiter, solver=self.solver)
def test_svds_input_validation_v0_3(self, v0): A = np.ones((10, 10)) message = "`v0` must be of floating or complex floating data type." with pytest.raises(ValueError, match=message): svds(A, k=1, v0=v0, solver=self.solver)
def test_svds_input_validation_v0_2(self): A = np.ones((10, 10)) v0 = np.ones((1, 10)) message = "`v0` must have shape" with pytest.raises(ValueError, match=message): svds(A, k=1, v0=v0, solver=self.solver)
def test_svds_input_validation_solver(self, solver): message = "solver must be one of" with pytest.raises(ValueError, match=message): svds(np.ones((3, 4)), k=2, solver=solver)
def test_svds_input_validation_tol_1(self, tol): message = "`tol` must be a non-negative floating point value." with pytest.raises(ValueError, match=message): svds(np.eye(10), tol=tol, solver=self.solver)
if R[i][j] > 0: e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2) for k in range(K): e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2)) if e < 0.001: break return P, Q.T nets_train,nets_test = train_test_split(nets,0.3) print(nets_train.shape) print(nets_test.shape) nets_mean = np.mean(nets_train, axis = 1) nets_demeaned = nets_train - nets_mean.reshape(-1, 1) # print(R_demeaned) from scipy.sparse.linalg import svds U, sigma, Vt = svds(nets_demeaned, k = 25) sigma = np.diag(sigma) all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + nets_mean.reshape(-1, 1) print(all_user_predicted_ratings.shape) print(nets.shape) from sklearn.metrics import mean_squared_error def get_mse(pred, actual): # Ignore nonzero terms. pred = pred[actual.nonzero()].flatten() actual = actual[actual.nonzero()].flatten() return mean_squared_error(pred, actual) print(sqrt(get_mse(all_user_predicted_ratings,nets_test)))
# Populate our empty matrix with the cumulative scores between teams for index, row in df_scores.iterrows(): # get id home_id = row['Home_Team_id'] away_id = row['Away_Team_id'] # convert id to a number in range 0-29 home_num = df_team_ids.loc[df_team_ids['TEAM_ID'] == home_id]['SV_TEAM_ID'] away_num = df_team_ids.loc[df_team_ids['TEAM_ID'] == away_id]['SV_TEAM_ID'] home_num = home_num.iloc[0] away_num = away_num.iloc[0] # add scores to matrix df_scores_mat[home_num][away_num] += row['Home_PTS'] df_scores_mat[away_num][home_num] += row['Visitor_PTS'] scores_mat = df_scores_mat.as_matrix() # apply SVD from scipy.sparse.linalg import svds U, s, V = svds(scores_mat,k=2) # plot latent vectors plt.scatter(U[:,0], U[:,1], c=np.arange(0,30)) plt.show() # scale by eigenvalues and plot again U2 = np.dot(U,np.sqrt(np.diag(s))) plt.scatter(U2[:,0], U2[:,1], c=np.arange(0,30)) plt.show()
#Creating a sparse pivot table with users in rows and items in columns users_items_pivot_matrix_df = interactions_train_df.pivot( index='personId', columns='contentId', values='eventStrength').fillna(0) users_items_pivot_matrix_df.head(10) users_items_pivot_matrix = users_items_pivot_matrix_df.as_matrix() users_items_pivot_matrix[:10] users_ids = list(users_items_pivot_matrix_df.index) users_ids[:10] users_items_pivot_sparse_matrix = csr_matrix(users_items_pivot_matrix) users_items_pivot_sparse_matrix #The number of factors to factor the user-item matrix. NUMBER_OF_FACTORS_MF = 15 #Performs matrix factorization of the original user item matrix #U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF) U, sigma, Vt = svds(users_items_pivot_sparse_matrix, k=NUMBER_OF_FACTORS_MF) U.shape Vt.shape sigma = np.diag(sigma) sigma.shape all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) all_user_predicted_ratings all_user_predicted_ratings_norm = ( all_user_predicted_ratings - all_user_predicted_ratings.min()) / ( all_user_predicted_ratings.max() - all_user_predicted_ratings.min()) #Converting the reconstructed matrix back to a Pandas dataframe cf_preds_df = pd.DataFrame(all_user_predicted_ratings_norm, columns=users_items_pivot_matrix_df.columns, index=users_ids).transpose() cf_preds_df.head(10) len(cf_preds_df.columns)
def low_rank_svd(matrix, singular_count=2): u, s, vt = svds(matrix, k=singular_count) return u, s, vt
return pred item_prediction = predict(train_data_matrix, item_similarity, type='item') user_prediction = predict(train_data_matrix, user_similarity, type='user') from sklearn.metrics import mean_squared_error from math import sqrt def rmse(prediction, ground_truth): prediction = prediction[ground_truth.nonzero()].flatten() ground_truth = ground_truth[ground_truth.nonzero()].flatten() return sqrt(mean_squared_error(prediction, ground_truth)) print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))) print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))) sparsity = round(1.0 - len(df) / float(n_users * n_items), 3) print('The sparsity level of MovieLens100K is ' + str(sparsity * 100) + '%') import scipy.sparse as sp from scipy.sparse.linalg import svds #get SVD components from train matrix. Choose k. u, s, vt = svds(train_data_matrix, k=10) s_diag_matrix = np.diag(s) X_pred = np.dot(np.dot(u, s_diag_matrix), vt) print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))
def training(): db = firestore.client() doc_ref = db.collection(u'ratings').where(u'book', u'==', True).stream() ref = db.collection_group(u'review')\ .where(u'reviewed', u'==', True).order_by('user_id') docs = ref.stream() for doc in docs: data1.append(doc.to_dict()) # print(type(data), data) df = pd.DataFrame(data1) df1 = df[['user_id', 'book_id', 'rating']] # print(df1) ratingss = pd.read_csv('dataset/ratings.csv', usecols=['user_id', 'book_id', 'rating']) # print(ratings.head(5)) ratings = pd.concat([ratingss, df1]) # print(new) ratings['user_id'] = ratings['user_id'].apply(str) n_users = ratings.user_id.unique().shape[0] n_books = ratings.book_id.unique().shape[0] print('Number of users = ' + str(n_users) + ' | Number of books = ' + str(n_books)) Ratings = ratings.pivot(index='user_id', columns='book_id', values='rating').fillna(0) # Ratings.head() R = Ratings.to_numpy() user_ratings_mean = np.mean(R, axis=1) Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1) sparsity = round(1.0 - len(ratings) / float(n_users * n_books), 3) # print ('The sparsity level of MovieLens1M dataset is ' + str(sparsity * 100) + '%') U, sigma, Vt = svds(Ratings_demeaned, k=50) sigma = np.diag(sigma) # sigma all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1) preds = pd.DataFrame(all_user_predicted_ratings, columns=Ratings.columns) reader = Reader() # Load ratings dataset with Dataset library data = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']], reader) # Split the dataset for 5-fold evaluation kf = KFold(n_splits=5) svd = SVD() for trainset, testset in kf.split(data): # train and test algorithm. svd.fit(trainset) predictions = svd.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True) trainset = data.build_full_trainset() svd.fit(trainset) print('Data Trained Successfully') with open('model_pickle', 'wb') as f: pickle.dump(svd, f)
for i in range(0, rating_train_matrix.shape[0]): for j in range(0, rating_train_matrix.shape[1]): if rating_train_matrix[i][j] == 0: mean_u_ratings[i][j] = 0 print(mean_u_ratings) # In[11]: ### reference: https://github.com/khanhnamle1994/movielens/blob/master/SVD_Model.ipynb ## use scipy function to do the singular value decomposition from scipy.sparse.linalg import svds U, sigma, Vt = svds(mean_u_ratings, k=200) sigma = np.diag(sigma) print(sigma) # In[12]: ### Reference: https://simplyml.com/generating-recommendations/ ## predict the ratings by multiplying the three matrices U, sigma and Vt predict_rating = np.dot(np.dot(U, sigma), Vt) + mean_user_ratings.reshape( -1, 1) print(predict_rating) print(predict_rating.shape)
ground_truth = ground_truth[ground_truth.nonzero()].flatten() return sqrt(mean_squared_error(prediction, ground_truth)) #prediction = prediction[ground_truth.nonzero()].flatten() print('basado en similaridad CF RMSE: ' + str(rmse(user_prediction, R))) #normalizado los valores y modificado columnas e indices R = R_df.as_matrix() user_ratings_mean = np.mean(R, axis=1) R_demeaned = R - user_ratings_mean.reshape(-1, 1) from scipy.sparse.linalg import svds U, sigma, Vt = svds(R, k=50) sigma = np.diag(sigma) all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1) preds_df = pd.DataFrame(all_user_predicted_ratings, columns=R_df.columns) from scipy.sparse.linalg import svds U, sigma, Vt = svds(R_demeaned, k=50) normalized = (preds_df - min(preds_df)) / (max(preds_df) - min(preds_df)) from sklearn import preprocessing min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 5))
csvt = pd.read_csv('go_track_trackspoints.csv') tracks = csvt[['track_id', 'latitude', 'longitude']] #carrega o csv gerado por update.py R_df = pd.read_csv('matrix.csv', header=0, index_col=0) #Transforma o dataframe do pandas em uma matriz numpy para se realizar os cálculos e a normalização R = R_df.as_matrix().astype(np.int64) user_ratings_mean = np.mean(R, axis=1) R_demeaned = R - user_ratings_mean.reshape(-1, 1) from scipy.sparse.linalg import svds U, sigma, Vt = svds(R_demeaned, k=25) sigma = np.diag(sigma) all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1) preds_df = pd.DataFrame(all_user_predicted_ratings, columns=R_df.columns.astype(np.int64)) def recommend_tracks(predictions_df, userID, tracks, original_ratings_df, num_recommendations=5):
def report(): import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from subprocess import check_output import os print(os.listdir('../input')) diff = pd.read_csv('../input/diffsydiw.csv') sym = pd.read_csv('../input/sym_t.csv') dia = pd.read_csv('../input/dia_t.csv') # print(sym.head()) # dia['idnr'] = dia['_id'].convert_objects(convert_numeric=True) # print(dia.head()) sd_diff = diff.merge(sym, left_on='syd', right_on='syd') #print(sd_diff.head()) sd_diff = sd_diff.merge(dia, left_on='did', right_on='did') # print(sd_diff.head()) from sklearn.preprocessing import LabelEncoder from tqdm import tqdm from scipy.sparse import coo_matrix, csr_matrix def read_data(filename): """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe and a sparse matrix of song/user/playcount """ # read in triples of user/song/playcount from the input dataset data = pd.read_csv( filename, usecols=[ 0, 1, 2 ], # [36, 11, 10] vrk_pat_primkey,prd_atc_primkey,vdp_aantal names=['user', 'song', 'plays'], skiprows=1 ) # [:1000000] # user = patient, or prescriptionnr song=atc data = data.dropna(axis=0, how='any') # drop nan data['plays'] = data['plays'] + 1 # print(data.head()) # map each song and user to a unique numeric value data['user'] = data['user'].astype("category") data['song'] = data['song'].astype("category") # create a sparse matrix of all the users/plays plays = coo_matrix( (data['plays'].astype(float), (data['song'].cat.codes.copy(), data['user'].cat.codes.copy()))) return data, plays, data.groupby( ['song']).plays.sum(), data['user'].cat.codes.copy() data, matrix, songsd, user = read_data('../input/diffsydiw.csv') data.head(10) # user=symptom # sond=diagnose from sklearn.preprocessing import normalize def cosine(plays): normalized = normalize(plays) return normalized.dot(normalized.T) def bhattacharya(plays): plays.data = np.sqrt(plays.data) return cosine(plays) def ochiai(plays): plays = csr_matrix(plays) plays.data = np.ones(len(plays.data)) return cosine(plays) def bm25_weight(data, K1=1.2, B=0.8): """ Weighs each row of the matrix data by BM25 weighting """ # calculate idf per term (user) N = float(data.shape[0]) idf = np.log(N / (1 + np.bincount(data.col))) # calculate length_norm per document (artist) row_sums = np.squeeze(np.asarray(data.sum(1))) average_length = row_sums.sum() / N length_norm = (1.0 - B) + B * row_sums / average_length # weight matrix rows by bm25 ret = coo_matrix(data) ret.data = ret.data * (K1 + 1.0) / (K1 * length_norm[ret.row] + ret.data) * idf[ret.col] return ret def bm25(plays): plays = bm25_weight(plays) return plays.dot(plays.T) def get_largest(row, N=10): if N >= row.nnz: best = zip(row.data, row.indices) else: ind = np.argpartition(row.data, -N)[-N:] best = zip(row.data[ind], row.indices[ind]) return sorted(best, reverse=True) def calculate_similar_artists(similarity, artists, artistid): neighbours = similarity[artistid] top = get_largest(neighbours) return [(artists[other], score, i) for i, (score, other) in enumerate(top)] # songsd = dict(enumerate(data['song'].cat.categories)) user_count = data.groupby('user').size() # to_generate = sorted(list(songsd), key=lambda x: -user_count[x]) similarity = bm25_weight(matrix) # print(sym) sym[sym['syd'].isin(list(songsd.index))] from scipy.sparse.linalg import svds Ur, Si, VTr = svds(bm25_weight(coo_matrix(matrix)), k=100) # print(Ur.shape, Si.shape, VTr.shape,user.shape,matrix.shape,data.shape,songsd.shape,user_count.shape) VTr = pd.DataFrame(VTr) from sklearn.metrics.pairwise import cosine_similarity Sddf = pd.DataFrame(cosine_similarity(Ur, VTr.T), columns=user_count.index, index=list(songsd.index)) Sddf.to_csv('Sddf.csv') Sydi = pd.DataFrame(cosine_similarity(Ur, VTr.T)) ###changes # booknr=13 #symptoom4 # b='Headache' # print('Symptom',sym[sym['symptom']==b]) # a = input("Enter your symptom:") file = open("Symptom.txt", "r") x = file.readlines() file.close() print(x) r1 = [] for j in range(len(x)): a = x[j] #print("symptom") #print(x[0]) #print(a) data = pd.read_csv("sym_t.csv") c = 0 for i in data['symptom']: print(c) c += 1 print(type(i)) #print("i is"+i) if a == (i + "\n"): break print("Value of c") print(c) booknr = c # symptoom4 print('Symptom', sym[sym['syd'] == booknr]) print('top 7 related disease probability' ) # ,Sddf[booknr].sort_values(ascending=False)) print() # print(sym.loc["symptom"]=="Headache") data = pd.read_csv("sym_t.csv", index_col="symptom") # print(data.loc["Headache"], ["syd"]) #print("hi") lijst = Sddf[booknr].sort_values(ascending=False).index for xi in lijst[:4]: r1.append(dia[dia['did'] == xi].diagnose.values) #print(type(r1[0][0])) file = open("Disease.txt", "a") for j in range(len(r1)): file.write(r1[j][0]) file.write("\n") file.close() #print(r1[0][0]) '''lijst = list(lijst[:3])
def Demo(): #__SUPPLY_CHAIN_NETWORK__################################################### N = 10 # number of possible maps T = 1000 # number of time steps eta = .01 # learning rate # Define Domains and Compute Equilbria Domains = [] X_Stars = [] CurlBounds = [] for n in range(N): # Create Domain Network = CreateRandomNetwork(I=3, Nm=2, Nd=2, Nr=1, seed=n) Domain = SupplyChain(Network=Network, alpha=2) # Record Domain Domains += [Domain] # Set Method Method = HeunEuler(Domain=Domain, P=BoxProjection(lo=0), Delta0=1e-3) # Initialize Starting Point Start = np.zeros(Domain.Dim) # Calculate Initial Gap gap_0 = Domain.gap_rplus(Start) # Calculate Curl Bound J = approx_jacobian(Domain.F, Start) _J = approx_jacobian(Domain.F, Start + 0.5) assert np.allclose(J, _J, atol=1e-5) CurlBounds += [ np.sqrt(18) * svds(J, k=1, which='LM', return_singular_vectors=False).item() ] # Set Options Init = Initialization(Step=-1e-10) Term = Termination(MaxIter=25000, Tols=[(Domain.gap_rplus, 1e-3 * gap_0)]) Repo = Reporting(Requests=[Domain.gap_rplus]) Misc = Miscellaneous() Options = DescentOptions(Init, Term, Repo, Misc) # Print Stats PrintSimStats(Domain, Method, Options) # Start Solver tic = time.time() SupplyChain_Results = Solve(Start, Method, Domain, Options) toc = time.time() - tic # Print Results PrintSimResults(Options, SupplyChain_Results, Method, toc) # Record X_Star X_Star = SupplyChain_Results.TempStorage['Data'][-1] X_Stars += [X_Star] X_Stars = np.asarray(X_Stars) # Compute Equilibrium of Average Domain Domain = AverageDomains(Domains) # Set Method Method = HeunEuler(Domain=Domain, P=BoxProjection(lo=0), Delta0=1e-3) # Initialize Starting Point Start = np.zeros(Domain.Dim) # Calculate Initial Gap gap_0 = Domain.gap_rplus(Start) # Set Options Init = Initialization(Step=-1e-10) Term = Termination(MaxIter=25000, Tols=[(Domain.gap_rplus, 1e-3 * gap_0)]) Repo = Reporting(Requests=[Domain.gap_rplus]) Misc = Miscellaneous() Options = DescentOptions(Init, Term, Repo, Misc) # Print Stats PrintSimStats(Domain, Method, Options) # Start Solver tic = time.time() SupplyChain_Results = Solve(Start, Method, Domain, Options) toc = time.time() - tic # Print Results PrintSimResults(Options, SupplyChain_Results, Method, toc) # Record X_Opt # X_Opt = SupplyChain_Results.TempStorage['Data'][-1] X_Opt = np.mean(X_Stars, axis=0) print('Starting Online Learning') # Set First Prediction X = np.zeros(X_Stars.shape[1]) # Select First Domain # idx = np.argmax(np.linalg.norm(X_Stars - X,axis=1)) idx = 0 distances = [] loss_infs = [] regret_standards = [] regret_news = [] stokes = [] ts = range(T) for t in ts: print('t = ' + str(t)) # retrieve domain Domain = Domains[idx] # retrieve equilibrium / reference vector equi = X_Stars[idx] # calculate distance distances += [np.linalg.norm(equi - X)] # calculate infinity loss loss_infs += [infinity_loss(Domain, X)] # calculate standard regret ci_predict = ContourIntegral(Domain, LineContour(equi, X)) predict_loss = integral(ci_predict) ci_opt = ContourIntegral(Domain, LineContour(equi, X_Opt)) predict_opt = integral(ci_opt) regret_standards += [predict_loss - predict_opt] # calculate new regret ci_new = ContourIntegral(Domain, LineContour(X_Opt, X)) regret_news += [integral(ci_new)] # calculate bound # area = 0.5*np.prod(np.sort([np.linalg.norm(X_Opt-equi),np.linalg.norm(X-X_Opt),np.linalg.norm(equi-X)])[:2]) # area upper bound area = herons(X_Opt, X, equi) # exact area stokes += [CurlBounds[idx] * area] # update prediction X = BoxProjection(lo=0).P(X, -eta, Domain.F(X)) # update domain # idx = np.argmax(np.linalg.norm(X_Stars - X,axis=1)) idx = (idx + 1) % X_Stars.shape[0] ts_p1 = range(1, T + 1) distances_avg = np.divide(distances, ts_p1) loss_infs_avg = np.divide(loss_infs, ts_p1) regret_standards_avg = np.divide(regret_standards, ts_p1) regret_news_avg = np.divide(regret_news, ts_p1) stokes = np.asarray(stokes) np.savez_compressed('NoRegret_SCN.npz', d_avg=distances_avg, linf_avg=loss_infs_avg, rs_avg=regret_standards_avg, rn_avg=regret_news_avg, stokes=stokes) # plt.subplot(2, 1, 1) # plt.plot(ts, distances_avg, 'k',label='Average Distance') # plt.title('Demonstration of No-Regret on MLN') # plt.ylabel('Euclidean Distance') # plt.legend() plt.subplot(1, 1, 1) plt.plot(ts, loss_infs_avg, 'k--', label=r'loss$_{\infty}$') plt.plot(ts, regret_standards_avg, 'r--o', markevery=T // 20, label=r'regret$_{s}$') plt.plot(ts, regret_news_avg, 'b-', label=r'regret$_{n}$') ax.fill_between(ts, regret_news_avg - stokes, regret_news_avg + stokes, facecolor='c', alpha=0.2, zorder=0, label='Stokes Bound') plt.plot(ts, np.zeros_like(ts), 'w-', lw=1) plt.xlabel('Time Step') plt.ylabel('Aggregate System-Wide Loss') plt.xlim([0, T]) plt.ylim([-250, 1000]) plt.legend() plt.title('Demonstration of No-Regret on Supply Chain Network') plt.savefig('NoRegret_SCN')
import re from sklearn.feature_extraction.text import TfidfVectorizer from scipy.sparse.linalg import svds from sklearn.preprocessing import normalize from flask import Flask, request with open("./app/helper_functions/songData.json", "r") as f: song_transcripts = json.load(f) print("loading this page") SONGS = [song_transcripts[index] for index in song_transcripts] songlist = [song["lyrics"] for song in SONGS] vectorizer = TfidfVectorizer(stop_words="english", max_df=.8) docs_compressed = svds(vectorizer.fit_transform(songlist).transpose(), k=40)[2].transpose() tokenize_transcript(SONGS) inv_idx = build_inverted_index(SONGS) idf = compute_idf(inv_idx, len(SONGS)) doc_norms = computer_doc_norms(inv_idx, idf, len(SONGS)) @irsystem.route('search', methods=["POST"]) def getQuery(): #docs_compressed = docs_compressed.transpose() my_json = request.get_json() inputquery = my_json.get('search').lower() returnquery = runQuery(inputquery.lower(), inv_idx, idf, SONGS, doc_norms, docs_compressed) data = []
def rankOneMatrixPursuit_econ(Y, projMat, solRank, C, projMatC, biasMat, verbose=False): M = list() rec_X = np.zeros( (projMat.shape[0], projMat.shape[1])) # matrix reconstructed using basis matrices projMat_c = sp.coo_matrix(projMat) row_coo, col_coo, dummy = sp.find(projMat_c != 0) proj_coo = zip(row_coo, col_coo) # projection Y_proj = np.multiply(Y.copy(), projMat) # get observed entries as a list for solving linear equations to # estimate coefficients Yp = np.transpose(np.matrix([Y_proj[el] for el in proj_coo])) u_set = list() v_set = list() rmse_list = list() count = -1 for maxIter in range(solRank): count = count + 1 # residual Res_k = np.matrix(Y_proj - rec_X) # left and right highest singular vectors of the residual matrix Res_k_sp = sp.bsr_matrix(Res_k) [u, s, vt] = spl.svds(Res_k_sp, k=1, which='LM') # rank-1 basis matrix Mk = u * vt #u*np.transpose(v) proj_basis_set = list() # rec_X is 0, when count==0 if count > 0: proj_basis_set.append(np.multiply(rec_X, projMat)) proj_basis_set.append(np.multiply(Mk, projMat)) # coordinate wise content of rec_X rec_Xp = np.matrix([rec_X[el] for el in proj_coo]) rec_Xp = np.transpose(rec_Xp) Mp = np.matrix([Mk[el] for el in proj_coo]) Mp = np.transpose(Mp) if count > 0: M = rec_Xp M = np.hstack([M, Mp]) else: M = Mp if (count == 0): u_set = np.matrix(np.transpose(u)) v_set = np.matrix(vt) else: u_set = np.vstack([u_set, np.transpose(u)]) v_set = np.vstack([v_set, vt]) #print 'solving for coefficients ...' # len(alpha_k) == 2 alpha_k = solve(M, Yp) # update coefficients if count == 0: theta_k = alpha_k elif count == 1: theta_k[0] = theta_k[0] * alpha_k[0] theta_k.extend([alpha_k[1]]) else: for c in range(maxIter - 1): theta_k[c] = theta_k[c] * alpha_k[0] theta_k.extend([alpha_k[1]]) # reconstruction of the projected matrix rec_X = reconstruct_proj(proj_basis_set, alpha_k) if verbose: #print 'res norm: ' + str(np.linalg.norm(rec_X,'fro')) print "iter:" + str(maxIter) + " err: " + str( np.linalg.norm(rec_X - Y_proj, 'fro')) #print "coeffs: " + str(theta_k) Z_inter = reconstruct_full(u_set, v_set, theta_k, biasMat) if verbose: err = np.linalg.norm(np.multiply(projMatC - projMat, Z_inter - C), 'fro') / np.sqrt(np.sum(projMatC - projMat)) print "rmse full: " + str(err) rmse_list.append(err) # full reconstruction Z_final = reconstruct_full(u_set, v_set, theta_k, biasMat) return [Z_final, rmse_list]
row_indices = [] col_indices = [] data_rating = [] lines = netflix_file.collect() for line in lines: line_array = line.split(",") row_indices.append(int(line_array[0]) - 1) col_indices.append(int(line_array[1]) - 1) data_rating.append(float(line_array[2])) return csr_matrix((data_rating, (row_indices, col_indices))) if __name__ == "__main__": # made the spark contest sc = SparkContext(appName="SVD Solver for Netflix Data") # input file netflix_file = sc.textFile("nf_subsample.csv") sparse_data = CSV_to_sparse(netflix_file) # k = 20 principal components U, s, Vt = svds(sparse_data, 20) # 20, to 20 * 20 to get reconstruction error final_s = np.diag(s) matrix_after_svd = U.dot((final_s.dot(Vt))) nz_index = sparse_data.nonzero() # original minus the reconstructed one difference = np.asarray(sparse_data[nz_index] - matrix_after_svd[nz_index]) # recconstruction error loss_l2 = np.sum(difference**2) print(loss_l2)
def rankOneMatrixPursuit(Y, projMat, solRank, C, projMatC, biasMat, verbose=False): M = list() rec_X = 0 # matrix reconstructed using basis matrices projMat_c = sp.coo_matrix(projMat) row_coo, col_coo, dummy = sp.find(projMat_c != 0) proj_coo = zip(row_coo, col_coo) proj_basis_set = list() # projection Y_proj = np.multiply(Y, projMat) # get observed entries as a list for solving linear equations to # estimate coefficients Yp = np.transpose(np.matrix([Y_proj[el] for el in proj_coo])) # for rank-1 matrices of the form M = u*v', storing u and v are sufficient u_set = list() v_set = list() rmse_list = list() for maxIter in range(solRank): # residual Res_k = np.matrix(Y_proj - rec_X) # get left and right highest singular vectors of residual Res_k = sp.bsr_matrix(Res_k) [u, s, vt] = spl.svds(Res_k, k=1, which='LM') # get the rank-1 basis matrix Mk = u * vt proj_basis_set.append(np.multiply( Mk, projMat)) # projection on observed entries Mp = np.matrix([Mk[el] for el in proj_coo]) Mp = np.transpose( Mp) # get the entries for solving the linear-equations if (len(M) == 0): M = Mp u_set = np.matrix(np.transpose(u)) v_set = np.matrix(vt) else: M = np.hstack([M, Mp]) u_set = np.vstack([u_set, np.transpose(u)]) v_set = np.vstack([v_set, vt]) # solving for coefficients of the basis matrices theta_k = solve(M, Yp) # performing reconstruction on the projected set rec_X = reconstruct_proj(proj_basis_set, theta_k) if verbose: #print 'res norm: ' + str(np.linalg.norm(rec_X,'fro')) print "iter:" + str(maxIter) + " err: " + str( np.linalg.norm(rec_X - Y_proj, 'fro')) #print "coeffs: " + str(theta_k) if verbose: Z_inter = reconstruct_full(u_set, v_set, theta_k, biasMat) err = np.linalg.norm(np.multiply(projMatC - projMat, Z_inter - C), 'fro') / np.sqrt(np.sum(projMatC - projMat)) print "rmse full: " + str(err) rmse_list.append(err) print theta_k Z_final = reconstruct_full(u_set, v_set, theta_k, biasMat) return [Z_final, rmse_list]
def svd_wrapper(matrix, mode, ncomp, debug, verbose, usv=False): """ Wrapper for different SVD libraries with the option of showing the cumulative explained variance ratio. Note: ---- Sklearn.PCA deprecated as it uses linalg.svd(X, full_matrices=False) under the hood, which is already included. Sklearn.RandomizedPCA deprecated as it uses sklearn.randomized_svd which is already included. """ if not matrix.ndim==2: raise TypeError('Input matrix is not a 2d array') def reconstruction(ncomp, U, S, V, var=1): if mode=='lapack': rec_matrix = np.dot(U[:,:ncomp], np.dot(np.diag(S[:ncomp]), V[:ncomp])) rec_matrix = rec_matrix.T print(' Matrix reconstruction with {:} PCs:'.format(ncomp)) print(' Mean Absolute Error =', mean_absolute_error(matrix, rec_matrix)) print(' Mean Squared Error =', mean_squared_error(matrix,rec_matrix)) exp_var = S**2 full_var = np.sum(S**2) explained_variance_ratio = exp_var / full_var # % of variance explained by each PC ratio_cumsum = np.cumsum(explained_variance_ratio) elif mode=='eigen': exp_var = S**2 # squared because we previously took the sqrt of the EVals full_var = np.sum(S**2) explained_variance_ratio = exp_var / full_var # % of variance explained by each PC ratio_cumsum = np.cumsum(explained_variance_ratio) else: rec_matrix = np.dot(U, np.dot(np.diag(S), V)) print(' Matrix reconstruction MAE =', mean_absolute_error(matrix, rec_matrix)) exp_var = (S**2) / matrix.shape[0] full_var = np.var(matrix, axis=0).sum() explained_variance_ratio = exp_var / full_var # % of variance explained by each PC if var==1: pass else: explained_variance_ratio = explained_variance_ratio[::-1] ratio_cumsum = np.cumsum(explained_variance_ratio) msg = ' This info makes sense when the matrix is mean centered ' msg += '(temp-mean scaling)' print (msg) lw = 2 alpha = 0.4 fig = plt.figure(figsize=(6,3)) fig.subplots_adjust(wspace=0.4) ax1 = plt.subplot2grid((1,3), (0,0), colspan=2) ax1.step(list(range(explained_variance_ratio.shape[0])), explained_variance_ratio, alpha=alpha, where='mid', label='Individual EVR', lw=lw) ax1.plot(ratio_cumsum, '.-', alpha=alpha, label='Cumulative EVR', lw=lw) ax1.legend(loc='best', frameon=False, fontsize='medium') ax1.set_ylabel('Explained variance ratio (EVR)') ax1.set_xlabel('Principal components') ax1.grid(linestyle='solid', alpha=0.2) ax1.set_xlim(-10, explained_variance_ratio.shape[0]+10) ax1.set_ylim(0, 1) trunc = 20 ax2 = plt.subplot2grid((1,3), (0,2), colspan=1) #plt.setp(ax2.get_yticklabels(), visible=False) ax2.step(list(range(trunc)), explained_variance_ratio[:trunc], alpha=alpha, where='mid', lw=lw) ax2.plot(ratio_cumsum[:trunc], '.-', alpha=alpha, lw=lw) ax2.set_xlabel('Principal components') ax2.grid(linestyle='solid', alpha=0.2) ax2.set_xlim(-2, trunc+2) ax2.set_ylim(0, 1) msg = ' Cumulative explained variance ratio for {:} PCs = {:.5f}' #plt.savefig('figure.pdf', dpi=300, bbox_inches='tight') print(msg.format(ncomp, ratio_cumsum[ncomp-1])) if ncomp>min(matrix.shape[0],matrix.shape[1]): msg = '{:} PCs can be obtained from a matrix with size [{:},{:}].' msg += ' Increase the size of the patches or decrease the number of' msg += ' principal components.' raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1])) if mode=='eigen': # in our data n_frames is always smaller than n_pixels. In this setting # by taking the covariance as np.dot(matrix.T,matrix) we get all # (n_pixels) eigenvectors but it is much slower and takes more memory M = np.dot(matrix, matrix.T) # covariance matrix e, EV = linalg.eigh(M) # eigenvalues and eigenvectors pc = np.dot(EV.T, matrix) # PCs using a compact trick when cov is MM' V = pc[::-1] # reverse since last eigenvectors are the ones we want S = np.sqrt(e)[::-1] # reverse since eigenvalues are in increasing order if debug: reconstruction(ncomp, None, S, None) for i in range(V.shape[1]): V[:,i] /= S # scaling by the square root of eigenvalues V = V[:ncomp] if verbose: print('Done PCA with numpy linalg eigh functions') elif mode=='lapack': # in our data n_frames is always smaller than n_pixels. In this setting # taking the SVD of M' and keeping the left (transposed) SVs is faster # than taking the SVD of M and taking the right ones U, S, V = linalg.svd(matrix.T, full_matrices=False) if debug: reconstruction(ncomp, U, S, V) V = V[:ncomp] # we cut projection matrix according to the # of PCs U = U[:,:ncomp] S = S[:ncomp] if verbose: print('Done SVD/PCA with numpy SVD (LAPACK)') elif mode=='arpack': U, S, V = svds(matrix, k=ncomp) if debug: reconstruction(ncomp, U, S, V, -1) if verbose: print('Done SVD/PCA with scipy sparse SVD (ARPACK)') elif mode=='randsvd': U, S, V = randomized_svd(matrix, n_components=ncomp, n_iter=2, transpose='auto', random_state=None) if debug: reconstruction(ncomp, U, S, V) if verbose: print('Done SVD/PCA with randomized SVD') else: raise TypeError('The SVD mode is not available') if usv: if mode=='lapack': return U.T, S, V.T else: return U, S, V else: if mode=='lapack': return U.T else: return V
def err(tol): _, s2, _ = svds(A, k=k, v0=np.ones(n), solver=self.solver, tol=tol) return np.linalg.norm((s2 - s[k - 1::-1]) / s[k - 1::-1])
def _fit_truncated(self, X, n_components, svd_solver): """Fit the model by computing truncated SVD (by ARPACK or randomized) on X """ n_samples, n_features = X.shape if isinstance(n_components, six.string_types): raise ValueError("n_components=%r cannot be a string " "with svd_solver='%s'" % (n_components, svd_solver)) elif not 1 <= n_components <= min(n_samples, n_features): raise ValueError("n_components=%r must be between 1 and " "min(n_samples, n_features)=%r with " "svd_solver='%s'" % (n_components, min(n_samples, n_features), svd_solver)) elif not isinstance(n_components, (numbers.Integral, np.integer)): raise ValueError("n_components=%r must be of type int " "when greater than or equal to 1, was of type=%r" % (n_components, type(n_components))) elif svd_solver == 'arpack' and n_components == min(n_samples, n_features): raise ValueError("n_components=%r must be strictly less than " "min(n_samples, n_features)=%r with " "svd_solver='%s'" % (n_components, min(n_samples, n_features), svd_solver)) random_state = check_random_state(self.random_state) # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ if svd_solver == 'arpack': # random init solution, as ARPACK does it internally v0 = random_state.uniform(-1, 1, size=min(X.shape)) U, S, V = svds(X, k=n_components, tol=self.tol, v0=v0) # svds doesn't abide by scipy.linalg.svd/randomized_svd # conventions, so reverse its outputs. S = S[::-1] # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U[:, ::-1], V[::-1]) elif svd_solver == 'randomized': # sign flipping is done inside U, S, V = randomized_svd(X, n_components=n_components, n_iter=self.iterated_power, flip_sign=True, random_state=random_state) self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = V self.n_components_ = n_components # Get variance explained by singular values self.explained_variance_ = (S ** 2) / (n_samples - 1) total_var = np.var(X, ddof=1, axis=0) self.explained_variance_ratio_ = \ self.explained_variance_ / total_var.sum() self.singular_values_ = S.copy() # Store the singular values. if self.n_components_ < min(n_features, n_samples): self.noise_variance_ = (total_var.sum() - self.explained_variance_.sum()) self.noise_variance_ /= min(n_features, n_samples) - n_components else: self.noise_variance_ = 0. return U, S, V