def get_bond_order(bofile, job_info, num_sv=4): metal_ind = job_info['metal_ind'] natoms = job_info['natoms'] dict_bondorder = OrderedDict() catoms = [metal_ind] + job_info['catoms'] dict_patterns = {} for catom in catoms: dict_patterns[catom] = [metal_ind, catom] botext = list() with open(bofile, 'r') as fo: for line in fo: if "bond order list" in line: botext = list() else: botext.append(line) bo_mat = np.zeros(shape=(natoms, natoms)) for line in botext: ll = line.split() row_idx, col_idx = int(ll[0]), int(ll[1]) bo_mat[row_idx, col_idx] = float(ll[2]) bo_mat[col_idx, row_idx] = float(ll[2]) U, Sigma, VT = randomized_svd(bo_mat, n_components=num_sv, n_iter=20) sigma = Sigma.tolist() for sv in range(num_sv): dict_bondorder.update({'bo_sv%d' % sv: sigma[sv]}) bo_mat_off_diag = bo_mat.copy() np.fill_diagonal(bo_mat_off_diag, 0) _U, _Sigma, _VT = randomized_svd(bo_mat_off_diag, n_components=num_sv, n_iter=20) _sigma = _Sigma.tolist() for sv in range(num_sv): dict_bondorder.update({'bo_offsv%d' % sv: _sigma[sv]}) for catom, vals in dict_patterns.items(): dict_bondorder.update({'bo_%d' % catom: bo_mat[vals[0], vals[1]]}) dict_bondorder = symmetricalize_dict(job_info, feature_dict=dict_bondorder) return dict_bondorder
def test_randomized_svd_low_rank(): """Check that extmath.randomized_svd is consistent with linalg.svd""" n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X of approximate effective rank `rank` and no noise # component (very structured signal): X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.0, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method U, s, V = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method Ua, sa, Va = randomized_svd(X, k) assert_equal(Ua.shape, (n_samples, k)) assert_equal(sa.shape, (k,)) assert_equal(Va.shape, (k, n_features)) # ensure that the singular values of both methods are equal up to the real # rank of the matrix assert_almost_equal(s[:k], sa) # check the singular vectors too (while not checking the sign) assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va)) # check the sparse matrix representation X = sparse.csr_matrix(X) # compute the singular values of X using the fast approximate method Ua, sa, Va = randomized_svd(X, k) assert_almost_equal(s[:rank], sa[:rank])
def test_randomized_svd_sign_flip_with_transpose(): # Check if the randomized_svd sign flipping is always done based on u # irrespective of transpose. # See https://github.com/scikit-learn/scikit-learn/issues/5608 # for more details. def max_loading_is_positive(u, v): """ returns bool tuple indicating if the values maximising np.abs are positive across all rows for u and across all columns for v. """ u_based = (np.abs(u).max(axis=0) == u.max(axis=0)).all() v_based = (np.abs(v).max(axis=1) == v.max(axis=1)).all() return u_based, v_based mat = np.arange(10 * 8).reshape(10, -1) # Without transpose u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True) u_based, v_based = max_loading_is_positive(u_flipped, v_flipped) assert_true(u_based) assert_false(v_based) # With transpose u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd( mat, 3, flip_sign=True, transpose=True) u_based, v_based = max_loading_is_positive( u_flipped_with_transpose, v_flipped_with_transpose) assert_true(u_based) assert_false(v_based)
def test_randomized_svd_power_iteration_normalizer(): # randomized_svd with power_iteration_normalized='none' diverges for # large number of power iterations on this dataset rng = np.random.RandomState(42) X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng) X += 3 * rng.randint(0, 2, size=X.shape) n_components = 50 # Check that it diverges with many (non-normalized) power iterations U, s, V = randomized_svd(X, n_components, n_iter=2, power_iteration_normalizer='none') A = X - U.dot(np.diag(s).dot(V)) error_2 = linalg.norm(A, ord='fro') U, s, V = randomized_svd(X, n_components, n_iter=20, power_iteration_normalizer='none') A = X - U.dot(np.diag(s).dot(V)) error_20 = linalg.norm(A, ord='fro') assert_greater(np.abs(error_2 - error_20), 100) for normalizer in ['LU', 'QR', 'auto']: U, s, V = randomized_svd(X, n_components, n_iter=2, power_iteration_normalizer=normalizer, random_state=0) A = X - U.dot(np.diag(s).dot(V)) error_2 = linalg.norm(A, ord='fro') for i in [5, 10, 50]: U, s, V = randomized_svd(X, n_components, n_iter=i, power_iteration_normalizer=normalizer, random_state=0) A = X - U.dot(np.diag(s).dot(V)) error = linalg.norm(A, ord='fro') assert_greater(15, np.abs(error_2 - error))
def test_randomized_svd_infinite_rank(): """Check that extmath.randomized_svd can handle noisy matrices""" n_samples = 100 n_features = 500 rank = 5 k = 10 # let us try again without 'low_rank component': just regularly but slowly # decreasing singular values: the rank of the data matrix is infinite X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=1.0, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0) # the approximation does not tolerate the noise: assert_greater(np.abs(s[:k] - sa).max(), 0.1) # compute the singular values of X using the fast approximate method with # iterated power method _, sap, _ = randomized_svd(X, k, n_iter=5) # the iterated power method is still managing to get most of the structure # at the requested rank assert_almost_equal(s[:k], sap, decimal=3)
def test_randomized_svd_low_rank_with_noise(): """Check that extmath.randomized_svd can handle noisy matrices""" n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X wity structure approximate rank `rank` and an # important noisy component X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.5, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0) # the approximation does not tolerate the noise: assert_greater(np.abs(s[:k] - sa).max(), 0.05) # compute the singular values of X using the fast approximate method with # iterated power method _, sap, _ = randomized_svd(X, k, n_iter=5) # the iterated power method is helping getting rid of the noise: assert_almost_equal(s[:k], sap, decimal=3)
def test_randomized_svd_transpose_consistency(): """Check that transposing the design matrix has limit impact""" n_samples = 100 n_features = 500 rank = 4 k = 10 X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.5, random_state=0) assert_equal(X.shape, (n_samples, n_features)) U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False, random_state=0) U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True, random_state=0) U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose='auto', random_state=0) U4, s4, V4 = linalg.svd(X, full_matrices=False) assert_almost_equal(s1, s4[:k], decimal=3) assert_almost_equal(s2, s4[:k], decimal=3) assert_almost_equal(s3, s4[:k], decimal=3) assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]), decimal=2) assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]), decimal=2) # in this case 'auto' is equivalent to transpose assert_almost_equal(s2, s3)
def get_svd_learn_clusters(accu_path, data=None, sing_threshold=2.0, assign_clstr=0.1, vis=False): """First runs the decomposition for maximum number of singular values. Then reruns on a subset > than some value""" (N, f) = data.shape all_components = min(N,f) U, Sigma, VT = randomized_svd(data, n_components=all_components, n_iter=5, random_state=None) # print "Sigma:", Sigma best_components = sum(Sigma > sing_threshold) U, Sigma, VT = randomized_svd(data, n_components=best_components, n_iter=5, random_state=None) pred_labels = [np.argmax(doc) if np.max(doc) > assign_clstr else 100 for doc in U] # print "predicted classes:", pred_labels utils.screeplot(accu_path, Sigma, all_components, vis) """Plot a graph for each right singular vector (VT)""" max_, min_ = 0, 100 min_=100 for i in VT: if max(i)>max_: max_ = max(i) if min(i)<min_: min_ = min(i) if vis: with open(accu_path + "/graphlets.p", 'r') as f: graphlets = pickle.load(f) for i, vocabulary in enumerate(VT): title = 'Latent Concept %s' % i utils.genome(accu_path, vocabulary, [min_, max_], title) if vis: for c, v in enumerate(vocabulary): if v > 0.1: print "\n",c, graphlets[c] return U, Sigma, VT
def _randomized_dpca(self,X,mXs,pinvX=None): """ Solves the dPCA minimization problem analytically by using a randomized SVD solver from sklearn. Returns ------- P : dict mapping strings to array-like, Holds encoding matrices for each term in variance decompostions (used to transform data to low-dimensional space). D : dict mapping strings to array-like, Holds decoding matrices for each term in variance decompostions (used in inverse_transform to map from low-dimensional representation back to original data space). """ n_features = X.shape[0] rX = X.reshape((n_features,-1)) pinvX = pinv(rX) if pinvX is None else pinvX P, D = {}, {} for key in list(mXs.keys()): mX = mXs[key].reshape((n_features,-1)) # called X_phi in paper C = np.dot(mX,pinvX) if isinstance(self.n_components,dict): U,s,V = randomized_svd(np.dot(C,rX),n_components=self.n_components[key],n_iter=self.n_iter,random_state=np.random.randint(10e5)) else: U,s,V = randomized_svd(np.dot(C,rX),n_components=self.n_components,n_iter=self.n_iter,random_state=np.random.randint(10e5)) P[key] = U D[key] = np.dot(U.T,C).T return P, D
def test_randomized_svd_sign_flip(): a = np.array([[2.0, 0.0], [0.0, 1.0]]) u1, s1, v1 = randomized_svd(a, 2, flip_sign=True, random_state=41) for seed in range(10): u2, s2, v2 = randomized_svd(a, 2, flip_sign=True, random_state=seed) assert_almost_equal(u1, u2) assert_almost_equal(v1, v2) assert_almost_equal(np.dot(u2 * s2, v2), a) assert_almost_equal(np.dot(u2.T, u2), np.eye(2)) assert_almost_equal(np.dot(v2.T, v2), np.eye(2))
def _svd(self, array, n_components, n_discard): """Returns first `n_components` left and right singular vectors u and v, discarding the first `n_discard`. """ if self.svd_method == "randomized": kwargs = {} if self.n_svd_vecs is not None: kwargs["n_oversamples"] = self.n_svd_vecs u, _, vt = randomized_svd(array, n_components, random_state=self.random_state, **kwargs) elif self.svd_method == "arpack": u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs) if np.any(np.isnan(vt)): # some eigenvalues of A * A.T are negative, causing # sqrt() to be np.nan. This causes some vectors in vt # to be np.nan. _, v = eigsh(safe_sparse_dot(array.T, array), ncv=self.n_svd_vecs) vt = v.T if np.any(np.isnan(u)): _, u = eigsh(safe_sparse_dot(array, array.T), ncv=self.n_svd_vecs) assert_all_finite(u) assert_all_finite(vt) u = u[:, n_discard:] vt = vt[n_discard:] return u, vt.T
def fit(self, X, y): self.work_titles = {} for work in Work.objects.values('id', 'title'): self.work_titles[work['id']] = work['title'] work_ids = list(Rating.objects.values_list('work_id', flat=True).distinct()) nb_works = len(work_ids) self.inv_work = {work_ids[i]: i for i in range(nb_works)} user_ids = list(User.objects.values_list('id', flat=True)) nb_users = len(user_ids) self.inv_user = {user_ids[i]: i for i in range(nb_users)} self.chrono.save('get_work_ids') # print("Computing M: (%i × %i)" % (nb_users, nb_works)) self.M = lil_matrix((nb_users, nb_works)) """ratings_of = {} for (user_id, work_id), rating in zip(X, y): ratings_of.setdefault(user_id, []).append(rating)""" for (user_id, work_id), rating in zip(X, y): self.M[self.inv_user[user_id], self.inv_work[work_id]] = rating #- np.mean(ratings_of[user_id]) # np.save('backupM', self.M) self.chrono.save('fill matrix') # Ranking computation self.U, self.sigma, self.VT = randomized_svd(self.M, NB_COMPONENTS, n_iter=3, random_state=42) # print('Formes', self.U.shape, self.sigma.shape, self.VT.shape) self.save('backup.pickle') self.chrono.save('factor matrix')
def _fit(self, gn): from sklearn.utils.validation import check_random_state from sklearn.utils.extmath import randomized_svd # apply scaling gn = self.scaler_.fit(gn).transform(gn) # transpose for svd # TODO eliminate need for transposition x = gn.T n_samples, n_features = x.shape # intermediates random_state = check_random_state(self.random_state) n_components = self.n_components n_samples, n_features = x.shape # singular value decomposition u, s, v = randomized_svd(x, n_components, n_iter=self.iterated_power, random_state=random_state) # calculate explained variance self.explained_variance_ = exp_var = (s ** 2) / n_samples full_var = np.var(x, axis=0).sum() self.explained_variance_ratio_ = exp_var / full_var # store components self.components_ = v return u, s, v
def gsvd(X, M, A, n_comps = 10): """ Generalized SVD :param X: :param M: :param A: :return: """ print("GSVD") print("GSVD: Weights... ", end='') Xw = np.dot(np.sqrt(M), np.dot(X, np.sqrt(A))) print("Done!") print("GSVD: SVD... ", end='') [P_, D, Q_] = randomized_svd(Xw, n_comps) #P_ = P_[:,0:n_comps] #D = D[0:n_comps] #Q_ = Q_[0:n_comps,:] print('Done!') print("GSVD: Factor scores and eigenvalues... ", end='') Mp = np.power(np.diag(M), -0.5) Ap = np.power(np.diag(A), -0.5) P = np.dot(np.diag(Mp), P_) Q = np.dot(np.diag(Ap), Q_.T) ev = np.power(D, 2) print('Done!') return P, D, Q, ev
def _max_singular_value(self, X_filled): # quick decomposition of X_filled into rank-1 SVD _, s, _ = randomized_svd( X_filled, 1, n_iter=5) return s[0]
def _svd_step(self, X, shrinkage_value, max_rank=None): """ Returns reconstructed X from low-rank thresholded SVD and the rank achieved. """ if max_rank: # if we have a max rank then perform the faster randomized SVD (U, s, V) = randomized_svd( X, max_rank, n_iter=self.n_power_iterations) else: # perform a full rank SVD using ARPACK (U, s, V) = np.linalg.svd( X, full_matrices=False, compute_uv=True) s_thresh = np.maximum(s - shrinkage_value, 0) rank = (s_thresh > 0).sum() s_thresh = s_thresh[:rank] U_thresh = U[:, :rank] V_thresh = V[:rank, :] S_thresh = np.diag(s_thresh) X_reconstruction = np.dot(U_thresh, np.dot(S_thresh, V_thresh)) return X_reconstruction, rank
def select_factorization_algorithm(factorization_algo, corpus=None, doc_count=0, num_features=2): U, S, V = None, None, None if factorization_algo == FactorizationAlgorithm.linear_svd: A = [[]] for doc_id, word_id, value in corpus: if len(A) < doc_id + 1: A.append([]) A[doc_id].append(value) U, S, V = linalg.svd(A) elif factorization_algo == FactorizationAlgorithm.randomized_svd: A = [[]] for doc_id, word_id, value in corpus: if len(A) < doc_id + 1: A.append([]) A[doc_id].append(value) U, S, V = randomized_svd(numpy.array(A), n_components=num_features) elif factorization_algo == FactorizationAlgorithm.gradient_descent: N = doc_count M = len(corpus.dictionary.items()) K = num_features P = numpy.random.uniform(low=-0.01, high=0.01, size=(N, K)) Q = numpy.random.uniform(low=-0.01, high=0.01, size=(M, K)) # P = numpy.full((N, K), 0.1) # Q = numpy.full((M, K), 0.1) U, V = svd_factorization(corpus, P, Q, K) elif factorization_algo == FactorizationAlgorithm.gradient_descent_engine: svd_engine = SVDEngine(num_docs=doc_count, num_words=len(corpus.dictionary.items()), num_features=2) svd_engine.feature_training(corpus) U, V = svd_engine.document_profiles, svd_engine.word_profiles return U, S, V
def run(): start = datetime.now() KING_ID = User.objects.get(username='******').id anime_titles = {} anime_ids = set() rs = list(Rating.objects.all().select_related('work')) print(rs[0]) cp0 = datetime.now() print(cp0 - start) for i, rating in enumerate(rs, start=1): if i % 1000 == 0: print(i) if rating.work.id not in anime_ids: anime_ids.add(rating.work.id) anime_titles[rating.work.id] = rating.work.title cp1 = datetime.now() print(cp1 - cp0) seen_titles = set() for rating in Rating.objects.filter(user__id=KING_ID).select_related('work'): if rating.choice != 'willsee': seen_titles.add(rating.work.title) cp2 = datetime.now() print(cp2 - cp1) nb_users = max(user.id for user in User.objects.all()) nb_anime = len(anime_ids) anime_ids = list(anime_ids) inversed = {anime_ids[i]: i for i in range(nb_anime)} print("Computing X: (%i×%i)" % (nb_users, nb_anime)) cp3 = datetime.now() print(cp3 - cp2) print(nb_users, '×', nb_anime) values = {'like': 2, 'dislike': -2, 'neutral': 0.1, 'willsee': 0.5, 'wontsee': -0.5} X = lil_matrix((nb_users + 1, nb_anime + 1)) for rating in Rating.objects.select_related('work', 'user'): if rating.work.id < nb_anime: X[rating.user.id, inversed[rating.work.id]] = values[rating.choice] # Ranking computation cp4 = datetime.now() print(cp4 - cp3) U, sigma, VT = randomized_svd(X, NB_COMPONENTS, n_iter=3, random_state=42) XD = np.dot(np.dot(U, np.diag(sigma)), VT) ranking = sorted((XD[KING_ID, j], anime_titles[anime_ids[j]]) for j in range(1, nb_anime + 1) if j in anime_titles)[::-1] # Summarize the results of the ranking for KING_ID: # “=> rank, title, score” c = 0 for i, (rating, title) in enumerate(ranking, start=1): if title not in seen_titles: print('=>', i, title, rating) c += 1 elif i < 10: print(i, title, rating) if c >= 10: break print(len(connection.queries)) for line in connection.queries: print(line) end = datetime.now() print(end - start)
def ksvd(Y, D, X, n_cycles=1, verbose=True): n_atoms = D.shape[1] n_features, n_samples = Y.shape unused_atoms = [] R = Y - fast_dot(D, X) for c in range(n_cycles): for k in range(n_atoms): if verbose: sys.stdout.write("\r" + "k-svd..." + ":%3.2f%%" % ((k / float(n_atoms)) * 100)) sys.stdout.flush() # find all the datapoints that use the kth atom omega_k = X[k, :] != 0 if not np.any(omega_k): unused_atoms.append(k) continue # the residual due to all the other atoms but k Rk = R[:, omega_k] + np.outer(D[:, k], X[k, omega_k]) U, S, V = randomized_svd(Rk, n_components=1, n_iter=10, flip_sign=False) D[:, k] = U[:, 0] X[k, omega_k] = V[0, :] * S[0] # update the residual R[:, omega_k] = Rk - np.outer(D[:, k], X[k, omega_k]) print "" return D, X, unused_atoms
def _sv_thresh(X, threshold, num_svalue): """ Perform singular value thresholding. Parameters --------- X : array of shape [n_samples, n_features] The input array. threshold : float The threshold for the singualar values. num_svalue : int The number of singular values to compute. Returns ------- X_thresh : array of shape [n_samples, n_features] The output after performing singular value thresholding. grater_sv : int The number of singular values of `X` which were greater than `threshold` (U, s, V): tuple The singular value decomposition """ m, n = X.shape U, s, V = randomized_svd(X, num_svalue) greater_sv = np.count_nonzero(s > threshold) s = _soft_thresh(s, threshold) S = np.diag(s) X_thresh = np.dot(U, np.dot(S, V)) return X_thresh, greater_sv, (U, s, V)
def apply_uv_decomposition(self): U, Sigma, VT = randomized_svd(self.behaviour_matrix, n_components=15, n_iter=10, random_state=None) print(U.shape) print(VT.shape) self.X_hat = np.dot(U, VT) # U * np.diag(Sigma)
def do_fit(X): n_samples = X.shape[0] n_components = X.shape[1] U, S, V = extmath.randomized_svd(X, n_components, n_iter=3) return U, S, V
def fast_svd(X, n_components, random_state=None): """ Automatically switch between randomized and lapack SVD (heuristic of scikit-learn). Parameters ========== X: array, shape (n_samples, n_features) The data to decompose n_components: integer The order of the dimensionality of the truncated SVD random_state: int or RandomState Pseudo number generator state used for random sampling. Returns ======== U: array, shape (n_samples, n_components) The first matrix of the truncated svd S: array, shape (n_components) The second matric of the truncated svd V: array, shape (n_components, n_features) The last matric of the truncated svd """ random_state = check_random_state(random_state) # Small problem, just call full PCA if max(X.shape) <= 500: svd_solver = 'full' elif n_components >= 1 and n_components < .8 * min(X.shape): svd_solver = 'randomized' # This is also the case of n_components in (0,1) else: svd_solver = 'full' # Call different fits for either full or truncated SVD if svd_solver == 'full': U, S, V = linalg.svd(X, full_matrices=False) # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U, V) # The "copy" are there to free the reference on the non reduced # data, and hence clear memory early U = U[:, :n_components].copy() S = S[:n_components] V = V[:n_components].copy() else: if LooseVersion(sklearn.__version__) >= LooseVersion('0.17'): n_iter = 'auto' else: n_iter = 3 U, S, V = randomized_svd(X, n_components=n_components, n_iter=n_iter, flip_sign=True, random_state=random_state) return U, S, V
def run(in_file, out_path, dim=300, keep_words=None): base_embed = Explicit.load(in_file, normalize=False) if keep_words != None: base_embed = base_embed.get_subembed(keep_words) u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5) np.save(out_path + "-u.npy", u) np.save(out_path + "-v.npy", v) np.save(out_path + "-s.npy", s) util.write_pickle(base_embed.iw, out_path + "-vocab.pkl")
def test(self, override=False): """ Applies randomised VD to the dataset. This function does NOT write results to the hdf5 file. Call compute() to write to the file. Handles complex, compound datasets such that the V matrix is of the same data-type as the input matrix. Parameters ---------- override : bool, optional. default = False Set to true to recompute results if prior results are available. Else, returns existing results Returns ------- U : :class:`numpy.ndarray` Abundance matrix S : :class:`numpy.ndarray` variance vector V : :class:`numpy.ndarray` eigenvector matrix """ ''' Check if a number of compnents has been set and ensure that the number is less than the minimum axis length of the data. If both conditions are met, use fsvd. If not use the regular svd. C.Smith -- We might need to put a lower limit on num_comps in the future. I don't know enough about svd to be sure. ''' if not override: if isinstance(self.duplicate_h5_groups, list) and len(self.duplicate_h5_groups) > 0: self.h5_results_grp = self.duplicate_h5_groups[-1] print('Returning previously computed results from: {}'.format(self.h5_results_grp.name)) print('set the "override" flag to True to recompute results') return reshape_to_n_dims(self.h5_results_grp['U'])[0], self.h5_results_grp['S'][()], \ reshape_to_n_dims(self.h5_results_grp['V'])[0] self.h5_results_grp = None t1 = time.time() self.__u, self.__s, self.__v = randomized_svd(self.data_transform_func(self.h5_main), self.num_components, n_iter=3) self.__v = stack_real_to_target_dtype(self.__v, self.h5_main.dtype) print('Took {} to compute randomized SVD'.format(format_time(time.time() - t1))) u_mat, success = reshape_to_n_dims(self.__u, h5_pos=self.h5_main.h5_pos_inds, h5_spec=np.expand_dims(np.arange(self.__u.shape[1]), axis=0)) if not success: raise ValueError('Could not reshape U to N-Dimensional dataset! Error:' + success) v_mat, success = reshape_to_n_dims(self.__v, h5_pos=np.expand_dims(np.arange(self.__u.shape[1]), axis=1), h5_spec=self.h5_main.h5_spec_inds) if not success: raise ValueError('Could not reshape V to N-Dimensional dataset! Error:' + success) return u_mat, self.__s, v_mat
def algin_coor_sys(cs_static, cs): # A,B have shape (landmark-by-dimension) A = cs.lmrk_coors.as_matrix().transpose() B = cs_static.lmrk_coors.as_matrix().transpose() U, Sigma, V = randomized_svd(np.dot(A.transpose(), B), n_components=A.shape[0]) Q = np.dot(U, V.transpose()) W = Q A = np.dot(A, W) cs.lmrk_coors = pd.DataFrame(A.transpose(), columns=cs.lmrk_coors.columns) return W
def sv_thresh(X, t, k): m, n = X.shape U, s, V = randomized_svd(X, k) #pca(X, raw=True, k=25) # Number of singular values greater than `t` greater_sv = np.sum(s > t) s = soft_thresh(s, t) S = np.diag(s) ret = np.dot(U, np.dot(S, V)) assert ret.shape == X.shape return ret, greater_sv
def get_gradient(gradfile, job_info, num_sv=3): metal_ind = job_info['metal_ind'] natoms = job_info['natoms'] num_lines = natoms + 2 dict_gradient = OrderedDict() catoms = [metal_ind] + job_info['catoms'] with open(gradfile, 'r') as fo: gradtext = fo.readlines()[-num_lines:] grad_mat = np.zeros(shape=(natoms, 3)) for idx, line in enumerate(gradtext): ll = line.split() if ll[0] == 'terachem': dict_gradient.update({'grad_rms': float(ll[7][:-1])}) if idx > 1: grad_mat[idx - 2, :] = [float(x) for x in ll[1:]] U, Sigma, VT = randomized_svd(grad_mat, n_components=num_sv, n_iter=20) sigma = Sigma.tolist() for sv in range(num_sv): dict_gradient.update({'grad_sv%d' % sv: sigma[sv]}) for catom in catoms: dict_gradient.update({'grad_%d' % catom: np.linalg.norm(grad_mat[catom, :])}) max_norm = 0 for ii in range(natoms): _norm = np.linalg.norm(grad_mat[ii, :]) if _norm > max_norm: max_norm = _norm dict_gradient.update({'grad_maxnorm': max_norm}) grad_mat_internal = grad_mat.copy() grad_mat_internal = grad_mat_internal - grad_mat_internal[metal_ind, :] _U, _Sigma, _VT = randomized_svd(grad_mat_internal, n_components=num_sv, n_iter=20) _sigma = _Sigma.tolist() for sv in range(num_sv): dict_gradient.update({'grad_intsv%d' % sv: _sigma[sv]}) _max_norm = 0 for ii in range(natoms): _norm = np.linalg.norm(grad_mat_internal[ii, :]) if _norm > _max_norm: _max_norm = _norm dict_gradient.update({'grad_intmaxnorm': _max_norm}) dict_gradient = symmetricalize_dict(job_info, feature_dict=dict_gradient) return dict_gradient
def __set_landmarks__(self, lmrks, Dim): vertices = None if self.vertex_filter: vertices = [v for v in self.g.nodes() if self.vertex_filter(v)] L2all = self.proximity_to(lmrks, dests=vertices) L2L = L2all.loc[lmrks] U, S, _ = randomized_svd(L2L.as_matrix(), n_components=Dim) lmrk_coors = np.dot(U, np.sqrt(np.diag(S))) self.lmrk_coors = pd.DataFrame(lmrk_coors.transpose(), columns=lmrks) self.L2all = L2all
def _randomized_dpca(self,X,mXs,pinvX=None): """ Solves the dPCA minimization problem analytically by using a randomized SVD solver from sklearn. """ n_features = X.shape[0] rX = X.reshape((n_features,-1)) pinvX = pinv(rX) if pinvX is None else pinvX P, D = {}, {} for key in mXs.keys(): mX = mXs[key].reshape((n_features,-1)) C = np.dot(mX,pinvX) if isinstance(self.n_components,dict): U,s,V = randomized_svd(np.dot(C,rX),n_components=self.n_components[key],n_iter=self.n_iter,random_state=np.random.randint(10e5)) else: U,s,V = randomized_svd(np.dot(C,rX),n_components=self.n_components,n_iter=self.n_iter,random_state=np.random.randint(10e5)) P[key] = U D[key] = np.dot(U.T,C).T return P, D
def svd_dcmp(self, precision=0.01, n_terms_range=(1, np.inf)): """ Does decomposition of covariance matrix defined by set of points :param precision: Desired accuracy of the KL approximation, smaller eigen values are dropped. :param n_terms_range: (min, max) number of terms in KL expansion to use. The number of terms estimated from given precision is snapped to the given interval. truncated SVD: cov_mat = U*diag(ev) * V, _cov_l_factor = U[:,0:m]*sqrt(ev[0:m]) Note on number of terms: According to: C. Schwab and R. A. Todor: KL Approximation of Random Fields by Generalized Fast Multiploe Method the eigen values should decay as (Proposition 2.18): lambda_m ~ sigma^2 * ( 1/gamma ) **( m**(1/d) + alpha ) / Gamma(0.5 * m**(1/d) ) where gamma = correlation length / domain diameter ans alpha is the correlation exponent. Gamma is the gamma function. ... should be checked experimantaly and generalized for sigma(X) :return: """ if self.cov_mat is None: self.cov_matrix() if n_terms_range[0] >= self.n_points: U, ev, VT = np.linalg.svd(self.cov_mat) m = self.n_points else: range = list(n_terms_range) range[0] = max(1, range[0]) range[1] = min(self.n_points, range[1]) prec_range = (self._eigen_value_estimate(range[0]), self._eigen_value_estimate(range[1])) if precision < prec_range[0]: m = range[0] elif precision > prec_range[1]: m = range[1] else: f = lambda m: self._eigen_value_estimate(m) - precision m = sp.optmize.bisect( f, range[0], range[1], xtol=0.5, ) m = max(m, range[0]) threshold = 2 * precision # TODO: Test if we should cut eigen values by relative (like now) or absolute value while threshold >= precision and m <= range[1]: #print("treshold: {} m: {} precision: {} max_m: {}".format(threshold, m, precision, range[1])) U, ev, VT = randomized_svd(self.cov_mat, n_components=m, n_iter=3, random_state=None) threshold = ev[-1] / ev[0] m = int(np.ceil(1.5 * m)) m = len(ev) m = min(m, range[1]) #print("KL approximation: {} for {} points.".format(m, self.n_points)) self.n_approx_terms = m self._sqrt_ev = np.sqrt(ev[0:m]) self._cov_l_factor = U[:, 0:m].dot(np.diag(self._sqrt_ev)) self.cov_mat = None return self._cov_l_factor, ev[0:m]
def foreground(m, rank=1): U, S, Vh = randomized_svd(m, rank) L = U * np.diag(S) * Vh S = m - L return S
def plot(self): colors = ['b', 'g', 'r', 'c', 'm'] self.plot_axes = 2 if self.load_from is None and self.store_to is not None: #print('Plot to CSV') self.plot_u, self.plot_sigma, self.plot_vt = randomized_svd( self.tdmatrix, n_components=self.plot_axes) csv_filename = self.path + "models/" + self.store_to csv_u = open(csv_filename + '_plot_u.csv', 'w+') csv_sigma = open(csv_filename + '_plot_sigma.csv', 'w+') csv_vt = open(csv_filename + '_plot_vt.csv', 'w+') writer_u = csv.writer(csv_u, delimiter=',') writer_sigma = csv.writer(csv_sigma, delimiter=',') writer_vt = csv.writer(csv_vt, delimiter=',') for row in self.plot_u: writer_u.writerow(row) writer_sigma.writerow(self.plot_sigma) for row in self.plot_vt: writer_vt.writerow(row) csv_u.close() csv_sigma.close() csv_vt.close() elif self.load_from is not None and self.store_to is None: #print('Plot from CSV') csv_filename = self.path + "models/" + self.load_from csv_u = open(csv_filename + '_plot_u.csv', 'r') csv_sigma = open(csv_filename + '_plot_sigma.csv', 'r') csv_vt = open(csv_filename + '_plot_vt.csv', 'r') reader_u = csv.reader(csv_u) reader_sigma = csv.reader(csv_sigma) reader_vt = csv.reader(csv_vt) self.plot_u = list() for irow, row in enumerate(reader_u): self.plot_u.append(list()) for icol, val in enumerate(row): self.plot_u[irow].append(float(val)) self.plot_sigma = list() for irow, row in enumerate(reader_sigma): for icol, val in enumerate(row): self.plot_sigma.append(float(val)) self.plot_vt = list() for irow, row in enumerate(reader_vt): self.plot_vt.append(list()) for icol, val in enumerate(row): self.plot_vt[irow].append(float(val)) csv_u.close() csv_sigma.close() csv_sigma.close() csv_vt.close() #print("Plot SVD Done") #print("U : " + str(len(self.plot_u)) + " x " + str(len(self.plot_u[0]))) #print("Sigma : " + str(len(self.plot_sigma))) #print("Vt : " + str(len(self.plot_vt)) + " x " + str(len(self.plot_vt[0]))) vectors = list() for idir, dirname in enumerate(self.directories): vectors.append(list()) for file in listdir(dirname): index = self.ldocs.index(dirname + file) vectors[idir].append( self.get_doc_vector(doc_index=index, n_dim=self.plot_axes)) X = list() Y = list() for i in range(len(vectors)): X.append(list()) Y.append(list()) for vec in vectors[i]: X[i].append(vec[0]) Y[i].append(vec[1]) for i in range(len(vectors)): plt.scatter(X[i], Y[i], c=colors[i]) plt.show()
cor_mat2 = np.corrcoef(X.T) eig_vals, eig_vecs = np.linalg.eig(cor_mat2) print('Eigenvectors \n%s' % eig_vecs) print('\nEigenvalues \n%s' % eig_vals) # SVD # TODO - SVD on X_std # What are the three matrices again? u, s, v = np.linalg.svd(X_std.T, full_matrices=True) print s * s / 150 print s from sklearn.utils.extmath import randomized_svd U, Sigma, VT = randomized_svd(X_std, n_components=1) Sigma # sorting eigenpairs for ev in eig_vecs: np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev)) print('Everything ok!') # Make a list of (eigenvalue, eigenvector) tuples eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))] # Sort the (eigenvalue, eigenvector) tuples from high to low eig_pairs.sort() eig_pairs.reverse()
def my_svd(X): _, s, V = randomized_svd(X, n_components, random_state=random_state, n_iter=self.iterated_power) return s, V, squared_norm(X) - squared_norm(s)
def main(): """ Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD as described in Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3. """ # Get the arguments args = docopt( '''Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD and save it in pickle format. Usage: svd.py [-l] <dsm_prefix> <dim> <gamma> <outPath> <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.svd) <dim> = dimensionality of low-dimensional output vectors <gamma> = eigenvalue weighting parameter <outPath> = output path for space Options: -l, --len normalize final vectors to unit length ''') is_len = args['--len'] dsm_prefix = args['<dsm_prefix>'] dim = int(args['<dim>']) gamma = float(args['<gamma>']) outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Get space with sparse matrix dsm = load_pkl_files(dsm_prefix) id2row = dsm.get_id2row() # Get matrix from space matrix_ = dsm.get_cooccurrence_matrix() # Apply SVD u, s, v = randomized_svd(matrix_.get_mat(), n_components=dim, n_iter=5, transpose=False) # Weight matrix if gamma == 0.0: matrix_ = u elif gamma == 1.0: #matrix_ = np.dot(u, np.diag(s)) # This is equivalent to the below formula (because s is a flattened diagonal matrix) matrix_ = s * u else: #matrix_ = np.dot(u, np.power(np.diag(s), gamma)) # This is equivalent to the below formula matrix_ = np.power(s, gamma) * u if is_len: # L2-normalize vectors l2norm1 = np.linalg.norm(matrix_, axis=1, ord=2) l2norm1[l2norm1 == 0.0] = 1.0 # Convert 0 values to 1 matrix_ /= l2norm1.reshape(len(l2norm1), 1) dsm = Space(DenseMatrix(matrix_), id2row, []) # Save the Space object in pickle format save_pkl_files(dsm, outPath + ".svd.dm", save_in_one_file=True, save_as_w2v=True) logging.info("--- %s seconds ---" % (time.time() - start_time))
from scipy.linalg import hilbert np.set_printoptions(linewidth=120) Observations = 10 Features = 4000 N = max(Observations, Features) k = 7 # Create a known ill-conditionned matrix for testing H = hilbert(N)[:Observations, :Features] print(f'Matrix of shape: [{Observations}, {Features}]') print(f'Target SVD: [{Observations}, {k}]') (U, S, Vh) = randomized_svd(H, n_components=k, n_oversamples=5, n_iter=2) print("\n#################################\n") print("U - left singular vectors") print(U) print("\n#################################\n") print("S - Singular values diagonal") print(S) print("\n#################################\n") print("Vh - transposed right singular vectors") print(Vh) # ---------------------------------------------------------------------------------------- # Matrix of shape: [10, 4000] # Target SVD: [10, 7]
def _initialize_nmf(X, n_components, init=None, eps=1e-6, random_state=None): check_non_negative(X, "NMF initialization") n_samples, n_features = X.shape if (init is not None and init != 'random' and n_components > min(n_samples, n_features)): raise ValueError( "init = '{}' can only be used when " "n_components <= min(n_samples, n_features)".format(init)) if init is None: if n_components <= min(n_samples, n_features): init = 'nndsvd' else: init = 'random' # Random initialization if init == 'random': avg = np.sqrt(X.mean() / n_components) rng = check_random_state(random_state) H = avg * rng.randn(n_components, n_features) W = avg * rng.randn(n_samples, n_components) # we do not write np.abs(H, out=H) to stay compatible with # numpy 1.5 and earlier where the 'out' keyword is not # supported as a kwarg on ufuncs np.abs(H, H) np.abs(W, W) return W, H # NNDSVD initialization U, S, V = randomized_svd(X, n_components, random_state=random_state) W, H = np.zeros(U.shape), np.zeros(V.shape) # The leading singular triplet is non-negative # so it can be used as is for initialization. W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0]) H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :]) for j in range(1, n_components): x, y = U[:, j], V[j, :] # extract positive and negative parts of column vectors x_p, y_p = np.maximum(x, 0), np.maximum(y, 0) x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0)) # and their norms x_p_nrm, y_p_nrm = norm(x_p), norm(y_p) x_n_nrm, y_n_nrm = norm(x_n), norm(y_n) m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm # choose update if m_p > m_n: u = x_p / x_p_nrm v = y_p / y_p_nrm sigma = m_p else: u = x_n / x_n_nrm v = y_n / y_n_nrm sigma = m_n lbd = np.sqrt(S[j] * sigma) W[:, j] = lbd * u H[j, :] = lbd * v W[W < eps] = 0 H[H < eps] = 0 if init == "nndsvd": pass elif init == "nndsvda": avg = X.mean() W[W == 0] = avg H[H == 0] = avg elif init == "nndsvdar": rng = check_random_state(random_state) avg = X.mean() W[W == 0] = abs(avg * rng.randn(len(W[W == 0])) / 100) H[H == 0] = abs(avg * rng.randn(len(H[H == 0])) / 100) else: raise ValueError( 'Invalid init parameter: got %r instead of one of %r' % (init, (None, 'random', 'nndsvd', 'nndsvda', 'nndsvdar'))) return W, H
def randSVD(X, n_components=None): if n_components is None: n_components = min(X.shape) U, S, VT = randomized_svd(X, n_components) return U, S, VT.T
assert len(sys.argv) >= 4, 'bad arguments' fmatrix = sys.argv[1] fwords = sys.argv[2] fdata = sys.argv[3] #fkeys = sys.argv[4] # load the raw matrix X = load_matrix(fmatrix) print(X.shape, X.nnz, file=sys.stderr) # compute its PPMI X = ppmi(X) print(X.shape, X.nnz, file=sys.stderr) if USE_SVD: X, _, _ = randomized_svd(X, n_components=500, n_iter=5, random_state=None) print(X.shape, file=sys.stderr) # load words words = load_words(fwords) # make word to index dict inv = {w: i for i, w in enumerate(words)} assert len(words) == X.shape[0] # load test set wpairs = load_data(fdata) results = evaluate(wpairs, inv, X) for val in results: print(val)
def _initialize_mf(M, n_components, init=None, eps=1e-6, random_state=None, non_negative=False): """Algorithms for MF initialization. Computes an initial guess for the non-negative rank k matrix approximation for M: M = AB^T Parameters ---------- M : array-like, shape (n_samples, n_features) The data matrix to be decomposed. n_components : integer The number of components desired in the approximation. init : None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'svd' Method used to initialize the procedure. Default: 'svd' if n_components < n_features, otherwise 'random'. Valid options: - 'random': non-negative random matrices, scaled with: sqrt(X.mean() / n_components) - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD) initialization (better for sparseness) - 'nndsvda': NNDSVD with zeros filled with the average of X (better when sparsity is not desired) - 'nndsvdar': NNDSVD with zeros filled with small random values (generally faster, less accurate alternative to NNDSVDa for when sparsity is not desired) non_negative: bool Whether to decompose into non-negative matrices. eps : float If non-negative, truncate all values less then this in output to zero. random_state : int, RandomState instance or None, optional, default: None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used when ``random`` == 'nndsvdar' or 'random'. Returns ------- A : array-like, shape (n_samples, n_components) Initial guesses for solving M ~= AB^T B : array-like, shape (n_features, n_components) Initial guesses for solving M ~= AB^T References ---------- C. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for nonnegative matrix factorization - Pattern Recognition, 2008 http://tinyurl.com/nndsvd """ if non_negative: check_non_negative(M, "MF initialization") n_samples, n_features = M.shape if init is None: if n_components < n_features: init = 'nndsvdar' if non_negative else 'svd' else: init = 'random' if init == 'random': avg = np.sqrt(np.abs(M.mean()) / n_components) rng = check_random_state(random_state) A = avg * rng.randn(n_samples, n_components) B = avg * rng.randn(n_components, n_features) if non_negative: np.abs(A, A) np.abs(B, B) elif init == 'svd': if non_negative: raise ValueError( 'SVD initialization incompatible with NMF (use nndsvd instead)' ) if min(n_samples, n_features) < n_components: warnings.warn( 'The number of components is smaller than the rank in svd initialization.' + 'The input will be padded with zeros to compensate for the lack of singular values.' ) # simple SVD based approximation U, S, V = randomized_svd(M, n_components, random_state=random_state) # randomize_svd only returns min(n_components, n_features, n_samples) singular values and vectors # therefore, to retain the desired shape, we need to pad and reshape the inputs if n_components > n_features: U_padded = np.zeros((U.shape[0], n_components)) U_padded[:, :U.shape[1]] = U U = U_padded V_padded = np.zeros((n_components, V.shape[1])) V_padded[:V.shape[0], :] = V V = V_padded S_padded = np.zeros(n_components) S_padded[:S.shape[0]] = S S = S_padded S = np.diag(np.sqrt(S)) A = np.dot(U, S) B = np.dot(S, V) elif init in ['nndsvd', 'nndsvda', 'nndsvdar']: if not non_negative: warnings.warn( '%s results in non-negative constrained factors,' % init + 'so SVD initialization should provide better initial estimate') # NNDSVD initialization U, S, V = randomized_svd(M, n_components, random_state=random_state) A, B = np.zeros(U.shape), np.zeros(V.shape) # The leading singular triplet is non-negative # so it can be used as is for initialization. A[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0]) B[0, :] = np.sqrt(S[0]) * np.abs(V[0, :]) for j in range(1, n_components): x, y = U[:, j], V[j, :] # extract positive and negative parts of column vectors x_p, y_p = np.maximum(x, 0), np.maximum(y, 0) x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0)) # and their norms x_p_nrm, y_p_nrm = norm(x_p), norm(y_p) x_n_nrm, y_n_nrm = norm(x_n), norm(y_n) m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm # choose update if m_p > m_n: u = x_p / x_p_nrm v = y_p / y_p_nrm sigma = m_p else: u = x_n / x_n_nrm v = y_n / y_n_nrm sigma = m_n lbd = np.sqrt(S[j] * sigma) A[:, j] = lbd * u B[j, :] = lbd * v A[A < eps] = 0 B[B < eps] = 0 if init == "nndsvd": pass elif init == "nndsvda": avg = M.mean() A[A == 0] = avg B[B == 0] = avg elif init == "nndsvdar": rng = check_random_state(random_state) avg = M.mean() A[A == 0] = abs(avg * rng.randn(len(A[A == 0])) / 100) B[B == 0] = abs(avg * rng.randn(len(B[B == 0])) / 100) else: raise ValueError("Invalid init argument") return A, B.T
sys.path.append('..') import numpy as np from common.util import most_similar, create_co_matrix, ppmi from dataset import ptb window_size = 2 wordvec_size = 100 corpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) print('counting co-occurence ...') C = create_co_matrix(corpus, vocab_size, window_size) print('calculating PPMI ...') W = ppmi(C, verbose=True) print('calculating SVD ...') try: from sklearn.utils.extmath import randomized_svd U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5, random_state=None) except ImportError: U, S, V = np.linalg.svd(W) word_vecs = U[:, :wordvec_size] querys = ['you', 'year', 'car', 'toyota'] for query in querys: most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
def PCA_rand(X, n_components, n_iter): X = X - mean(X, axis=1, keepdims=True) V, d, _ = randomized_svd(X, n_components=n_components, n_iter=n_iter) V = V[:, argsort(d)[::-1]] d = d[argsort(d)[::-1]] return d, V
def omwrpca_cp(M, burnin, win_size, track_cp_burnin, n_check_cp, alpha, proportion, n_positive, min_test_size, tolerance_num=0, lambda1=np.nan, lambda2=np.nan, factor=1): """ The loss function is min_{L,S} { 1/2||M-L-S||_F^2 + lambda1||L||_* + lambda2*||S(:)||_1} based on moving window. Parameters ---------- M : array-like, shape (n_features, n_samples), which will be decomposed into a sparse matrix S and a low-rank matrix L. burnin : burnin sample size. We require burnin >= win_size. win_size : length of moving window. We require win_size <= burnin. track_cp_burnin: the first track_cp_burnin samples generated from omwrpca algorithm will exclude for track change point. Because the result may be unstable. n_check_cp: buffer size to track changepoint. alpha: threshold value used in the hypothesis test. Hypothesis test is applied to track subspace changing. We suggest use the value 0.01. tolerance_num: offset of numbers used in hypothesis test to track change point. A larger tolerance_num gives a more robust result. We restrict tolerance_num to be a non-negative integer. The default value of tolerance_num is 0. lambda1, lambda2:tuning parameters factor: parameter factor for PCP. Returns ---------- Lhat : array-like, low-rank matrix. Shat : array-like, sparse matrix. rank : rank of low-rank matrix. References ---------- Rule of thumb for tuning paramters: lambda1 = 1.0/np.sqrt(m); lambda2 = 1.0/np.sqrt(m); """ m, n = M.shape # parameter setting assert burnin >= win_size, "Parameter burin should be larger than or equal to parameter win_size." if n < burnin: print "Parameter burin should be less than or equal to the number of columns of input matrix. Program stops." return np.empty((m, 0)), np.empty((m, 0)), [], [], [] if np.isnan(lambda1): lambda1 = 1.0 / np.sqrt(m) if np.isnan(lambda2): lambda2 = 1.0 / np.sqrt(m) # calculate pcp on burnin samples and find rank r Lhat, Shat, niter, r = pcp(M[:, :burnin], factor=factor) # initialization for omwrpca Uhat, sigmas_hat, Vhat = randomized_svd(Lhat, n_components=r, n_iter=5, random_state=0) U = Uhat.dot(np.sqrt(np.diag(sigmas_hat))) Vhat_win = Vhat[:, -win_size:] A = np.zeros((r, r)) B = np.zeros((m, r)) for i in range(Vhat_win.shape[1]): A = A + np.outer(Vhat_win[:, i], Vhat_win[:, i]) B = B + np.outer( M[:, burnin - win_size + i] - Shat[:, burnin - win_size + i], Vhat_win[:, i]) # initialization for change points tracking # dist_num_sparses: distribution of the number of nonzero elements of columns of sparse matrix # used for tracking change point dist_num_sparses = np.zeros(m + 1) # buffer_num: number of nonzero elements of columns of sparse matrix in the buffer used for # tracking change point (buffer size = n_check_cp, queue structure) buffer_num = deque([]) # buffer_flag: flags of columns of sparse matrix in the buffer used for tracking change point # (buffer size = n_check_cp, queue structure); flag=1 - potential change point; flag=0 - normal point. buffer_flag = deque([]) # num_sparses, cp, rvec are returned by the function # initialize num_sparses to track the number of nonzero elements of columns of sparse matrix num_sparses = list((Shat != 0).sum(axis=0)) # initialize change points to an empty list cp = [] # initialize list of rank to [r] rvec = [r] # main loop i = burnin while i < n: mi = M[:, i] vi, si = solve_proj2(mi, U, lambda1, lambda2) Shat = np.hstack((Shat, si.reshape(m, 1))) vi_delete = Vhat_win[:, 0] Vhat_win = np.hstack((Vhat_win[:, 1:], vi.reshape(r, 1))) A = A + np.outer(vi, vi) - np.outer(vi_delete, vi_delete) B = B + np.outer(mi - si, vi) - np.outer( M[:, i - win_size] - Shat[:, i - win_size], vi_delete) U = update_col(U, A, B, lambda1) Lhat = np.hstack((Lhat, U.dot(vi).reshape(m, 1))) num_sparses.append((si.reshape(m, 1) != 0).sum()) if i >= burnin + track_cp_burnin and i < burnin + track_cp_burnin + min_test_size: num = (si != 0).sum() dist_num_sparses[num] += 1 elif i >= burnin + track_cp_burnin + min_test_size: # do hypothesis testing to find chang point num = (si != 0).sum() buffer_num.append(num) pvalue = dist_num_sparses[max(num - tolerance_num, 0):].sum( ) / dist_num_sparses.sum() if pvalue <= alpha: buffer_flag.append(1) else: buffer_flag.append(0) if len(buffer_flag) >= n_check_cp: # check change point if len(buffer_flag) == n_check_cp + 1: dist_num_sparses[buffer_num[0]] += 1 buffer_num.popleft() buffer_flag.popleft() nabnormal = sum(buffer_flag) # potential change identified if nabnormal >= n_check_cp * float(proportion): for k in range(n_check_cp - n_positive + 1): # use the earliest change point if change point exists if sum(itertools.islice(buffer_flag, k, k + n_positive)) == n_positive: changepoint = i - n_check_cp + 1 + k cp.append(changepoint) Lhat = Lhat[:, :changepoint] Shat = Shat[:, :changepoint] M_update = M[:, changepoint:] num_sparses = num_sparses[:changepoint] # recursively call omwrpca_cp Lhat_update, Shat_update, rvec_update, cp_update, num_sparses_update = \ omwrpca_cp(M_update, burnin, win_size, track_cp_burnin, n_check_cp, alpha, proportion, n_positive, min_test_size, tolerance_num, lambda1, lambda2, factor) # update Lhat, Shat, rvec, num_sparses, cp Lhat = np.hstack((Lhat, Lhat_update)) Shat = np.hstack((Shat, Shat_update)) rvec.extend(rvec_update) num_sparses.extend(num_sparses_update) cp.extend([changepoint + j for j in cp_update]) return Lhat, Shat, rvec, cp, num_sparses i += 1 return Lhat, Shat, rvec, cp, num_sparses
def test(self, override=False): """ Applies randomised VD to the dataset. This function does NOT write results to the hdf5 file. Call compute() to write to the file. Handles complex, compound datasets such that the V matrix is of the same data-type as the input matrix. :param override: Set to true to recompute results if prior results are available. Else, returns existing results :type override: bool, optional. default = False :returns: tuple (u_mat, self.__s, v_mat) WHERE numpy.ndarray u_mat is abundance matrix numpy.ndarray self.__s is variance vector numpy.ndarray v_mat is eigenvector matrix """ ''' Check if a number of compnents has been set and ensure that the number is less than the minimum axis length of the data. If both conditions are met, use fsvd. If not use the regular svd. C.Smith -- We might need to put a lower limit on num_comps in the future. I don't know enough about svd to be sure. ''' if not override: if isinstance(self.duplicate_h5_groups, list) and len(self.duplicate_h5_groups) > 0: self.h5_results_grp = self.duplicate_h5_groups[-1] print('Returning previously computed results from: {}'.format( self.h5_results_grp.name)) print('set the "override" flag to True to recompute results') return reshape_to_n_dims(self.h5_results_grp['U'])[0], self.h5_results_grp['S'][()], \ reshape_to_n_dims(self.h5_results_grp['V'])[0] self.h5_results_grp = None t1 = time.time() self.__u, self.__s, self.__v = randomized_svd(self.data_transform_func( self.h5_main), self.num_components, n_iter=3) self.__v = stack_real_to_target_dtype(self.__v, self.h5_main.dtype) print('Took {} to compute randomized SVD'.format( format_time(time.time() - t1))) u_mat, success = reshape_to_n_dims(self.__u, h5_pos=self.h5_main.h5_pos_inds, h5_spec=np.expand_dims(np.arange( self.__u.shape[1]), axis=0)) if not success: raise ValueError( 'Could not reshape U to N-Dimensional dataset! Error:' + success) # When the source dataset has a singular valued spectroscopic dimension # stack_real_to_target causes V to lose all its dimensions if self.__v.ndim == 0: # However, we want V to be 2D: self.__v = np.atleast_2d(self.__v) v_mat, success = reshape_to_n_dims(self.__v, h5_pos=np.expand_dims(np.arange( self.__u.shape[1]), axis=1), h5_spec=self.h5_main.h5_spec_inds) if not success: raise ValueError( 'Could not reshape V to N-Dimensional dataset! Error:' + success) return u_mat, self.__s, v_mat
def CanonicalBip(self, GroupNames, y, std=True): if isinstance(GroupNames, (list)): self.GroupNames = GroupNames else: raise ValueError('not numeric') if isinstance(y, (np.ndarray)): self.target = y else: raise ValueError('not numeric') if std == True: self.standardize() data = self.data_st else: data = self.data g = len(GroupNames) n = data.shape[0] m = data.shape[1] r = np.min(np.array([g - 1, m])) def Factor2Binary(y, Name=None): if Name == None: Name = "C" ncat = len(list(set(y))) n = len(y) Z = pd.DataFrame(0, index=np.arange(len(y)), columns=list(set(y))) for col in Z.columns: for i in range(0, n): if y[i] == col: Z[col].iloc[i] = 1 return Z def matrixsqrt(M, dim, tol=np.finfo(float).eps, inv=True): U, Sigma, VT = randomized_svd(M, n_components=self.dim, n_iter=5, random_state=None) nz = Sigma > tol if inv == True: S12 = U.dot(np.diag(1 / np.sqrt(Sigma[nz]))).dot(VT[nz, :]) else: S12 = U.dot(np.diag(np.sqrt(Sigma[nz]))).dot(VT[nz, :]) return S12 #Groups to Binary Z = Factor2Binary(y) ng = Z.sum(axis=0) S11 = (Z.T).dot(Z) Xb = np.linalg.inv(S11).dot(Z.T).dot(data) B = (Xb.T).dot(S11).dot(Xb) S = (data.T).dot(data) - B Y = np.power(S11, 0.5).dot(Xb).dot(matrixsqrt(S, self.dim, inv=True)) U, Sigma, VT = randomized_svd(Y, n_components=self.dim, n_iter=5, random_state=None) #Variable_Coord H = matrixsqrt(S, self.dim, inv=False).dot(np.transpose(VT[0:r, :])) self.Var_Coord = H #Canonical_Weights B = matrixsqrt(S, self.dim, inv=True).dot(np.transpose(VT[0:r, :])) self.Can_Weights = B #Group_Coord J = Xb.dot(B) self.Group_Coord = J #Individual_Coord V = data.dot(B) self.Ind_Coord = V
def SVD(M, dimen, niter=5, state=0): U, Sigma, VT = randomized_svd(M, n_components=dimen, n_iter=niter, random_state=state) return U, Sigma, VT
def factor_rank_one(mat): u, s, v = randomized_svd(mat, n_components=1) u, v = np.abs(np.sqrt(s) * u[:, 0]), np.abs(np.sqrt(s) * v[0, :]) return u, v
def svd(svd_matrix): u, s, vt = randomized_svd(svd_matrix, n_components=256, n_iter=3, random_state=None) return {'u': u, 's': s, 'vt': vt}
def load_w2v_features(inds_all, author_IMat, ents, prop, path_to_w2v, path_to_sents, alpha): """Engineering node features by means of word2vec embeddings vectors. The entities and property nodes will be assigned their corresponding embedding vector, whereas the authors will be given the average of the sentence embeddings from the abstract of all their papers. The sentence embedding is computed through a smoothened weighted average, which is also adjusted by subtracting the first principal component of the sentences. Args: * inds_all: array-like Index of all the nodes that were selected in our dataset. The indices that correspond to authors should be located in the first chunk of this array. Their size is equal to the the number of columns in `author_IMat`, i.e. `inds_all = [A_ids, E_inds, P_ind]` where A: auhors, E: entities, P: property * author_IMat: 2D sparse array Incidence matrix corresponding to all the author nodes (no matter if they are among the selected nodes or not). Number of papers (hyperedges) in this matrix should be equal to the number of abstracts (saved as sentences) whose path is given by `path_to_sents`. In case we would like to see papers only in a specific time-window, the rows outside this window should be zero-ed out before feeding it to this function. * ents: array-like List of all entities (no matter if they are among the selected nodes or not) * prop: str or list of str Property keyword(s) * path_to_w2v: str Path to the Word2Vec model * path_to_sents: str Path to the sentences using which the Word2Vec model were trained * alpha: float scalar The smoothing parameter """ # total number of authors nA = author_IMat.shape[1] # number of selected authors nA_selected = np.sum(inds_all < nA) # load the w2v model model = Word2Vec.load(path_to_w2v) # load sentences, and compute their embeddings sents = np.array(open(path_to_sents, 'r').read().splitlines()) sents_embeds = unadjusted_words2sents(sents, model, alpha) x_all = np.zeros((len(inds_all), model.vector_size)) # the easier task first (entities and property) for i, ind in enumerate(inds_all[nA_selected:]): # the last index is for the property if ind != inds_all[-1]: ent = ents[ind - nA] idx = model.wv.vocab[ent].index v = model.trainables.syn1neg[idx, :] x_all[nA_selected + i, :] = v / np.sqrt((v**2).sum()) else: x_all[-1, :] = model.wv[prop] / np.sqrt((model.wv[prop]**2).sum()) # now, the more demanding task (authors) author_IMat = author_IMat.tocsc() pbar = tqdm(range(nA_selected), position=0, leave=True) pbar.set_description('Words2Sents for Authors') for i, ind in enumerate(inds_all[:nA_selected]): pids = author_IMat[:, ind].indices # so far the matrices were row-wise, make it # column-wise to be more consistent with the formula V = np.concatenate([sents_embeds[j] for j in pids], axis=0).T u, _, _ = randomized_svd(V, n_components=1) # adjusted average vector avg = np.sum(V, axis=1) avg = avg - np.dot(np.dot(u, u.T), avg) x_all[i, :] = avg.squeeze() / np.sqrt((avg**2).sum()) pbar.update(1) pbar.close() return x_all
print('vocab size: {0}'.format(len(word_to_id))) print('corpus size: {0}'.format(len(corpus))) # 共起行列 print('counting co_occurence..') c = create_co_matrix(corpus, vocab_size=len(word_to_id), window_size=window_size) # ppmi print('calculating ppmi (t) ..') m_t = ppmi_text(c, verbose=True) print('calculating ppmi (self) ..') m = ppmi(c) # 次元削減 SVD print('calculating svd..') U, S, V = randomized_svd(m, n_components=vec_size) U_t, S_t, V_t = randomized_svd(m_t, n_components=vec_size) # ひょうか querys = ['you', 'year', 'car', 'toyota'] for q in querys: print('SVD (self ppmi)') most_similar(q, word_to_id, id_to_word, U) print('SVD (t ppmi)') most_similar(q, word_to_id, id_to_word, U_t)
def _max_singular_value(self, X_filled): # quick decomposition of X_filled into rank-1 SVD _, s, _ = randomized_svd(X_filled, 1, n_iter=5) return s[0]
uv = [] for r in xrange(len(review_rating)): uv.append(review_rating[r] - user_avg[review_user[r]] - business_avg[review_business[r]] + mu) row = np.array(review_business) col = np.array(review_user) val = np.array(uv) ori = csr_matrix((val, (row, col)), shape=(len(business_avg), len(user_avg))).toarray() n_comp = 30 # k n_iter = 15 U, S, VT = randomized_svd(ori, n_components=n_comp, n_iter=n_iter, random_state=None) # U[2686*k], S[k], VT[K*4929] rc = [] for n in xrange(n_comp): rc.append(n) rc = np.array(rc) S = csr_matrix((S, (rc, rc)), shape=(n_comp, n_comp)).toarray() now = np.dot(np.dot(U, S), VT) rmse = mean_squared_error(ori, now)
del f0 del f1 gc.collect() print("stacked.shape " + str(stacked.shape) + ", stacked.dtype " + str(stacked.dtype)) assert len(stacked.shape) == 2 if ncomponents < 0: ncomponents = min(stacked.shape[0], stacked.shape[1]) from sklearn.utils.extmath import randomized_svd orig_stack = stacked.copy() stackemean = np.mean(stacked, axis=0) stacked -= stackemean print("starting SVD") U, s, VT = randomized_svd(stacked, n_components=ncomponents) #, n_iter=6) print("SVD done! VT.shape " + str(VT.shape) + ", VT.dtype " + str(VT.dtype)) h5file = tables.open_file(pickle_name + '_PCA_transfparams.hdf5', mode='w') h5file.create_array(h5file.root, 'mean', stackemean) h5file.create_array(h5file.root, 'VT', VT) h5file.create_array(h5file.root, 's', s) h5file.close() print("SAVED TRANSFORM PARAMS") print("VT.shape " + str(VT.shape)) #h5file = tables.open_file(pickle_name+'_PCA.hdf5',mode='w') #for ii in range(2): # h5file.create_array(h5file.root, 'f'+str(ii), stacktransf[(ii*ndataset):((ii+1)*ndataset),...])
def _fit_transform(self, graph: Graph, return_dataframe: bool = True, verbose: bool = True) -> EmbeddingResult: """Return node embedding.""" matrix = None if self._metric == "Jaccard": edges, weights = graph.get_jaccard_coo_matrix() elif self._metric == "Laplacian": edges, weights = graph.get_laplacian_coo_matrix() elif self._metric == "Modularity": matrix = graph.get_dense_modularity_matrix() elif self._metric == "Left Normalized Laplacian": edges, weights = graph.get_left_normalized_laplacian_coo_matrix() elif self._metric == "Right Normalized Laplacian": edges, weights = graph.get_right_normalized_laplacian_coo_matrix() elif self._metric == "Symmetric Normalized Laplacian": edges, weights = graph.get_symmetric_normalized_laplacian_coo_matrix( ) elif self._metric == "Neighbours Intersection size": edges, weights = graph.get_neighbours_intersection_size_coo_matrix( ) elif self._metric == "Ancestors Jaccard": matrix = graph.get_shared_ancestors_jaccard_adjacency_matrix( graph.get_breadth_first_search_from_node_names( src_node_name=self._root_node_name, compute_predecessors=True), verbose=verbose) elif self._metric == "Ancestors size": matrix = graph.get_shared_ancestors_size_adjacency_matrix( graph.get_breadth_first_search_from_node_names( src_node_name=self._root_node_name, compute_predecessors=True), verbose=verbose) elif self._metric == "Adamic-Adar": edges, weights = graph.get_adamic_adar_coo_matrix() elif self._metric == "Adjacency": edges, weights = graph.get_directed_edge_node_ids(), np.ones( graph.get_number_of_directed_edges()) else: raise NotImplementedError(f"The provided metric {self._metric} " "is not currently supported.") if matrix is None: matrix = coo_matrix((weights, (edges[:, 0], edges[:, 1])), shape=(graph.get_number_of_nodes(), graph.get_number_of_nodes()), dtype=np.float32) U, sigmas, Vt = sparse_svds(matrix, k=int(self._embedding_size / 2)) else: U, sigmas, Vt = randomized_svd(matrix, n_components=int( self._embedding_size / 2)) sigmas = np.diagflat(np.sqrt(sigmas)) left_embedding = np.dot(U, sigmas) right_embedding = np.dot(Vt.T, sigmas) if return_dataframe: node_names = graph.get_node_names() left_embedding = pd.DataFrame(left_embedding, index=node_names) right_embedding = pd.DataFrame(right_embedding, index=node_names) return EmbeddingResult( embedding_method_name=self.model_name(), node_embeddings=[left_embedding, right_embedding])
def dict_learning_online(X, n_components=2, alpha=1, n_iter=100, return_code=True, dict_init=None, callback=None, batch_size=3, verbose=False, shuffle=True, n_jobs=1, method='lars', iter_offset=0, random_state=None, return_inner_stats=False, inner_stats=None, return_n_iter=False): """Solves a dictionary learning matrix factorization problem online. Finds the best dictionary and the corresponding sparse code for approximating the data matrix X by solving:: (U^*, V^*) = argmin 0.5 || X - U V ||_2^2 + alpha * || U ||_1 (U,V) with || V_k ||_2 = 1 for all 0 <= k < n_components where V is the dictionary and U is the sparse code. This is accomplished by repeatedly iterating over mini-batches by slicing the input data. Parameters ---------- X: array of shape (n_samples, n_features) Data matrix. n_components : int, Number of dictionary atoms to extract. alpha : float, Sparsity controlling parameter. n_iter : int, Number of iterations to perform. return_code : boolean, Whether to also return the code U or just the dictionary V. dict_init : array of shape (n_components, n_features), Initial value for the dictionary for warm restart scenarios. callback : Callable that gets invoked every five iterations. batch_size : int, The number of samples to take in each batch. verbose : Degree of output the procedure will print. shuffle : boolean, Whether to shuffle the data before splitting it in batches. n_jobs : int, Number of parallel jobs to run, or -1 to autodetect. method : {'lars', 'cd'} lars: uses the least angle regression method to solve the lasso problem (linear_model.lars_path) cd: uses the coordinate descent method to compute the Lasso solution (linear_model.Lasso). Lars will be faster if the estimated components are sparse. iter_offset : int, default 0 Number of previous iterations completed on the dictionary used for initialization. random_state : int or RandomState Pseudo number generator state used for random sampling. return_inner_stats : boolean, optional Return the inner statistics A (dictionary covariance) and B (data approximation). Useful to restart the algorithm in an online setting. If return_inner_stats is True, return_code is ignored inner_stats : tuple of (A, B) ndarrays Inner sufficient statistics that are kept by the algorithm. Passing them at initialization is useful in online settings, to avoid loosing the history of the evolution. A (n_components, n_components) is the dictionary covariance matrix. B (n_features, n_components) is the data approximation matrix return_n_iter : bool Whether or not to return the number of iterations. Returns ------- code : array of shape (n_samples, n_components), the sparse code (only returned if `return_code=True`) dictionary : array of shape (n_components, n_features), the solutions to the dictionary learning problem n_iter : int Number of iterations run. Returned only if `return_n_iter` is set to `True`. See also -------- dict_learning DictionaryLearning MiniBatchDictionaryLearning SparsePCA MiniBatchSparsePCA """ if method not in ('lars', 'cd', 'admm'): raise ValueError('Coding method not supported as a fit algorithm.') method = 'lasso_' + method t0 = time.time() n_samples, n_features = X.shape # Avoid integer division problems alpha = float(alpha) random_state = check_random_state(random_state) if n_jobs == -1: n_jobs = cpu_count() # Init V with SVD of X if dict_init is not None: dictionary = dict_init else: _, S, dictionary = randomized_svd(X, n_components) dictionary = S[:, np.newaxis] * dictionary r = len(dictionary) if n_components <= r: dictionary = dictionary[:n_components, :] else: dictionary = np.r_[dictionary, np.zeros((n_components - r, dictionary.shape[1]))] dictionary = np.ascontiguousarray(dictionary.T) if verbose == 1: print('[dict_learning]', end=' ') n_batches = floor(float(len(X)) / batch_size) if shuffle: X_train = X.copy() random_state.shuffle(X_train) else: X_train = X batches = np.array_split(X_train, n_batches) batches = itertools.cycle(batches) # The covariance of the dictionary if inner_stats is None: A = np.zeros((n_components, n_components)) # The data approximation B = np.zeros((n_features, n_components)) else: A = inner_stats[0].copy() B = inner_stats[1].copy() # If n_iter is zero, we need to return zero. ii = iter_offset - 1 for ii, this_X in zip(range(iter_offset, iter_offset + n_iter), batches): dt = (time.time() - t0) if verbose == 1: sys.stdout.write(".") sys.stdout.flush() elif verbose: if verbose > 10 or ii % ceil(100. / verbose) == 0: print ("Iteration % 3i (elapsed time: % 3is, % 4.1fmn)" % (ii, dt, dt / 60)) this_code = sparse_encode(this_X, dictionary.T, algorithm=method, alpha=alpha, n_jobs=n_jobs).T # Update the auxiliary variables if ii < batch_size - 1: theta = float((ii + 1) * batch_size) else: theta = float(batch_size ** 2 + ii + 1 - batch_size) beta = (theta + 1 - batch_size) / (theta + 1) A *= beta A += np.dot(this_code, this_code.T) B *= beta B += np.dot(this_X.T, this_code.T) # Update dictionary dictionary = _update_dict(dictionary, B, A, verbose=verbose, random_state=random_state) # XXX: Can the residuals be of any use? # Maybe we need a stopping criteria based on the amount of # modification in the dictionary if callback is not None: callback(locals()) if return_inner_stats: if return_n_iter: return dictionary.T, (A, B), ii - iter_offset + 1 else: return dictionary.T, (A, B) if return_code: if verbose > 1: print('Learning code...', end=' ') elif verbose == 1: print('|', end=' ') code = sparse_encode(X, dictionary.T, algorithm=method, alpha=alpha, n_jobs=n_jobs) if verbose > 1: dt = (time.time() - t0) print('done (total time: % 3is, % 4.1fmn)' % (dt, dt / 60)) if return_n_iter: return code, dictionary.T, ii - iter_offset + 1 else: return code, dictionary.T if return_n_iter: return dictionary.T, ii - iter_offset + 1 else: return dictionary.T
def laplacian_embedding(G, max_dim=2, elb=1, get_lcc=True, weightcol='weight', svd_seed=None): """ Inputs G - A networkx graph Outputs eig_vectors - The scaled (or unscaled) eigenvectors """ # if get_lcc==True: # #print("extracting largest_connected_component") # G_lcc = lcc_BNU.extract_lcc(G) # else: # G_lcc = G.copy() # weightcolumn = weightcol # print("pass_to_ranks") # G_ptr = ptr.pass_to_ranks(G_lcc, weightcol=weightcolumn) # print ("diagonoal augmentation") # G_aug_ptr= cvec.diag_aug(G_ptr, weightcol=weightcolumn) sorted_vertex = sorted(G.nodes()) A = nx.to_scipy_sparse_matrix(G, nodelist=sorted_vertex) row, col = A.shape n = min(row, col) if nx.is_directed(G) == False: deg = (A.sum(axis=1).T).astype(float) deg_array = np.squeeze(np.asarray(deg)) D = np.diag(deg_array**(-0.5)) LSE_Matrix = D @ A @ D else: deg = (A.sum(axis=1).T + A.sum(axis=0)).astype(float) deg_array = np.squeeze(np.asarray(deg)) D = np.diag(deg_array**(-1)) LSE_Matrix = np.identity(n) - D @ A #print ("spectral embedding into %d dimensions" %max_dim) U, Sigma, VT = randomized_svd(LSE_Matrix, n_components=min(max_dim, n - 1), n_iter=50, random_state=svd_seed) #print ("dimension reduction (elbow selection)") rank_graph = getElbows_BNU.getElbows(Sigma, n_elbows=elb) reduced_dim = rank_graph[(elb - 1)] #print ("elbow is %d" %reduced_dim) s_sqrt = np.sqrt(Sigma) #[np.newaxis] Zeinab commented this out s_sqrt_dim_reduced = s_sqrt[:reduced_dim] U_dim_reduced = U[:, :reduced_dim] VT_dim_reduced = VT[:reduced_dim, :] Xhat1 = np.multiply(s_sqrt_dim_reduced, U_dim_reduced) if nx.is_directed(G) == False: Xhat2 = np.array([]).reshape(Xhat1.shape[0], 0) else: Xhat2 = np.multiply(np.transpose(VT_dim_reduced), s_sqrt_dim_reduced) Xhat = np.concatenate((Xhat1, Xhat2), axis=1) embedded = collections.namedtuple('embedded', 'X vertex_labels') result = embedded(X=Xhat, vertex_labels=sorted_vertex) return result
def session_pca(imgs, mask_img, parameters, n_components=20, confounds=None, memory_level=0, memory=Memory(cachedir=None), verbose=0, copy=True): """Filter, mask and compute PCA on Niimg-like objects This is an helper function whose first call `base_masker.filter_and_mask` and then apply a PCA to reduce the number of time series. Parameters ---------- imgs: list of Niimg-like objects See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg. List of subject data mask_img: Niimg-like object See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg. Mask to apply on the data parameters: dictionary Dictionary of parameters passed to `filter_and_mask`. Please see the documentation of the `NiftiMasker` for more informations. confounds: CSV file path or 2D matrix This parameter is passed to signal.clean. Please see the corresponding documentation for details. n_components: integer, optional Number of components to be extracted by the PCA memory_level: integer, optional Integer indicating the level of memorization. The higher, the more function calls are cached. memory: joblib.Memory Used to cache the function calls. verbose: integer, optional Indicate the level of verbosity (0 means no messages). copy: boolean, optional Whether or not data should be copied """ data, affine = cache(filter_and_mask, memory, memory_level=memory_level, func_memory_level=2, ignore=['verbose', 'memory', 'memory_level', 'copy'])(imgs, mask_img, parameters, memory_level=memory_level, memory=memory, verbose=verbose, confounds=confounds, copy=copy) if n_components <= data.shape[0] // 4: U, S, _ = randomized_svd(data.T, n_components) else: U, S, _ = linalg.svd(data.T, full_matrices=False) U = U.T[:n_components].copy() S = S[:n_components] return U, S
def check_randomized_svd_low_rank(dtype): # Check that extmath.randomized_svd is consistent with linalg.svd n_samples = 100 n_features = 500 rank = 5 k = 10 decimal = 5 if dtype == np.float32 else 7 dtype = np.dtype(dtype) # generate a matrix X of approximate effective rank `rank` and no noise # component (very structured signal): X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.0, random_state=0).astype(dtype, copy=False) assert X.shape == (n_samples, n_features) # compute the singular values of X using the slow exact method U, s, V = linalg.svd(X, full_matrices=False) # Convert the singular values to the specific dtype U = U.astype(dtype, copy=False) s = s.astype(dtype, copy=False) V = V.astype(dtype, copy=False) for normalizer in ['auto', 'LU', 'QR']: # 'none' would not be stable # compute the singular values of X using the fast approximate method Ua, sa, Va = randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) # If the input dtype is float, then the output dtype is float of the # same bit size (f32 is not upcast to f64) # But if the input dtype is int, the output dtype is float64 if dtype.kind == 'f': assert Ua.dtype == dtype assert sa.dtype == dtype assert Va.dtype == dtype else: assert Ua.dtype == np.float64 assert sa.dtype == np.float64 assert Va.dtype == np.float64 assert Ua.shape == (n_samples, k) assert sa.shape == (k, ) assert Va.shape == (k, n_features) # ensure that the singular values of both methods are equal up to the # real rank of the matrix assert_almost_equal(s[:k], sa, decimal=decimal) # check the singular vectors too (while not checking the sign) assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va), decimal=decimal) # check the sparse matrix representation X = sparse.csr_matrix(X) # compute the singular values of X using the fast approximate method Ua, sa, Va = \ randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) if dtype.kind == 'f': assert Ua.dtype == dtype assert sa.dtype == dtype assert Va.dtype == dtype else: assert Ua.dtype.kind == 'f' assert sa.dtype.kind == 'f' assert Va.dtype.kind == 'f' assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)
def load_data(data_path): timer = utils.timer(name='main').tic() split_folder = os.path.join(data_path, 'cold') u_file = os.path.join( data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.U.txt') v_file = os.path.join( data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.V.txt') item_content_file = os.path.join(data_path, 'item_features_0based.txt') train_file = os.path.join(split_folder, 'train.csv') test_cold_file = os.path.join(split_folder, 'test.csv') test_cold_iid_file = os.path.join(split_folder, 'test_item_ids.csv') dat = {} # load preference data timer.tic() # u_pref = np.fromfile(u_file, dtype='>f4').reshape(n_users, 200) # v_pref = np.fromfile(v_file, dtype='>f4').reshape(n_items, 200) u_pref = np.loadtxt(u_file).reshape(n_users, 200) v_pref = np.loadtxt(v_file).reshape(n_items, 200) dat['u_pref'] = u_pref dat['v_pref'] = v_pref timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic() # pre-process _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref) _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref) timer.toc('standardized U,V').tic() # load content data timer.tic() item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) item_content = tfidf(item_content) from sklearn.utils.extmath import randomized_svd u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5) item_content = u * s _, item_content = utils.prep_standardize(item_content) if sp.issparse(item_content): dat['item_content'] = item_content.tolil(copy=False) else: dat['item_content'] = item_content timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() # load split timer.tic() train = pd.read_csv( train_file, delimiter=",", header=-1, dtype=np.int32).values.ravel().view( dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)]) dat['user_indices'] = np.unique(train['uid']) timer.toc('read train triplets %s' % train.shape).tic() dat['eval_cold'] = data.load_eval_data(test_cold_file, test_cold_iid_file, name='eval_cold', cold=True, train_data=train, citeu=True) return dat