def fit(self, X, Y): # copy since this will contains the centered data check_consistent_length(X, Y) X = check_array(X, dtype=np.float64, copy=self.copy) Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) if self.n_components > max(Y.shape[1], X.shape[1]): raise ValueError("Invalid number of components n_components=%d" " with X of shape %s and Y of shape %s." % (self.n_components, str(X.shape), str(Y.shape))) # Scale (in place) X, Y, self.x_mean_, self.y_mean_, self.x_std_, self.y_std_ = ( _center_scale_xy(X, Y, self.scale)) # svd(X'Y) C = np.dot(X.T, Y) # The arpack svds solver only works if the number of extracted # components is smaller than rank(X) - 1. Hence, if we want to extract # all the components (C.shape[1]), we have to use another one. Else, # let's use arpacks to compute only the interesting components. if self.n_components >= np.min(C.shape): U, s, V = linalg.svd(C, full_matrices=False) else: U, s, V = arpack.svds(C, k=self.n_components) # Deterministic output U, V = svd_flip(U, V) V = V.T self.x_scores_ = np.dot(X, U) self.y_scores_ = np.dot(Y, V) self.x_weights_ = U self.y_weights_ = V return self
def fast_svd(X, n_components, random_state=None): """ Automatically switch between randomized and lapack SVD (heuristic of scikit-learn). Parameters ========== X: array, shape (n_samples, n_features) The data to decompose n_components: integer The order of the dimensionality of the truncated SVD random_state: int or RandomState Pseudo number generator state used for random sampling. Returns ======== U: array, shape (n_samples, n_components) The first matrix of the truncated svd S: array, shape (n_components) The second matric of the truncated svd V: array, shape (n_components, n_features) The last matric of the truncated svd """ random_state = check_random_state(random_state) # Small problem, just call full PCA if max(X.shape) <= 500: svd_solver = 'full' elif n_components >= 1 and n_components < .8 * min(X.shape): svd_solver = 'randomized' # This is also the case of n_components in (0,1) else: svd_solver = 'full' # Call different fits for either full or truncated SVD if svd_solver == 'full': U, S, V = linalg.svd(X, full_matrices=False) # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U, V) # The "copy" are there to free the reference on the non reduced # data, and hence clear memory early U = U[:, :n_components].copy() S = S[:n_components] V = V[:n_components].copy() else: if LooseVersion(sklearn.__version__) >= LooseVersion('0.17'): n_iter = 'auto' else: n_iter = 3 U, S, V = randomized_svd(X, n_components=n_components, n_iter=n_iter, flip_sign=True, random_state=random_state) return U, S, V
def pca_with_sparse(X, npcs, solver='arpack', mu=None, random_state=None): random_state = check_random_state(random_state) np.random.set_state(random_state.get_state()) random_init = np.random.rand(np.min(X.shape)) X = check_array(X, accept_sparse=['csr', 'csc']) if mu is None: mu = X.mean(0).A.flatten()[None, :] mdot = mu.dot mmat = mdot mhdot = mu.T.dot mhmat = mu.T.dot Xdot = X.dot Xmat = Xdot XHdot = X.T.conj().dot XHmat = XHdot ones = np.ones(X.shape[0])[None, :].dot def matvec(x): return Xdot(x) - mdot(x) def matmat(x): return Xmat(x) - mmat(x) def rmatvec(x): return XHdot(x) - mhdot(ones(x)) def rmatmat(x): return XHmat(x) - mhmat(ones(x)) XL = sparse.linalg.LinearOperator( matvec=matvec, dtype=X.dtype, matmat=matmat, shape=X.shape, rmatvec=rmatvec, rmatmat=rmatmat, ) u, s, v = sparse.linalg.svds(XL, solver=solver, k=npcs, v0=random_init) u, v = svd_flip(u, v) idx = np.argsort(-s) v = v[idx, :] X_pca = (u * s)[:, idx] ev = s[idx]**2 / (X.shape[0] - 1) total_var = _get_mean_var(X)[1].sum() ev_ratio = ev / total_var output = { 'X_pca': X_pca, 'variance': ev, 'variance_ratio': ev_ratio, 'components': v, } return output
def fast_svd(X, n_components, random_state=None): """ Automatically switch between randomized and lapack SVD (heuristic of scikit-learn). Parameters ---------- X : array, shape (n_samples, n_features) The data to decompose n_components : integer The order of the dimensionality of the truncated SVD random_state : int or RandomState, optional Pseudo number generator state used for random sampling. Returns ------- U : array, shape (n_samples, n_components) The first matrix of the truncated svd S : array, shape (n_components) The second matric of the truncated svd V : array, shape (n_components, n_features) The last matric of the truncated svd """ random_state = check_random_state(random_state) # Small problem, just call full PCA if max(X.shape) <= 500: svd_solver = 'full' elif n_components >= 1 and n_components < .8 * min(X.shape): svd_solver = 'randomized' # This is also the case of n_components in (0,1) else: svd_solver = 'full' # Call different fits for either full or truncated SVD if svd_solver == 'full': U, S, V = linalg.svd(X, full_matrices=False) # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U, V) # The "copy" are there to free the reference on the non reduced # data, and hence clear memory early U = U[:, :n_components].copy() S = S[:n_components] V = V[:n_components].copy() else: n_iter = 'auto' U, S, V = randomized_svd(X, n_components=n_components, n_iter=n_iter, flip_sign=True, random_state=random_state) return U, S, V
def _fit_full(self, X, n_components): """Fit the model by computing full SVD on X""" n_samples, n_features = X.shape if n_components == 'mle': if n_samples < n_features: raise ValueError("n_components='mle' is only supported " "if n_samples >= n_features") elif not 0 <= n_components <= n_features: raise ValueError("n_components=%r must be between 0 and " "n_features=%r with svd_solver='full'" % (n_components, n_features)) # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ U, S, V = linalg.svd(X, full_matrices=False) # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U, V) components_ = V # Get variance explained by singular values explained_variance_ = (S ** 2) / (n_samples - 1) total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var singular_values_ = S.copy() # Store the singular values. # Postprocess the number of components required if n_components == 'mle': n_components = \ _infer_dimension_(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = stable_cumsum(explained_variance_ratio_) n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = explained_variance_[n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = \ explained_variance_ratio_[:n_components] self.singular_values_ = singular_values_[:n_components] return U, S, V
def svd_wrapper(X, rank=None): """ Computes the (possibly partial) SVD of a matrix. Handles the case where X is either dense or sparse. Parameters ---------- X: array-like, shape (N, D) rank: int, None rank of the desired SVD. If None, will compute the largest min(X.shape) singular value/vectors. Output ------ U, D, V U: array-like, shape (N, rank) Orthonormal matrix of left singular vectors. D: list, shape (rank, ) Singular values in non-increasing order (e.g. D[0] is the largest). V: array-like, shape (D, rank) Orthonormal matrix of right singular vectors """ # TODO: give user option to compute randomized SVD if rank is None: rank = min(X.shape) rank = int(rank) assert 1 <= rank and rank <= min(X.shape) if rank <= min(X.shape) - 1: scipy_svds = svds(X, rank) U, D, V = fix_scipy_svds(scipy_svds) else: assert not issparse(X) U, D, V = full_svd(X, full_matrices=False) V = V.T if rank: U = U[:, :rank] D = D[:rank] V = V[:, :rank] # enfoce deterministic output U, V = svd_flip(U, V.T) V = V.T return U, D, V
def randomized_pca(A, n_components, n_oversamples=10, n_iter="auto", flip_sign=True, random_state=0): """Compute the randomized PCA decomposition of a given matrix. This method differs from the scikit-learn implementation in that it supports and handles sparse matrices well. """ if n_iter == "auto": # Checks if the number of iterations is explicitly specified # Adjust n_iter. 7 was found a good compromise for PCA. See sklearn #5299 n_iter = 7 if n_components < .1 * min(A.shape) else 4 n_samples, n_features = A.shape c = np.atleast_2d(A.mean(axis=0)) if n_samples >= n_features: Q = random_state.normal(size=(n_features, n_components + n_oversamples)) Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q) # Normalized power iterations for _ in range(n_iter): Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :]) Q, _ = lu(Q, permute_l=True) Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q) Q, _ = lu(Q, permute_l=True) Q, _ = qr(Q, mode="economic") QA = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :]) R, s, V = svd(QA.T, full_matrices=False) U = Q.dot(R) else: # n_features > n_samples Q = random_state.normal(size=(n_samples, n_components + n_oversamples)) Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :]) # Normalized power iterations for _ in range(n_iter): Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q) Q, _ = lu(Q, permute_l=True) Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :]) Q, _ = lu(Q, permute_l=True) Q, _ = qr(Q, mode="economic") QA = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q) U, s, R = svd(QA, full_matrices=False) V = R.dot(Q.T) if flip_sign: U, V = svd_flip(U, V) return U[:, :n_components], s[:n_components], V[:n_components, :]
def D_tau_old(M, tau=None, l=5): if not tau: tau = 5 * np.sum(M.shape) / 2 #r is rank(M) r = 0 sk = r + 1 agl = 'arpack' #agl = 'lobpcg' (U, S, VT) = svds(M, k=min(sk, min(M.shape) - 1), solver=agl) S = S[::-1] U, VT = svd_flip(U[:, ::-1], VT[::-1]) while np.min(S) >= tau: sk = sk + l (U, S, VT) = svds(M, k=min(sk, min(M.shape) - 1), solver=agl) S = S[::-1] U, VT = svd_flip(U[:, ::-1], VT[::-1]) print("min S:") print(np.min(S)) print("sk:") print(sk) print("tau in D_tau:") print(tau) shrink_S = np.maximum(S - tau, 0) r = np.count_nonzero(shrink_S) diag_shrink_S = np.diag(shrink_S) res = np.linalg.multi_dot([U, diag_shrink_S, VT]) ''' s_thresh = np.maximum(S - tau, 0) rank = (s_thresh > 0).sum() s_thresh = s_thresh[:rank] U_thresh = U[:, :rank] VT_thresh = VT[:rank, :] S_thresh = np.diag(s_thresh) #res = np.dot(U_thresh, np.dot(S_thresh, VT_thresh)) del U del VT res = np.linalg.multi_dot([U_thresh, S_thresh, VT_thresh]) ''' return res
def fit_transform(self, X, y=None): """ Fit LSI model to X and perform dimensionality reduction on X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. Returns ------- X_new : array, shape (n_samples, n_components) Reduced version of X. This will always be a dense array. """ X = as_float_array(X, copy=False) random_state = check_random_state(self.random_state) # If sparse and not csr or csc, convert to csr if sp.issparse(X) and X.getformat() not in ["csr", "csc"]: X = X.tocsr() if self.algorithm == "arpack": U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol) # svds doesn't abide by scipy.linalg.svd/randomized_svd # conventions, so reverse its outputs. Sigma = Sigma[::-1] U, VT = svd_flip(U[:, ::-1], VT[::-1]) elif self.algorithm == "randomized": k = self.n_components n_features = X.shape[1] if k >= n_features: raise ValueError("n_components must be < n_features;" " got %d >= %d" % (k, n_features)) U, Sigma, VT = randomized_svd(X, self.n_components, n_iter=self.n_iter, random_state=random_state) else: raise ValueError("unknown algorithm %r" % self.algorithm) self.components_ = VT self.Sigma = Sigma[:self.n_components] # Calculate explained variance & explained variance ratio X_transformed = np.dot(U, np.diag(Sigma)) self.explained_variance_ = exp_var = np.var(X_transformed, axis=0) if sp.issparse(X): _, full_var = mean_variance_axis(X, axis=0) full_var = full_var.sum() else: full_var = np.var(X, axis=0).sum() self.explained_variance_ratio_ = exp_var / full_var return X_transformed
def _my_svd(M, k, algorithm): if algorithm == 'randomized': (U, S, V) = randomized_svd(M, n_components=min(k, M.shape[1] - 1), n_oversamples=20) elif algorithm == 'arpack': (U, S, V) = svds(M, k=min(k, min(M.shape) - 1)) S = S[::-1] U, V = svd_flip(U[:, ::-1], V[::-1]) else: raise ValueError("unknown algorithm") return (U, S, V)
def eigh_wrapper(A, B=None, rank=None, eval_descending=True): """ Symmetrics eigenvector or genealized eigenvector problem. A v = lambda v or A v = labmda B v where A (and B) are symmetric (hermetian). Parameters ---------- A: array-like, shape (n x n) B: None, array-like, shape (n x n) rank: None, int Number of eval_descending: bool Whether or not to compute largest or smallest eigenvalues. If True, will compute largest rank eigenvalues and eigenvalues are returned in descending order. Otherwise, computes smallest eigenvalues and returns them in ascending order. Output ------ evals, evecs """ if rank is not None: n_max_evals = A.shape[0] if eval_descending: eigvals_idxs = (n_max_evals - rank, n_max_evals - 1) else: eigvals_idxs = (0, rank - 1) else: eigvals_idxs = None evals, evecs = eigh(a=A, b=B, eigvals=eigvals_idxs) if eval_descending: ev_reordering = np.argsort(-evals) evals = evals[ev_reordering] evecs = evecs[:, ev_reordering] evecs = svd_flip(evecs, evecs.T)[0] return evals, evecs
def test_svd_flip_1d(): # Make sure svd_flip_1d is equivalent to svd_flip u = np.array([1, -4, 2]) v = np.array([1, 2, 3]) u_expected, v_expected = svd_flip(u.reshape(-1, 1), v.reshape(1, -1)) _svd_flip_1d(u, v) # inplace assert_allclose(u, u_expected.ravel()) assert_allclose(u, [-1, 4, -2]) assert_allclose(v, v_expected.ravel()) assert_allclose(v, [-1, -2, -3])
def _decompose_full(self, mat): if self.n_components != "mle": if not 0 <= self.n_components <= self.n_samples_: raise ValueError("n_components=%r must be between 1 and " "n_samples=%r with " "svd_solver='%s'" % ( self.n_components, self.n_samples_, self.svd_solver, )) elif self.n_components >= 1: if not isinstance(self.n_components, numbers.Integral): raise ValueError( "n_components=%r must be of type int " "when greater than or equal to 1, " "was of type=%r" % (self.n_components, type(self.n_components))) U, S, Vt = linalg.svd(mat, full_matrices=False) U[:, S < self.tol] = 0.0 Vt[S < self.tol] = 0.0 S[S < self.tol] = 0.0 # flip eigenvectors' sign to enforce deterministic output U, Vt = svd_flip(U, Vt) # Get variance explained by singular values explained_variance_ = (S**2) / (self.n_samples_ - 1) total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var # Postprocess the number of components required if self.n_components == "mle": self.n_components = _infer_dimension(explained_variance_, self.n_samples_) elif 0 < self.n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold # side='right' ensures that number of features selected # their variance is always greater than self.n_components float # passed. More discussion in issue: #15669 ratio_cumsum = stable_cumsum(explained_variance_ratio_) self.n_components = (np.searchsorted( ratio_cumsum, self.n_components, side="right") + 1) self.n_components = self.n_components return ( U[:, :self.n_components], S[:self.n_components], Vt[:self.n_components], )
def _decompose_truncated(self, mat): if not 1 <= self.n_components <= self.n_samples_: raise ValueError("n_components=%r must be between 1 and " "n_samples=%r with " "svd_solver='%s'" % ( self.n_components, self.n_samples_, self.svd_solver, )) elif not isinstance(self.n_components, numbers.Integral): raise ValueError( "n_components=%r must be of type int " "when greater than or equal to 1, was of type=%r" % (self.n_components, type(self.n_components))) elif self.svd_solver == "arpack" and self.n_components == self.n_samples_: raise ValueError("n_components=%r must be strictly less than " "n_samples=%r with " "svd_solver='%s'" % ( self.n_components, self.n_samples_, self.svd_solver, )) random_state = check_random_state(self.random_state) if self._fit_svd_solver == "arpack": v0 = _init_arpack_v0(min(mat.shape), random_state) U, S, Vt = svds(mat, k=self.n_components, tol=self.tol, v0=v0) # svds doesn't abide by scipy.linalg.svd/randomized_svd # conventions, so reverse its outputs. S = S[::-1] # flip eigenvectors' sign to enforce deterministic output U, Vt = svd_flip(U[:, ::-1], Vt[::-1]) # We have already eliminated all other solvers, so this must be "randomized" else: # sign flipping is done inside U, S, Vt = randomized_svd( mat, n_components=self.n_components, n_iter=self.iterated_power, flip_sign=True, random_state=random_state, ) U[:, S < self.tol] = 0.0 Vt[S < self.tol] = 0.0 S[S < self.tol] = 0.0 return U, S, Vt
def test_svd_flip(): """Check that svd_flip works in both situations, and reconstructs input.""" rs = np.random.RandomState(1999) n_samples = 20 n_features = 10 X = rs.randn(n_samples, n_features) # Check matrix reconstruction U, S, V = linalg.svd(X, full_matrices=False) U1, V1 = svd_flip(U, V, u_based_decision=False) assert_almost_equal(np.dot(U1 * S, V1), X, decimal=6) # Check transposed matrix reconstruction XT = X.T U, S, V = linalg.svd(XT, full_matrices=False) U2, V2 = svd_flip(U, V, u_based_decision=True) assert_almost_equal(np.dot(U2 * S, V2), XT, decimal=6) # Check that different flip methods are equivalent under reconstruction U_flip1, V_flip1 = svd_flip(U, V, u_based_decision=True) assert_almost_equal(np.dot(U_flip1 * S, V_flip1), XT, decimal=6) U_flip2, V_flip2 = svd_flip(U, V, u_based_decision=False) assert_almost_equal(np.dot(U_flip2 * S, V_flip2), XT, decimal=6)
def pca_fit_full_daal(X, n_components): fit_result, eigenvalues, eigenvectors, S = pca_fit_daal(X, min(X.shape), 'svdDense') U = pca_transform_daal(fit_result, X, min(X.shape), X.shape[0], eigenvalues, eigenvectors, whiten=True, scale_eigenvalues=True) V = fit_result.eigenvectors U, V = svd_flip(U, V) eigenvalues = fit_result.eigenvalues[:n_components].copy() eigenvectors = fit_result.eigenvectors[:n_components].copy() return fit_result, eigenvalues, eigenvectors, U, S, V
def test_svd_flip(): # Check that svd_flip works in both situations, and reconstructs input. rs = np.random.RandomState(1999) n_samples = 20 n_features = 10 X = rs.randn(n_samples, n_features) # Check matrix reconstruction U, S, V = linalg.svd(X, full_matrices=False) U1, V1 = svd_flip(U, V, u_based_decision=False) assert_almost_equal(np.dot(U1 * S, V1), X, decimal=6) # Check transposed matrix reconstruction XT = X.T U, S, V = linalg.svd(XT, full_matrices=False) U2, V2 = svd_flip(U, V, u_based_decision=True) assert_almost_equal(np.dot(U2 * S, V2), XT, decimal=6) # Check that different flip methods are equivalent under reconstruction U_flip1, V_flip1 = svd_flip(U, V, u_based_decision=True) assert_almost_equal(np.dot(U_flip1 * S, V_flip1), XT, decimal=6) U_flip2, V_flip2 = svd_flip(U, V, u_based_decision=False) assert_almost_equal(np.dot(U_flip2 * S, V_flip2), XT, decimal=6)
def pod(X, rank=6): # --> Compute the rank-k truncated SVD of X. U, Σ, Vh = svds(X, k=rank) # --> ARPACK does not abide by SVD convention. idx = np.argsort(-Σ) Σ = Σ[idx] # --> Sign correction to ensure deterministic output from SVD. U, Vh = svd_flip(U[:, idx], Vh[idx]) # --> Low-dimensional PCA state vector. a = np.diag(Σ) @ Vh return U, a, Σ**2
def customDomainSVD_accel(D, f, e, U, W, Bi, n, m, svd_power, t): Q = range_finder_domain_accel(D, f, e, U, W, Bi, n, m, svd_power, t) Bt = np.zeros((n, Q.shape[1]), dtype=np.float32) # B.T = dot(E.T, Q) halko.matMulTrans_SVD_domain_accel(D, f, U, W, Q, Bt, Bi, n, m, t) # SVD on thin matrix Uhat, s, V = linalg.svd(Bt.T, full_matrices=False) del Bt U = np.dot(Q, Uhat) # Correct sign U, V = svd_flip(U, V) return U[:, :e], s[:e], V[:e, :]
def _sparpack_wrapper(self, k=None, ncv=None, tol=0, v0=None, maxiter=None): """Wrapper for scipy.sparse.linalg.svds Apply Singular Value Decomposition to the embedding matrix of shape (`M`, `N`) using the `scipy.sparse.linalg.svds`_ algorithm. Parameters ---------- See Also -------- https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.svd.html """ # Matrix to be decomposed x = csc_matrix(self._embedseries()) # Default k value is full svd if k is None: k = min(x.shape) - 1 u, s, v = sparpack(x, k=k, ncv=ncv, tol=tol, which='LM', v0=v0, maxiter=maxiter, return_singular_vectors=True) # with this implementation vectors needs to be flipped to match lapack # other format and sign ambiguities have to be solved to force # deterministic output with svd_flip u, v = svd_flip(u[:, ::-1], v[::-1, :]) self.svd = [np.matrix(u), s[::-1], np.matrix(v)] return self.svd
def D_svt(M, sk): agl = 'arpack' #agl = 'lobpcg' (U, S, VT) = svds(M, k=min(sk, min(M.shape) - 1), solver=agl) S = S[::-1] U, VT = svd_flip(U[:, ::-1], VT[::-1]) #shrink_S = np.maximum(S - tau, 0) diag_shrink_S = np.diag(S) res = np.linalg.multi_dot([U, diag_shrink_S, VT]) print('D_svt S shape') print(diag_shrink_S.shape) return res
def svd_wrapper(Y, k, method='svds'): if method is 'svds': Ut, St, Vt = svds(Y, k) idx = np.argsort(St)[::-1] St = St[idx] # have issue with sorting zero singular values Ut, Vt = svd_flip(Ut[:, idx], Vt[idx]) elif method is 'random': Ut, St, Vt = randomized_svd(Y, k) else: Ut, St, Vt = np.linalg.svd(Y, full_matrices=False) # now truncate it to k Ut = Ut[:, :k] St = np.diag(St[:k]) Vt = Vt[:k, :] return Ut, St, Vt
def fit(self): """Fit the model by computing full SVD on m. SVD factors the matrix m as u * np.diag(s) * v, where u and v are unitary and s is a 1-d array of m‘s singular values. Note that the SVD is commonly written as a = U S V.H, and the v returned by this function is V.H (the Hermitian transpose). Therefore, we denote V.H as vt, and back into the actual v, denoted just v. The decomposition uses np.linalg.svd with full_matrices=False, so for m with shape (M, N), then the shape of: - u is (M, K) - v is (K, N where K = min(M, N) Intertia is the percentage of explained variance. Returns ------- self, to enable method chaining """ self.n_samples, self.n_features = self.ms.shape self.u, self.s, self.vt = np.linalg.svd(self.ms, full_matrices=False) self.v = self.vt.T # sklearn's implementation is to guarantee that the left and right # singular vectors (U and V) are always the same, by imposing the # that the largest coefficient of U in absolute value is positive # This implementation uses u_based_decision=False rather than the # default True to flip that logic and ensure the resulting # components and loadings have high positive coefficients self.u, self.vt = svd_flip( self.u, self.v, u_based_decision=self.u_based_decision ) self.v = self.vt.T # Drop eigenvalues with value > threshold # *keep* is number of components retained self.eigenvalues = self.s ** 2 / self.n_samples self.keep = np.count_nonzero(self.eigenvalues > self.threshold) self.inertia = (self.eigenvalues / self.eigenvalues.sum())[: self.keep] self.cumulative_inertia = self.inertia.cumsum()[: self.keep] self.eigenvalues = self.eigenvalues[: self.keep] return self
def fit(self, X, Y): """Fit model to data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of predictors. Y : array-like, shape = [n_samples, n_targets] Target vectors, where n_samples is the number of samples and n_targets is the number of response variables. """ # copy since this will contains the centered data check_consistent_length(X, Y) X = check_array(X, dtype=np.float64, copy=self.copy) Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) if self.n_components > max(Y.shape[1], X.shape[1]): raise ValueError("Invalid number of components n_components=%d" " with X of shape %s and Y of shape %s." % (self.n_components, str(X.shape), str(Y.shape))) # Scale (in place) X, Y, self.x_mean_, self.y_mean_, self.x_std_, self.y_std_ = ( _center_scale_xy(X, Y, self.scale)) # svd(X'Y) C = np.dot(X.T, Y) # The arpack svds solver only works if the number of extracted # components is smaller than rank(X) - 1. Hence, if we want to extract # all the components (C.shape[1]), we have to use another one. Else, # let's use arpacks to compute only the interesting components. if self.n_components >= np.min(C.shape): U, s, V = svd(C, full_matrices=False) else: U, s, V = svds(C, k=self.n_components) # Deterministic output U, V = svd_flip(U, V) V = V.T self.x_scores_ = np.dot(X, U) self.y_scores_ = np.dot(Y, V) self.x_weights_ = U self.y_weights_ = V return self
def partial_fit_transform(self, X): shp = X.shape if X.ndim == 4: X = np.reshape(X, (shp[0], -1)) elif X.ndim == 5: X = np.reshape(X, (shp[0] * shp[1], -1)) whr = np.where(np.any(X != self.pad_value, axis=1))[0] if len(whr) > 0: if self.n_samples_seen < self.n_samples_train: self.lock.acquire() try: # Update stats - they are 0 if this is the fisrt step col_mean, col_var, n_total_samples = \ _incremental_mean_and_var( X[whr], last_mean=self.mean, last_variance=self.var, last_sample_count=np.repeat(self.n_samples_seen, X[whr].shape[1])) n_total_samples = n_total_samples[0] if self.n_samples_seen == 0: X[whr] = X[whr] - col_mean _X = X[whr] else: col_batch_mean = np.mean(X[whr], axis=0) X[whr] = X[whr] - col_batch_mean # Build matrix of combined previous basis and new data mean_correction = np.sqrt( (self.n_samples_seen * X[whr].shape[0]) / n_total_samples) * (self.mean - col_batch_mean) _X = np.vstack((self.singular_values.reshape( (-1, 1)) * self.components, X[whr], mean_correction)) U, S, V = np.linalg.svd(_X, full_matrices=False) U, V = svd_flip(U, V, u_based_decision=False) explained_variance = S**2 / (n_total_samples - 1) self.n_samples_seen = n_total_samples self.components = V[:self.max_components] self.singular_values = S[:self.max_components] self.mean = col_mean self.var = col_var self.explained_variance = explained_variance[:self. max_components] finally: self.lock.release() else: X[whr] = X[whr] - self.mean X[whr] = np.dot((np.dot(X[whr], self.components.T) / np.sqrt(self.explained_variance + self.epsilon)), self.components) return np.reshape(X, shp)
def sym_eigen_to_mat_singular(V, s, method='mult'): """returns V, sigmas""" if method == 'mult': V, s = V, np.sqrt(s) elif method == 'block': rank = np.argwhere(s > 1e-15).size assert rank % 2 == 0 # rank should be even, else we have a # non-convergence issue ind_vo = 0 Vo = np.empty((A_n, rank / 2)) for i in range(0, s.size - 1, 2): j = i if i == 0: if np.abs(np.inner(V[A_m:, 1], V[A_m:, 1]) - 1) < np.abs(np.inner(V[A_m:, 0], V[A_m:, 0]) - 1): j = 1 else: viol_ni = np.abs(np.inner(V[A_m:, i], V[A_m:, i]) - 1) viol_nj = np.abs(np.inner(V[A_m:, i + 1], V[A_m:, i + 1]) - 1) if viol_ni < viol_nj / 10: pass elif viol_ni > viol_nj * 10: j = i + 1 else: ipi = 0 ip1 = 0 for k in range(i / 2 - 1): ipi += np.abs(np.inner(V[A_m:, i], Vo[:, k])) ip1 += np.abs(np.inner(V[A_m:, i + 1], Vo[:, k])) if ip1 < ipi: j = i + 1 if s[j] < 1e-15: # no more eigenvalues continue Vo[:, ind_vo] = V[A_m:, j] ind_vo += 1 s = s[0:rank:2] V, s = Vo, s else: raise Exception('method {} not recognized'.format(method)) k = V.shape[1] _, V = svd_flip(np.eye(k), V.T, u_based_decision=False) V = V.T return V, s
def Solve(self, K): # SELECT THE BEST METHOD TO CALCULATE THE EIGENVALUES if self.eigen_solver == 'auto': if K.shape[0] > 200 and self.n_components < 10: eigen_solver = 'arpack' else: eigen_solver = 'dense' else: eigen_solver = self.eigen_solver #GET EIGENVALUES AND EIGENVECTOR THE CENTER KERNEL if eigen_solver == 'dense': self.lambdas_, self.vectors_ = linalg.eigh( K, eigvals=(K.shape[0] - self.n_components, K.shape[0] - 1)) elif eigen_solver == 'arpack': random_state = check_random_state(self.random_state) # initialize with [-1,1] as in ARPACK v0 = random_state.uniform(-1, 1, K.shape[0]) self.lambdas_, self.vectors_ = eigsh(K, self.n_components, which="LA", tol=self.tol, maxiter=self.max_iter, v0=v0) # make sure that the eigenvalues are ok and fix numerical issues self.lambdas_ = _check_psd_eigenvalues(self.lambdas_, enable_warnings=False) # flip eigenvectors' sign to enforce deterministic output self.vectors_, _ = svd_flip(self.vectors_, np.empty_like(self.vectors_).T) # sort eigenvectors in descending order indices = self.lambdas_.argsort()[::-1] self.lambdas_ = self.lambdas_[indices] self.vectors_ = self.vectors_[:, indices] # remove eigenvectors with a zero eigenvalue (null space) if required if self.remove_zero_eig: self.vectors_ = self.vectors_[:, self.lambdas_ > 0] self.lambdas_ = self.lambdas_[self.lambdas_ > 0] return K
def customSVD(D, f, e, F, p, Bi, n, m, svd_power, t): Q = range_finder(D, f, e, F, p, Bi, n, m, svd_power, t) Bt = np.zeros((n, Q.shape[1]), dtype=np.float32) # B.T = dot(E.T, Q) if F is None: halko.matMulTrans_Freq(D, f, Q, Bt, Bi, n, m, t) else: halko.matMulTrans_Guide(D, f, F, p, Q, Bt, Bi, n, m, t) # SVD on thin matrix Uhat, s, V = linalg.svd(Bt.T, full_matrices=False) del Bt U = np.dot(Q, Uhat) # Correct sign U, V = svd_flip(U, V) return U[:, :e], s[:e], V[:e, :]
def _fit_transform(self, K): """ Fit's using kernel K""" # center kernel K = self._centerer.fit_transform(K) self.lambdas_, self.alphas_ = linalg.eigh( K, eigvals=(K.shape[0] - self.n_components, K.shape[0] - 1)) # make sure that the eigenvalues are ok and fix numerical issues self.lambdas_ = _check_psd_eigenvalues(self.lambdas_, enable_warnings=False) # flip eigenvectors' sign to enforce deterministic output self.alphas_, _ = svd_flip(self.alphas_, np.empty_like(self.alphas_).T) # sort eigenvectors in descending order indices = self.lambdas_.argsort()[::-1] self.lambdas_ = self.lambdas_[indices] self.alphas_ = self.alphas_[:, indices] # remove eigenvectors with a zero eigenvalue (null space) if required self.alphas_ = self.alphas_[:, self.lambdas_ > 0] self.lambdas_ = self.lambdas_[self.lambdas_ > 0] # Maintenance note on Eigenvectors normalization # ---------------------------------------------- # there is a link between # the eigenvectors of K=Phi(X)'Phi(X) and the ones of Phi(X)Phi(X)' # if v is an eigenvector of K # then Phi(X)v is an eigenvector of Phi(X)Phi(X)' # if u is an eigenvector of Phi(X)Phi(X)' # then Phi(X)'u is an eigenvector of Phi(X)Phi(X)' # # At this stage our self.alphas_ (the v) have norm 1, we need to scale # them so that eigenvectors in kernel feature space (the u) have norm=1 # instead # # We COULD scale them here: # self.alphas_ = self.alphas_ / np.sqrt(self.lambdas_) # # But choose to perform that LATER when needed, in `fit()` and in # `transform()`. return K
def _svd_train(self, data_carray): """Linear PCA training routine, used also by KernelPCA.""" # Computing SVD reduction from numpy import linalg from sklearn.utils.extmath import svd_flip u, s, v = linalg.svd(data_carray.atleast_2d().tondarray(), full_matrices=False) # flip eigenvectors' sign to enforce deterministic output u, v = svd_flip(u, v) eigenvec = CArray(u) eigenval = CArray(s) components = CArray(v) # Now we sort the eigenvalues/eigenvectors idx = (-eigenval).argsort(axis=None) eigenval = CArray(eigenval[idx]) eigenvec = CArray(eigenvec[:, idx]).atleast_2d() components = CArray(components[idx, :]).atleast_2d() # percentage of variance explained by each component explained_variance = (eigenval**2) / (data_carray.shape[0] - 1) explained_variance_ratio = explained_variance / explained_variance.sum( ) if 0 < self.n_components < 1.0: # number of components for which the cumulated explained variance # percentage is superior to the desired threshold ratio_cumsum = explained_variance_ratio.cumsum() self.n_components = CArray( ratio_cumsum < self.n_components).sum() + 1 # Consider only n_components self._eigenval = CArray(eigenval[:self.n_components]) self._eigenvec = CArray(eigenvec[:, :self.n_components]) self._components = CArray(components[:self.n_components, :]) # storing explained variance of n_components only self._explained_variance = explained_variance[:self.n_components] self._explained_variance_ratio = explained_variance_ratio[:self. n_components] return self
def PCA_values(data, centered=True): n_samples, n_features = data.shape #By default, the data are centered if (centered == True): data_centered = data - mean(data, axis=0) else: data_centered = data #apply the Single Vector Decomposition U, S, V = linalg.svd(data_centered, full_matrices=False) # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U, V) #components components_ = V #variance explained by PCs explained_variance_ratio_ = varianceExplained(S, n_samples) return(components_, explained_variance_ratio_)
def _pca(self, Y, k): """PCA using a fast truncated svd implementation in scipy Arguments --------- Y : np.array p x n normalized genotype matrix k : int rank used for truncated svd Returns ------- A tuple with elements ... L : np.array loadings matrix from running PCA on the original matrix F : np.array factor matrix from running PCA on the original dataset Lamb : np.array matrix of eigen values """ # compute truncated svd of data matrix V, lamb, VT = svds(Y.T @ Y, k) sigma = np.sqrt(lamb[::-1]) sigma_inv = 1. / sigma Sigma = np.diag(sigma) Sigma_inv = np.diag(sigma_inv) # flip signs of right eigenvectors V, VT = svd_flip(V[:, ::-1], VT[::-1]) F = (Y @ V @ Sigma_inv) # project on to factors L = (F.T @ Y).T return ((L, F, Sigma))
def fit(self, X): """X is a 2-d numpy array""" # Actually, the 'eig' method and the svd method # are the same thing mean_X = np.mean(X, axis=0) X_n = X - mean_X if self.use_svd == 'eig': sigma = np.dot(X_n.T, X_n) #sigma = np.cov(X) eigvalue, eigvector = np.linalg.eig(sigma) eigvalue = np.real(eigvalue) eigvector = np.real(eigvector) self.w = eigvector[np.argsort(eigvalue)[self.n_components:], :] elif self.use_svd == 'svd': # Don't forget the 'full_matrices'=False # as it's for reduced SVD U, S, V = np.linalg.svd(X_n, full_matrices=False) # flip eigenvectors' sign #to enforce deterministic output(from:sklearn/pca.py) U, V = svd_flip(U, V) self.w = V[:self.n_components]
def test_normalized_gives_correct_result(self, prepare_table): """Make sure that normalization through widget gives correct result.""" # Randomly set some values to zero random_state = check_random_state(42) mask = random_state.beta(1, 2, size=self.iris.X.shape) > 0.5 self.iris.X[mask] = 0 data = prepare_table(self.iris) # Enable normalization and run data through widget self.widget.controls.normalize.setChecked(True) self.send_signal(self.widget.Inputs.data, data) self.wait_until_stop_blocking() widget_result = self.get_output(self.widget.Outputs.transformed_data) # Compute the correct embedding x = self.iris.X x = (x - x.mean(0)) / x.std(0) U, S, Va = np.linalg.svd(x) U, S, Va = U[:, :2], S[:2], Va[:2] U, Va = svd_flip(U, Va) pca_embedding = U * S np.testing.assert_almost_equal(widget_result.X, pca_embedding)
def partial_fit(self, X, y=None, check_input=True): """Incremental fit with X. All of X is processed as a single batch. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. Returns ------- self: object Returns the instance itself. """ # ====== check the samples and cahces ====== # if isinstance(X, Data): X = X[:] if check_input: X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32]) n_samples, n_features = X.shape # check number of components if self.n_components is None: self.n_components_ = n_features elif not 1 <= self.n_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d, need " "more rows than columns for IncrementalPCA " "processing" % (self.n_components, n_features)) else: self.n_components_ = self.n_components # check the cache if n_samples < n_features or self._nb_cached_samples > 0: self._cache_batches.append(X) self._nb_cached_samples += n_samples # not enough samples yet if self._nb_cached_samples < n_features: return else: # group mini batch into big batch X = np.concatenate(self._cache_batches, axis=0) self._cache_batches = [] self._nb_cached_samples = 0 n_samples = X.shape[0] # ====== fit the model ====== # if (self.components_ is not None) and (self.components_.shape[0] != self.n_components_): raise ValueError("Number of input features has changed from %i " "to %i between calls to partial_fit! Try " "setting n_components to a fixed value." % (self.components_.shape[0], self.n_components_)) # Update stats - they are 0 if this is the fisrt step col_mean, col_var, n_total_samples = \ _incremental_mean_and_var(X, last_mean=self.mean_, last_variance=self.var_, last_sample_count=self.n_samples_seen_) total_var = np.sum(col_var * n_total_samples) if total_var == 0: # if variance == 0, make no sense to continue return self # Whitening if self.n_samples_seen_ == 0: # If it is the first step, simply whiten X X -= col_mean else: col_batch_mean = np.mean(X, axis=0) X -= col_batch_mean # Build matrix of combined previous basis and new data mean_correction = \ np.sqrt((self.n_samples_seen_ * n_samples) / n_total_samples) * (self.mean_ - col_batch_mean) X = np.vstack((self.singular_values_.reshape((-1, 1)) * self.components_, X, mean_correction)) U, S, V = linalg.svd(X, full_matrices=False) U, V = svd_flip(U, V, u_based_decision=False) explained_variance = S ** 2 / n_total_samples explained_variance_ratio = S ** 2 / total_var self.n_samples_seen_ = n_total_samples self.components_ = V[:self.n_components_] self.singular_values_ = S[:self.n_components_] self.mean_ = col_mean self.var_ = col_var self.explained_variance_ = explained_variance[:self.n_components_] self.explained_variance_ratio_ = \ explained_variance_ratio[:self.n_components_] if self.n_components_ < n_features: self.noise_variance_ = \ explained_variance[self.n_components_:].mean() else: self.noise_variance_ = 0. return self
def _fit_truncated(self, X, n_components, svd_solver): """Fit the model by computing truncated SVD (by ARPACK or randomized) on X""" n_samples, n_features = X.shape if isinstance(n_components, six.string_types): raise ValueError( "n_components=%r cannot be a string with svd_solver='%s'" % (n_components, svd_solver) ) elif not 1 <= n_components <= min(n_samples, n_features): raise ValueError( "n_components=%r must be between 1 and min(n_samples, " "n_features)=%r with svd_solver='%s'" % ( n_components, min(n_samples, n_features), svd_solver ) ) elif not isinstance(n_components, (numbers.Integral, np.integer)): raise ValueError( "n_components=%r must be of type int when greater than or " "equal to 1, was of type=%r" % (n_components, type(n_components)) ) elif svd_solver == "arpack" and n_components == min(n_samples, n_features): raise ValueError( "n_components=%r must be strictly less than min(n_samples, " "n_features)=%r with svd_solver='%s'" % ( n_components, min(n_samples, n_features), svd_solver ) ) random_state = check_random_state(self.random_state) self.mean_ = X.mean(axis=0) total_var = ut.var(X, axis=0, ddof=1) if svd_solver == "arpack": # Center data X -= self.mean_ # random init solution, as ARPACK does it internally v0 = random_state.uniform(-1, 1, size=min(X.shape)) U, S, V = sp.linalg.svds(X, k=n_components, tol=self.tol, v0=v0) # svds doesn't abide by scipy.linalg.svd/randomized_svd # conventions, so reverse its outputs. S = S[::-1] # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U[:, ::-1], V[::-1]) elif svd_solver == "randomized": # sign flipping is done inside U, S, V = randomized_pca( X, n_components=n_components, n_iter=self.iterated_power, flip_sign=True, random_state=random_state, ) self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = V self.n_components_ = n_components # Get variance explained by singular values self.explained_variance_ = (S ** 2) / (n_samples - 1) self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum() self.singular_values_ = S.copy() # Store the singular values. if self.n_components_ < min(n_features, n_samples): self.noise_variance_ = (total_var.sum() - self.explained_variance_.sum()) self.noise_variance_ /= min(n_features, n_samples) - n_components else: self.noise_variance_ = 0 return U, S, V
def partial_fit(self, X, y=None): """Incremental fit with X. All of X is processed as a single batch. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. Returns ------- self: object Returns the instance itself. """ #X = check_array(X, copy=self.copy, dtype=np.float) # --- ADJUSTED X = np.asarray(X) # --- ADJUSTED n_samples, n_features = X.shape if not hasattr(self, 'components_'): self.components_ = None if self.n_components is None: self.n_components_ = n_features elif not 1 <= self.n_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d, need " "more rows than columns for IncrementalPCA " "processing" % (self.n_components, n_features)) else: self.n_components_ = self.n_components if (self.components_ is not None) and (self.components_.shape[0] != self.n_components_): raise ValueError("Number of input features has changed from %i " "to %i between calls to partial_fit! Try " "setting n_components to a fixed value." % ( self.components_.shape[0], self.n_components_)) if self.components_ is None: # This is the first pass through partial_fit self.n_samples_seen_ = 0 col_var = X.var(axis=0) col_mean = X.mean(axis=0) X -= col_mean U, S, V = linalg.svd(X, full_matrices=False) U, V = svd_flip(U, V, u_based_decision=False) explained_variance = S ** 2 / n_samples explained_variance_ratio = S ** 2 / np.sum(col_var * n_samples) else: col_batch_mean = X.mean(axis=0) col_mean, col_var, n_total_samples = _batch_mean_variance_update( X, self.mean_, self.var_, self.n_samples_seen_) X -= col_batch_mean # Build matrix of combined previous basis and new data mean_correction = np.sqrt((self.n_samples_seen_ * n_samples) / n_total_samples) * (self.mean_ - col_batch_mean) X_combined = np.vstack((self.singular_values_.reshape((-1, 1)) * self.components_, X, mean_correction)) U, S, V = linalg.svd(X_combined, full_matrices=False) U, V = svd_flip(U, V, u_based_decision=False) explained_variance = S ** 2 / n_total_samples explained_variance_ratio = S ** 2 / np.sum(col_var * n_total_samples) self.n_samples_seen_ += n_samples self.components_ = V[:self.n_components_] self.singular_values_ = S[:self.n_components_] self.mean_ = col_mean self.var_ = col_var self.explained_variance_ = explained_variance[:self.n_components_] self.explained_variance_ratio_ = \ explained_variance_ratio[:self.n_components_] # if self.n_components_ < n_features: # --- ADJUSTED # self.noise_variance_ = \ # explained_variance[self.n_components_:].mean() # else: # self.noise_variance_ = 0. return self
def fit(self, X, Y): """Fit model to data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples in the number of samples and n_features is the number of predictors. Y : array-like of response, shape = [n_samples, n_targets] Target vectors, where n_samples in the number of samples and n_targets is the number of response variables. """ # copy since this will contains the residuals (deflated) matrices check_consistent_length(X, Y) X = check_array(X, dtype=np.float64, copy=self.copy) Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) n = X.shape[0] p = X.shape[1] q = Y.shape[1] if self.n_components < 1 or self.n_components > p: raise ValueError('Invalid number of components: %d' % self.n_components) if self.algorithm not in ("svd", "nipals"): raise ValueError("Got algorithm %s when only 'svd' " "and 'nipals' are known" % self.algorithm) if self.algorithm == "svd" and self.mode == "B": raise ValueError('Incompatible configuration: mode B is not ' 'implemented with svd algorithm') if self.deflation_mode not in ["canonical", "regression"]: raise ValueError('The deflation mode is unknown') # Scale (in place) X, Y, self.x_mean_, self.y_mean_, self.x_std_, self.y_std_ = ( _center_scale_xy(X, Y, self.scale)) # Residuals (deflated) matrices Xk = X Yk = Y # Results matrices self.x_scores_ = np.zeros((n, self.n_components)) self.y_scores_ = np.zeros((n, self.n_components)) self.x_weights_ = np.zeros((p, self.n_components)) self.y_weights_ = np.zeros((q, self.n_components)) self.x_loadings_ = np.zeros((p, self.n_components)) self.y_loadings_ = np.zeros((q, self.n_components)) self.n_iter_ = [] # NIPALS algo: outer loop, over components for k in range(self.n_components): if np.all(np.dot(Yk.T, Yk) < np.finfo(np.double).eps): # Yk constant warnings.warn('Y residual constant at iteration %s' % k) break # 1) weights estimation (inner loop) # ----------------------------------- if self.algorithm == "nipals": x_weights, y_weights, n_iter_ = \ _nipals_twoblocks_inner_loop( X=Xk, Y=Yk, mode=self.mode, max_iter=self.max_iter, tol=self.tol, norm_y_weights=self.norm_y_weights) self.n_iter_.append(n_iter_) elif self.algorithm == "svd": x_weights, y_weights = _svd_cross_product(X=Xk, Y=Yk) # Forces sign stability of x_weights and y_weights # Sign undeterminacy issue from svd if algorithm == "svd" # and from platform dependent computation if algorithm == 'nipals' x_weights, y_weights = svd_flip(x_weights, y_weights.T) y_weights = y_weights.T # compute scores x_scores = np.dot(Xk, x_weights) if self.norm_y_weights: y_ss = 1 else: y_ss = np.dot(y_weights.T, y_weights) y_scores = np.dot(Yk, y_weights) / y_ss # test for null variance if np.dot(x_scores.T, x_scores) < np.finfo(np.double).eps: warnings.warn('X scores are null at iteration %s' % k) break # 2) Deflation (in place) # ---------------------- # Possible memory footprint reduction may done here: in order to # avoid the allocation of a data chunk for the rank-one # approximations matrix which is then subtracted to Xk, we suggest # to perform a column-wise deflation. # # - regress Xk's on x_score x_loadings = np.dot(Xk.T, x_scores) / np.dot(x_scores.T, x_scores) # - subtract rank-one approximations to obtain remainder matrix Xk -= np.dot(x_scores, x_loadings.T) if self.deflation_mode == "canonical": # - regress Yk's on y_score, then subtract rank-one approx. y_loadings = (np.dot(Yk.T, y_scores) / np.dot(y_scores.T, y_scores)) Yk -= np.dot(y_scores, y_loadings.T) if self.deflation_mode == "regression": # - regress Yk's on x_score, then subtract rank-one approx. y_loadings = (np.dot(Yk.T, x_scores) / np.dot(x_scores.T, x_scores)) Yk -= np.dot(x_scores, y_loadings.T) # 3) Store weights, scores and loadings # Notation: self.x_scores_[:, k] = x_scores.ravel() # T self.y_scores_[:, k] = y_scores.ravel() # U self.x_weights_[:, k] = x_weights.ravel() # W self.y_weights_[:, k] = y_weights.ravel() # C self.x_loadings_[:, k] = x_loadings.ravel() # P self.y_loadings_[:, k] = y_loadings.ravel() # Q # Such that: X = TP' + Err and Y = UQ' + Err # 4) rotations from input space to transformed space (scores) # T = X W(P'W)^-1 = XW* (W* : p x k matrix) # U = Y C(Q'C)^-1 = YC* (W* : q x k matrix) self.x_rotations_ = np.dot( self.x_weights_, linalg.pinv2(np.dot(self.x_loadings_.T, self.x_weights_), **pinv2_args)) if Y.shape[1] > 1: self.y_rotations_ = np.dot( self.y_weights_, linalg.pinv2(np.dot(self.y_loadings_.T, self.y_weights_), **pinv2_args)) else: self.y_rotations_ = np.ones(1) if True or self.deflation_mode == "regression": # FIXME what's with the if? # Estimate regression coefficient # Regress Y on T # Y = TQ' + Err, # Then express in function of X # Y = X W(P'W)^-1Q' + Err = XB + Err # => B = W*Q' (p x q) self.coef_ = np.dot(self.x_rotations_, self.y_loadings_.T) self.coef_ = (1. / self.x_std_.reshape((p, 1)) * self.coef_ * self.y_std_) return self