def test_svd_compressed_deterministic(): m, n = 30, 25 x = da.random.RandomState(1234).random_sample(size=(m, n), chunks=(5, 5)) u, s, vt = svd_compressed(x, 3, seed=1234) u2, s2, vt2 = svd_compressed(x, 3, seed=1234) assert all(da.compute((u == u2).all(), (s == s2).all(), (vt == vt2).all()))
def test_svd_compressed(): m, n = 2000, 250 r = 10 np.random.seed(4321) mat1 = np.random.randn(m, r) mat2 = np.random.randn(r, n) mat = mat1.dot(mat2) data = da.from_array(mat, chunks=(500, 50)) u, s, vt = svd_compressed(data, r, seed=4321, n_power_iter=2) usvt = da.dot(u, da.dot(da.diag(s), vt)) tol = 0.2 assert_eq(da.linalg.norm(usvt), np.linalg.norm(mat), rtol=tol, atol=tol) # average accuracy check u = u[:, :r] s = s[:r] vt = vt[:r, :] s_exact = np.linalg.svd(mat)[1] s_exact = s_exact[:r] assert_eq(np.eye(r, r), da.dot(u.T, u)) # u must be orthonormal assert_eq(np.eye(r, r), da.dot(vt, vt.T)) # v must be orthonormal assert_eq(s, s_exact) # s must contain the singular values
def test_svd_compressed(iterator): m, n = 100, 50 r = 5 a = da.random.random((m, n), chunks=(m, n)) # calculate approximation and true singular values u, s, vt = svd_compressed(a, 2 * r, iterator=iterator[0], n_power_iter=iterator[1], seed=4321) # worst case s_true = scipy.linalg.svd(a.compute(), compute_uv=False) # compute the difference with original matrix norm = scipy.linalg.norm((a - (u[:, :r] * s[:r]) @ vt[:r, :]).compute(), 2) # ||a-a_hat||_2 <= (1+tol)s_{k+1}: based on eq. 1.10/1.11: # Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp. # "Finding structure with randomness: Probabilistic algorithms for constructing # approximate matrix decompositions." SIAM review 53.2 (2011): 217-288. frac = norm / s_true[r + 1] - 1 # Tolerance determined via simulation to be slightly above max norm of difference matrix in 10k samples. # See https://github.com/dask/dask/pull/6799#issuecomment-726631175 for more details. tol = 0.4 assert frac < tol assert_eq(np.eye(r, r), da.dot(u[:, :r].T, u[:, :r])) # u must be orthonormal assert_eq(np.eye(r, r), da.dot(vt[:r, :], vt[:r, :].T)) # v must be orthonormal
def test_svd_compressed_shapes(m, n, k, chunks): x = da.random.random(size=(m, n), chunks=chunks) u, s, v = svd_compressed(x, k=k, n_power_iter=1, compute=True, seed=1) u, s, v = da.compute(u, s, v) r = min(m, n, k) assert u.shape == (m, r) assert s.shape == (r, ) assert v.shape == (r, n)
def fit(self, A): if not hasattr(A, 'dask'): A = da.from_array(A, A.shape) n_comps = self.svd_kwargs.pop('n_components') _, _, vt = svd_compressed(A, n_comps, **self.svd_kwargs) self.components_ = vt return self
def test_svd_compressed(): m, n = 300, 250 r = 10 np.random.seed(1234) mat1 = np.random.randn(m, r) mat2 = np.random.randn(r, n) mat = mat1.dot(mat2) data = from_array(mat, chunks=(50, 50)) n_iter = 6 for i in range(n_iter): u, s, vt = svd_compressed(data, r) u = np.array(u) s = np.array(s) vt = np.array(vt) if i == 0: usvt = np.dot(u, np.dot(np.diag(s), vt)) else: usvt += np.dot(u, np.dot(np.diag(s), vt)) usvt /= n_iter tol = 2e-1 assert np.allclose(np.linalg.norm(mat - usvt), np.linalg.norm(mat), rtol=tol, atol=tol) # average accuracy check u, s, vt = svd_compressed(data, r) u = np.array(u)[:, :r] s = np.array(s)[:r] vt = np.array(vt)[:r, :] s_exact = np.linalg.svd(mat)[1] s_exact = s_exact[:r] assert np.allclose(np.eye(r, r), np.dot(u.T, u)) # u must be orthonormal assert np.allclose(np.eye(r, r), np.dot(vt, vt.T)) # v must be orthonormal assert np.allclose(s, s_exact) # s must contain the singular values
def test_svd_compressed_dtype_preservation(input_dtype, output_dtype): x = da.random.random((50, 50), chunks=(50, 50)).astype(input_dtype) u, s, vt = svd_compressed(x, 1, seed=4321) assert u.dtype == s.dtype == vt.dtype == output_dtype
def partial_fit(self, X, y=None, check_input=True): """Incremental fit with X. All of X is processed as a single batch. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. check_input : bool Run check_array on X. y : Ignored Returns ------- self : object Returns the instance itself. """ if check_input: if sparse.issparse(X): raise TypeError( "IncrementalPCA.partial_fit does not support " "sparse input. Either convert data to dense " "or use IncrementalPCA.fit to do so in batches.") X = check_array( X, copy=self.copy, dtype=[np.float64, np.float32], accept_multiple_blocks=True, ) n_samples, n_features = X.shape if not hasattr(self, "components_"): self.components_ = None if self.n_components is None: if self.components_ is None: self.n_components_ = min(n_samples, n_features) else: self.n_components_ = self.components_.shape[0] elif not 1 <= self.n_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d, need " "more rows than columns for IncrementalPCA " "processing" % (self.n_components, n_features)) elif not self.n_components <= n_samples: raise ValueError("n_components=%r must be less or equal to " "the batch number of samples " "%d." % (self.n_components, n_samples)) else: self.n_components_ = self.n_components if (self.components_ is not None) and (self.components_.shape[0] != self.n_components_): raise ValueError("Number of input features has changed from %i " "to %i between calls to partial_fit! Try " "setting n_components to a fixed value." % (self.components_.shape[0], self.n_components_)) # This is the first partial_fit if not hasattr(self, "n_samples_seen_"): self.n_samples_seen_ = 0 self.mean_ = 0.0 self.var_ = 0.0 # Update stats - they are 0 if this is the first step # The next line is equivalent with np.repeat(self.n_samples_seen_, X.shape[1]), # which dask-array does not support last_sample_count = np.tile(np.expand_dims(self.n_samples_seen_, 0), X.shape[1]) col_mean, col_var, n_total_samples = _incremental_mean_and_var( X, last_mean=self.mean_, last_variance=self.var_, last_sample_count=last_sample_count, ) n_total_samples = da.compute(n_total_samples[0])[0] # Whitening if self.n_samples_seen_ == 0: # If it is the first step, simply whiten X X -= col_mean else: col_batch_mean = np.mean(X, axis=0) X -= col_batch_mean # Build matrix of combined previous basis and new data mean_correction = np.sqrt( (self.n_samples_seen_ * n_samples) / n_total_samples) * (self.mean_ - col_batch_mean) X = np.vstack(( self.singular_values_.reshape((-1, 1)) * self.components_, X, mean_correction, )) # The following part is modified so that it can fit to large dask-array solver = self._get_solver(X, self.n_components_) if solver in {"full", "tsqr"}: U, S, V = linalg.svd(X) # manually implement full_matrix=False if V.shape[0] > len(S): V = V[:len(S)] if U.shape[1] > len(S): U = U[:, :len(S)] else: # randomized random_state = check_random_state(self.random_state) seed = draw_seed(random_state, np.iinfo("int32").max) n_power_iter = self.iterated_power U, S, V = linalg.svd_compressed(X, self.n_components_, n_power_iter=n_power_iter, seed=seed) U, V = svd_flip(U, V) explained_variance = S**2 / (n_total_samples - 1) components, singular_values = V, S # The following part is also updated for randomized solver, # which computes only a limited number of the singular values total_var = np.sum(col_var) explained_variance_ratio = (explained_variance / total_var * ((n_total_samples - 1) / n_total_samples)) actual_rank = min(n_features, n_total_samples) if self.n_components_ < actual_rank: if solver == "randomized": noise_variance = (total_var - explained_variance.sum()) / ( actual_rank - self.n_components_) else: noise_variance = da.mean( explained_variance[self.n_components_:]) else: noise_variance = 0.0 self.n_samples_seen_ = n_total_samples try: ( self.n_samples_, self.mean_, self.var_, self.n_features_, self.components_, self.explained_variance_, self.explained_variance_ratio_, self.singular_values_, self.noise_variance_, ) = compute( n_samples, col_mean, col_var, n_features, components[:self.n_components_], explained_variance[:self.n_components_], explained_variance_ratio[:self.n_components_], singular_values[:self.n_components_], noise_variance, ) except ValueError as e: if np.isnan([n_samples, n_features]).any(): msg = ( "Computation of the SVD raised an error. It is possible " "n_components is too large. i.e., " "`n_components > np.nanmin(X.shape) = " "np.nanmin({})`\n\n" "A possible resolution to this error is to ensure that " "n_components <= min(n_samples, n_features)") raise ValueError(msg.format(X.shape)) from e raise e if len(self.singular_values_) < self.n_components_: self.n_components_ = len(self.singular_values_) msg = ( "n_components={n} is larger than the number of singular values" " ({s}) (note: PCA has attributes as if n_components == {s})") raise ValueError( msg.format(n=self.n_components_, s=len(self.singular_values_))) return self