def test_incremental_variance_ddof(): """Test that degrees of freedom parameter for calculations are correct.""" rng = np.random.RandomState(1999) X = rng.randn(50, 10) n_samples, n_features = X.shape for batch_size in [11, 20, 37]: steps = np.arange(0, X.shape[0], batch_size) if steps[-1] != X.shape[0]: steps = np.hstack([steps, n_samples]) for i, j in zip(steps[:-1], steps[1:]): batch = X[i:j, :] if i == 0: incremental_means = batch.mean(axis=0) incremental_variances = batch.var(axis=0) # Assign this twice so that the test logic is consistent incremental_count = batch.shape[0] sample_count = batch.shape[0] else: result = _batch_mean_variance_update(batch, incremental_means, incremental_variances, sample_count) (incremental_means, incremental_variances, incremental_count) = result sample_count += batch.shape[0] calculated_means = np.mean(X[:j], axis=0) calculated_variances = np.var(X[:j], axis=0) assert_almost_equal(incremental_means, calculated_means, 6) assert_almost_equal(incremental_variances, calculated_variances, 6) assert_equal(incremental_count, sample_count)
def test_incremental_variance_ddof(): # Test that degrees of freedom parameter for calculations are correct. rng = np.random.RandomState(1999) X = rng.randn(50, 10) n_samples, n_features = X.shape for batch_size in [11, 20, 37]: steps = np.arange(0, X.shape[0], batch_size) if steps[-1] != X.shape[0]: steps = np.hstack([steps, n_samples]) for i, j in zip(steps[:-1], steps[1:]): batch = X[i:j, :] if i == 0: incremental_means = batch.mean(axis=0) incremental_variances = batch.var(axis=0) # Assign this twice so that the test logic is consistent incremental_count = batch.shape[0] sample_count = batch.shape[0] else: result = _batch_mean_variance_update( batch, incremental_means, incremental_variances, sample_count) (incremental_means, incremental_variances, incremental_count) = result sample_count += batch.shape[0] calculated_means = np.mean(X[:j], axis=0) calculated_variances = np.var(X[:j], axis=0) assert_almost_equal(incremental_means, calculated_means, 6) assert_almost_equal(incremental_variances, calculated_variances, 6) assert_equal(incremental_count, sample_count)
def test_incremental_variance_update_formulas(): """Test Youngs and Cramer incremental variance formulas.""" # Doggie data from http://www.mathsisfun.com/data/standard-deviation.html A = np.array([[600, 470, 170, 430, 300], [600, 470, 170, 430, 300], [600, 470, 170, 430, 300], [600, 470, 170, 430, 300]]).T idx = 2 X1 = A[:idx, :] X2 = A[idx:, :] old_means = X1.mean(axis=0) old_variances = X1.var(axis=0) old_sample_count = X1.shape[0] final_means, final_variances, final_count = _batch_mean_variance_update( X2, old_means, old_variances, old_sample_count) assert_almost_equal(final_means, A.mean(axis=0), 6) assert_almost_equal(final_variances, A.var(axis=0), 6) assert_almost_equal(final_count, A.shape[0])
def test_incremental_variance_update_formulas(): # Test Youngs and Cramer incremental variance formulas. # Doggie data from http://www.mathsisfun.com/data/standard-deviation.html A = np.array([[600, 470, 170, 430, 300], [600, 470, 170, 430, 300], [600, 470, 170, 430, 300], [600, 470, 170, 430, 300]]).T idx = 2 X1 = A[:idx, :] X2 = A[idx:, :] old_means = X1.mean(axis=0) old_variances = X1.var(axis=0) old_sample_count = X1.shape[0] final_means, final_variances, final_count = _batch_mean_variance_update( X2, old_means, old_variances, old_sample_count) assert_almost_equal(final_means, A.mean(axis=0), 6) assert_almost_equal(final_variances, A.var(axis=0), 6) assert_almost_equal(final_count, A.shape[0])
def partial_fit(self, X, y=None): """Incremental fit with X. All of X is processed as a single batch. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. Returns ------- self: object Returns the instance itself. """ #X = check_array(X, copy=self.copy, dtype=np.float) # --- ADJUSTED X = np.asarray(X) # --- ADJUSTED n_samples, n_features = X.shape if not hasattr(self, 'components_'): self.components_ = None if self.n_components is None: self.n_components_ = n_features elif not 1 <= self.n_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d, need " "more rows than columns for IncrementalPCA " "processing" % (self.n_components, n_features)) else: self.n_components_ = self.n_components if (self.components_ is not None) and (self.components_.shape[0] != self.n_components_): raise ValueError("Number of input features has changed from %i " "to %i between calls to partial_fit! Try " "setting n_components to a fixed value." % ( self.components_.shape[0], self.n_components_)) if self.components_ is None: # This is the first pass through partial_fit self.n_samples_seen_ = 0 col_var = X.var(axis=0) col_mean = X.mean(axis=0) X -= col_mean U, S, V = linalg.svd(X, full_matrices=False) U, V = svd_flip(U, V, u_based_decision=False) explained_variance = S ** 2 / n_samples explained_variance_ratio = S ** 2 / np.sum(col_var * n_samples) else: col_batch_mean = X.mean(axis=0) col_mean, col_var, n_total_samples = _batch_mean_variance_update( X, self.mean_, self.var_, self.n_samples_seen_) X -= col_batch_mean # Build matrix of combined previous basis and new data mean_correction = np.sqrt((self.n_samples_seen_ * n_samples) / n_total_samples) * (self.mean_ - col_batch_mean) X_combined = np.vstack((self.singular_values_.reshape((-1, 1)) * self.components_, X, mean_correction)) U, S, V = linalg.svd(X_combined, full_matrices=False) U, V = svd_flip(U, V, u_based_decision=False) explained_variance = S ** 2 / n_total_samples explained_variance_ratio = S ** 2 / np.sum(col_var * n_total_samples) self.n_samples_seen_ += n_samples self.components_ = V[:self.n_components_] self.singular_values_ = S[:self.n_components_] self.mean_ = col_mean self.var_ = col_var self.explained_variance_ = explained_variance[:self.n_components_] self.explained_variance_ratio_ = \ explained_variance_ratio[:self.n_components_] # if self.n_components_ < n_features: # --- ADJUSTED # self.noise_variance_ = \ # explained_variance[self.n_components_:].mean() # else: # self.noise_variance_ = 0. return self
def partial_fit(self, X, y=None): """Incremental fit with X. All of X is processed as a single batch. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. Returns ------- self: object Returns the instance itself. """ #X = check_array(X, copy=self.copy, dtype=np.float) # --- ADJUSTED X = np.asarray(X) # --- ADJUSTED n_samples, n_features = X.shape if not hasattr(self, 'components_'): self.components_ = None if self.n_components is None: self.n_components_ = n_features elif not 1 <= self.n_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d, need " "more rows than columns for IncrementalPCA " "processing" % (self.n_components, n_features)) else: self.n_components_ = self.n_components if (self.components_ is not None) and (self.components_.shape[0] != self.n_components_): raise ValueError("Number of input features has changed from %i " "to %i between calls to partial_fit! Try " "setting n_components to a fixed value." % (self.components_.shape[0], self.n_components_)) if self.components_ is None: # This is the first pass through partial_fit self.n_samples_seen_ = 0 col_var = X.var(axis=0) col_mean = X.mean(axis=0) X -= col_mean U, S, V = linalg.svd(X, full_matrices=False) U, V = svd_flip(U, V, u_based_decision=False) explained_variance = S**2 / n_samples explained_variance_ratio = S**2 / np.sum(col_var * n_samples) else: col_batch_mean = X.mean(axis=0) col_mean, col_var, n_total_samples = _batch_mean_variance_update( X, self.mean_, self.var_, self.n_samples_seen_) X -= col_batch_mean # Build matrix of combined previous basis and new data mean_correction = np.sqrt( (self.n_samples_seen_ * n_samples) / n_total_samples) * (self.mean_ - col_batch_mean) X_combined = np.vstack((self.singular_values_.reshape( (-1, 1)) * self.components_, X, mean_correction)) U, S, V = linalg.svd(X_combined, full_matrices=False) U, V = svd_flip(U, V, u_based_decision=False) explained_variance = S**2 / n_total_samples explained_variance_ratio = S**2 / np.sum(col_var * n_total_samples) self.n_samples_seen_ += n_samples self.components_ = V[:self.n_components_] self.singular_values_ = S[:self.n_components_] self.mean_ = col_mean self.var_ = col_var self.explained_variance_ = explained_variance[:self.n_components_] self.explained_variance_ratio_ = \ explained_variance_ratio[:self.n_components_] # if self.n_components_ < n_features: # --- ADJUSTED # self.noise_variance_ = \ # explained_variance[self.n_components_:].mean() # else: # self.noise_variance_ = 0. return self