def test_incremental_mean_and_variance_ignore_nan(): old_means = np.array([535.0, 535.0, 535.0, 535.0]) old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0]) old_sample_count = np.array([2, 2, 2, 2], dtype=np.int32) X = np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]]) X_nan = np.array( [ [170, np.nan, 170, 170], [np.nan, 170, 430, 430], [430, 430, np.nan, 300], [300, 300, 300, np.nan], ] ) X_means, X_variances, X_count = _incremental_mean_and_var( X, old_means, old_variances, old_sample_count ) X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var( X_nan, old_means, old_variances, old_sample_count ) assert_allclose(X_nan_means, X_means) assert_allclose(X_nan_variances, X_variances) assert_allclose(X_nan_count, X_count)
def test_incremental_weighted_mean_and_variance_ignore_nan(dtype): old_means = np.array([535.0, 535.0, 535.0, 535.0]) old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0]) old_weight_sum = np.array([2, 2, 2, 2], dtype=np.int32) sample_weights_X = np.ones(3) sample_weights_X_nan = np.ones(4) X = np.array( [[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]] ).astype(dtype) X_nan = np.array( [ [170, np.nan, 170, 170], [np.nan, 170, 430, 430], [430, 430, np.nan, 300], [300, 300, 300, np.nan], ] ).astype(dtype) X_means, X_variances, X_count = _incremental_mean_and_var( X, old_means, old_variances, old_weight_sum, sample_weight=sample_weights_X ) X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var( X_nan, old_means, old_variances, old_weight_sum, sample_weight=sample_weights_X_nan, ) assert_allclose(X_nan_means, X_means) assert_allclose(X_nan_variances, X_variances) assert_allclose(X_nan_count, X_count)
def test_incremental_variance_ddof(): # Test that degrees of freedom parameter for calculations are correct. rng = np.random.RandomState(1999) X = rng.randn(50, 10) n_samples, n_features = X.shape for batch_size in [11, 20, 37]: steps = np.arange(0, X.shape[0], batch_size) if steps[-1] != X.shape[0]: steps = np.hstack([steps, n_samples]) for i, j in zip(steps[:-1], steps[1:]): batch = X[i:j, :] if i == 0: incremental_means = batch.mean(axis=0) incremental_variances = batch.var(axis=0) # Assign this twice so that the test logic is consistent incremental_count = batch.shape[0] sample_count = batch.shape[0] else: result = _incremental_mean_and_var( batch, incremental_means, incremental_variances, sample_count) (incremental_means, incremental_variances, incremental_count) = result sample_count += batch.shape[0] calculated_means = np.mean(X[:j], axis=0) calculated_variances = np.var(X[:j], axis=0) assert_almost_equal(incremental_means, calculated_means, 6) assert_almost_equal(incremental_variances, calculated_variances, 6) assert_equal(incremental_count, sample_count)
def test_incremental_variance_ddof(): # Test that degrees of freedom parameter for calculations are correct. rng = np.random.RandomState(1999) X = rng.randn(50, 10) n_samples, n_features = X.shape for batch_size in [11, 20, 37]: steps = np.arange(0, X.shape[0], batch_size) if steps[-1] != X.shape[0]: steps = np.hstack([steps, n_samples]) for i, j in zip(steps[:-1], steps[1:]): batch = X[i:j, :] if i == 0: incremental_means = batch.mean(axis=0) incremental_variances = batch.var(axis=0) # Assign this twice so that the test logic is consistent incremental_count = batch.shape[0] sample_count = np.full(batch.shape[1], batch.shape[0], dtype=np.int32) else: result = _incremental_mean_and_var(batch, incremental_means, incremental_variances, sample_count) (incremental_means, incremental_variances, incremental_count) = result sample_count += batch.shape[0] calculated_means = np.mean(X[:j], axis=0) calculated_variances = np.var(X[:j], axis=0) assert_almost_equal(incremental_means, calculated_means, 6) assert_almost_equal(incremental_variances, calculated_variances, 6) assert_array_equal(incremental_count, sample_count)
def get_var(dataset): mean_, var_ = 0., 0. count = 0 dtype = dataset[0][1].dtype for idx, (_, x, _, length) in enumerate(dataset): x = x[:length] mean_, var_, _ = _incremental_mean_and_var(x, mean_, var_, count) count += len(x) return var_.astype(dtype)
def _assert(X, sample_weight, expected_mean, expected_var): n = X.shape[0] for chunk_size in [1, n // 10 + 1, n // 4 + 1, n // 2 + 1, n]: last_mean, last_weight_sum, last_var = 0, 0, 0 for batch in gen_batches(n, chunk_size): last_mean, last_var, last_weight_sum = \ _incremental_mean_and_var( X[batch], last_mean, last_var, last_weight_sum, sample_weight=sample_weight[batch]) assert_allclose(last_mean, expected_mean) assert_allclose(last_var, expected_var, atol=1e-6)
def test_incremental_weighted_mean_and_variance_simple(rng, dtype): mult = 10 X = rng.rand(1000, 20).astype(dtype) * mult sample_weight = rng.rand(X.shape[0]) * mult mean, var, _ = _incremental_mean_and_var(X, 0, 0, 0, sample_weight=sample_weight) expected_mean = np.average(X, weights=sample_weight, axis=0) expected_var = ( np.average(X ** 2, weights=sample_weight, axis=0) - expected_mean ** 2 ) assert_almost_equal(mean, expected_mean) assert_almost_equal(var, expected_var)
def meanvar(dataset, lengths=None, mean_=0., var_=0., last_sample_count=0, return_last_sample_count=False): """Mean/variance computation given a iterable dataset Dataset can have variable length samples. In that cases, you need to explicitly specify lengths for all the samples. Args: dataset (nnmnkwii.datasets.Dataset): Dataset lengths: (list): Frame lengths for each dataset sample. mean\_ (array or scalar): Initial value for mean vector. var\_ (array or scaler): Initial value for variance vector. last_sample_count (int): Last sample count. Default is 0. If you set non-default ``mean_`` and ``var_``, you need to set ``last_sample_count`` property. Typically this will be the number of time frames ever seen. return_last_sample_count (bool): Return ``last_sample_count`` if True. Returns: tuple: Mean and variance for each dimention. If ``return_last_sample_count`` is True, returns ``last_sample_count`` as well. See also: :func:`nnmnkwii.preprocessing.meanstd`, :func:`nnmnkwii.preprocessing.scale` Examples: >>> from nnmnkwii.preprocessing import meanvar >>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model >>> from nnmnkwii.datasets import FileSourceDataset >>> X, Y = example_file_data_sources_for_acoustic_model() >>> X, Y = FileSourceDataset(X), FileSourceDataset(Y) >>> lengths = [len(y) for y in Y] >>> data_mean, data_var = meanvar(Y, lengths) """ dtype = dataset[0].dtype for idx, x in enumerate(dataset): if lengths is not None: x = x[:lengths[idx]] mean_, var_, _ = _incremental_mean_and_var(x, mean_, var_, last_sample_count) last_sample_count += len(x) mean_, var_ = mean_.astype(dtype), var_.astype(dtype) if return_last_sample_count: return mean_, var_, last_sample_count else: return mean_, var_
def test_incremental_mean_and_variance_ignore_nan(): old_means = np.array([535., 535., 535., 535.]) old_variances = np.array([4225., 4225., 4225., 4225.]) old_sample_count = np.array([2, 2, 2, 2], dtype=np.int32) X = np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]]) X_nan = np.array([[170, np.nan, 170, 170], [np.nan, 170, 430, 430], [430, 430, np.nan, 300], [300, 300, 300, np.nan]]) X_means, X_variances, X_count = _incremental_mean_and_var( X, old_means, old_variances, old_sample_count) X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var( X_nan, old_means, old_variances, old_sample_count) assert_allclose(X_nan_means, X_means) assert_allclose(X_nan_variances, X_variances) assert_allclose(X_nan_count, X_count)
def meanvar(dataset): last_sample_count = 0 mean_ = 0. var_ = 0. dtype = dataset[0].dtype for idx, x in enumerate(dataset): mean_, var_, _ = _incremental_mean_and_var(x, mean_, var_, last_sample_count) last_sample_count += len(x) mean_, var_ = mean_.astype(dtype), var_.astype(dtype) return mean_, var_
def fit(self, X, y=None): """Compute the mean and std to be used for later scaling. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. y: Passthrough for ``Pipeline`` compatibility. """ self.mean_, self.var_, self.n_samples_seen_ = \ _incremental_mean_and_var(X, self.mean_, self.var_, self.n_samples_seen_) return self
def partial_fit_transform(self, X): shp = X.shape if X.ndim == 4: X = np.reshape(X, (shp[0], -1)) elif X.ndim == 5: X = np.reshape(X, (shp[0] * shp[1], -1)) whr = np.where(np.any(X != self.pad_value, axis=1))[0] if len(whr) > 0: if self.n_samples_seen < self.n_samples_train: self.lock.acquire() try: # Update stats - they are 0 if this is the fisrt step col_mean, col_var, n_total_samples = \ _incremental_mean_and_var( X[whr], last_mean=self.mean, last_variance=self.var, last_sample_count=np.repeat(self.n_samples_seen, X[whr].shape[1])) n_total_samples = n_total_samples[0] if self.n_samples_seen == 0: X[whr] = X[whr] - col_mean _X = X[whr] else: col_batch_mean = np.mean(X[whr], axis=0) X[whr] = X[whr] - col_batch_mean # Build matrix of combined previous basis and new data mean_correction = np.sqrt( (self.n_samples_seen * X[whr].shape[0]) / n_total_samples) * (self.mean - col_batch_mean) _X = np.vstack((self.singular_values.reshape( (-1, 1)) * self.components, X[whr], mean_correction)) U, S, V = np.linalg.svd(_X, full_matrices=False) U, V = svd_flip(U, V, u_based_decision=False) explained_variance = S**2 / (n_total_samples - 1) self.n_samples_seen = n_total_samples self.components = V[:self.max_components] self.singular_values = S[:self.max_components] self.mean = col_mean self.var = col_var self.explained_variance = explained_variance[:self. max_components] finally: self.lock.release() else: X[whr] = X[whr] - self.mean X[whr] = np.dot((np.dot(X[whr], self.components.T) / np.sqrt(self.explained_variance + self.epsilon)), self.components) return np.reshape(X, shp)
def test_incremental_variance_update_formulas(): # Test Youngs and Cramer incremental variance formulas. # Doggie data from https://www.mathsisfun.com/data/standard-deviation.html A = np.array([[600, 470, 170, 430, 300], [600, 470, 170, 430, 300], [600, 470, 170, 430, 300], [600, 470, 170, 430, 300]]).T idx = 2 X1 = A[:idx, :] X2 = A[idx:, :] old_means = X1.mean(axis=0) old_variances = X1.var(axis=0) old_sample_count = np.full(X1.shape[1], X1.shape[0], dtype=np.int32) final_means, final_variances, final_count = \ _incremental_mean_and_var(X2, old_means, old_variances, old_sample_count) assert_almost_equal(final_means, A.mean(axis=0), 6) assert_almost_equal(final_variances, A.var(axis=0), 6) assert_almost_equal(final_count, A.shape[0])
def test_incremental_variance_update_formulas(): # Test Youngs and Cramer incremental variance formulas. # Doggie data from http://www.mathsisfun.com/data/standard-deviation.html A = np.array([[600, 470, 170, 430, 300], [600, 470, 170, 430, 300], [600, 470, 170, 430, 300], [600, 470, 170, 430, 300]]).T idx = 2 X1 = A[:idx, :] X2 = A[idx:, :] old_means = X1.mean(axis=0) old_variances = X1.var(axis=0) old_sample_count = X1.shape[0] final_means, final_variances, final_count = \ _incremental_mean_and_var(X2, old_means, old_variances, old_sample_count) assert_almost_equal(final_means, A.mean(axis=0), 6) assert_almost_equal(final_variances, A.var(axis=0), 6) assert_almost_equal(final_count, A.shape[0])
def partial_fit(self, X, y=None, check_input=True, svd_solver='auto'): ''' incremental_PCA if svd_solver in ['arpack','randomized'] -> _fit_truncated elif svd_solver == 'full' -> _fit_full else: raise ValueError # standard_scaler:True # whiten:Bool false # copy:Bool True ''' if check_input: check_array(X, copy=self.copy, accept_sparse=['csc', 'csr']) n_samples, n_features = X.shape ''' partial_fit: directly ''' if self.standard_scaler: if not hasattr(self, 'scaler_'): self.scaler_ = StandardScaler() X = self.scaler_.fit_transform(X) if not hasattr(self, 'components_'): self.components_ = None #determine n_components if self.n_components is None: #n_component -> n_components if self.components is None: if self.svd_solver != 'arpack': n_components = min(n_samples, n_features) else: n_components = min(n_samples, n_features) - 1 else: n_components = self.components_.shape[0] else: n_components = self.n_components # -> _partial_fit_full,_partial_fit_truncated process the case where n_components in [0,1] ,'mle',int self.n_components = n_components _fit_svd_solver = self.svd_solver if _fit_svd_solver == 'auto': if max(n_samples, n_features) <= 500 or n_components == 'mle': _fit_svd_solver = 'full' elif 1 <= n_components < 0.8 * min(n_samples, n_features): _fit_svd_solver = 'randomized' else: _fit_svd_solver = 'full' if self.components_ is not None and n_components != self.components_.shape[ 0]: raise ValueError( 'n_components should be identical with components_') #Beginning step if not hasattr(self, 'n_sample_seen'): self.n_sample_seen = 0 self.mean_ = 0. self.var_ = 0. ''' ''' if issparse(X): total_mean, total_variance, n_sample_total = incr_mean_variance_axis0( X, self.mean_, self.var_, self.n_sample_seen) else: total_mean, total_variance, n_sample_total = _incremental_mean_and_var( X, self.mean_, self.var_, self.n_sample_seen) n_sample_total = n_sample_total[0] if self.n_sample_seen == 0: X = X - total_mean else: _local_mean= np.mean(X,axis=0)\ X = X - _local_mean #mean_correction= sqrt( local_samples*merge_before_samples/merge_after_samples )*(merge_before_mean-local_mean ) mean_correction = np.sqrt( (n_samples * self.n_sample_seen) / n_sample_total) * (self.mean_ - _local_mean) #vstack : singular_values(K,)*components_(K,N),X(M,N),mean_correction(N,) X = np.vstack((self.singular_values[:, None] * self.components_, X, mean_correction)) self.mean_ = total_mean self.var_ = total_variance self.n_sample_seen = n_sample_total if _fit_svd_solver == 'full': return self._partial_fit_full(X) elif _fit_svd_solver in ['arpack', 'randomized']: return self._partial_fit_truncated(X, _fit_svd_solver) else: raise ValueError( "svd_solver:{0} not exits in {'full','randomized','arpack','auto'}" .format(_fit_svd_solver))
def partial_fit(self, X, y=None, check_input=True): """Incremental fit with X. All of X is processed as a single batch. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. Returns ------- self: object Returns the instance itself. """ # ====== check the samples and cahces ====== # if isinstance(X, Data): X = X[:] if check_input: X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32]) n_samples, n_features = X.shape # check number of components if self.n_components is None: self.n_components_ = n_features elif not 1 <= self.n_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d, need " "more rows than columns for IncrementalPCA " "processing" % (self.n_components, n_features)) else: self.n_components_ = self.n_components # check the cache if n_samples < n_features or self._nb_cached_samples > 0: self._cache_batches.append(X) self._nb_cached_samples += n_samples # not enough samples yet if self._nb_cached_samples < n_features: return else: # group mini batch into big batch X = np.concatenate(self._cache_batches, axis=0) self._cache_batches = [] self._nb_cached_samples = 0 n_samples = X.shape[0] # ====== fit the model ====== # if (self.components_ is not None) and (self.components_.shape[0] != self.n_components_): raise ValueError("Number of input features has changed from %i " "to %i between calls to partial_fit! Try " "setting n_components to a fixed value." % (self.components_.shape[0], self.n_components_)) # Update stats - they are 0 if this is the fisrt step col_mean, col_var, n_total_samples = \ _incremental_mean_and_var(X, last_mean=self.mean_, last_variance=self.var_, last_sample_count=self.n_samples_seen_) total_var = np.sum(col_var * n_total_samples) if total_var == 0: # if variance == 0, make no sense to continue return self # Whitening if self.n_samples_seen_ == 0: # If it is the first step, simply whiten X X -= col_mean else: col_batch_mean = np.mean(X, axis=0) X -= col_batch_mean # Build matrix of combined previous basis and new data mean_correction = \ np.sqrt((self.n_samples_seen_ * n_samples) / n_total_samples) * (self.mean_ - col_batch_mean) X = np.vstack((self.singular_values_.reshape( (-1, 1)) * self.components_, X, mean_correction)) U, S, V = linalg.svd(X, full_matrices=False) U, V = svd_flip(U, V, u_based_decision=False) explained_variance = S**2 / n_total_samples explained_variance_ratio = S**2 / total_var self.n_samples_seen_ = n_total_samples self.components_ = V[:self.n_components_] self.singular_values_ = S[:self.n_components_] self.mean_ = col_mean self.var_ = col_var self.explained_variance_ = explained_variance[:self.n_components_] self.explained_variance_ratio_ = \ explained_variance_ratio[:self.n_components_] if self.n_components_ < n_features: self.noise_variance_ = \ explained_variance[self.n_components_:].mean() else: self.noise_variance_ = 0. return self
def test_incremental_variance_numerical_stability(): # Test Youngs and Cramer incremental variance formulas. def np_var(A): return A.var(axis=0) # Naive one pass variance computation - not numerically stable # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance def one_pass_var(X): n = X.shape[0] exp_x2 = (X ** 2).sum(axis=0) / n expx_2 = (X.sum(axis=0) / n) ** 2 return exp_x2 - expx_2 # Two-pass algorithm, stable. # We use it as a benchmark. It is not an online algorithm # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm def two_pass_var(X): mean = X.mean(axis=0) Y = X.copy() return np.mean((Y - mean)**2, axis=0) # Naive online implementation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm # This works only for chunks for size 1 def naive_mean_variance_update(x, last_mean, last_variance, last_sample_count): updated_sample_count = (last_sample_count + 1) samples_ratio = last_sample_count / float(updated_sample_count) updated_mean = x / updated_sample_count + last_mean * samples_ratio updated_variance = last_variance * samples_ratio + \ (x - last_mean) * (x - updated_mean) / updated_sample_count return updated_mean, updated_variance, updated_sample_count # We want to show a case when one_pass_var has error > 1e-3 while # _batch_mean_variance_update has less. tol = 200 n_features = 2 n_samples = 10000 x1 = np.array(1e8, dtype=np.float64) x2 = np.log(1e-5, dtype=np.float64) A0 = x1 * np.ones((n_samples // 2, n_features), dtype=np.float64) A1 = x2 * np.ones((n_samples // 2, n_features), dtype=np.float64) A = np.vstack((A0, A1)) # Older versions of numpy have different precision # In some old version, np.var is not stable if np.abs(np_var(A) - two_pass_var(A)).max() < 1e-6: stable_var = np_var else: stable_var = two_pass_var # Naive one pass var: >tol (=1063) assert_greater(np.abs(stable_var(A) - one_pass_var(A)).max(), tol) # Starting point for online algorithms: after A0 # Naive implementation: >tol (436) mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2 for i in range(A1.shape[0]): mean, var, n = \ naive_mean_variance_update(A1[i, :], mean, var, n) assert_equal(n, A.shape[0]) # the mean is also slightly unstable assert_greater(np.abs(A.mean(axis=0) - mean).max(), 1e-6) assert_greater(np.abs(stable_var(A) - var).max(), tol) # Robust implementation: <tol (177) mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2 for i in range(A1.shape[0]): mean, var, n = \ _incremental_mean_and_var(A1[i, :].reshape((1, A1.shape[1])), mean, var, n) assert_equal(n, A.shape[0]) assert_array_almost_equal(A.mean(axis=0), mean) assert_greater(tol, np.abs(stable_var(A) - var).max())
def partial_fit(self, X, y=None, check_input=True): #print(self.components_) #print(self.singular_values_ ) #print(self.mean_) n_samples, n_features = X.shape if not hasattr(self, 'components_'): self.components_ = None if self.n_components is None: if self.components_ is None: self.n_components_ = min(n_samples, n_features) else: self.n_components_ = self.components_.shape[0] elif not 1 <= self.n_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d, need " "more rows than columns for IncrementalPCA " "processing" % (self.n_components, n_features)) elif not self.n_components <= n_samples: raise ValueError("n_components=%r must be less or equal to " "the batch number of samples " "%d." % (self.n_components, n_samples)) else: self.n_components_ = self.n_components if (self.components_ is not None) and (self.components_.shape[0] != self.n_components_): raise ValueError("Number of input features has changed from %i " "to %i between calls to partial_fit! Try " "setting n_components to a fixed value." % (self.components_.shape[0], self.n_components_)) # This is the first partial_fit if not hasattr(self, 'n_samples_seen_'): self.n_samples_seen_ = 0 self.mean_ = .0 self.var_ = .0 # Update stats - they are 0 if this is the fisrt step col_mean, col_var, n_total_samples = \ _incremental_mean_and_var(X, last_mean=self.mean_, last_variance=self.var_, last_sample_count=self.n_samples_seen_) # Whitening #print(col_mean) ---------------Totally Correct if self.n_samples_seen_ == 0: # If it is the first step, simply whiten X X -= col_mean else: col_batch_mean = np.mean(X, axis=0) #print(col_batch_mean) #-----------Totally Correct X -= col_batch_mean # Build matrix of combined previous basis and new data mean_correction = \ np.sqrt((self.n_samples_seen_ * n_samples) / n_total_samples) * (self.mean_ - col_batch_mean) X = np.vstack((self.singular_values_.reshape( (-1, 1)) * self.components_, X, mean_correction)) print(self.singular_values_.reshape((-1, 1)) * self.components_) #print(mean_correction) #-------------Totally Correct #print(X[:2]) #print("\n\n\n"); U, S, V = linalg.svd(X, full_matrices=False) #U, V = svd_flip(U, V, u_based_decision=False) explained_variance = S**2 / (n_total_samples - 1) explained_variance_ratio = S**2 / np.sum(col_var * n_total_samples) self.n_samples_seen_ = n_total_samples self.components_ = V[:self.n_components_] #print(self.components_); self.singular_values_ = S[:self.n_components_] self.mean_ = col_mean #print(self.mean_) self.var_ = col_var self.explained_variance_ = explained_variance[:self.n_components_] self.explained_variance_ratio_ = \ explained_variance_ratio[:self.n_components_] if self.n_components_ < n_features: self.noise_variance_ = \ explained_variance[self.n_components_:].mean() else: self.noise_variance_ = 0. return self
def test_incremental_variance_numerical_stability(): # Test Youngs and Cramer incremental variance formulas. def np_var(A): return A.var(axis=0) # Naive one pass variance computation - not numerically stable # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance def one_pass_var(X): n = X.shape[0] exp_x2 = (X**2).sum(axis=0) / n expx_2 = (X.sum(axis=0) / n)**2 return exp_x2 - expx_2 # Two-pass algorithm, stable. # We use it as a benchmark. It is not an online algorithm # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm def two_pass_var(X): mean = X.mean(axis=0) Y = X.copy() return np.mean((Y - mean)**2, axis=0) # Naive online implementation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm # This works only for chunks for size 1 def naive_mean_variance_update(x, last_mean, last_variance, last_sample_count): updated_sample_count = (last_sample_count + 1) samples_ratio = last_sample_count / float(updated_sample_count) updated_mean = x / updated_sample_count + last_mean * samples_ratio updated_variance = last_variance * samples_ratio + \ (x - last_mean) * (x - updated_mean) / updated_sample_count return updated_mean, updated_variance, updated_sample_count # We want to show a case when one_pass_var has error > 1e-3 while # _batch_mean_variance_update has less. tol = 200 n_features = 2 n_samples = 10000 x1 = np.array(1e8, dtype=np.float64) x2 = np.log(1e-5, dtype=np.float64) A0 = np.full((n_samples // 2, n_features), x1, dtype=np.float64) A1 = np.full((n_samples // 2, n_features), x2, dtype=np.float64) A = np.vstack((A0, A1)) # Older versions of numpy have different precision # In some old version, np.var is not stable if np.abs(np_var(A) - two_pass_var(A)).max() < 1e-6: stable_var = np_var else: stable_var = two_pass_var # Naive one pass var: >tol (=1063) assert_greater(np.abs(stable_var(A) - one_pass_var(A)).max(), tol) # Starting point for online algorithms: after A0 # Naive implementation: >tol (436) mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2 for i in range(A1.shape[0]): mean, var, n = \ naive_mean_variance_update(A1[i, :], mean, var, n) assert_equal(n, A.shape[0]) # the mean is also slightly unstable assert_greater(np.abs(A.mean(axis=0) - mean).max(), 1e-6) assert_greater(np.abs(stable_var(A) - var).max(), tol) # Robust implementation: <tol (177) mean, var = A0[0, :], np.zeros(n_features) n = np.full(n_features, n_samples // 2, dtype=np.int32) for i in range(A1.shape[0]): mean, var, n = \ _incremental_mean_and_var(A1[i, :].reshape((1, A1.shape[1])), mean, var, n) assert_array_equal(n, A.shape[0]) assert_array_almost_equal(A.mean(axis=0), mean) assert_greater(tol, np.abs(stable_var(A) - var).max())
def partial_fit(self, X, y=None, check_input=True): """Incremental fit with X. All of X is processed as a single batch. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. Returns ------- self: object Returns the instance itself. """ # ====== check the samples and cahces ====== # if isinstance(X, Data): X = X[:] if check_input: X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32]) n_samples, n_features = X.shape # check number of components if self.n_components is None: self.n_components_ = n_features elif not 1 <= self.n_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d, need " "more rows than columns for IncrementalPCA " "processing" % (self.n_components, n_features)) else: self.n_components_ = self.n_components # check the cache if n_samples < n_features or self._nb_cached_samples > 0: self._cache_batches.append(X) self._nb_cached_samples += n_samples # not enough samples yet if self._nb_cached_samples < n_features: return else: # group mini batch into big batch X = np.concatenate(self._cache_batches, axis=0) self._cache_batches = [] self._nb_cached_samples = 0 n_samples = X.shape[0] # ====== fit the model ====== # if (self.components_ is not None) and (self.components_.shape[0] != self.n_components_): raise ValueError("Number of input features has changed from %i " "to %i between calls to partial_fit! Try " "setting n_components to a fixed value." % (self.components_.shape[0], self.n_components_)) # Update stats - they are 0 if this is the fisrt step col_mean, col_var, n_total_samples = \ _incremental_mean_and_var(X, last_mean=self.mean_, last_variance=self.var_, last_sample_count=self.n_samples_seen_) total_var = np.sum(col_var * n_total_samples) if total_var == 0: # if variance == 0, make no sense to continue return self # Whitening if self.n_samples_seen_ == 0: # If it is the first step, simply whiten X X -= col_mean else: col_batch_mean = np.mean(X, axis=0) X -= col_batch_mean # Build matrix of combined previous basis and new data mean_correction = \ np.sqrt((self.n_samples_seen_ * n_samples) / n_total_samples) * (self.mean_ - col_batch_mean) X = np.vstack((self.singular_values_.reshape((-1, 1)) * self.components_, X, mean_correction)) U, S, V = linalg.svd(X, full_matrices=False) U, V = svd_flip(U, V, u_based_decision=False) explained_variance = S ** 2 / n_total_samples explained_variance_ratio = S ** 2 / total_var self.n_samples_seen_ = n_total_samples self.components_ = V[:self.n_components_] self.singular_values_ = S[:self.n_components_] self.mean_ = col_mean self.var_ = col_var self.explained_variance_ = explained_variance[:self.n_components_] self.explained_variance_ratio_ = \ explained_variance_ratio[:self.n_components_] if self.n_components_ < n_features: self.noise_variance_ = \ explained_variance[self.n_components_:].mean() else: self.noise_variance_ = 0. return self
def _batch_fit(self, X, y, check_input=False): print('Batch fit') if check_input: X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self) current_n_samples, n_features = X.shape # Update stats - they are 0 if this is the first step updated_mean, updated_var, updated_n_samples_seen_ = _incremental_mean_and_var( X, last_mean=self.mean_, last_variance=self.var_, last_sample_count=self.n_samples_seen_) # Whitening if self.n_samples_seen_ == 0: # If it is the first step, simply whiten X X = np.subtract(X, updated_mean) else: col_batch_mean = np.mean(X, axis=0) X = np.subtract(X, col_batch_mean) # Updating algorithm # Updating Mean and Class Means updated_class_mean = self.class_mean_ updated_class_n_samples_seen_ = self.class_n_samples_seen_ # print('updated_class_n_samples_seen_', updated_class_n_samples_seen_) # print('updated_class_mean', updated_class_mean) for i, current_class in enumerate(self.classes_): current_class_samples = X[y == current_class, :] n_current_class_samples = current_class_samples.shape[0] previous_n_class_samples = updated_class_n_samples_seen_[i] if n_current_class_samples > 0 and previous_n_class_samples > 0: previous_class_sum_current_class = updated_class_mean[ i, :] * updated_class_n_samples_seen_[i] current_class_sum_current_class = np.sum(current_class_samples, axis=0) # print('previous_class_sum_current_class.shape', previous_class_sum_current_class.shape) # print('current_class_sum_current_class.shape', current_class_sum_current_class.shape) # print('updated_class_mean.shape', updated_class_mean.shape) # print('updated_class_n_samples_seen_.shape', updated_class_n_samples_seen_[i]) updated_class_n_samples_seen_[i] += n_current_class_samples updated_class_mean[i, :] = (previous_class_sum_current_class + current_class_sum_current_class) / \ updated_class_n_samples_seen_[i] elif n_current_class_samples > 0: updated_class_mean[i, :] = np.mean(current_class_samples, axis=0) updated_class_n_samples_seen_[i] = n_current_class_samples updated_class_within_scatter = self.class_within_scatter for i, current_class_mean in enumerate(updated_class_mean): current_class_samples = X[y == self.classes_[i], :] n_current_class_samples = current_class_samples.shape[0] l_c = current_class_samples.shape[0] n_c = self.class_n_samples_seen_[i] mean_y_c = np.reshape(np.mean(current_class_samples, axis=0), (n_features, 1)) if n_current_class_samples > 0 and n_c > 0: # print('current_class_samples.shape', current_class_samples.shape) mean_x_c = np.reshape(self.class_mean_[i, :], (n_features, 1)) D_c = (mean_y_c - mean_x_c).dot((mean_y_c - mean_x_c).T) E_c = np.zeros(D_c.shape) for current_samples, j in enumerate(current_class_samples): E_c += (current_samples - mean_x_c).dot( (current_samples - mean_x_c).T) F_c = np.zeros(D_c.shape) for current_samples, j in enumerate(current_class_samples): F_c += (current_samples - mean_y_c).dot( (current_samples - mean_y_c).T) updated_class_within_scatter[:, :, i] += ((n_c * l_c * l_c) * D_c / np.square(n_c + l_c)) + \ ((np.square(n_c) * E_c) / np.square(n_c + l_c)) + \ ((l_c * (l_c + (2 * n_c)) * F_c) / np.square(n_c + l_c)) elif n_current_class_samples > 0: updated_class_within_scatter[:, :, i] = (current_class_samples - mean_y_c).dot( (current_class_samples - mean_y_c).T) updated_within_scatter = np.sum(updated_class_within_scatter, axis=2) # Updating between class scatter updated_between_scatter = self.between_scatter for i, i_class_mean in enumerate(updated_class_mean[:-1, :]): for j_class_mean in updated_class_mean[i + 1:, :]: print('Computing mean difference of means:::', i_class_mean, j_class_mean) current_mean_difference = np.reshape( i_class_mean - j_class_mean, (1, n_features)) d_ij = current_mean_difference.dot( np.linalg.pinv(updated_within_scatter)).dot( current_mean_difference.T) d_ij = np.sqrt(d_ij) if d_ij > 0: w_d_ij = erf(d_ij / (2 * np.sqrt(2))) / (2 * np.square(d_ij)) # print('current_mean_difference.shape', current_mean_difference.shape) updated_between_scatter += w_d_ij * current_mean_difference.T.dot( current_mean_difference) # n = X[y == self.classes_[i], :].shape[0] # current_class_mean = current_class_mean.reshape(1, n_features) # updated_mean = updated_mean.reshape(1, n_features) # if n > 0: # updated_between_scatter += n * (current_class_mean - updated_mean).T.dot( # current_class_mean - updated_mean) # if np.any(np.isnan(updated_between_scatter)): # print('Reached nan:::: ', n) # print('Updatec class mean:::', updated_class_mean) # print('updated mean::::', updated_mean) # Final values after computation self.n_samples_seen_ = updated_n_samples_seen_ self.class_n_samples_seen_ = updated_class_n_samples_seen_ self.mean_ = updated_mean self.class_mean_ = updated_class_mean self.var_ = updated_var self.between_scatter = updated_between_scatter self.within_scatter = updated_within_scatter self.class_within_scatter = updated_class_within_scatter
def partial_fit(self, X, y=None): """ Performs online computation of mean and standard deviation on X for later scaling. All of X is processed as a single batch. This is intended for cases when `fit` is not feasible due to very large number of `n_samples` or because X is read from a continuous stream. The algorithm for incremental mean and std is given in Equation 1.5a,b in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms for computing the sample variance: Analysis and recommendations." The American Statistician 37.3 (1983): 242-247 :param X: Data matrix to scale. :type X: numpy.ndarray, shape [n_samples, n_features] :param y: Passthrough for Scikit-learn ``Pipeline`` compatibility. :type y: None :return: Fitted object. :rtype: pyChemometrics.ChemometricsScaler """ X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES) # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var # See incr_mean_variance_axis and _incremental_mean_variance_axis if sparse.issparse(X): if self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") if self.with_std: # First pass if not hasattr(self, 'n_samples_seen_'): self.mean_, self.var_ = mean_variance_axis(X, axis=0) self.n_samples_seen_ = X.shape[0] # Next passes else: self.mean_, self.var_, self.n_samples_seen_ = \ incr_mean_variance_axis(X, axis=0, last_mean=self.mean_, last_var=self.var_, last_n=self.n_samples_seen_) else: self.mean_ = None self.var_ = None else: # First pass if not hasattr(self, 'n_samples_seen_'): self.mean_ = .0 self.n_samples_seen_ = 0 if self.with_std: self.var_ = .0 else: self.var_ = None self.mean_, self.var_, self.n_samples_seen_ = \ _incremental_mean_and_var(X, self.mean_, self.var_, self.n_samples_seen_) if self.with_std: self.scale_ = _handle_zeros_in_scale(numpy.sqrt( self.var_))**self.scale_power else: self.scale_ = None return self
def update(self, values): self.mean, self.var, self.count =\ _incremental_mean_and_var(self.mean, self.var, self.count)