def test_incr_mean_variance_axis_ignore_nan(axis, sparse_constructor): old_means = np.array([535., 535., 535., 535.]) old_variances = np.array([4225., 4225., 4225., 4225.]) old_sample_count = np.array([2, 2, 2, 2], dtype=np.int64) X = sparse_constructor( np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]])) X_nan = sparse_constructor( np.array([[170, np.nan, 170, 170], [np.nan, 170, 430, 430], [430, 430, np.nan, 300], [300, 300, 300, np.nan]])) # we avoid creating specific data for axis 0 and 1: translating the data is # enough. if axis: X = X.T X_nan = X_nan.T # take a copy of the old statistics since they are modified in place. X_means, X_vars, X_sample_count = incr_mean_variance_axis( X, axis, old_means.copy(), old_variances.copy(), old_sample_count.copy()) X_nan_means, X_nan_vars, X_nan_sample_count = incr_mean_variance_axis( X_nan, axis, old_means.copy(), old_variances.copy(), old_sample_count.copy()) assert_allclose(X_nan_means, X_means) assert_allclose(X_nan_vars, X_vars) assert_allclose(X_nan_sample_count, X_sample_count)
def test_incr_mean_variance_axis_dim_mismatch(sparse_constructor): """Check that we raise proper error when axis=1 and the dimension mismatch. Non-regression test for: https://github.com/scikit-learn/scikit-learn/pull/18655 """ n_samples, n_features = 60, 4 rng = np.random.RandomState(42) X = sparse_constructor(rng.rand(n_samples, n_features)) last_mean = np.zeros(n_features) last_var = np.zeros_like(last_mean) last_n = np.zeros(last_mean.shape, dtype=np.int64) kwargs = dict(last_mean=last_mean, last_var=last_var, last_n=last_n) mean0, var0, _ = incr_mean_variance_axis(X, axis=0, **kwargs) assert_allclose(np.mean(X.toarray(), axis=0), mean0) assert_allclose(np.var(X.toarray(), axis=0), var0) # test ValueError if axis=1 and last_mean.size == n_features with pytest.raises(ValueError): incr_mean_variance_axis(X, axis=1, **kwargs) # test inconsistent shapes of last_mean, last_var, last_n kwargs = dict(last_mean=last_mean[:-1], last_var=last_var, last_n=last_n) with pytest.raises(ValueError): incr_mean_variance_axis(X, axis=0, **kwargs)
def test_incr_mean_variance_axis_ignore_nan(axis, sparse_constructor): old_means = np.array([535., 535., 535., 535.]) old_variances = np.array([4225., 4225., 4225., 4225.]) old_sample_count = np.array([2, 2, 2, 2], dtype=np.int64) X = sparse_constructor( np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]])) X_nan = sparse_constructor( np.array([[170, np.nan, 170, 170], [np.nan, 170, 430, 430], [430, 430, np.nan, 300], [300, 300, 300, np.nan]])) # we avoid creating specific data for axis 0 and 1: translating the data is # enough. if axis: X = X.T X_nan = X_nan.T # take a copy of the old statistics since they are modified in place. X_means, X_vars, X_sample_count = incr_mean_variance_axis( X, axis, old_means.copy(), old_variances.copy(), old_sample_count.copy()) X_nan_means, X_nan_vars, X_nan_sample_count = incr_mean_variance_axis( X_nan, axis, old_means.copy(), old_variances.copy(), old_sample_count.copy()) assert_allclose(X_nan_means, X_means) assert_allclose(X_nan_vars, X_vars) assert_allclose(X_nan_sample_count, X_sample_count)
def test_incr_mean_variance_axis_weighted_axis1(Xw, X, weights, sparse_constructor, dtype): axis = 1 Xw_sparse = sparse_constructor(Xw).astype(dtype) X_sparse = sparse_constructor(X).astype(dtype) last_mean = np.zeros(np.shape(Xw)[0], dtype=dtype) last_var = np.zeros_like(last_mean, dtype=dtype) last_n = np.zeros_like(last_mean, dtype=np.int64) means0, vars0, n_incr0 = incr_mean_variance_axis(X=X_sparse, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n, weights=None) means_w0, vars_w0, n_incr_w0 = incr_mean_variance_axis(X=Xw_sparse, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n, weights=weights) assert means_w0.dtype == dtype assert vars_w0.dtype == dtype assert n_incr_w0.dtype == dtype means_simple, vars_simple = mean_variance_axis(X=X_sparse, axis=axis) assert_array_almost_equal(means0, means_w0) assert_array_almost_equal(means0, means_simple) assert_array_almost_equal(vars0, vars_w0) assert_array_almost_equal(vars0, vars_simple) assert_array_almost_equal(n_incr0, n_incr_w0) # check second round for incremental means1, vars1, n_incr1 = incr_mean_variance_axis(X=X_sparse, axis=axis, last_mean=means0, last_var=vars0, last_n=n_incr0, weights=None) means_w1, vars_w1, n_incr_w1 = incr_mean_variance_axis(X=Xw_sparse, axis=axis, last_mean=means_w0, last_var=vars_w0, last_n=n_incr_w0, weights=weights) assert_array_almost_equal(means1, means_w1) assert_array_almost_equal(vars1, vars_w1) assert_array_almost_equal(n_incr1, n_incr_w1) assert means_w1.dtype == dtype assert vars_w1.dtype == dtype assert n_incr_w1.dtype == dtype
def test_incr_mean_variance_no_new_n(): # check the behaviour when we update the variance with an empty matrix axis = 0 X1 = sp.random(5, 1, density=0.8, random_state=0).tocsr() X2 = sp.random(0, 1, density=0.8, random_state=0).tocsr() last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1]) last_n = np.zeros(X1.shape[1], dtype=np.int64) last_mean, last_var, last_n = incr_mean_variance_axis( X1, axis, last_mean, last_var, last_n) # update statistic with a column which should ignored updated_mean, updated_var, updated_n = incr_mean_variance_axis( X2, axis, last_mean, last_var, last_n) assert_allclose(updated_mean, last_mean) assert_allclose(updated_var, last_var) assert_allclose(updated_n, last_n)
def test_incr_mean_variance_axis_equivalence_mean_variance(X1, X2): # non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/16448 # check that computing the incremental mean and variance is equivalent to # computing the mean and variance on the stacked dataset. axis = 0 last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1]) last_n = np.zeros(X1.shape[1], dtype=np.int64) updated_mean, updated_var, updated_n = incr_mean_variance_axis( X1, axis, last_mean, last_var, last_n) updated_mean, updated_var, updated_n = incr_mean_variance_axis( X2, axis, updated_mean, updated_var, updated_n) X = sp.vstack([X1, X2]) assert_allclose(updated_mean, np.nanmean(X.A, axis=axis)) assert_allclose(updated_var, np.nanvar(X.A, axis=axis)) assert_allclose(updated_n, np.count_nonzero(~np.isnan(X.A), axis=0))
def test_incr_mean_variance_n_float(): # check the behaviour when last_n is just a number axis = 0 X = sp.random(5, 2, density=0.8, random_state=0).tocsr() last_mean, last_var = np.zeros(X.shape[1]), np.zeros(X.shape[1]) last_n = 0 _, _, new_n = incr_mean_variance_axis( X, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n ) assert_allclose(new_n, np.full(X.shape[1], X.shape[0]))
def partial_fit(self, X, y=None): """Online computation of mean and std on X for later scaling. All of X is processed as a single batch. This is intended for cases when `fit` is not feasible due to very large number of `n_samples` or because X is read from a continuous stream. The algorithm for incremental mean and std is given in Equation 1.5a,b in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms for computing the sample variance: Analysis and recommendations." The American Statistician 37.3 (1983): 242-247: Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. y: Passthrough for ``Pipeline`` compatibility. """ X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, ensure_2d=False, warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES) # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var # See incr_mean_variance_axis and _incremental_mean_variance_axis if not sparse.issparse(X): return super(SparseScaler, self).partial_fit(X) if self.with_std: # First pass if not hasattr(self, 'n_samples_seen_'): self.mean_, self.var_ = mean_variance_axis(X, axis=0) n = X.shape[0] self.n_samples_seen_ = n # Next passes else: self.mean_, self.var_, self.n_samples_seen_ = \ incr_mean_variance_axis(X, axis=0, last_mean=self.mean_, last_var=self.var_, last_n=self.n_samples_seen_) if self.with_std: self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) else: self.scale_ = None return self
def test_mean_variance_illegal_axis(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit X[0, 0] = 0 X[2, 1] = 0 X[4, 3] = 0 X_csr = sp.csr_matrix(X) with pytest.raises(ValueError): mean_variance_axis(X_csr, axis=-3) with pytest.raises(ValueError): mean_variance_axis(X_csr, axis=2) with pytest.raises(ValueError): mean_variance_axis(X_csr, axis=-1) with pytest.raises(ValueError): incr_mean_variance_axis(X_csr, axis=-3, last_mean=None, last_var=None, last_n=None) with pytest.raises(ValueError): incr_mean_variance_axis(X_csr, axis=2, last_mean=None, last_var=None, last_n=None) with pytest.raises(ValueError): incr_mean_variance_axis(X_csr, axis=-1, last_mean=None, last_var=None, last_n=None)
def test_incr_mean_variance_axis(): for axis in [0, 1]: rng = np.random.RandomState(0) n_features = 50 n_samples = 10 data_chunks = [ rng.randint(0, 2, size=n_features) for i in range(n_samples) ] # default params for incr_mean_variance last_mean = np.zeros(n_features) last_var = np.zeros_like(last_mean) last_n = 0 # Test errors X = np.array(data_chunks[0]) X = np.atleast_2d(X) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, X_lil, axis, last_mean, last_var, last_n) # Test _incr_mean_and_var with a 1 row input X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # X.shape[axis] picks # samples X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # Test _incremental_mean_and_var with whole data X = np.vstack(data_chunks) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) X_csc = sp.csc_matrix(X_lil) expected_dtypes = [(np.float32, np.float32), (np.float64, np.float64), (np.int32, np.float64), (np.int64, np.float64)] for input_dtype, output_dtype in expected_dtypes: for X_sparse in (X_csr, X_csc): X_sparse = X_sparse.astype(input_dtype) X_means, X_vars = mean_variance_axis(X_sparse, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_sparse, axis, last_mean, last_var, last_n) assert_equal(X_means_incr.dtype, output_dtype) assert_equal(X_vars_incr.dtype, output_dtype) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr)
def test_incr_mean_variance_axis(): for axis in [0, 1]: rng = np.random.RandomState(0) n_features = 50 n_samples = 10 data_chunks = [ rng.random_integers(0, 1, size=n_features) for i in range(n_samples) ] # default params for incr_mean_variance last_mean = np.zeros(n_features) last_var = np.zeros_like(last_mean) last_n = 0 # Test errors X = np.array(data_chunks[0]) X = np.atleast_2d(X) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, X_lil, axis, last_mean, last_var, last_n) # Test _incr_mean_and_var with a 1 row input X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # X.shape[axis] picks # samples X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # Test _incremental_mean_and_var with whole data X = np.vstack(data_chunks) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # All data but as float X = X.astype(np.float32) X_csr = X_csr.astype(np.float32) X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) X_csc = X_csr.astype(np.float32) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr)
def test_incr_mean_variance_axis(): for axis in [0, 1]: rng = np.random.RandomState(0) n_features = 50 n_samples = 10 data_chunks = [rng.randint(0, 2, size=n_features) for i in range(n_samples)] # default params for incr_mean_variance last_mean = np.zeros(n_features) last_var = np.zeros_like(last_mean) last_n = np.zeros_like(last_mean, dtype=np.int64) # Test errors X = np.array(data_chunks[0]) X = np.atleast_2d(X) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, X_lil, axis, last_mean, last_var, last_n) # Test _incr_mean_and_var with a 1 row input X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # X.shape[axis] picks # samples X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # Test _incremental_mean_and_var with whole data X = np.vstack(data_chunks) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) X_csc = sp.csc_matrix(X_lil) expected_dtypes = [(np.float32, np.float32), (np.float64, np.float64), (np.int32, np.float64), (np.int64, np.float64)] for input_dtype, output_dtype in expected_dtypes: for X_sparse in (X_csr, X_csc): X_sparse = X_sparse.astype(input_dtype) last_mean = last_mean.astype(output_dtype) last_var = last_var.astype(output_dtype) X_means, X_vars = mean_variance_axis(X_sparse, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_sparse, axis, last_mean, last_var, last_n) assert_equal(X_means_incr.dtype, output_dtype) assert_equal(X_vars_incr.dtype, output_dtype) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr)
def test_incr_mean_variance_axis(): for axis in [0, 1]: rng = np.random.RandomState(0) n_features = 50 n_samples = 10 data_chunks = [rng.random_integers(0, 1, size=n_features) for i in range(n_samples)] # default params for incr_mean_variance last_mean = np.zeros(n_features) last_var = np.zeros_like(last_mean) last_n = 0 # Test errors X = np.array(data_chunks[0]) X = np.atleast_2d(X) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, X_lil, axis, last_mean, last_var, last_n) # Test _incr_mean_and_var with a 1 row input X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # X.shape[axis] picks # samples X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # Test _incremental_mean_and_var with whole data X = np.vstack(data_chunks) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # All data but as float X = X.astype(np.float32) X_csr = X_csr.astype(np.float32) X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) X_csc = X_csr.astype(np.float32) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr)
def partial_fit(self, X, y=None): """ Performs online computation of mean and standard deviation on X for later scaling. All of X is processed as a single batch. This is intended for cases when `fit` is not feasible due to very large number of `n_samples` or because X is read from a continuous stream. The algorithm for incremental mean and std is given in Equation 1.5a,b in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms for computing the sample variance: Analysis and recommendations." The American Statistician 37.3 (1983): 242-247 :param X: Data matrix to scale. :type X: numpy.ndarray, shape [n_samples, n_features] :param y: Passthrough for Scikit-learn ``Pipeline`` compatibility. :type y: None :return: Fitted object. :rtype: pyChemometrics.ChemometricsScaler """ X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES) # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var # See incr_mean_variance_axis and _incremental_mean_variance_axis if sparse.issparse(X): if self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") if self.with_std: # First pass if not hasattr(self, 'n_samples_seen_'): self.mean_, self.var_ = mean_variance_axis(X, axis=0) self.n_samples_seen_ = X.shape[0] # Next passes else: self.mean_, self.var_, self.n_samples_seen_ = \ incr_mean_variance_axis(X, axis=0, last_mean=self.mean_, last_var=self.var_, last_n=self.n_samples_seen_) else: self.mean_ = None self.var_ = None else: # First pass if not hasattr(self, 'n_samples_seen_'): self.mean_ = .0 self.n_samples_seen_ = 0 if self.with_std: self.var_ = .0 else: self.var_ = None self.mean_, self.var_, self.n_samples_seen_ = \ _incremental_mean_and_var(X, self.mean_, self.var_, self.n_samples_seen_) if self.with_std: self.scale_ = _handle_zeros_in_scale(numpy.sqrt( self.var_))**self.scale_power else: self.scale_ = None return self