Пример #1
0
def test_incremental_mean_and_variance_ignore_nan():
    old_means = np.array([535.0, 535.0, 535.0, 535.0])
    old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0])
    old_sample_count = np.array([2, 2, 2, 2], dtype=np.int32)

    X = np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]])

    X_nan = np.array(
        [
            [170, np.nan, 170, 170],
            [np.nan, 170, 430, 430],
            [430, 430, np.nan, 300],
            [300, 300, 300, np.nan],
        ]
    )

    X_means, X_variances, X_count = _incremental_mean_and_var(
        X, old_means, old_variances, old_sample_count
    )
    X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var(
        X_nan, old_means, old_variances, old_sample_count
    )

    assert_allclose(X_nan_means, X_means)
    assert_allclose(X_nan_variances, X_variances)
    assert_allclose(X_nan_count, X_count)
Пример #2
0
def test_incremental_weighted_mean_and_variance_ignore_nan(dtype):
    old_means = np.array([535.0, 535.0, 535.0, 535.0])
    old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0])
    old_weight_sum = np.array([2, 2, 2, 2], dtype=np.int32)
    sample_weights_X = np.ones(3)
    sample_weights_X_nan = np.ones(4)

    X = np.array(
        [[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]]
    ).astype(dtype)

    X_nan = np.array(
        [
            [170, np.nan, 170, 170],
            [np.nan, 170, 430, 430],
            [430, 430, np.nan, 300],
            [300, 300, 300, np.nan],
        ]
    ).astype(dtype)

    X_means, X_variances, X_count = _incremental_mean_and_var(
        X, old_means, old_variances, old_weight_sum, sample_weight=sample_weights_X
    )
    X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var(
        X_nan,
        old_means,
        old_variances,
        old_weight_sum,
        sample_weight=sample_weights_X_nan,
    )

    assert_allclose(X_nan_means, X_means)
    assert_allclose(X_nan_variances, X_variances)
    assert_allclose(X_nan_count, X_count)
Пример #3
0
def test_incremental_variance_ddof():
    # Test that degrees of freedom parameter for calculations are correct.
    rng = np.random.RandomState(1999)
    X = rng.randn(50, 10)
    n_samples, n_features = X.shape
    for batch_size in [11, 20, 37]:
        steps = np.arange(0, X.shape[0], batch_size)
        if steps[-1] != X.shape[0]:
            steps = np.hstack([steps, n_samples])

        for i, j in zip(steps[:-1], steps[1:]):
            batch = X[i:j, :]
            if i == 0:
                incremental_means = batch.mean(axis=0)
                incremental_variances = batch.var(axis=0)
                # Assign this twice so that the test logic is consistent
                incremental_count = batch.shape[0]
                sample_count = batch.shape[0]
            else:
                result = _incremental_mean_and_var(
                    batch, incremental_means, incremental_variances,
                    sample_count)
                (incremental_means, incremental_variances,
                 incremental_count) = result
                sample_count += batch.shape[0]

            calculated_means = np.mean(X[:j], axis=0)
            calculated_variances = np.var(X[:j], axis=0)
            assert_almost_equal(incremental_means, calculated_means, 6)
            assert_almost_equal(incremental_variances,
                                calculated_variances, 6)
            assert_equal(incremental_count, sample_count)
Пример #4
0
def test_incremental_variance_ddof():
    # Test that degrees of freedom parameter for calculations are correct.
    rng = np.random.RandomState(1999)
    X = rng.randn(50, 10)
    n_samples, n_features = X.shape
    for batch_size in [11, 20, 37]:
        steps = np.arange(0, X.shape[0], batch_size)
        if steps[-1] != X.shape[0]:
            steps = np.hstack([steps, n_samples])

        for i, j in zip(steps[:-1], steps[1:]):
            batch = X[i:j, :]
            if i == 0:
                incremental_means = batch.mean(axis=0)
                incremental_variances = batch.var(axis=0)
                # Assign this twice so that the test logic is consistent
                incremental_count = batch.shape[0]
                sample_count = np.full(batch.shape[1],
                                       batch.shape[0],
                                       dtype=np.int32)
            else:
                result = _incremental_mean_and_var(batch, incremental_means,
                                                   incremental_variances,
                                                   sample_count)
                (incremental_means, incremental_variances,
                 incremental_count) = result
                sample_count += batch.shape[0]

            calculated_means = np.mean(X[:j], axis=0)
            calculated_variances = np.var(X[:j], axis=0)
            assert_almost_equal(incremental_means, calculated_means, 6)
            assert_almost_equal(incremental_variances, calculated_variances, 6)
            assert_array_equal(incremental_count, sample_count)
Пример #5
0
def get_var(dataset):
    mean_, var_ = 0., 0.
    count = 0
    dtype = dataset[0][1].dtype

    for idx, (_, x, _, length) in enumerate(dataset):
        x = x[:length]
        mean_, var_, _ = _incremental_mean_and_var(x, mean_, var_, count)
        count += len(x)

    return var_.astype(dtype)
Пример #6
0
 def _assert(X, sample_weight, expected_mean, expected_var):
     n = X.shape[0]
     for chunk_size in [1, n // 10 + 1, n // 4 + 1, n // 2 + 1, n]:
         last_mean, last_weight_sum, last_var = 0, 0, 0
         for batch in gen_batches(n, chunk_size):
             last_mean, last_var, last_weight_sum = \
                 _incremental_mean_and_var(
                     X[batch], last_mean, last_var, last_weight_sum,
                     sample_weight=sample_weight[batch])
         assert_allclose(last_mean, expected_mean)
         assert_allclose(last_var, expected_var, atol=1e-6)
Пример #7
0
def test_incremental_weighted_mean_and_variance_simple(rng, dtype):
    mult = 10
    X = rng.rand(1000, 20).astype(dtype) * mult
    sample_weight = rng.rand(X.shape[0]) * mult
    mean, var, _ = _incremental_mean_and_var(X, 0, 0, 0, sample_weight=sample_weight)

    expected_mean = np.average(X, weights=sample_weight, axis=0)
    expected_var = (
        np.average(X ** 2, weights=sample_weight, axis=0) - expected_mean ** 2
    )
    assert_almost_equal(mean, expected_mean)
    assert_almost_equal(var, expected_var)
Пример #8
0
def meanvar(dataset,
            lengths=None,
            mean_=0.,
            var_=0.,
            last_sample_count=0,
            return_last_sample_count=False):
    """Mean/variance computation given a iterable dataset

    Dataset can have variable length samples. In that cases, you need to
    explicitly specify lengths for all the samples.

    Args:
        dataset (nnmnkwii.datasets.Dataset): Dataset
        lengths: (list): Frame lengths for each dataset sample.
        mean\_ (array or scalar): Initial value for mean vector.
        var\_ (array or scaler): Initial value for variance vector.
        last_sample_count (int): Last sample count. Default is 0. If you set
          non-default ``mean_`` and ``var_``, you need to set
          ``last_sample_count`` property. Typically this will be the number of
          time frames ever seen.
        return_last_sample_count (bool): Return ``last_sample_count`` if True.

    Returns:
        tuple: Mean and variance for each dimention. If
          ``return_last_sample_count`` is True, returns ``last_sample_count``
          as well.

    See also:
        :func:`nnmnkwii.preprocessing.meanstd`, :func:`nnmnkwii.preprocessing.scale`

    Examples:
        >>> from nnmnkwii.preprocessing import meanvar
        >>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
        >>> from nnmnkwii.datasets import FileSourceDataset
        >>> X, Y = example_file_data_sources_for_acoustic_model()
        >>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
        >>> lengths = [len(y) for y in Y]
        >>> data_mean, data_var = meanvar(Y, lengths)
    """
    dtype = dataset[0].dtype

    for idx, x in enumerate(dataset):
        if lengths is not None:
            x = x[:lengths[idx]]
        mean_, var_, _ = _incremental_mean_and_var(x, mean_, var_,
                                                   last_sample_count)
        last_sample_count += len(x)
    mean_, var_ = mean_.astype(dtype), var_.astype(dtype)

    if return_last_sample_count:
        return mean_, var_, last_sample_count
    else:
        return mean_, var_
Пример #9
0
def test_incremental_mean_and_variance_ignore_nan():
    old_means = np.array([535., 535., 535., 535.])
    old_variances = np.array([4225., 4225., 4225., 4225.])
    old_sample_count = np.array([2, 2, 2, 2], dtype=np.int32)

    X = np.array([[170, 170, 170, 170],
                  [430, 430, 430, 430],
                  [300, 300, 300, 300]])

    X_nan = np.array([[170, np.nan, 170, 170],
                      [np.nan, 170, 430, 430],
                      [430, 430, np.nan, 300],
                      [300, 300, 300, np.nan]])

    X_means, X_variances, X_count = _incremental_mean_and_var(
        X, old_means, old_variances, old_sample_count)
    X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var(
        X_nan, old_means, old_variances, old_sample_count)

    assert_allclose(X_nan_means, X_means)
    assert_allclose(X_nan_variances, X_variances)
    assert_allclose(X_nan_count, X_count)
Пример #10
0
def meanvar(dataset):
    last_sample_count = 0
    mean_ = 0.
    var_ = 0.
    dtype = dataset[0].dtype

    for idx, x in enumerate(dataset):
        mean_, var_, _ = _incremental_mean_and_var(x, mean_, var_,
                                                   last_sample_count)
        last_sample_count += len(x)
    mean_, var_ = mean_.astype(dtype), var_.astype(dtype)

    return mean_, var_
Пример #11
0
    def fit(self, X, y=None):
        """Compute the mean and std to be used for later scaling.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, n_features]
            The data used to compute the mean and standard deviation
            used for later scaling along the features axis.

        y: Passthrough for ``Pipeline`` compatibility.
        """
        self.mean_, self.var_, self.n_samples_seen_ = \
            _incremental_mean_and_var(X, self.mean_, self.var_,
                                      self.n_samples_seen_)
        return self
Пример #12
0
 def partial_fit_transform(self, X):
     shp = X.shape
     if X.ndim == 4:
         X = np.reshape(X, (shp[0], -1))
     elif X.ndim == 5:
         X = np.reshape(X, (shp[0] * shp[1], -1))
     whr = np.where(np.any(X != self.pad_value, axis=1))[0]
     if len(whr) > 0:
         if self.n_samples_seen < self.n_samples_train:
             self.lock.acquire()
             try:
                 # Update stats - they are 0 if this is the fisrt step
                 col_mean, col_var, n_total_samples = \
                     _incremental_mean_and_var(
                         X[whr], last_mean=self.mean, last_variance=self.var, last_sample_count=np.repeat(self.n_samples_seen, X[whr].shape[1]))
                 n_total_samples = n_total_samples[0]
                 if self.n_samples_seen == 0:
                     X[whr] = X[whr] - col_mean
                     _X = X[whr]
                 else:
                     col_batch_mean = np.mean(X[whr], axis=0)
                     X[whr] = X[whr] - col_batch_mean
                     # Build matrix of combined previous basis and new data
                     mean_correction = np.sqrt(
                         (self.n_samples_seen * X[whr].shape[0]) /
                         n_total_samples) * (self.mean - col_batch_mean)
                     _X = np.vstack((self.singular_values.reshape(
                         (-1, 1)) * self.components, X[whr],
                                     mean_correction))
                 U, S, V = np.linalg.svd(_X, full_matrices=False)
                 U, V = svd_flip(U, V, u_based_decision=False)
                 explained_variance = S**2 / (n_total_samples - 1)
                 self.n_samples_seen = n_total_samples
                 self.components = V[:self.max_components]
                 self.singular_values = S[:self.max_components]
                 self.mean = col_mean
                 self.var = col_var
                 self.explained_variance = explained_variance[:self.
                                                              max_components]
             finally:
                 self.lock.release()
         else:
             X[whr] = X[whr] - self.mean
         X[whr] = np.dot((np.dot(X[whr], self.components.T) /
                          np.sqrt(self.explained_variance + self.epsilon)),
                         self.components)
     return np.reshape(X, shp)
Пример #13
0
def test_incremental_variance_update_formulas():
    # Test Youngs and Cramer incremental variance formulas.
    # Doggie data from https://www.mathsisfun.com/data/standard-deviation.html
    A = np.array([[600, 470, 170, 430, 300], [600, 470, 170, 430, 300],
                  [600, 470, 170, 430, 300], [600, 470, 170, 430, 300]]).T
    idx = 2
    X1 = A[:idx, :]
    X2 = A[idx:, :]

    old_means = X1.mean(axis=0)
    old_variances = X1.var(axis=0)
    old_sample_count = np.full(X1.shape[1], X1.shape[0], dtype=np.int32)
    final_means, final_variances, final_count = \
        _incremental_mean_and_var(X2, old_means, old_variances,
                                  old_sample_count)
    assert_almost_equal(final_means, A.mean(axis=0), 6)
    assert_almost_equal(final_variances, A.var(axis=0), 6)
    assert_almost_equal(final_count, A.shape[0])
Пример #14
0
def test_incremental_variance_update_formulas():
    # Test Youngs and Cramer incremental variance formulas.
    # Doggie data from http://www.mathsisfun.com/data/standard-deviation.html
    A = np.array([[600, 470, 170, 430, 300],
                  [600, 470, 170, 430, 300],
                  [600, 470, 170, 430, 300],
                  [600, 470, 170, 430, 300]]).T
    idx = 2
    X1 = A[:idx, :]
    X2 = A[idx:, :]

    old_means = X1.mean(axis=0)
    old_variances = X1.var(axis=0)
    old_sample_count = X1.shape[0]
    final_means, final_variances, final_count = \
        _incremental_mean_and_var(X2, old_means, old_variances,
                                  old_sample_count)
    assert_almost_equal(final_means, A.mean(axis=0), 6)
    assert_almost_equal(final_variances, A.var(axis=0), 6)
    assert_almost_equal(final_count, A.shape[0])
Пример #15
0
    def partial_fit(self, X, y=None, check_input=True, svd_solver='auto'):
        '''
        incremental_PCA 
        if svd_solver  in ['arpack','randomized'] -> _fit_truncated
        elif svd_solver == 'full'  -> _fit_full
        else: raise ValueError

#    standard_scaler:True
#    whiten:Bool false
#    copy:Bool  True

        '''
        if check_input:
            check_array(X, copy=self.copy, accept_sparse=['csc', 'csr'])

        n_samples, n_features = X.shape
        '''
        partial_fit: directly 
        '''
        if self.standard_scaler:
            if not hasattr(self, 'scaler_'):
                self.scaler_ = StandardScaler()
                X = self.scaler_.fit_transform(X)

        if not hasattr(self, 'components_'):
            self.components_ = None

        #determine  n_components
        if self.n_components is None:  #n_component -> n_components
            if self.components is None:
                if self.svd_solver != 'arpack':
                    n_components = min(n_samples, n_features)
                else:
                    n_components = min(n_samples, n_features) - 1
            else:
                n_components = self.components_.shape[0]
        else:
            n_components = self.n_components  # ->  _partial_fit_full,_partial_fit_truncated  process the case where  n_components  in [0,1] ,'mle',int

        self.n_components = n_components

        _fit_svd_solver = self.svd_solver
        if _fit_svd_solver == 'auto':
            if max(n_samples, n_features) <= 500 or n_components == 'mle':
                _fit_svd_solver = 'full'
            elif 1 <= n_components < 0.8 * min(n_samples, n_features):
                _fit_svd_solver = 'randomized'
            else:
                _fit_svd_solver = 'full'

        if self.components_ is not None and n_components != self.components_.shape[
                0]:
            raise ValueError(
                'n_components  should be identical with  components_')

        #Beginning step
        if not hasattr(self, 'n_sample_seen'):
            self.n_sample_seen = 0
            self.mean_ = 0.
            self.var_ = 0.
        '''
        '''
        if issparse(X):
            total_mean, total_variance, n_sample_total = incr_mean_variance_axis0(
                X, self.mean_, self.var_, self.n_sample_seen)

        else:
            total_mean, total_variance, n_sample_total = _incremental_mean_and_var(
                X, self.mean_, self.var_, self.n_sample_seen)

        n_sample_total = n_sample_total[0]
        if self.n_sample_seen == 0:
            X = X - total_mean
        else:
            _local_mean= np.mean(X,axis=0)\

            X = X - _local_mean
            #mean_correction= sqrt( local_samples*merge_before_samples/merge_after_samples )*(merge_before_mean-local_mean )
            mean_correction = np.sqrt(
                (n_samples * self.n_sample_seen) /
                n_sample_total) * (self.mean_ - _local_mean)
            #vstack : singular_values(K,)*components_(K,N),X(M,N),mean_correction(N,)
            X = np.vstack((self.singular_values[:, None] * self.components_, X,
                           mean_correction))

        self.mean_ = total_mean
        self.var_ = total_variance
        self.n_sample_seen = n_sample_total

        if _fit_svd_solver == 'full':
            return self._partial_fit_full(X)
        elif _fit_svd_solver in ['arpack', 'randomized']:
            return self._partial_fit_truncated(X, _fit_svd_solver)
        else:
            raise ValueError(
                "svd_solver:{0} not exits in {'full','randomized','arpack','auto'}"
                .format(_fit_svd_solver))
Пример #16
0
    def partial_fit(self, X, y=None, check_input=True):
        """Incremental fit with X. All of X is processed as a single batch.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples and
        n_features is the number of features.

    Returns
    -------
    self: object
        Returns the instance itself.
    """
        # ====== check the samples and cahces ====== #
        if isinstance(X, Data):
            X = X[:]
        if check_input:
            X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32])
        n_samples, n_features = X.shape
        # check number of components
        if self.n_components is None:
            self.n_components_ = n_features
        elif not 1 <= self.n_components <= n_features:
            raise ValueError("n_components=%r invalid for n_features=%d, need "
                             "more rows than columns for IncrementalPCA "
                             "processing" % (self.n_components, n_features))
        else:
            self.n_components_ = self.n_components
        # check the cache
        if n_samples < n_features or self._nb_cached_samples > 0:
            self._cache_batches.append(X)
            self._nb_cached_samples += n_samples
            # not enough samples yet
            if self._nb_cached_samples < n_features:
                return
            else:  # group mini batch into big batch
                X = np.concatenate(self._cache_batches, axis=0)
                self._cache_batches = []
                self._nb_cached_samples = 0
        n_samples = X.shape[0]
        # ====== fit the model ====== #
        if (self.components_ is not None) and (self.components_.shape[0] !=
                                               self.n_components_):
            raise ValueError("Number of input features has changed from %i "
                             "to %i between calls to partial_fit! Try "
                             "setting n_components to a fixed value." %
                             (self.components_.shape[0], self.n_components_))
        # Update stats - they are 0 if this is the fisrt step
        col_mean, col_var, n_total_samples = \
            _incremental_mean_and_var(X, last_mean=self.mean_,
                                      last_variance=self.var_,
                                      last_sample_count=self.n_samples_seen_)
        total_var = np.sum(col_var * n_total_samples)
        if total_var == 0:  # if variance == 0, make no sense to continue
            return self
        # Whitening
        if self.n_samples_seen_ == 0:
            # If it is the first step, simply whiten X
            X -= col_mean
        else:
            col_batch_mean = np.mean(X, axis=0)
            X -= col_batch_mean
            # Build matrix of combined previous basis and new data
            mean_correction = \
                np.sqrt((self.n_samples_seen_ * n_samples) /
                        n_total_samples) * (self.mean_ - col_batch_mean)
            X = np.vstack((self.singular_values_.reshape(
                (-1, 1)) * self.components_, X, mean_correction))

        U, S, V = linalg.svd(X, full_matrices=False)
        U, V = svd_flip(U, V, u_based_decision=False)
        explained_variance = S**2 / n_total_samples
        explained_variance_ratio = S**2 / total_var

        self.n_samples_seen_ = n_total_samples
        self.components_ = V[:self.n_components_]
        self.singular_values_ = S[:self.n_components_]
        self.mean_ = col_mean
        self.var_ = col_var
        self.explained_variance_ = explained_variance[:self.n_components_]
        self.explained_variance_ratio_ = \
            explained_variance_ratio[:self.n_components_]
        if self.n_components_ < n_features:
            self.noise_variance_ = \
                explained_variance[self.n_components_:].mean()
        else:
            self.noise_variance_ = 0.
        return self
Пример #17
0
def test_incremental_variance_numerical_stability():
    # Test Youngs and Cramer incremental variance formulas.

    def np_var(A):
        return A.var(axis=0)

    # Naive one pass variance computation - not numerically stable
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
    def one_pass_var(X):
        n = X.shape[0]
        exp_x2 = (X ** 2).sum(axis=0) / n
        expx_2 = (X.sum(axis=0) / n) ** 2
        return exp_x2 - expx_2

    # Two-pass algorithm, stable.
    # We use it as a benchmark. It is not an online algorithm
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
    def two_pass_var(X):
        mean = X.mean(axis=0)
        Y = X.copy()
        return np.mean((Y - mean)**2, axis=0)

    # Naive online implementation
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm
    # This works only for chunks for size 1
    def naive_mean_variance_update(x, last_mean, last_variance,
                                   last_sample_count):
        updated_sample_count = (last_sample_count + 1)
        samples_ratio = last_sample_count / float(updated_sample_count)
        updated_mean = x / updated_sample_count + last_mean * samples_ratio
        updated_variance = last_variance * samples_ratio + \
            (x - last_mean) * (x - updated_mean) / updated_sample_count
        return updated_mean, updated_variance, updated_sample_count

    # We want to show a case when one_pass_var has error > 1e-3 while
    # _batch_mean_variance_update has less.
    tol = 200
    n_features = 2
    n_samples = 10000
    x1 = np.array(1e8, dtype=np.float64)
    x2 = np.log(1e-5, dtype=np.float64)
    A0 = x1 * np.ones((n_samples // 2, n_features), dtype=np.float64)
    A1 = x2 * np.ones((n_samples // 2, n_features), dtype=np.float64)
    A = np.vstack((A0, A1))

    # Older versions of numpy have different precision
    # In some old version, np.var is not stable
    if np.abs(np_var(A) - two_pass_var(A)).max() < 1e-6:
        stable_var = np_var
    else:
        stable_var = two_pass_var

    # Naive one pass var: >tol (=1063)
    assert_greater(np.abs(stable_var(A) - one_pass_var(A)).max(), tol)

    # Starting point for online algorithms: after A0

    # Naive implementation: >tol (436)
    mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2
    for i in range(A1.shape[0]):
        mean, var, n = \
            naive_mean_variance_update(A1[i, :], mean, var, n)
    assert_equal(n, A.shape[0])
    # the mean is also slightly unstable
    assert_greater(np.abs(A.mean(axis=0) - mean).max(), 1e-6)
    assert_greater(np.abs(stable_var(A) - var).max(), tol)

    # Robust implementation: <tol (177)
    mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2
    for i in range(A1.shape[0]):
        mean, var, n = \
            _incremental_mean_and_var(A1[i, :].reshape((1, A1.shape[1])),
                                      mean, var, n)
    assert_equal(n, A.shape[0])
    assert_array_almost_equal(A.mean(axis=0), mean)
    assert_greater(tol, np.abs(stable_var(A) - var).max())
Пример #18
0
    def partial_fit(self, X, y=None, check_input=True):
        #print(self.components_)
        #print(self.singular_values_ )
        #print(self.mean_)
        n_samples, n_features = X.shape
        if not hasattr(self, 'components_'):
            self.components_ = None

        if self.n_components is None:
            if self.components_ is None:
                self.n_components_ = min(n_samples, n_features)
            else:
                self.n_components_ = self.components_.shape[0]
        elif not 1 <= self.n_components <= n_features:
            raise ValueError("n_components=%r invalid for n_features=%d, need "
                             "more rows than columns for IncrementalPCA "
                             "processing" % (self.n_components, n_features))
        elif not self.n_components <= n_samples:
            raise ValueError("n_components=%r must be less or equal to "
                             "the batch number of samples "
                             "%d." % (self.n_components, n_samples))
        else:
            self.n_components_ = self.n_components

        if (self.components_ is not None) and (self.components_.shape[0] !=
                                               self.n_components_):
            raise ValueError("Number of input features has changed from %i "
                             "to %i between calls to partial_fit! Try "
                             "setting n_components to a fixed value." %
                             (self.components_.shape[0], self.n_components_))

        # This is the first partial_fit
        if not hasattr(self, 'n_samples_seen_'):
            self.n_samples_seen_ = 0
            self.mean_ = .0
            self.var_ = .0

        # Update stats - they are 0 if this is the fisrt step
        col_mean, col_var, n_total_samples = \
            _incremental_mean_and_var(X, last_mean=self.mean_,
                                      last_variance=self.var_,
                                      last_sample_count=self.n_samples_seen_)
        # Whitening
        #print(col_mean) ---------------Totally Correct
        if self.n_samples_seen_ == 0:
            # If it is the first step, simply whiten X
            X -= col_mean
        else:
            col_batch_mean = np.mean(X, axis=0)
            #print(col_batch_mean)  #-----------Totally Correct
            X -= col_batch_mean
            # Build matrix of combined previous basis and new data
            mean_correction = \
                np.sqrt((self.n_samples_seen_ * n_samples) /
                        n_total_samples) * (self.mean_ - col_batch_mean)
            X = np.vstack((self.singular_values_.reshape(
                (-1, 1)) * self.components_, X, mean_correction))
            print(self.singular_values_.reshape((-1, 1)) * self.components_)
            #print(mean_correction) #-------------Totally Correct
        #print(X[:2])
        #print("\n\n\n");

        U, S, V = linalg.svd(X, full_matrices=False)
        #U, V = svd_flip(U, V, u_based_decision=False)
        explained_variance = S**2 / (n_total_samples - 1)
        explained_variance_ratio = S**2 / np.sum(col_var * n_total_samples)

        self.n_samples_seen_ = n_total_samples
        self.components_ = V[:self.n_components_]
        #print(self.components_);
        self.singular_values_ = S[:self.n_components_]
        self.mean_ = col_mean
        #print(self.mean_)
        self.var_ = col_var
        self.explained_variance_ = explained_variance[:self.n_components_]
        self.explained_variance_ratio_ = \
            explained_variance_ratio[:self.n_components_]
        if self.n_components_ < n_features:
            self.noise_variance_ = \
                explained_variance[self.n_components_:].mean()
        else:
            self.noise_variance_ = 0.
        return self
Пример #19
0
def test_incremental_variance_numerical_stability():
    # Test Youngs and Cramer incremental variance formulas.

    def np_var(A):
        return A.var(axis=0)

    # Naive one pass variance computation - not numerically stable
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
    def one_pass_var(X):
        n = X.shape[0]
        exp_x2 = (X**2).sum(axis=0) / n
        expx_2 = (X.sum(axis=0) / n)**2
        return exp_x2 - expx_2

    # Two-pass algorithm, stable.
    # We use it as a benchmark. It is not an online algorithm
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
    def two_pass_var(X):
        mean = X.mean(axis=0)
        Y = X.copy()
        return np.mean((Y - mean)**2, axis=0)

    # Naive online implementation
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm
    # This works only for chunks for size 1
    def naive_mean_variance_update(x, last_mean, last_variance,
                                   last_sample_count):
        updated_sample_count = (last_sample_count + 1)
        samples_ratio = last_sample_count / float(updated_sample_count)
        updated_mean = x / updated_sample_count + last_mean * samples_ratio
        updated_variance = last_variance * samples_ratio + \
            (x - last_mean) * (x - updated_mean) / updated_sample_count
        return updated_mean, updated_variance, updated_sample_count

    # We want to show a case when one_pass_var has error > 1e-3 while
    # _batch_mean_variance_update has less.
    tol = 200
    n_features = 2
    n_samples = 10000
    x1 = np.array(1e8, dtype=np.float64)
    x2 = np.log(1e-5, dtype=np.float64)
    A0 = np.full((n_samples // 2, n_features), x1, dtype=np.float64)
    A1 = np.full((n_samples // 2, n_features), x2, dtype=np.float64)
    A = np.vstack((A0, A1))

    # Older versions of numpy have different precision
    # In some old version, np.var is not stable
    if np.abs(np_var(A) - two_pass_var(A)).max() < 1e-6:
        stable_var = np_var
    else:
        stable_var = two_pass_var

    # Naive one pass var: >tol (=1063)
    assert_greater(np.abs(stable_var(A) - one_pass_var(A)).max(), tol)

    # Starting point for online algorithms: after A0

    # Naive implementation: >tol (436)
    mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2
    for i in range(A1.shape[0]):
        mean, var, n = \
            naive_mean_variance_update(A1[i, :], mean, var, n)
    assert_equal(n, A.shape[0])
    # the mean is also slightly unstable
    assert_greater(np.abs(A.mean(axis=0) - mean).max(), 1e-6)
    assert_greater(np.abs(stable_var(A) - var).max(), tol)

    # Robust implementation: <tol (177)
    mean, var = A0[0, :], np.zeros(n_features)
    n = np.full(n_features, n_samples // 2, dtype=np.int32)
    for i in range(A1.shape[0]):
        mean, var, n = \
            _incremental_mean_and_var(A1[i, :].reshape((1, A1.shape[1])),
                                      mean, var, n)
    assert_array_equal(n, A.shape[0])
    assert_array_almost_equal(A.mean(axis=0), mean)
    assert_greater(tol, np.abs(stable_var(A) - var).max())
Пример #20
0
  def partial_fit(self, X, y=None, check_input=True):
    """Incremental fit with X. All of X is processed as a single batch.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples and
        n_features is the number of features.

    Returns
    -------
    self: object
        Returns the instance itself.
    """
    # ====== check the samples and cahces ====== #
    if isinstance(X, Data):
      X = X[:]
    if check_input:
      X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32])
    n_samples, n_features = X.shape
    # check number of components
    if self.n_components is None:
      self.n_components_ = n_features
    elif not 1 <= self.n_components <= n_features:
      raise ValueError("n_components=%r invalid for n_features=%d, need "
                       "more rows than columns for IncrementalPCA "
                       "processing" % (self.n_components, n_features))
    else:
      self.n_components_ = self.n_components
    # check the cache
    if n_samples < n_features or self._nb_cached_samples > 0:
      self._cache_batches.append(X)
      self._nb_cached_samples += n_samples
      # not enough samples yet
      if self._nb_cached_samples < n_features:
        return
      else: # group mini batch into big batch
        X = np.concatenate(self._cache_batches, axis=0)
        self._cache_batches = []
        self._nb_cached_samples = 0
    n_samples = X.shape[0]
    # ====== fit the model ====== #
    if (self.components_ is not None) and (self.components_.shape[0] !=
                                           self.n_components_):
      raise ValueError("Number of input features has changed from %i "
                       "to %i between calls to partial_fit! Try "
                       "setting n_components to a fixed value." %
                       (self.components_.shape[0], self.n_components_))
    # Update stats - they are 0 if this is the fisrt step
    col_mean, col_var, n_total_samples = \
        _incremental_mean_and_var(X, last_mean=self.mean_,
                                  last_variance=self.var_,
                                  last_sample_count=self.n_samples_seen_)
    total_var = np.sum(col_var * n_total_samples)
    if total_var == 0: # if variance == 0, make no sense to continue
      return self
    # Whitening
    if self.n_samples_seen_ == 0:
      # If it is the first step, simply whiten X
      X -= col_mean
    else:
      col_batch_mean = np.mean(X, axis=0)
      X -= col_batch_mean
      # Build matrix of combined previous basis and new data
      mean_correction = \
          np.sqrt((self.n_samples_seen_ * n_samples) /
                  n_total_samples) * (self.mean_ - col_batch_mean)
      X = np.vstack((self.singular_values_.reshape((-1, 1)) *
                    self.components_, X, mean_correction))

    U, S, V = linalg.svd(X, full_matrices=False)
    U, V = svd_flip(U, V, u_based_decision=False)
    explained_variance = S ** 2 / n_total_samples
    explained_variance_ratio = S ** 2 / total_var

    self.n_samples_seen_ = n_total_samples
    self.components_ = V[:self.n_components_]
    self.singular_values_ = S[:self.n_components_]
    self.mean_ = col_mean
    self.var_ = col_var
    self.explained_variance_ = explained_variance[:self.n_components_]
    self.explained_variance_ratio_ = \
        explained_variance_ratio[:self.n_components_]
    if self.n_components_ < n_features:
      self.noise_variance_ = \
          explained_variance[self.n_components_:].mean()
    else:
      self.noise_variance_ = 0.
    return self
Пример #21
0
    def _batch_fit(self, X, y, check_input=False):
        print('Batch fit')
        if check_input:
            X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self)

        current_n_samples, n_features = X.shape
        # Update stats - they are 0 if this is the first step
        updated_mean, updated_var, updated_n_samples_seen_ = _incremental_mean_and_var(
            X,
            last_mean=self.mean_,
            last_variance=self.var_,
            last_sample_count=self.n_samples_seen_)
        # Whitening
        if self.n_samples_seen_ == 0:
            # If it is the first step, simply whiten X
            X = np.subtract(X, updated_mean)
        else:
            col_batch_mean = np.mean(X, axis=0)
            X = np.subtract(X, col_batch_mean)

        # Updating algorithm
        # Updating Mean and Class Means
        updated_class_mean = self.class_mean_
        updated_class_n_samples_seen_ = self.class_n_samples_seen_
        # print('updated_class_n_samples_seen_', updated_class_n_samples_seen_)
        # print('updated_class_mean', updated_class_mean)
        for i, current_class in enumerate(self.classes_):
            current_class_samples = X[y == current_class, :]
            n_current_class_samples = current_class_samples.shape[0]
            previous_n_class_samples = updated_class_n_samples_seen_[i]
            if n_current_class_samples > 0 and previous_n_class_samples > 0:
                previous_class_sum_current_class = updated_class_mean[
                    i, :] * updated_class_n_samples_seen_[i]
                current_class_sum_current_class = np.sum(current_class_samples,
                                                         axis=0)

                # print('previous_class_sum_current_class.shape', previous_class_sum_current_class.shape)
                # print('current_class_sum_current_class.shape', current_class_sum_current_class.shape)
                # print('updated_class_mean.shape', updated_class_mean.shape)
                # print('updated_class_n_samples_seen_.shape', updated_class_n_samples_seen_[i])

                updated_class_n_samples_seen_[i] += n_current_class_samples
                updated_class_mean[i, :] = (previous_class_sum_current_class + current_class_sum_current_class) / \
                                           updated_class_n_samples_seen_[i]
            elif n_current_class_samples > 0:
                updated_class_mean[i, :] = np.mean(current_class_samples,
                                                   axis=0)
                updated_class_n_samples_seen_[i] = n_current_class_samples

        updated_class_within_scatter = self.class_within_scatter
        for i, current_class_mean in enumerate(updated_class_mean):
            current_class_samples = X[y == self.classes_[i], :]
            n_current_class_samples = current_class_samples.shape[0]
            l_c = current_class_samples.shape[0]
            n_c = self.class_n_samples_seen_[i]
            mean_y_c = np.reshape(np.mean(current_class_samples, axis=0),
                                  (n_features, 1))

            if n_current_class_samples > 0 and n_c > 0:
                # print('current_class_samples.shape', current_class_samples.shape)
                mean_x_c = np.reshape(self.class_mean_[i, :], (n_features, 1))

                D_c = (mean_y_c - mean_x_c).dot((mean_y_c - mean_x_c).T)

                E_c = np.zeros(D_c.shape)
                for current_samples, j in enumerate(current_class_samples):
                    E_c += (current_samples - mean_x_c).dot(
                        (current_samples - mean_x_c).T)

                F_c = np.zeros(D_c.shape)
                for current_samples, j in enumerate(current_class_samples):
                    F_c += (current_samples - mean_y_c).dot(
                        (current_samples - mean_y_c).T)

                updated_class_within_scatter[:, :, i] += ((n_c * l_c * l_c) * D_c / np.square(n_c + l_c)) + \
                                                         ((np.square(n_c) * E_c) / np.square(n_c + l_c)) + \
                                                         ((l_c * (l_c + (2 * n_c)) * F_c) / np.square(n_c + l_c))
            elif n_current_class_samples > 0:
                updated_class_within_scatter[:, :,
                                             i] = (current_class_samples -
                                                   mean_y_c).dot(
                                                       (current_class_samples -
                                                        mean_y_c).T)
        updated_within_scatter = np.sum(updated_class_within_scatter, axis=2)

        # Updating between class scatter
        updated_between_scatter = self.between_scatter
        for i, i_class_mean in enumerate(updated_class_mean[:-1, :]):
            for j_class_mean in updated_class_mean[i + 1:, :]:
                print('Computing mean difference of means:::', i_class_mean,
                      j_class_mean)
                current_mean_difference = np.reshape(
                    i_class_mean - j_class_mean, (1, n_features))
                d_ij = current_mean_difference.dot(
                    np.linalg.pinv(updated_within_scatter)).dot(
                        current_mean_difference.T)
                d_ij = np.sqrt(d_ij)
                if d_ij > 0:
                    w_d_ij = erf(d_ij /
                                 (2 * np.sqrt(2))) / (2 * np.square(d_ij))
                    # print('current_mean_difference.shape', current_mean_difference.shape)
                    updated_between_scatter += w_d_ij * current_mean_difference.T.dot(
                        current_mean_difference)
            # n = X[y == self.classes_[i], :].shape[0]
            # current_class_mean = current_class_mean.reshape(1, n_features)
            # updated_mean = updated_mean.reshape(1, n_features)
            # if n > 0:
            #     updated_between_scatter += n * (current_class_mean - updated_mean).T.dot(
            #         current_class_mean - updated_mean)

        # if np.any(np.isnan(updated_between_scatter)):
        #     print('Reached nan:::: ', n)
        #     print('Updatec class mean:::', updated_class_mean)
        #     print('updated mean::::', updated_mean)

        # Final values after computation
        self.n_samples_seen_ = updated_n_samples_seen_
        self.class_n_samples_seen_ = updated_class_n_samples_seen_
        self.mean_ = updated_mean
        self.class_mean_ = updated_class_mean
        self.var_ = updated_var
        self.between_scatter = updated_between_scatter
        self.within_scatter = updated_within_scatter
        self.class_within_scatter = updated_class_within_scatter
Пример #22
0
    def partial_fit(self, X, y=None):
        """
        Performs online computation of mean and standard deviation on X for later scaling.
        All of X is processed as a single batch.
        This is intended for cases when `fit` is
        not feasible due to very large number of `n_samples`
        or because X is read from a continuous stream.

        The algorithm for incremental mean
        and std is given in Equation 1.5a,b in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms
        for computing the sample variance: Analysis and recommendations."
        The American Statistician 37.3 (1983): 242-247

        :param X: Data matrix to scale.
        :type X: numpy.ndarray, shape [n_samples, n_features]
        :param y: Passthrough for Scikit-learn ``Pipeline`` compatibility.
        :type y: None
        :return: Fitted object.
        :rtype: pyChemometrics.ChemometricsScaler

        """

        X = check_array(X,
                        accept_sparse=('csr', 'csc'),
                        copy=self.copy,
                        estimator=self,
                        dtype=FLOAT_DTYPES)

        # Even in the case of `with_mean=False`, we update the mean anyway
        # This is needed for the incremental computation of the var
        # See incr_mean_variance_axis and _incremental_mean_variance_axis

        if sparse.issparse(X):
            if self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")
            if self.with_std:
                # First pass
                if not hasattr(self, 'n_samples_seen_'):
                    self.mean_, self.var_ = mean_variance_axis(X, axis=0)
                    self.n_samples_seen_ = X.shape[0]
                # Next passes
                else:
                    self.mean_, self.var_, self.n_samples_seen_ = \
                        incr_mean_variance_axis(X, axis=0,
                                                last_mean=self.mean_,
                                                last_var=self.var_,
                                                last_n=self.n_samples_seen_)
            else:
                self.mean_ = None
                self.var_ = None
        else:
            # First pass
            if not hasattr(self, 'n_samples_seen_'):
                self.mean_ = .0
                self.n_samples_seen_ = 0
                if self.with_std:
                    self.var_ = .0
                else:
                    self.var_ = None

            self.mean_, self.var_, self.n_samples_seen_ = \
                _incremental_mean_and_var(X, self.mean_, self.var_,
                                          self.n_samples_seen_)

        if self.with_std:
            self.scale_ = _handle_zeros_in_scale(numpy.sqrt(
                self.var_))**self.scale_power
        else:
            self.scale_ = None

        return self
Пример #23
0
 def update(self, values):
     self.mean, self.var, self.count =\
         _incremental_mean_and_var(self.mean, self.var, self.count)