示例#1
0
def main(readcsv=read_csv, method='svdDense'):
    infile = "./data/batch/pca_normalized.csv"

    # 'normalization' is an optional parameter to PCA; we use z-score which could be configured differently
    zscore = d4p.normalization_zscore()
    # configure a PCA object
    algo = d4p.pca(resultsToCompute="mean|variance|eigenvalue",
                   isDeterministic=True,
                   normalization=zscore)

    # let's provide a file directly, not a table/array
    result1 = algo.compute(infile)

    # We can also load the data ourselfs and provide the numpy array
    data = readcsv(infile)
    result2 = algo.compute(data)

    # PCA result objects provide eigenvalues, eigenvectors, means and variances
    assert np.allclose(result1.eigenvalues, result2.eigenvalues)
    assert np.allclose(result1.eigenvectors, result2.eigenvectors)
    assert np.allclose(result1.means, result2.means)
    assert np.allclose(result1.variances, result2.variances)
    assert result1.eigenvalues.shape == (1, data.shape[1])
    assert result1.eigenvectors.shape == (data.shape[1], data.shape[1])
    assert result1.means.shape == (1, data.shape[1])
    assert result1.variances.shape == (1, data.shape[1])

    return result1
示例#2
0
def pca_fit_daal(X, n_components, method):

    if n_components < 1:
        n_components = min(X.shape)

    fptype = getFPType(X)

    centering_algo = normalization_zscore(
        fptype=fptype,
        doScale=False
    )

    pca_algorithm = pca(
        fptype=fptype,
        method=method,
        normalization=centering_algo,
        resultsToCompute='mean|variance|eigenvalue',
        isDeterministic=True,
        nComponents=n_components
    )

    pca_result = pca_algorithm.compute(X)
    eigenvectors = pca_result.eigenvectors
    eigenvalues = pca_result.eigenvalues.ravel()
    singular_values = np.sqrt((X.shape[0] - 1) * eigenvalues)

    return pca_result, eigenvalues, eigenvectors, singular_values
示例#3
0
    def _fit_daal4py(self, X, n_components):
        n_samples, n_features = X.shape
        n_sf_min = min(n_samples, n_features)

        _validate_n_components(n_components, n_samples, n_features)

        if n_components == 'mle':
            daal_n_components = n_features
        elif n_components < 1:
            daal_n_components = n_sf_min
        else:
            daal_n_components = n_components

        fpType = getFPType(X)
        centering_algo = daal4py.normalization_zscore(fptype=fpType,
                                                      doScale=False)
        pca_alg = daal4py.pca(fptype=fpType,
                              method='svdDense',
                              normalization=centering_algo,
                              resultsToCompute='mean|variance|eigenvalue',
                              isDeterministic=True,
                              nComponents=daal_n_components)
        pca_res = pca_alg.compute(X)

        self.mean_ = pca_res.means.ravel()
        variances_ = pca_res.variances.ravel()
        components_ = pca_res.eigenvectors
        explained_variance_ = pca_res.eigenvalues.ravel()
        tot_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / tot_var

        if n_components == 'mle':
            n_components = \
                _infer_dimension(explained_variance_, n_samples)
        elif 0 < n_components < 1.0:
            n_components = _n_components_from_fraction(
                explained_variance_ratio_, n_components)

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < n_sf_min:
            if explained_variance_.shape[0] == n_sf_min:
                self.noise_variance_ = explained_variance_[n_components:].mean(
                )
            else:
                resid_var_ = variances_.sum()
                resid_var_ -= explained_variance_[:n_components].sum()
                self.noise_variance_ = resid_var_ / (n_sf_min - n_components)
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = \
            explained_variance_ratio_[:n_components]
        self.singular_values_ = np.sqrt(
            (n_samples - 1) * self.explained_variance_)
示例#4
0
def compute(data):
    # 'normalization' is an optional parameter to PCA; we use z-score which could be configured differently
    zscore = d4p.normalization_zscore()
    # configure a PCA object
    algo = d4p.pca(resultsToCompute="mean|variance|eigenvalue",
                   isDeterministic=True,
                   normalization=zscore)
    return algo.compute(data)
def main(readcsv=read_csv, method='defaultDense'):
    infile = "./data/batch/normalization.csv"

    # configure a covariance object
    try:
        algo = d4p.normalization_zscore(doScale=True)
    except NameError:
        algo = d4p.normalization_zscore()

    # let's provide a file directly, not a table/array
    result1 = algo.compute(infile)

    # We can also load the data ourselfs and provide the numpy array
    data = readcsv(infile)
    result2 = algo.compute(data)

    # covariance result objects provide correlation, covariance and mean
    assert np.allclose(result1.normalizedData, result2.normalizedData)

    return result1