def main(): infile = "./data/batch/pca_normalized.csv" method = 'svdDense' # configure a PCA object algo = d4p.pca(method=method, resultsToCompute="mean|variance|eigenvalue", isDeterministic=True) # let's provide a file directly, not a table/array result1 = algo.compute(infile) # We can also load the data ourselfs and provide the numpy array data = read_csv(infile, range(10)) result2 = algo.compute(data) # PCA result objects provide eigenvalues, eigenvectors, means and variances assert np.allclose(result1.eigenvalues, result2.eigenvalues) assert np.allclose(result1.eigenvectors, result2.eigenvectors) assert np.allclose(result1.means, result2.means) assert np.allclose(result1.variances, result2.variances) assert result1.eigenvalues.shape == (1, data.shape[1]) assert result1.eigenvectors.shape == (data.shape[1], data.shape[1]) assert result1.means.shape == (1, data.shape[1]) assert result1.variances.shape == (1, data.shape[1]) return result1
def compute(data, nComponents): # configure a PCA object and perform PCA pca_algo = d4p.pca(isDeterministic=True, resultsToCompute="mean|variance|eigenvalue") pca_res = pca_algo.compute(data) # Apply transform with whitening because means and eigenvalues are provided pcatrans_algo = d4p.pca_transform(nComponents=nComponents) return pcatrans_algo.compute(data, pca_res.eigenvectors, pca_res.dataForTransform)
def main(readcsv=read_csv, method='svdDense'): infile = "./data/batch/pca_normalized.csv" # 'normalization' is an optional parameter to PCA; we use z-score which could be configured differently zscore = d4p.normalization_zscore() # configure a PCA object algo = d4p.pca(resultsToCompute="mean|variance|eigenvalue", isDeterministic=True, normalization=zscore) # let's provide a file directly, not a table/array result1 = algo.compute(infile) # We can also load the data ourselfs and provide the numpy array data = readcsv(infile) result2 = algo.compute(data) # PCA result objects provide eigenvalues, eigenvectors, means and variances assert np.allclose(result1.eigenvalues, result2.eigenvalues) assert np.allclose(result1.eigenvectors, result2.eigenvectors) assert np.allclose(result1.means, result2.means) assert np.allclose(result1.variances, result2.variances) assert result1.eigenvalues.shape == (1, data.shape[1]) assert result1.eigenvectors.shape == (data.shape[1], data.shape[1]) assert result1.means.shape == (1, data.shape[1]) assert result1.variances.shape == (1, data.shape[1]) return result1
def pca(self, Data_Path, target, n): ''' daal4py PCA SPMD Mode ''' # Initialize SPMD mode d4p.daalinit(nthreads=n) # Train setup file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file_path) data = data.drop(target, axis=1) # configure a PCA object algo = d4p.pca(method='svdDense', distributed=True) self.logger.info('Training the PCA in pydaal SPMD Mode') start = time.time() result = algo.compute(data) self.latency['Parallel_PCA_SPMD_Time'] = time.time() - start # result is available on all processes - but we print only on root if d4p.my_procid() == 0: print("PCA completed", result) self.latency["Overall Parallel PCA SPMD Time"] = time.time() - \ start d4p.daalfini() self.logger.info('Completed PCA in pydaal SPMD Mode') return
def pca_fit_daal(X, n_components, method): if n_components < 1: n_components = min(X.shape) fptype = getFPType(X) centering_algo = normalization_zscore( fptype=fptype, doScale=False ) pca_algorithm = pca( fptype=fptype, method=method, normalization=centering_algo, resultsToCompute='mean|variance|eigenvalue', isDeterministic=True, nComponents=n_components ) pca_result = pca_algorithm.compute(X) eigenvectors = pca_result.eigenvectors eigenvalues = pca_result.eigenvalues.ravel() singular_values = np.sqrt((X.shape[0] - 1) * eigenvalues) return pca_result, eigenvalues, eigenvectors, singular_values
def _fit_daal4py(self, X, n_components): n_samples, n_features = X.shape n_sf_min = min(n_samples, n_features) _validate_n_components(n_components, n_samples, n_features) if n_components == 'mle': daal_n_components = n_features elif n_components < 1: daal_n_components = n_sf_min else: daal_n_components = n_components fpType = getFPType(X) centering_algo = daal4py.normalization_zscore(fptype=fpType, doScale=False) pca_alg = daal4py.pca(fptype=fpType, method='svdDense', normalization=centering_algo, resultsToCompute='mean|variance|eigenvalue', isDeterministic=True, nComponents=daal_n_components) pca_res = pca_alg.compute(X) self.mean_ = pca_res.means.ravel() variances_ = pca_res.variances.ravel() components_ = pca_res.eigenvectors explained_variance_ = pca_res.eigenvalues.ravel() tot_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / tot_var if n_components == 'mle': n_components = \ _infer_dimension(explained_variance_, n_samples) elif 0 < n_components < 1.0: n_components = _n_components_from_fraction( explained_variance_ratio_, n_components) # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < n_sf_min: if explained_variance_.shape[0] == n_sf_min: self.noise_variance_ = explained_variance_[n_components:].mean( ) else: resid_var_ = variances_.sum() resid_var_ -= explained_variance_[:n_components].sum() self.noise_variance_ = resid_var_ / (n_sf_min - n_components) else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = \ explained_variance_ratio_[:n_components] self.singular_values_ = np.sqrt( (n_samples - 1) * self.explained_variance_)
def compute(data): # 'normalization' is an optional parameter to PCA; we use z-score which could be configured differently zscore = d4p.normalization_zscore() # configure a PCA object algo = d4p.pca(resultsToCompute="mean|variance|eigenvalue", isDeterministic=True, normalization=zscore) return algo.compute(data)
def main(readcsv=read_csv, method='svdDense'): dataFileName = "data/batch/pca_transform.csv" nComponents = 2 # read data data = readcsv(dataFileName, range(3)) # configure a PCA object and perform PCA pca_algo = d4p.pca(isDeterministic=True, resultsToCompute="mean|variance|eigenvalue") pca_res = pca_algo.compute(data) # Apply transform with whitening because means and eigenvalues are provided pcatrans_algo = d4p.pca_transform(nComponents=nComponents) pcatrans_res = pcatrans_algo.compute(data, pca_res.eigenvectors, pca_res.dataForTransform) # pca_transform_result objects provides transformedData return (pca_res, pcatrans_res)
def run_pca_daal4py_corr(X, Y): algo = d4p.pca(resultsToCompute="mean|variance|eigenvalue", isDeterministic=True, method="correlationDense") result1 = algo.compute(X) pcatrans_algo = d4p.pca_transform(nComponents=X.shape[1] // 2) transform = pcatrans_algo.compute(X, result1.eigenvectors, result1.dataForTransform).transformedData res = [ transform, result1.eigenvalues, result1.eigenvectors, result1.means, result1.variances ] name = [ "transform", "result1.eigenvalues", "result1.eigenvectors", "result1.means", "result1.variances" ] return res, name
def pca(self, data, target): ''' Method for PCA ''' data = data.drop(target, axis=1) # configure a PCA object self.logger.info('Training the serial PCA in pydaal') # algo = d4p.pca(resultsToCompute="mean|variance|eigenvalue",nComponents = 10, isDeterministic=True) algo = d4p.pca(method='svdDense') self.logger.info('Training the PCA in pydaal Batch Mode') start = time.time() result = algo.compute(data) self.latency["Serial_PCA_Batch_Time"] = time.time() - start self.logger.info('Completed PCA in pydaal Batch/Serial Mode') return result
# mpirun -genv DIST_CNC=MPI -n 4 python ./pca_spmd.py import daal4py as d4p from numpy import loadtxt, allclose if __name__ == "__main__": # Initialize SPMD mode d4p.daalinit(spmd=True) # Each process gets its own data infile = "./data/distributed/pca_normalized_" + str(d4p.my_procid() + 1) + ".csv" # configure a PCA object to use svd instead of default correlation algo = d4p.pca(method='svdDense', distributed=True) # let's provide a file directly, not a table/array result1 = algo.compute(infile) # We can also load the data ourselfs and provide the numpy array data = loadtxt(infile, delimiter=',') result2 = algo.compute(data) # PCA result objects provide eigenvalues, eigenvectors, means and variances assert allclose(result1.eigenvalues, result2.eigenvalues) assert allclose(result1.eigenvectors, result2.eigenvectors) assert result1.means == None and result2.means == None or allclose( result1.means, result2.means) assert result1.variances == None and result2.variances == None or allclose( result1.variances, result2.variances)
def _fit_full_daal4py(self, X, n_components): n_samples, n_features = X.shape n_sf_min = min(n_samples, n_features) if n_components == 'mle': daal_n_components = n_features elif n_components < 1: daal_n_components = n_sf_min else: daal_n_components = n_components fpType = getFPType(X) covariance_algo = daal4py.covariance( fptype=fpType, outputMatrixType='covarianceMatrix') covariance_res = covariance_algo.compute(X) self.mean_ = covariance_res.mean.ravel() covariance = covariance_res.covariance variances_ = np.array([covariance[i, i] for i in range(n_features)]) pca_alg = daal4py.pca(fptype=fpType, method='correlationDense', resultsToCompute='eigenvalue', isDeterministic=True, nComponents=daal_n_components) pca_res = pca_alg.compute(X, covariance) components_ = pca_res.eigenvectors explained_variance_ = np.maximum(pca_res.eigenvalues.ravel(), 0) tot_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / tot_var if n_components == 'mle': if sklearn_check_version('0.23'): n_components = _infer_dimension(explained_variance_, n_samples) else: n_components = \ _infer_dimension_(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: ratio_cumsum = stable_cumsum(explained_variance_ratio_) n_components = np.searchsorted( ratio_cumsum, n_components, side='right') + 1 if n_components < n_sf_min: if explained_variance_.shape[0] == n_sf_min: self.noise_variance_ = explained_variance_[n_components:].mean( ) else: resid_var_ = variances_.sum() resid_var_ -= explained_variance_[:n_components].sum() self.noise_variance_ = resid_var_ / (n_sf_min - n_components) else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = explained_variance_ratio_[: n_components] self.singular_values_ = np.sqrt( (n_samples - 1) * self.explained_variance_)