def set_daal_num_threads(num_threads): try: import daal4py if num_threads: daal4py.daalinit(nthreads=num_threads) except ImportError: print("@ Package 'daal4py' was not found. Number of threads is being ignored")
def pca(self, Data_Path, target, n): ''' daal4py PCA SPMD Mode ''' # Initialize SPMD mode d4p.daalinit(nthreads=n) # Train setup file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file_path) data = data.drop(target, axis=1) # configure a PCA object algo = d4p.pca(method='svdDense', distributed=True) self.logger.info('Training the PCA in pydaal SPMD Mode') start = time.time() result = algo.compute(data) self.latency['Parallel_PCA_SPMD_Time'] = time.time() - start # result is available on all processes - but we print only on root if d4p.my_procid() == 0: print("PCA completed", result) self.latency["Overall Parallel PCA SPMD Time"] = time.time() - \ start d4p.daalfini() self.logger.info('Completed PCA in pydaal SPMD Mode') return
def svd(self, Data_Path, target, n): ''' daal4py SVD SPMD Mode ''' # Initialize SPMD mode d4p.daalinit(nthreads=n) # Train setup file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file_path) data = data.drop(target, axis=1) algo = d4p.svd(distributed=True) self.logger.info('Training the SVD in pydaal SPMD Mode') # SVD result svd_start_time = time.time() result = algo.compute(data) self.latency["Parallel_SVD_SPMD_Time"] = time.time() - svd_start_time # result is available on all processes - but we print only on root if d4p.my_procid() == 0: print("SVD completed", result) self.logger.info('Completed SVD in pydaal SPMD Mode') d4p.daalfini() return
def set_daal_num_threads(num_threads): try: import daal4py if num_threads: daal4py.daalinit(nthreads=num_threads) except ImportError: logging.info('@ Package "daal4py" was not found. Number of threads ' 'is being ignored')
def linearRegression(self, Data_Path, test_data_path, target, n): ''' daal4py Linear Regression SPMD Mode ''' # Initialize SPMD mode d4p.daalinit(nthreads=n) # training setup file = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file) X = data.drop(columns=target) y = data[target] train_algo = d4p.linear_regression_training(method='qrDense', distributed=True) self.logger.info('Training the Linear Regression in pydaal SPMD Mode') start = time.time() train_result = train_algo.compute(X, y) self.latency['Parallel_LinearRegression_Pydaal_Time'] = time.time() - \ start # test file setup test = pd.read_csv(test_data_path) y_test = test[target] X_test = test.drop(target, axis=1) if d4p.my_procid() == 0: predict_algo = d4p.linear_regression_prediction() # now predict using the model from the training above predict_result = predict_algo.compute(X_test, train_result.model) self.latency[ "Overall Parallel Linear Regression Prediction SPMD Time"] = time.time( ) - start # The prediction result provides prediction #assert predict_result.prediction.shape == (X_test.shape[0], y.shape[1]) d4p.daalfini() self.logger.info('Completed Linear Regression in pydaal SPMD Mode') # Compute metrics mse = mean_squared_error(y_test, predict_result.prediction) r2score = r2_score(y_test, predict_result.prediction) # Store the time taken and model metrics self.metrics['MSE_Parallel_LinearRegression_Pydaal'] = mse self.metrics['r2score_Parallel_LinearRegression_Pydaal'] = r2score return
def naiveBayes(self, Data_Path, test_data_path, target, n): ''' daal4py Naive Bayes SPMD Mode ''' # Initialize SPMD mode d4p.daalinit(nthreads=n) # training setup file = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file) X = data.drop(columns=target) y = data[target] # test file setup test = pd.read_csv(test_data_path) y_test = test[target] X_test = test.drop(target, axis=1) # store unique target values category_count = len(y.unique()) # print(category_count) # Configure a training object train_algo = d4p.multinomial_naive_bayes_training( category_count, method='defaultDense', distributed=True) self.logger.info('Training the Naive Bayes in pydaal SPMD Mode') start = time.time() train_result = train_algo.compute(X, y) self.latency['Parallel_NaiveBayes_Pydaal_Time'] = time.time() - start # Now let's do some prediction # It runs only on a single node if d4p.my_procid() == 0: predict_algo = d4p.multinomial_naive_bayes_prediction( category_count) # now predict using the model from the training above presult = predict_algo.compute(X_test, train_result.model) self.latency[ "Overall Parallel Naive Bayes Prediction SPMD Time"] = time.time( ) - start d4p.daalfini() self.logger.info('Completed Naive Bayes in pydaal SPMD Mode') return
def ridgeRegression(self, Data_Path, test_data_path, target, n): ''' daal4py Ridge Regression SPMD Mode ''' # Initialize SPMD mode d4p.daalinit(nthreads=n) file = Data_Path + str(d4p.my_procid() + 1) + ".csv" # training data = pd.read_csv(file) X = data.drop(columns=target) y = data[target] # test file setup test = pd.read_csv(test_data_path) y_test = test[target] X_test = test.drop(target, axis=1) # Configure a Ridge regression training object train_algo = d4p.ridge_regression_training(distributed=True, interceptFlag=True) self.logger.info('Training the Ridge Regression in pydaal SPMD Mode') start_time = time.time() train_result = train_algo.compute(X, y) self.latency["Parallel Ridge Regression SPMD Time"] = time.time() - \ start_time # Only process #0 reports results if d4p.my_procid() == 0: predict_algo = d4p.ridge_regression_prediction() # now predict using the model from the training above predict_result = predict_algo.compute(X_test, train_result.model) self.logger.info('Completed Ridge Regression in pydaal SPMD Mode') d4p.daalfini() # Compute metrics mse = mean_squared_error(y_test, predict_result.prediction) r2score = r2_score(y_test, predict_result.prediction) # Store the time taken and model metrics self.metrics["MSE For Parallel Ridge regression SPMD"] = mse self.metrics["R2 Score For Parallel Ridge regression SPMD"] = r2score return
def kMeans(self, Data_Path, n): ''' daal4py KMeans Clustering SPMD Mode ''' nClusters = 4 maxIter = 25 # fixed maximum number of itertions # Initialize SPMD mode d4p.daalinit(nthreads=n) # training setup file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file_path) init_algo = d4p.kmeans_init(nClusters=nClusters, distributed=True, method="plusPlusDense") self.logger.info('Training the KMeans in pydaal SPMD Mode') # compute initial centroids centroids = init_algo.compute(data).centroids init_result = init_algo.compute(data) # configure kmeans main object algo = d4p.kmeans(nClusters, maxIter, distributed=True) kmeans_start_time = time.time() # compute the clusters/centroids result = algo.compute(data, init_result.centroids) self.latency["Parallel_KMeans_SPMD_Time"] = time.time() - \ kmeans_start_time # result is available on all processes - but we print only on root if d4p.my_procid() == 0: print("KMeans completed", result) self.logger.info('Completed KMeans in pydaal SPMD Mode') d4p.daalfini() return
# method="plusPlusDense", # distributed=True # ).compute(data).centroids # ) # Kmeans result objects provide centroids, goalFunction, # nIterations and objectiveFunction assert result.centroids.shape[0] == nClusters assert result.nIterations <= maxIter # we need an extra call to kmeans to get the assignments # (not directly supported through parameter assignFlag yet in SPMD mode) algo = d4p.kmeans(nClusters, 0, assignFlag=True) # maxIt=0; not distributed, we compute on local data only! assignments = algo.compute(data, result.centroids).assignments return (assignments, result) if __name__ == "__main__": # Initialize SPMD mode d4p.daalinit() (assignments, result) = main() # result is available on all processes - but we print only on root if d4p.my_procid() == 0: print("\nFirst 10 cluster assignments:\n", assignments[0:10]) print("\nFirst 10 dimensions of centroids:\n", result.centroids[:, 0:10]) print("\nObjective function value:\n", result.objectiveFunction) print('All looks good!') d4p.daalfini()
# In[2]: ##### daal4py K-Means Clustering example for Distributed Memory Systems [SPMD Mode] ##### import daal4py as d4p import pickle import pandas as pd import numpy as np # Now let's **load** in the dataset and **organize** it as necessary to work with our model. For distributed, every file has a unique ID. # # We will also **initialize the distribution engine**. # In[3]: d4p.daalinit() #initializes the distribution engine # organizing variables used in the model for prediction # each process gets its own data infile = "./data/distributed_data/daal4py_Distributed_Kmeans_" + str( d4p.my_procid() + 1) + ".csv" # read data X = pd.read_csv(infile) # ## Computing and Saving Initial Centroids # Time to **initialize our centroids!** # In[4]:
# # See the License for the specific language governing permissions and # limitations under the License. #******************************************************************************* # daal4py Ridge Regression example for distributed memory systems; SPMD mode # run like this: # mpirun -genv DIST_CNC=MPI -n 4 python ./ridge_regression_spmd.py import daal4py as d4p from numpy import loadtxt, allclose if __name__ == "__main__": # Initialize SPMD mode d4p.daalinit(spmd=True) # Each process gets its own data infile = "./data/distributed/linear_regression_train_" + str( d4p.my_procid() + 1) + ".csv" # Configure a Ridge regression training object train_algo = d4p.ridge_regression_training(distributed=True) # Read data. Let's have 10 independent, and 2 dependent variables (for each observation) indep_data = loadtxt(infile, delimiter=',', usecols=range(10)) dep_data = loadtxt(infile, delimiter=',', usecols=range(10, 12)) # Now train/compute, the result provides the model for prediction train_result = train_algo.compute(indep_data, dep_data) # Now let's do some prediction
def setUpClass(cls): d4p.daalinit()
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso from sklearn.cluster import KMeans, DBSCAN from sklearn.decomposition import PCA from sklearn.svm import SVC, NuSVC, SVR, NuSVR from sklearn.manifold import TSNE from sklearn.model_selection import train_test_split from sklearn.datasets import (make_classification, load_breast_cancer, load_diabetes, load_iris, load_boston) from sklearn.metrics import pairwise_distances, roc_auc_score from scipy import sparse from daal4py.sklearn._utils import daal_check_version # to reproduce errors even in CI d4p.daalinit(nthreads=100) def get_class_name(x): return x.__class__.__name__ def method_processing(X, clf, methods): res = [] name = [] for i in methods: if i == 'predict': res.append(clf.predict(X)) name.append(get_class_name(clf) + '.predict(X)') elif i == 'predict_proba': res.append(clf.predict_proba(X))
import daal4py import hpat import numpy as np daal4py.daalinit(spmd=True) @hpat.jit def lr_predict(N, D, model): data = np.random.ranf((N / 2, D)) return daal4py.linear_regression_prediction().compute(data, model) @hpat.jit def lr_train(N, D): data = np.random.ranf((N, D)) gt = np.random.ranf((N, 2)) return daal4py.linear_regression_training(interceptFlag=True, method='qrDense').compute( data, gt) t_res = lr_train(1000, 10) p_res = lr_predict(1000, 10, t_res.model) print(p_res.prediction[0], t_res.model.NumberOfBetas) hpat.distribution_report() daal4py.daalfini()