def svd(self, Data_Path, target, n): ''' daal4py SVD SPMD Mode ''' # Initialize SPMD mode d4p.daalinit(nthreads=n) # Train setup file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file_path) data = data.drop(target, axis=1) algo = d4p.svd(distributed=True) self.logger.info('Training the SVD in pydaal SPMD Mode') # SVD result svd_start_time = time.time() result = algo.compute(data) self.latency["Parallel_SVD_SPMD_Time"] = time.time() - svd_start_time # result is available on all processes - but we print only on root if d4p.my_procid() == 0: print("SVD completed", result) self.logger.info('Completed SVD in pydaal SPMD Mode') d4p.daalfini() return
def pca(self, Data_Path, target, n): ''' daal4py PCA SPMD Mode ''' # Initialize SPMD mode d4p.daalinit(nthreads=n) # Train setup file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file_path) data = data.drop(target, axis=1) # configure a PCA object algo = d4p.pca(method='svdDense', distributed=True) self.logger.info('Training the PCA in pydaal SPMD Mode') start = time.time() result = algo.compute(data) self.latency['Parallel_PCA_SPMD_Time'] = time.time() - start # result is available on all processes - but we print only on root if d4p.my_procid() == 0: print("PCA completed", result) self.latency["Overall Parallel PCA SPMD Time"] = time.time() - \ start d4p.daalfini() self.logger.info('Completed PCA in pydaal SPMD Mode') return
def linearRegression(self, Data_Path, test_data_path, target, n): ''' daal4py Linear Regression SPMD Mode ''' # Initialize SPMD mode d4p.daalinit(nthreads=n) # training setup file = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file) X = data.drop(columns=target) y = data[target] train_algo = d4p.linear_regression_training(method='qrDense', distributed=True) self.logger.info('Training the Linear Regression in pydaal SPMD Mode') start = time.time() train_result = train_algo.compute(X, y) self.latency['Parallel_LinearRegression_Pydaal_Time'] = time.time() - \ start # test file setup test = pd.read_csv(test_data_path) y_test = test[target] X_test = test.drop(target, axis=1) if d4p.my_procid() == 0: predict_algo = d4p.linear_regression_prediction() # now predict using the model from the training above predict_result = predict_algo.compute(X_test, train_result.model) self.latency[ "Overall Parallel Linear Regression Prediction SPMD Time"] = time.time( ) - start # The prediction result provides prediction #assert predict_result.prediction.shape == (X_test.shape[0], y.shape[1]) d4p.daalfini() self.logger.info('Completed Linear Regression in pydaal SPMD Mode') # Compute metrics mse = mean_squared_error(y_test, predict_result.prediction) r2score = r2_score(y_test, predict_result.prediction) # Store the time taken and model metrics self.metrics['MSE_Parallel_LinearRegression_Pydaal'] = mse self.metrics['r2score_Parallel_LinearRegression_Pydaal'] = r2score return
def naiveBayes(self, Data_Path, test_data_path, target, n): ''' daal4py Naive Bayes SPMD Mode ''' # Initialize SPMD mode d4p.daalinit(nthreads=n) # training setup file = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file) X = data.drop(columns=target) y = data[target] # test file setup test = pd.read_csv(test_data_path) y_test = test[target] X_test = test.drop(target, axis=1) # store unique target values category_count = len(y.unique()) # print(category_count) # Configure a training object train_algo = d4p.multinomial_naive_bayes_training( category_count, method='defaultDense', distributed=True) self.logger.info('Training the Naive Bayes in pydaal SPMD Mode') start = time.time() train_result = train_algo.compute(X, y) self.latency['Parallel_NaiveBayes_Pydaal_Time'] = time.time() - start # Now let's do some prediction # It runs only on a single node if d4p.my_procid() == 0: predict_algo = d4p.multinomial_naive_bayes_prediction( category_count) # now predict using the model from the training above presult = predict_algo.compute(X_test, train_result.model) self.latency[ "Overall Parallel Naive Bayes Prediction SPMD Time"] = time.time( ) - start d4p.daalfini() self.logger.info('Completed Naive Bayes in pydaal SPMD Mode') return
def ridgeRegression(self, Data_Path, test_data_path, target, n): ''' daal4py Ridge Regression SPMD Mode ''' # Initialize SPMD mode d4p.daalinit(nthreads=n) file = Data_Path + str(d4p.my_procid() + 1) + ".csv" # training data = pd.read_csv(file) X = data.drop(columns=target) y = data[target] # test file setup test = pd.read_csv(test_data_path) y_test = test[target] X_test = test.drop(target, axis=1) # Configure a Ridge regression training object train_algo = d4p.ridge_regression_training(distributed=True, interceptFlag=True) self.logger.info('Training the Ridge Regression in pydaal SPMD Mode') start_time = time.time() train_result = train_algo.compute(X, y) self.latency["Parallel Ridge Regression SPMD Time"] = time.time() - \ start_time # Only process #0 reports results if d4p.my_procid() == 0: predict_algo = d4p.ridge_regression_prediction() # now predict using the model from the training above predict_result = predict_algo.compute(X_test, train_result.model) self.logger.info('Completed Ridge Regression in pydaal SPMD Mode') d4p.daalfini() # Compute metrics mse = mean_squared_error(y_test, predict_result.prediction) r2score = r2_score(y_test, predict_result.prediction) # Store the time taken and model metrics self.metrics["MSE For Parallel Ridge regression SPMD"] = mse self.metrics["R2 Score For Parallel Ridge regression SPMD"] = r2score return
def kMeans(self, Data_Path, n): ''' daal4py KMeans Clustering SPMD Mode ''' nClusters = 4 maxIter = 25 # fixed maximum number of itertions # Initialize SPMD mode d4p.daalinit(nthreads=n) # training setup file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file_path) init_algo = d4p.kmeans_init(nClusters=nClusters, distributed=True, method="plusPlusDense") self.logger.info('Training the KMeans in pydaal SPMD Mode') # compute initial centroids centroids = init_algo.compute(data).centroids init_result = init_algo.compute(data) # configure kmeans main object algo = d4p.kmeans(nClusters, maxIter, distributed=True) kmeans_start_time = time.time() # compute the clusters/centroids result = algo.compute(data, init_result.centroids) self.latency["Parallel_KMeans_SPMD_Time"] = time.time() - \ kmeans_start_time # result is available on all processes - but we print only on root if d4p.my_procid() == 0: print("KMeans completed", result) self.logger.info('Completed KMeans in pydaal SPMD Mode') d4p.daalfini() return
# method="plusPlusDense", # distributed=True # ).compute(data).centroids # ) # Kmeans result objects provide centroids, goalFunction, # nIterations and objectiveFunction assert result.centroids.shape[0] == nClusters assert result.nIterations <= maxIter # we need an extra call to kmeans to get the assignments # (not directly supported through parameter assignFlag yet in SPMD mode) algo = d4p.kmeans(nClusters, 0, assignFlag=True) # maxIt=0; not distributed, we compute on local data only! assignments = algo.compute(data, result.centroids).assignments return (assignments, result) if __name__ == "__main__": # Initialize SPMD mode d4p.daalinit() (assignments, result) = main() # result is available on all processes - but we print only on root if d4p.my_procid() == 0: print("\nFirst 10 cluster assignments:\n", assignments[0:10]) print("\nFirst 10 dimensions of centroids:\n", result.centroids[:, 0:10]) print("\nObjective function value:\n", result.objectiveFunction) print('All looks good!') d4p.daalfini()
# # Assign The Data to Clusters and Save The Results # Let's **assign the data** to clusters. # In[7]: # compute the clusters/centroids kmeans_result = d4p.kmeans(nClusters=3, maxIterations=5, assignFlag=True).compute(X, init_result.centroids) # To **get Kmeans result objects** (assignments, centroids, goalFunction [deprecated], nIterations, and objectiveFunction): # In[8]: # retrieving and printing cluster assignments assignments = kmeans_result.assignments print("Here is our cluster assignments for first 5 datapoints: \n\n", assignments[:5]) # Now let's **export the cluster assignments** to a **CSV file**. We will also **stop the distribution engine.** # In[9]: # now export the results to a CSV file results_filename = "./results/daal4py_Distributed_Kmeans_results_" + str( d4p.my_procid() + 1) + ".csv" np.savetxt(results_filename, assignments, delimiter=",") d4p.daalfini() # stops the distribution engine print('[CODE_SAMPLE_COMPLETED_SUCCESFULLY]')
def tearDownClass(cls): d4p.daalfini()
import daal4py import hpat import numpy as np daal4py.daalinit(spmd=True) @hpat.jit def lr_predict(N, D, model): data = np.random.ranf((N / 2, D)) return daal4py.linear_regression_prediction().compute(data, model) @hpat.jit def lr_train(N, D): data = np.random.ranf((N, D)) gt = np.random.ranf((N, 2)) return daal4py.linear_regression_training(interceptFlag=True, method='qrDense').compute( data, gt) t_res = lr_train(1000, 10) p_res = lr_predict(1000, 10, t_res.model) print(p_res.prediction[0], t_res.model.NumberOfBetas) hpat.distribution_report() daal4py.daalfini()