#******************************************************************************* # daal4py Linear Regression example for distributed memory systems; SPMD mode # run like this: # mpirun -n 4 python ./linreg_spmd.py import daal4py as d4p from numpy import loadtxt, allclose if __name__ == "__main__": # Initialize SPMD mode d4p.daalinit() # Each process gets its own data infile = "./data/distributed/linear_regression_train_" + str( d4p.my_procid() + 1) + ".csv" # Configure a Linear regression training object train_algo = d4p.linear_regression_training(distributed=True) # Read data. Let's have 10 independent, and 2 dependent variables (for each observation) indep_data = loadtxt(infile, delimiter=',', usecols=range(10)) dep_data = loadtxt(infile, delimiter=',', usecols=range(10, 12)) # Now train/compute, the result provides the model for prediction train_result = train_algo.compute(indep_data, dep_data) # Now let's do some prediction # It run only on a single node if d4p.my_procid() == 0: predict_algo = d4p.linear_regression_prediction() # read test data (with same #features)
train_result = train_alg.compute(train_data, train_labels) # Now let's do some prediction # It operates on the same data on each process # read testing data from file with 20 features per observation testfile = "./data/batch/binary_cls_test.csv" predict_data = readcsv(testfile, range(nFeatures)) predict_labels = readcsv(testfile, range(nFeatures, nFeatures + 1)) # set parameters and compute predictions predict_alg = d4p.logistic_regression_prediction(nClasses=nClasses, method=method) predict_result = predict_alg.compute(predict_data, train_result.model) # the prediction result provides prediction assert predict_result.prediction.shape == (predict_data.shape[0], train_labels.shape[1]) return (train_result, predict_result, predict_labels) if __name__ == "__main__": # Initialize SPMD mode d4p.daalinit() (train_result, predict_result, predict_labels) = main() if d4p.my_procid() == 0: print("\nLogistic Regression coefficients:\n", train_result.model.Beta) print("\nLogistic regression prediction results (first 10 rows):\n", predict_result.prediction[0:10]) print("\nGround truth (first 10 rows):\n", predict_labels[0:10]) print('All looks good!') d4p.daalfini()
import pandas as pd import numpy as np # Now let's **load** in the dataset and **organize** it as necessary to work with our model. For distributed, every file has a unique ID. # # We will also **initialize the distribution engine**. # In[3]: d4p.daalinit() #initializes the distribution engine # organizing variables used in the model for prediction # each process gets its own data infile = "./data/distributed_data/daal4py_Distributed_Kmeans_" + str(d4p.my_procid()+1) + ".csv" # read data X = pd.read_csv(infile) # ## Computing and Saving Initial Centroids # Time to **initialize our centroids!** # In[4]: # computing inital centroids init_result = d4p.kmeans_init(nClusters = 3, method = "plusPlusDense").compute(X)
import pandas as pd import numpy as np import pickle # Now let's **load** in the dataset and **organize** it as necessary to work with our model. For distributed, every file has a unique ID. # # We will also **initialize the distribution engine**. # In[3]: d4p.daalinit() #initializes the distribution engine # organizing variables used in the model for prediction # each process gets its own data infile = "./data/distributed_data/linear_regression_train_" + str( d4p.my_procid() + 1) + ".csv" # read data indep_data = pd.read_csv(infile).drop(["target"], axis=1) # house characteristics dep_data = pd.read_csv(infile)["target"] # house price # ## Training and Saving the Model # Time to **train our model** and look at the model's features! # In[4]: # training the model for prediction train_result = d4p.linear_regression_training(distributed=True).compute( indep_data, dep_data)
# limitations under the License. #******************************************************************************* # daal4py SVD example for distributed memory systems; SPMD mode # run like this: # mpirun -n 4 python ./svd_spmd.py import daal4py as d4p from numpy import loadtxt, allclose if __name__ == "__main__": # Initialize SPMD mode d4p.daalinit() # Each process gets its own data infile = "./data/distributed/svd_{}.csv".format(d4p.my_procid() + 1) # configure a SVD object algo = d4p.svd(distributed=True) # let's provide a file directly, not a table/array result1 = algo.compute(infile) # We can also load the data ourselfs and provide the numpy array data = loadtxt(infile, delimiter=',') result2 = algo.compute(data) # SVD result objects provide leftSingularMatrix, rightSingularMatrix and singularValues # leftSingularMatrix not yet supported in dist mode assert result1.leftSingularMatrix == None and result2.leftSingularMatrix == None assert allclose(result1.rightSingularMatrix,
# limitations under the License. #=============================================================================== # daal4py PCA example for distributed memory systems; SPMD mode # run like this: # mpirun -n 4 python ./pca_spmd.py import daal4py as d4p from numpy import loadtxt, allclose if __name__ == "__main__": # Initialize SPMD mode d4p.daalinit() # Each process gets its own data infile = "./data/distributed/pca_normalized_" + str(d4p.my_procid() + 1) + ".csv" # configure a PCA object to use svd instead of default correlation algo = d4p.pca(method='svdDense', distributed=True) # let's provide a file directly, not a table/array result1 = algo.compute(infile) # We can also load the data ourselfs and provide the numpy array data = loadtxt(infile, delimiter=',') result2 = algo.compute(data) # PCA result objects provide eigenvalues, eigenvectors, means and variances assert allclose(result1.eigenvalues, result2.eigenvalues) assert allclose(result1.eigenvectors, result2.eigenvectors) assert result1.means is None and \
#******************************************************************************* # daal4py Linear Regression example for distributed memory systems; SPMD mode # run like this: # mpirun -genv DIST_CNC=MPI -n 4 python ./linreg_spmd.py import daal4py as d4p from numpy import loadtxt, allclose if __name__ == "__main__": # Initialize SPMD mode d4p.daalinit(spmd=True) # Each process gets its own data infile = "./data/distributed/linear_regression_train_" + str(d4p.my_procid()+1) + ".csv" # Configure a Linear regression training object train_algo = d4p.linear_regression_training(distributed=True) # Read data. Let's have 10 independent, and 2 dependent variables (for each observation) indep_data = loadtxt(infile, delimiter=',', usecols=range(10)) dep_data = loadtxt(infile, delimiter=',', usecols=range(10,12)) # Now train/compute, the result provides the model for prediction train_result = train_algo.compute(indep_data, dep_data) # Now let's do some prediction # It run only on a single node if d4p.my_procid() == 0: predict_algo = d4p.linear_regression_prediction(distributed=True) # read test data (with same #features)
def main(method='defaultDense'): infile = "./data/batch/dbscan_dense.csv" epsilon = 0.02 minObservations = 180 # Load the data data = np.loadtxt(infile, delimiter=',') rpp = int(data.shape[0] / d4p.num_procs()) data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :] # configure dbscan main object algo = d4p.dbscan(minObservations=minObservations, epsilon=epsilon, distributed=True) # and compute result = algo.compute(data) return result if __name__ == "__main__": # Initialize SPMD mode d4p.daalinit() result = main() print("\nResults on node with id = ", d4p.my_procid(), " :\n", "\nFirst 10 cluster assignments:\n", result.assignments[0:10], "\nNumber of clusters:\n", result.nClusters) d4p.daalfini()
#******************************************************************************* # daal4py SVD example for distributed memory systems; SPMD mode # run like this: # mpirun -genv DIST_CNC=MPI -n 4 python ./svd_spmd.py import daal4py as d4p from numpy import loadtxt, allclose if __name__ == "__main__": # Initialize SPMD mode d4p.daalinit(spmd=True) # Each process gets its own data infile = "./data/distributed/svd_" + str(d4p.my_procid() + 1) + ".csv" # configure a SVD object algo = d4p.svd(distributed=True) # let's provide a file directly, not a table/array result1 = algo.compute(infile) # We can also load the data ourselfs and provide the numpy array data = loadtxt(infile, delimiter=',') result2 = algo.compute(data) # SVD result objects provide leftSingularMatrix, rightSingularMatrix and singularValues # leftSingularMatrix not yet supported in dist mode assert result1.leftSingularMatrix == None and result2.leftSingularMatrix == None assert allclose(result1.rightSingularMatrix,
# Initialize SPMD mode d4p.daalinit() infile = "./data/distributed/kmeans_dense.csv" nClusters = 10 maxIter = 25 # configure a kmeans-init init_algo = d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True) # Load the data data = loadtxt(infile, delimiter=',') # now slice the data, it would have been better to read only what we need, of course... rpp = int(data.shape[0] / d4p.num_procs()) data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :] # compute initial centroids init_result = init_algo.compute(data) # The results provides the initial centroids assert init_result.centroids.shape[0] == nClusters # configure kmeans main object algo = d4p.kmeans(nClusters, maxIter, distributed=True) # compute the clusters/centroids result = algo.compute(data, init_result.centroids) # Note: we could have done this in just one line: # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True).compute(data).centroids) # Kmeans result objects provide centroids, goalFunction, nIterations and objectiveFunction