Пример #1
0
def set_daal_num_threads(num_threads):
    try:
        import daal4py
        if num_threads:
            daal4py.daalinit(nthreads=num_threads)
    except ImportError:
        print("@ Package 'daal4py' was not found. Number of threads is being ignored")
Пример #2
0
    def pca(self, Data_Path, target, n):
        '''
        daal4py PCA SPMD Mode
        '''

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        # Train setup
        file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv"
        data = pd.read_csv(file_path)
        data = data.drop(target, axis=1)

        # configure a PCA object
        algo = d4p.pca(method='svdDense', distributed=True)

        self.logger.info('Training the PCA in  pydaal SPMD Mode')

        start = time.time()

        result = algo.compute(data)
        self.latency['Parallel_PCA_SPMD_Time'] = time.time() - start

        # result is available on all processes - but we print only on root
        if d4p.my_procid() == 0:
            print("PCA completed", result)
            self.latency["Overall Parallel PCA SPMD Time"] = time.time() - \
                start

        d4p.daalfini()

        self.logger.info('Completed PCA in pydaal SPMD Mode')

        return
Пример #3
0
    def svd(self, Data_Path, target, n):
        '''
        daal4py SVD SPMD Mode
        '''

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        # Train setup
        file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv"
        data = pd.read_csv(file_path)
        data = data.drop(target, axis=1)

        algo = d4p.svd(distributed=True)
        self.logger.info('Training the SVD in pydaal SPMD Mode')

        # SVD result
        svd_start_time = time.time()
        result = algo.compute(data)
        self.latency["Parallel_SVD_SPMD_Time"] = time.time() - svd_start_time

        # result is available on all processes - but we print only on root
        if d4p.my_procid() == 0:
            print("SVD completed", result)

        self.logger.info('Completed SVD in pydaal SPMD Mode')
        d4p.daalfini()

        return
Пример #4
0
def set_daal_num_threads(num_threads):
    try:
        import daal4py
        if num_threads:
            daal4py.daalinit(nthreads=num_threads)
    except ImportError:
        logging.info('@ Package "daal4py" was not found. Number of threads '
                     'is being ignored')
Пример #5
0
    def linearRegression(self, Data_Path, test_data_path, target, n):
        '''
        daal4py Linear Regression SPMD Mode
        '''

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        # training setup
        file = Data_Path + str(d4p.my_procid() + 1) + ".csv"
        data = pd.read_csv(file)
        X = data.drop(columns=target)
        y = data[target]

        train_algo = d4p.linear_regression_training(method='qrDense',
                                                    distributed=True)

        self.logger.info('Training the Linear Regression in pydaal SPMD Mode')

        start = time.time()

        train_result = train_algo.compute(X, y)

        self.latency['Parallel_LinearRegression_Pydaal_Time'] = time.time() - \
            start

        # test file setup
        test = pd.read_csv(test_data_path)

        y_test = test[target]
        X_test = test.drop(target, axis=1)

        if d4p.my_procid() == 0:
            predict_algo = d4p.linear_regression_prediction()

            # now predict using the model from the training above
            predict_result = predict_algo.compute(X_test, train_result.model)
            self.latency[
                "Overall Parallel Linear Regression Prediction SPMD Time"] = time.time(
                ) - start

            # The prediction result provides prediction
            #assert predict_result.prediction.shape == (X_test.shape[0], y.shape[1])

        d4p.daalfini()

        self.logger.info('Completed Linear Regression in pydaal SPMD Mode')

        # Compute metrics
        mse = mean_squared_error(y_test, predict_result.prediction)
        r2score = r2_score(y_test, predict_result.prediction)

        # Store the time taken and model metrics
        self.metrics['MSE_Parallel_LinearRegression_Pydaal'] = mse
        self.metrics['r2score_Parallel_LinearRegression_Pydaal'] = r2score

        return
Пример #6
0
    def naiveBayes(self, Data_Path, test_data_path, target, n):
        '''
        daal4py Naive Bayes SPMD Mode
        '''

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        # training setup
        file = Data_Path + str(d4p.my_procid() + 1) + ".csv"
        data = pd.read_csv(file)
        X = data.drop(columns=target)
        y = data[target]

        # test file setup
        test = pd.read_csv(test_data_path)

        y_test = test[target]
        X_test = test.drop(target, axis=1)

        # store unique target values
        category_count = len(y.unique())
        # print(category_count)

        # Configure a training object
        train_algo = d4p.multinomial_naive_bayes_training(
            category_count, method='defaultDense', distributed=True)
        self.logger.info('Training the Naive Bayes in pydaal SPMD Mode')

        start = time.time()

        train_result = train_algo.compute(X, y)
        self.latency['Parallel_NaiveBayes_Pydaal_Time'] = time.time() - start
        # Now let's do some prediction
        # It runs only on a single node
        if d4p.my_procid() == 0:
            predict_algo = d4p.multinomial_naive_bayes_prediction(
                category_count)

            # now predict using the model from the training above
            presult = predict_algo.compute(X_test, train_result.model)

            self.latency[
                "Overall Parallel Naive Bayes Prediction SPMD Time"] = time.time(
                ) - start

        d4p.daalfini()

        self.logger.info('Completed Naive Bayes in pydaal SPMD Mode')

        return
Пример #7
0
    def ridgeRegression(self, Data_Path, test_data_path, target, n):
        '''
        daal4py Ridge Regression SPMD Mode
        '''

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        file = Data_Path + str(d4p.my_procid() + 1) + ".csv"

        # training
        data = pd.read_csv(file)
        X = data.drop(columns=target)
        y = data[target]

        # test file setup
        test = pd.read_csv(test_data_path)
        y_test = test[target]
        X_test = test.drop(target, axis=1)

        # Configure a Ridge regression training object
        train_algo = d4p.ridge_regression_training(distributed=True,
                                                   interceptFlag=True)
        self.logger.info('Training the Ridge Regression in pydaal SPMD Mode')

        start_time = time.time()

        train_result = train_algo.compute(X, y)

        self.latency["Parallel Ridge Regression SPMD Time"] = time.time() - \
            start_time

        # Only process #0 reports results
        if d4p.my_procid() == 0:
            predict_algo = d4p.ridge_regression_prediction()
            # now predict using the model from the training above
            predict_result = predict_algo.compute(X_test, train_result.model)

        self.logger.info('Completed Ridge Regression in pydaal SPMD Mode')
        d4p.daalfini()

        # Compute metrics
        mse = mean_squared_error(y_test, predict_result.prediction)
        r2score = r2_score(y_test, predict_result.prediction)

        # Store the time taken and model metrics
        self.metrics["MSE For Parallel Ridge regression SPMD"] = mse
        self.metrics["R2 Score For Parallel Ridge regression SPMD"] = r2score

        return
Пример #8
0
    def kMeans(self, Data_Path, n):
        '''
        daal4py KMeans Clustering SPMD Mode
        '''

        nClusters = 4

        maxIter = 25  # fixed maximum number of itertions

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        # training setup
        file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv"
        data = pd.read_csv(file_path)
        init_algo = d4p.kmeans_init(nClusters=nClusters,
                                    distributed=True,
                                    method="plusPlusDense")

        self.logger.info('Training the KMeans in pydaal SPMD Mode')

        # compute initial centroids
        centroids = init_algo.compute(data).centroids
        init_result = init_algo.compute(data)

        # configure kmeans main object
        algo = d4p.kmeans(nClusters, maxIter, distributed=True)
        kmeans_start_time = time.time()
        # compute the clusters/centroids
        result = algo.compute(data, init_result.centroids)
        self.latency["Parallel_KMeans_SPMD_Time"] = time.time() - \
            kmeans_start_time

        # result is available on all processes - but we print only on root
        if d4p.my_procid() == 0:
            print("KMeans completed", result)

        self.logger.info('Completed KMeans in pydaal SPMD Mode')

        d4p.daalfini()

        return
Пример #9
0
    #         method="plusPlusDense",
    #         distributed=True
    #     ).compute(data).centroids
    # )

    # Kmeans result objects provide centroids, goalFunction,
    # nIterations and objectiveFunction
    assert result.centroids.shape[0] == nClusters
    assert result.nIterations <= maxIter
    # we need an extra call to kmeans to get the assignments
    # (not directly supported through parameter assignFlag yet in SPMD mode)
    algo = d4p.kmeans(nClusters, 0, assignFlag=True)
    # maxIt=0; not distributed, we compute on local data only!
    assignments = algo.compute(data, result.centroids).assignments

    return (assignments, result)


if __name__ == "__main__":
    # Initialize SPMD mode
    d4p.daalinit()
    (assignments, result) = main()
    # result is available on all processes - but we print only on root
    if d4p.my_procid() == 0:
        print("\nFirst 10 cluster assignments:\n", assignments[0:10])
        print("\nFirst 10 dimensions of centroids:\n", result.centroids[:,
                                                                        0:10])
        print("\nObjective function value:\n", result.objectiveFunction)
        print('All looks good!')
    d4p.daalfini()
Пример #10
0
# In[2]:

##### daal4py K-Means Clustering example for Distributed Memory Systems [SPMD Mode] #####
import daal4py as d4p
import pickle
import pandas as pd
import numpy as np

# Now let's **load** in the dataset and **organize** it as necessary to work with our model. For distributed, every file has a unique ID.
#
# We will also **initialize the distribution engine**.

# In[3]:

d4p.daalinit()  #initializes the distribution engine

# organizing variables used in the model for prediction
# each process gets its own data
infile = "./data/distributed_data/daal4py_Distributed_Kmeans_" + str(
    d4p.my_procid() + 1) + ".csv"

# read data
X = pd.read_csv(infile)

# ## Computing and Saving Initial Centroids

# Time to **initialize our centroids!**

# In[4]:
Пример #11
0
#
# See the License for the specific language governing permissions and
# limitations under the License.
#*******************************************************************************

# daal4py Ridge Regression example for distributed memory systems; SPMD mode
# run like this:
#    mpirun -genv DIST_CNC=MPI -n 4 python ./ridge_regression_spmd.py

import daal4py as d4p
from numpy import loadtxt, allclose

if __name__ == "__main__":

    # Initialize SPMD mode
    d4p.daalinit(spmd=True)

    # Each process gets its own data
    infile = "./data/distributed/linear_regression_train_" + str(
        d4p.my_procid() + 1) + ".csv"

    # Configure a Ridge regression training object
    train_algo = d4p.ridge_regression_training(distributed=True)

    # Read data. Let's have 10 independent, and 2 dependent variables (for each observation)
    indep_data = loadtxt(infile, delimiter=',', usecols=range(10))
    dep_data = loadtxt(infile, delimiter=',', usecols=range(10, 12))
    # Now train/compute, the result provides the model for prediction
    train_result = train_algo.compute(indep_data, dep_data)

    # Now let's do some prediction
Пример #12
0
 def setUpClass(cls):
     d4p.daalinit()
Пример #13
0
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.svm import SVC, NuSVC, SVR, NuSVR
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split

from sklearn.datasets import (make_classification, load_breast_cancer,
                              load_diabetes, load_iris, load_boston)
from sklearn.metrics import pairwise_distances, roc_auc_score
from scipy import sparse
from daal4py.sklearn._utils import daal_check_version

# to reproduce errors even in CI
d4p.daalinit(nthreads=100)


def get_class_name(x):
    return x.__class__.__name__


def method_processing(X, clf, methods):
    res = []
    name = []
    for i in methods:
        if i == 'predict':
            res.append(clf.predict(X))
            name.append(get_class_name(clf) + '.predict(X)')
        elif i == 'predict_proba':
            res.append(clf.predict_proba(X))
Пример #14
0
import daal4py
import hpat
import numpy as np

daal4py.daalinit(spmd=True)


@hpat.jit
def lr_predict(N, D, model):
    data = np.random.ranf((N / 2, D))
    return daal4py.linear_regression_prediction().compute(data, model)


@hpat.jit
def lr_train(N, D):
    data = np.random.ranf((N, D))
    gt = np.random.ranf((N, 2))
    return daal4py.linear_regression_training(interceptFlag=True,
                                              method='qrDense').compute(
                                                  data, gt)


t_res = lr_train(1000, 10)
p_res = lr_predict(1000, 10, t_res.model)

print(p_res.prediction[0], t_res.model.NumberOfBetas)

hpat.distribution_report()

daal4py.daalfini()