def gatherPartialResultsFromNodes(partialResult, partialResults,
                                  partialResultArchLength,
                                  partialResultLocalBuffer,
                                  partialResultMasterBuffer):

    dataArch = InputDataArchive()
    partialResult.serialize(dataArch)
    if partialResultArchLength == 0:
        partialResultArchLength = dataArch.getSizeOfArchive()

    # Serialized data is of equal size on each node
    if rankId == MPI_ROOT and len(partialResultMasterBuffer) == 0:
        partialResultMasterBuffer = np.zeros(partialResultArchLength * nNodes,
                                             dtype=np.uint8)

    if len(partialResultLocalBuffer) == 0:
        partialResultLocalBuffer = np.zeros(partialResultArchLength,
                                            dtype=np.uint8)

    dataArch.copyArchiveToArray(partialResultLocalBuffer)

    # Transfer partial results to step 2 on the root node
    partialResultMasterBuffer = comm.gather(partialResultLocalBuffer)

    if rankId == MPI_ROOT:
        for node in range(nNodes):
            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(partialResultMasterBuffer[node])

            partialResults[node] = training.PartialResult()
            partialResults[node].deserialize(dataArch)
示例#2
0
def finalizeMergeOnMasterNode(partsRDD):

    # Create an algorithm to compute PCA decomposition using the correlation method on the master node
    pcaMaster = pca.Distributed(step2Master, method=pca.correlationDense)

    covarianceSparse = covariance.Distributed(step2Master,
                                              method=covariance.fastCSR)
    pcaMaster.parameter.covariance = covarianceSparse

    parts_list = partsRDD.collect()

    # Add partial results computed on local nodes to the algorithm on the master node
    for key, pres in parts_list:
        dataArch = OutputDataArchive(pres)
        deserialized_pres = pca.PartialResult(pca.correlationDense)
        deserialized_pres.deserialize(dataArch)
        pcaMaster.input.add(pca.partialResults, deserialized_pres)

    # Compute PCA decomposition on the master node
    pcaMaster.compute()

    # Finalize computations and retrieve the results
    res = pcaMaster.finalizeCompute()

    return {
        'eigenvectors': res.get(pca.eigenvectors),
        'eigenvalues': res.get(pca.eigenvalues)
    }
 def deserialize(self,
                 serialObjectDict=None,
                 fileName=None,
                 useCompression=False):
     import daal
     if fileName != None and serialObjectDict == None:
         bufferArray = np.load(fileName)
         buffArrObjName = open(fileName.rsplit(".", 1)[0] + ".txt",
                               "r").read()
     elif fileName == None and any(serialObjectDict):
         bufferArray = serialObjectDict["Array Object"]
         buffArrObjName = serialObjectDict["Object Information"]
     else:
         warnings.warn(
             'Expecting "bufferArray" or "fileName" argument, NOT both')
         raise SystemExit
     if useCompression == True:
         bufferArray = MultiSVM.decompress(self, bufferArray)
     dataArch = OutputDataArchive(bufferArray)
     try:
         deSerialObj = eval(buffArrObjName)
     except AttributeError:
         deSerialObj = HomogenNumericTable()
     deSerialObj.deserialize(dataArch)
     return deSerialObj
示例#4
0
def finalizeMergeOnMasterNode(partsRDD):

    # Create an algorithm to compute a dense variance-covariance matrix on the master node
    covarianceMaster = covariance.Distributed(step=step2Master,
                                              method=covariance.defaultDense)

    parts_list = partsRDD.collect()

    # Add partial results computed on local nodes to the algorithm on the master node
    for _, val in parts_list:
        dataArch = OutputDataArchive(val)
        deserialized_val = covariance.PartialResult()
        deserialized_val.deserialize(dataArch)
        covarianceMaster.input.add(covariance.partialResults, deserialized_val)

    # Compute a dense variance-covariance matrix on the master node
    covarianceMaster.compute()

    # Finalize computations and retrieve the results
    res = covarianceMaster.finalizeCompute()

    result = {}
    result['covariance'] = res.get(covariance.covariance)
    result['mean'] = res.get(covariance.mean)

    return result
def deserializeDAALObject(buffer, object):
    # Create a data archive to deserialize the numeric table
    dataArch = OutputDataArchive(buffer)

    # Deserialize the numeric table from the data archive
    object.deserialize(dataArch)

    return object
def deserializePartialResult(buffer, module, partial=True):
    dataArch = OutputDataArchive(buffer)
    if partial:
        deserialized_pres = module.PartialResult()
    else:
        deserialized_pres = module.Result()
    deserialized_pres.deserialize(dataArch)
    return deserialized_pres
示例#7
0
    def deserializeTrainingResult(self, buffer):
        #  Create a data archive to deserialize the numeric table
        dataArch = OutputDataArchive(buffer)
        #  Create a numeric table object
        self.trainingResult = training.Result()
        #  Deserialize the numeric table from the data archive
        self.trainingResult.deserialize(dataArch)

        return self.trainingResult
示例#8
0
def trainModel():
    global trainingResult

    # Retrieve the input data from a .csv file
    trainDataTable = createSparseTable(trainDatasetFileNames[rankId])

    # Initialize FileDataSource to retrieve the input data from a .csv file
    trainLabelsSource = FileDataSource(trainGroundTruthFileNames[rankId],
                                       DataSourceIface.doAllocateNumericTable,
                                       DataSourceIface.doDictionaryFromContext)

    # Retrieve the data from input files
    trainLabelsSource.loadDataBlock()

    # Create an algorithm object to train the Naive Bayes model based on the local-node data
    localAlgorithm = training.Distributed(step1Local,
                                          nClasses,
                                          method=training.fastCSR)

    # Pass a training data set and dependent values to the algorithm
    localAlgorithm.input.set(classifier.training.data, trainDataTable)
    localAlgorithm.input.set(classifier.training.labels,
                             trainLabelsSource.getNumericTable())

    # Train the Naive Bayes model on local nodes
    pres = localAlgorithm.compute()

    # Serialize partial results required by step 2
    dataArch = InputDataArchive()
    pres.serialize(dataArch)

    nodeResults = dataArch.getArchiveAsArray()

    # Transfer partial results to step 2 on the root node
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        # Create an algorithm object to build the final Naive Bayes model on the master node
        masterAlgorithm = training.Distributed(step2Master,
                                               nClasses,
                                               method=training.fastCSR)

        for i in range(nBlocks):
            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(serializedData[i])

            dataForStep2FromStep1 = training.PartialResult()
            dataForStep2FromStep1.deserialize(dataArch)

            # Set the local Naive Bayes model as input for the master-node algorithm
            masterAlgorithm.input.add(training.partialModels,
                                      dataForStep2FromStep1)

        # Merge and finalizeCompute the Naive Bayes model on the master node
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()
def deserializeCSRNumericTable(buffer):

    dataArch = OutputDataArchive(buffer)

    dataTable = CSRNumericTable(np.zeros(1, dtype=np.float64),
                                np.zeros(1, dtype=np.uint64),
                                np.zeros(1, dtype=np.uint64), 0, 0)

    dataTable.deserialize(dataArch)

    return dataTable
示例#10
0
def deserializeDataCollection(buffer):

    #  Create a data archive to deserialize the numeric table
    dataArch = OutputDataArchive(buffer)

    #  Create a data collection object
    collection = DataCollection()

    #  Deserialize the numeric table from the data archive
    collection.deserialize(dataArch)

    return collection
示例#11
0
def deserializeNumericTable(buffer):

    #  Create a data archive to deserialize the numeric table
    dataArch = OutputDataArchive(buffer)

    #  Create a numeric table object
    dataTable = HomogenNumericTable()

    #  Deserialize the numeric table from the data archive
    dataTable.deserialize(dataArch)

    return dataTable
示例#12
0
def computeOnMasterNode():
    global R, serializedData

    # Create an algorithm to compute QR decomposition on the master node
    algorithm = qr.Distributed(step2Master)

    for i in range(nBlocks):
        # Deserialize partial results from step 1
        dataArch = OutputDataArchive(serializedData[i])

        dataForStep2FromStep1 = DataCollection()
        dataForStep2FromStep1.deserialize(dataArch)

        algorithm.input.add(qr.inputOfStep2FromStep1, i, dataForStep2FromStep1)

    # Compute QR decomposition
    pres = algorithm.compute()

    inputForStep3FromStep2 = pres.getCollection(qr.outputOfStep2ForStep3)

    for i in range(nBlocks):
        # Serialize partial results to transfer to local nodes for step 3
        dataArch = InputDataArchive()
        inputForStep3FromStep2[i].serialize(dataArch)
        length = dataArch.getSizeOfArchive()

        serializedData[i] = np.empty(length, dtype=np.uint8)
        dataArch.copyArchiveToArray(serializedData[i])

    # Result class from qr
    res = algorithm.getResult()

    R = res.get(qr.matrixR)
示例#13
0
def trainModel():
    global trainingResult
    masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense()

    for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size):
        trainDataSource = FileDataSource(
            trainDatasetFileNames[filenameIndex],
            DataSourceIface.notAllocateNumericTable,
            DataSourceIface.doDictionaryFromContext)
        trainData = HomogenNumericTable(nFeatures, 0,
                                        NumericTableIface.notAllocate)
        trainDependentVariables = HomogenNumericTable(
            nDependentVariables, 0, NumericTableIface.notAllocate)
        mergedData = MergedNumericTable(trainData, trainDependentVariables)
        trainDataSource.loadDataBlock(mergedData)

        localAlgorithm = training.Distributed_Step1LocalFloat64NormEqDense()
        localAlgorithm.input.set(training.data, trainData)
        localAlgorithm.input.set(training.dependentVariables,
                                 trainDependentVariables)
        pres = localAlgorithm.compute()
        masterAlgorithm.input.add(training.partialModels, pres)

        mergedData.freeDataMemory()
        trainData.freeDataMemory()
        trainDependentVariables.freeDataMemory()

    pres = masterAlgorithm.compute()
    dataArch = InputDataArchive()
    pres.serialize(dataArch)
    nodeResults = dataArch.getArchiveAsArray()
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        print("Number of processes is %d." % (len(serializedData)))
        masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense()

        for i in range(comm_size):
            dataArch = OutputDataArchive(serializedData[i])
            dataForStep2FromStep1 = training.PartialResult()
            dataForStep2FromStep1.deserialize(dataArch)
            masterAlgorithm.input.add(training.partialModels,
                                      dataForStep2FromStep1)
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()
示例#14
0
def finalizeComputestep1Local():
    global Ui, serializedData

    # Transfer partial results from the root node
    nodeResults = comm.scatter(serializedData)

    # Deserialize partial results from step 2
    dataArch = OutputDataArchive(nodeResults)

    dataFromStep2ForStep3 = DataCollection()
    dataFromStep2ForStep3.deserialize(dataArch)

    # Create an algorithm to compute SVD on the master node
    algorithm = svd.Distributed(step3Local)
    algorithm.input.set(svd.inputOfStep3FromStep1, dataFromStep1ForStep3)
    algorithm.input.set(svd.inputOfStep3FromStep2, dataFromStep2ForStep3)

    # Compute SVD
    algorithm.compute()
    res = algorithm.finalizeCompute()

    Ui = res.get(svd.leftSingularMatrix)
示例#15
0
def computeStep2Master(dataFromStep1ForStep2_RDD):

    nBlocks = int(dataFromStep1ForStep2_RDD.count())

    dataFromStep1ForStep2_list = dataFromStep1ForStep2_RDD.collect()

    # Create an algorithm to compute SVD on the master node
    svdStep2Master = svd.Distributed(step2Master, method=svd.defaultDense)

    for key, data_collection in dataFromStep1ForStep2_list:
        dataArch = OutputDataArchive(data_collection)
        deserialized_data_collection = DataCollection()
        deserialized_data_collection.deserialize(dataArch)
        svdStep2Master.input.add(svd.inputOfStep2FromStep1, key,
                                 deserialized_data_collection)

    # Compute SVD in step 2
    pres = svdStep2Master.compute()

    inputForStep3FromStep2 = pres.getCollection(svd.outputOfStep2ForStep3)

    data_list = []
    for key, _ in dataFromStep1ForStep2_list:
        dc = inputForStep3FromStep2[key]
        serialized_dc = serializeNumericTable(dc)
        data_list.append((key, serialized_dc))

    # Make PairRDD from the list
    dataFromStep2ForStep3_RDD = sc.parallelize(data_list, nBlocks)

    res = svdStep2Master.finalizeCompute()

    result = {
        'Sigma': res.get(svd.singularValues),
        'V': res.get(svd.rightSingularMatrix),
        'from2_for3': dataFromStep2ForStep3_RDD
    }

    return result
示例#16
0
def finalizeComputestep1Local():
    global Qi, serializedData

    # Transfer partial results from the root node
    nodeResults = comm.scatter(serializedData)

    # Deserialize partial results from step 2
    dataArch = OutputDataArchive(nodeResults)

    dataFromStep2ForStep3 = DataCollection()
    dataFromStep2ForStep3.deserialize(dataArch)

    # Create an algorithm to compute QR decomposition on the master node
    algorithm = qr.Distributed(step3Local)

    algorithm.input.set(qr.inputOfStep3FromStep1, dataFromStep1ForStep3)
    algorithm.input.set(qr.inputOfStep3FromStep2, dataFromStep2ForStep3)

    # Compute QR decomposition
    algorithm.compute()
    res = algorithm.finalizeCompute()

    Qi = res.get(qr.matrixQ)
def broadcastWeightsAndBiasesToNodes(wb):

    wbBuffer = None
    # Serialize weights and biases on the root node
    if rankId == MPI_ROOT:
        if not wb:
            # Weights and biases table should be valid and not NULL on master
            return HomogenNumericTable()

        wbDataArch = InputDataArchive()
        wb.serialize(wbDataArch)
        wbBuffer = np.zeros(wbDataArch.getSizeOfArchive(), dtype=np.uint8)
        wbDataArch.copyArchiveToArray(wbBuffer)

    # Broadcast the serialized weights and biases
    wbBuffer = comm.bcast(wbBuffer)

    # Deserialize weights and biases
    wbDataArchLocal = OutputDataArchive(wbBuffer)

    wbLocal = HomogenNumericTable(ntype=np.float32)
    wbLocal.deserialize(wbDataArchLocal)

    return wbLocal
示例#18
0
def computeOnMasterNode():
    global serializedData, Sigma, V

    # Create an algorithm to compute SVD on the master node
    algorithm = svd.Distributed(step2Master)

    for i in range(nBlocks):
        # Deserialize partial results from step 1
        dataArch = OutputDataArchive(serializedData[i])

        dataForStep2FromStep1 = DataCollection()
        dataForStep2FromStep1.deserialize(dataArch)

        algorithm.input.add(svd.inputOfStep2FromStep1, i,
                            dataForStep2FromStep1)

    # Compute SVD
    # DistributedPartialResult class from svd
    pres = algorithm.compute()

    inputForStep3FromStep2 = pres.getCollection(svd.outputOfStep2ForStep3)

    for i in range(nBlocks):
        # Serialize partial results to transfer to local nodes for step 3
        dataArch = InputDataArchive()
        inputForStep3FromStep2[i].serialize(dataArch)
        length = dataArch.getSizeOfArchive()

        serializedData[i] = np.empty(length, dtype=np.uint8)
        dataArch.copyArchiveToArray(serializedData[i])

    # DistributedPartialResult class from svd
    res = algorithm.getResult()

    Sigma = res.get(svd.singularValues)
    V = res.get(svd.rightSingularMatrix)
    dataArch = InputDataArchive()
    pres.serialize(dataArch)

    nodeResults = dataArch.getArchiveAsArray()

    # Transfer partial results to step 2 on the root node
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        # Create an algorithm to compute low order moments on the master node
        masterAlgorithm = low_order_moments.Distributed(
            step2Master, method=low_order_moments.fastCSR)

        for i in range(nBlocks):
            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(serializedData[i])

            dataForStep2FromStep1 = low_order_moments.PartialResult()

            dataForStep2FromStep1.deserialize(dataArch)

            # Set local partial results as input for the master-node algorithm
            masterAlgorithm.input.add(low_order_moments.partialResults,
                                      dataForStep2FromStep1)

        # Merge and finalizeCompute low order moments on the master node
        masterAlgorithm.compute()
        res = masterAlgorithm.finalizeCompute()

        # Print the results
        printNumericTable(res.get(low_order_moments.minimum), "Minimum:")
示例#20
0
    perNodeArchLength = dataArch.getSizeOfArchive()

    nodeResults = dataArch.getArchiveAsArray()

    # Transfer partial results to step 2 on the root node
    data = comm_size.gather(nodeResults, MPI_ROOT)

    if rankId == MPI_ROOT:

        # Create an algorithm to compute a variance-covariance matrix on the master node
        masterAlgorithm = covariance.Distributed(step2Master)

        for i in range(nBlocks):

            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(data[i])

            dataForStep2FromStep1 = covariance.PartialResult()

            dataForStep2FromStep1.deserialize(dataArch)

            # Set local partial results as input for the master-node algorithm
            masterAlgorithm.input.add(covariance.partialResults, dataForStep2FromStep1)

        # Merge and finalizeCompute a dense variance-covariance matrix on the master node */
        masterAlgorithm.compute()
        result = masterAlgorithm.finalizeCompute()

        # Print the results
        printNumericTable(result.get(covariance.covariance), "Covariance matrix:")
        printNumericTable(result.get(covariance.mean),       "Mean vector:")
示例#21
0
def trainModel():
    global trainingResult
    nodeResults = []
    # Create an algorithm object to build the final Naive Bayes model on the master node
    masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses)
    for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size):
        # Initialize FileDataSource to retrieve the input data from a .csv file
        #print("The worker with rank %d will read %s." % (rankId, trainDatasetFileNames[filenameIndex]))
        trainDataSource = FileDataSource(trainDatasetFileNames[filenameIndex],
                                         DataSourceIface.notAllocateNumericTable,
                                         DataSourceIface.doDictionaryFromContext)

        # Create Numeric Tables for training data and labels
        trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate)
        trainDependentVariables = HomogenNumericTable(1, 0, NumericTableIface.notAllocate)
        mergedData = MergedNumericTable(trainData, trainDependentVariables)

        # Retrieve the data from the input file
        trainDataSource.loadDataBlock(mergedData)

        # Create an algorithm object to train the Naive Bayes model based on the local-node data
        localAlgorithm = training.Distributed_Step1LocalFloat64DefaultDense(nClasses)

        # Pass a training data set and dependent values to the algorithm
        localAlgorithm.input.set(classifier.training.data, trainData)
        localAlgorithm.input.set(classifier.training.labels, trainDependentVariables)

        # Train the Naive Bayes model on local nodes
        pres = localAlgorithm.compute()
        # Serialize partial results required by step 2
        dataArch = InputDataArchive()
        pres.serialize(dataArch)

        masterAlgorithm.input.add(classifier.training.partialModels, pres)
        """
        nodeResults.append(dataArch.getArchiveAsArray().copy())
        localAlgorithm.clean()
        """
        mergedData.freeDataMemory()
        trainData.freeDataMemory()
        trainDependentVariables.freeDataMemory()
    # Transfer partial results to step 2 on the root node
    pres = masterAlgorithm.compute()
    dataArch = InputDataArchive()
    pres.serialize(dataArch)
    nodeResults.append(dataArch.getArchiveAsArray().copy())
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        # Create an algorithm object to build the final Naive Bayes model on the master node
        masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses)

        for currentRank in range(len(serializedData)):
            for currentBlock in range(0, len(serializedData[currentRank])):
                # Deserialize partial results from step 1
                dataArch = OutputDataArchive(serializedData[currentRank][currentBlock])

                dataForStep2FromStep1 = classifier.training.PartialResult()
                dataForStep2FromStep1.deserialize(dataArch)

                # Set the local Naive Bayes model as input for the master-node algorithm
                masterAlgorithm.input.add(classifier.training.partialModels, dataForStep2FromStep1)

        # Merge and finalizeCompute the Naive Bayes model on the master node
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()
def trainModel(comm, rankId):

    trainingResult = None

    # Initialize FileDataSource to retrieve the input data from a .csv file
    trainDataSource = FileDataSource(trainDatasetFileNames[rankId],
                                     DataSourceIface.notAllocateNumericTable,
                                     DataSourceIface.doDictionaryFromContext)

    # Create Numeric Tables for training data and labels
    trainData = HomogenNumericTable(NUM_FEATURES, 0,
                                    NumericTableIface.doNotAllocate)
    trainDependentVariables = HomogenNumericTable(
        NUM_DEPENDENT_VARS, 0, NumericTableIface.doNotAllocate)
    mergedData = MergedNumericTable(trainData, trainDependentVariables)

    # Retrieve the data from the input file
    trainDataSource.loadDataBlock(mergedData)

    # Create an algorithm object to train the ridge regression model based on the local-node data
    localAlgorithm = training.Distributed(step1Local)

    # Pass a training data set and dependent values to the algorithm
    localAlgorithm.input.set(training.data, trainData)
    localAlgorithm.input.set(training.dependentVariables,
                             trainDependentVariables)

    # Train the ridge regression model on local nodes
    pres = localAlgorithm.compute()

    # Serialize partial results required by step 2
    dataArch = InputDataArchive()
    pres.serialize(dataArch)

    # Transfer partial results to step 2 on the root node
    nodeResults = dataArch.getArchiveAsArray()

    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:

        # Create an algorithm object to build the final ridge regression model on the master node
        masterAlgorithm = training.Distributed(step2Master)

        for i in range(NUM_BLOCKS):

            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(serializedData[i])
            dataForStep2FromStep1 = training.PartialResult()
            dataForStep2FromStep1.deserialize(dataArch)

            # Set the local ridge regression model as input for the master-node algorithm
            masterAlgorithm.input.add(training.partialModels,
                                      dataForStep2FromStep1)

        # Merge and finalizeCompute the ridge regression model on the master node
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()

        # Retrieve the algorithm results
        printNumericTable(
            trainingResult.get(training.model).getBeta(),
            "Ridge Regression coefficients:")

    return trainingResult