def serializeTrainingResult(self): # Create a data archive to serialize the numeric table dataArch = InputDataArchive() # Serialize the numeric table into the data archive self.trainingResult.serialize(dataArch) # Get the length of the serialized data in bytes length = dataArch.getSizeOfArchive() # Store the serialized data in an array buffer = np.zeros(length, dtype=np.ubyte) dataArch.copyArchiveToArray(buffer) return buffer
def computeOnMasterNode(): global R, serializedData # Create an algorithm to compute QR decomposition on the master node algorithm = qr.Distributed(step2Master) for i in range(nBlocks): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = DataCollection() dataForStep2FromStep1.deserialize(dataArch) algorithm.input.add(qr.inputOfStep2FromStep1, i, dataForStep2FromStep1) # Compute QR decomposition pres = algorithm.compute() inputForStep3FromStep2 = pres.getCollection(qr.outputOfStep2ForStep3) for i in range(nBlocks): # Serialize partial results to transfer to local nodes for step 3 dataArch = InputDataArchive() inputForStep3FromStep2[i].serialize(dataArch) length = dataArch.getSizeOfArchive() serializedData[i] = np.empty(length, dtype=np.uint8) dataArch.copyArchiveToArray(serializedData[i]) # Result class from qr res = algorithm.getResult() R = res.get(qr.matrixR)
def trainModel(): global trainingResult masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense() for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size): trainDataSource = FileDataSource( trainDatasetFileNames[filenameIndex], DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate) trainDependentVariables = HomogenNumericTable( nDependentVariables, 0, NumericTableIface.notAllocate) mergedData = MergedNumericTable(trainData, trainDependentVariables) trainDataSource.loadDataBlock(mergedData) localAlgorithm = training.Distributed_Step1LocalFloat64NormEqDense() localAlgorithm.input.set(training.data, trainData) localAlgorithm.input.set(training.dependentVariables, trainDependentVariables) pres = localAlgorithm.compute() masterAlgorithm.input.add(training.partialModels, pres) mergedData.freeDataMemory() trainData.freeDataMemory() trainDependentVariables.freeDataMemory() pres = masterAlgorithm.compute() dataArch = InputDataArchive() pres.serialize(dataArch) nodeResults = dataArch.getArchiveAsArray() serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: print("Number of processes is %d." % (len(serializedData))) masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense() for i in range(comm_size): dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) masterAlgorithm.input.add(training.partialModels, dataForStep2FromStep1) masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute()
def serialize( self, data, fileName=None, useCompression=False, ): buffArrObjName = (str(type(data)).split()[1].split('>')[0] + '()').replace("'", '') dataArch = InputDataArchive() data.serialize(dataArch) length = dataArch.getSizeOfArchive() bufferArray = np.zeros(length, dtype=np.ubyte) dataArch.copyArchiveToArray(bufferArray) if useCompression == True: if fileName != None: if len(fileName.rsplit('.', 1)) == 2: fileName = fileName.rsplit('.', 1)[0] compressedData = Kmeans.compress(self, bufferArray) np.save(fileName, compressedData) else: comBufferArray = Kmeans.compress(self, bufferArray) serialObjectDict = { 'Array Object': comBufferArray, 'Object Information': buffArrObjName } return serialObjectDict else: if fileName != None: if len(fileName.rsplit('.', 1)) == 2: fileName = fileName.rsplit('.', 1)[0] np.save(fileName, bufferArray) else: serialObjectDict = { 'Array Object': bufferArray, 'Object Information': buffArrObjName } return serialObjectDict infoFile = open(fileName + '.txt', 'w') infoFile.write(buffArrObjName) infoFile.close()
def gatherPartialResultsFromNodes(partialResult, partialResults, partialResultArchLength, partialResultLocalBuffer, partialResultMasterBuffer): dataArch = InputDataArchive() partialResult.serialize(dataArch) if partialResultArchLength == 0: partialResultArchLength = dataArch.getSizeOfArchive() # Serialized data is of equal size on each node if rankId == MPI_ROOT and len(partialResultMasterBuffer) == 0: partialResultMasterBuffer = np.zeros(partialResultArchLength * nNodes, dtype=np.uint8) if len(partialResultLocalBuffer) == 0: partialResultLocalBuffer = np.zeros(partialResultArchLength, dtype=np.uint8) dataArch.copyArchiveToArray(partialResultLocalBuffer) # Transfer partial results to step 2 on the root node partialResultMasterBuffer = comm.gather(partialResultLocalBuffer) if rankId == MPI_ROOT: for node in range(nNodes): # Deserialize partial results from step 1 dataArch = OutputDataArchive(partialResultMasterBuffer[node]) partialResults[node] = training.PartialResult() partialResults[node].deserialize(dataArch)
def serialize(self, data, fileName=None, useCompression=False): buffArrObjName = (str(type(data)).split()[1].split('>')[0] + "()").replace("'", '') dataArch = InputDataArchive() data.serialize(dataArch) length = dataArch.getSizeOfArchive() bufferArray = np.zeros(length, dtype=np.ubyte) dataArch.copyArchiveToArray(bufferArray) if useCompression == True: if fileName != None: if len(fileName.rsplit(".", 1)) == 2: fileName = fileName.rsplit(".", 1)[0] compressedData = self.compress(bufferArray) np.save(fileName, compressedData) else: comBufferArray = self.compress(bufferArray) serialObjectDict = { "Array Object": comBufferArray, "Object Information": buffArrObjName } return serialObjectDict else: if fileName != None: if len(fileName.rsplit(".", 1)) == 2: fileName = fileName.rsplit(".", 1)[0] np.save(fileName, bufferArray) else: serialObjectDict = { "Array Object": bufferArray, "Object Information": buffArrObjName } return serialObjectDict infoFile = open(fileName + ".txt", "w") infoFile.write(buffArrObjName) infoFile.close() print("Data successfully serialized and saved as {} and {}".format( fileName, infoFile.name))
def broadcastWeightsAndBiasesToNodes(wb): wbBuffer = None # Serialize weights and biases on the root node if rankId == MPI_ROOT: if not wb: # Weights and biases table should be valid and not NULL on master return HomogenNumericTable() wbDataArch = InputDataArchive() wb.serialize(wbDataArch) wbBuffer = np.zeros(wbDataArch.getSizeOfArchive(), dtype=np.uint8) wbDataArch.copyArchiveToArray(wbBuffer) # Broadcast the serialized weights and biases wbBuffer = comm.bcast(wbBuffer) # Deserialize weights and biases wbDataArchLocal = OutputDataArchive(wbBuffer) wbLocal = HomogenNumericTable(ntype=np.float32) wbLocal.deserialize(wbDataArchLocal) return wbLocal
def computeOnMasterNode(): global serializedData, Sigma, V # Create an algorithm to compute SVD on the master node algorithm = svd.Distributed(step2Master) for i in range(nBlocks): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = DataCollection() dataForStep2FromStep1.deserialize(dataArch) algorithm.input.add(svd.inputOfStep2FromStep1, i, dataForStep2FromStep1) # Compute SVD # DistributedPartialResult class from svd pres = algorithm.compute() inputForStep3FromStep2 = pres.getCollection(svd.outputOfStep2ForStep3) for i in range(nBlocks): # Serialize partial results to transfer to local nodes for step 3 dataArch = InputDataArchive() inputForStep3FromStep2[i].serialize(dataArch) length = dataArch.getSizeOfArchive() serializedData[i] = np.empty(length, dtype=np.uint8) dataArch.copyArchiveToArray(serializedData[i]) # DistributedPartialResult class from svd res = algorithm.getResult() Sigma = res.get(svd.singularValues) V = res.get(svd.rightSingularMatrix)
def trainModel(): global trainingResult nodeResults = [] # Create an algorithm object to build the final Naive Bayes model on the master node masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses) for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size): # Initialize FileDataSource to retrieve the input data from a .csv file #print("The worker with rank %d will read %s." % (rankId, trainDatasetFileNames[filenameIndex])) trainDataSource = FileDataSource(trainDatasetFileNames[filenameIndex], DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for training data and labels trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate) trainDependentVariables = HomogenNumericTable(1, 0, NumericTableIface.notAllocate) mergedData = MergedNumericTable(trainData, trainDependentVariables) # Retrieve the data from the input file trainDataSource.loadDataBlock(mergedData) # Create an algorithm object to train the Naive Bayes model based on the local-node data localAlgorithm = training.Distributed_Step1LocalFloat64DefaultDense(nClasses) # Pass a training data set and dependent values to the algorithm localAlgorithm.input.set(classifier.training.data, trainData) localAlgorithm.input.set(classifier.training.labels, trainDependentVariables) # Train the Naive Bayes model on local nodes pres = localAlgorithm.compute() # Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) masterAlgorithm.input.add(classifier.training.partialModels, pres) """ nodeResults.append(dataArch.getArchiveAsArray().copy()) localAlgorithm.clean() """ mergedData.freeDataMemory() trainData.freeDataMemory() trainDependentVariables.freeDataMemory() # Transfer partial results to step 2 on the root node pres = masterAlgorithm.compute() dataArch = InputDataArchive() pres.serialize(dataArch) nodeResults.append(dataArch.getArchiveAsArray().copy()) serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: # Create an algorithm object to build the final Naive Bayes model on the master node masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses) for currentRank in range(len(serializedData)): for currentBlock in range(0, len(serializedData[currentRank])): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[currentRank][currentBlock]) dataForStep2FromStep1 = classifier.training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) # Set the local Naive Bayes model as input for the master-node algorithm masterAlgorithm.input.add(classifier.training.partialModels, dataForStep2FromStep1) # Merge and finalizeCompute the Naive Bayes model on the master node masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute()
def trainModel(comm, rankId): trainingResult = None # Initialize FileDataSource to retrieve the input data from a .csv file trainDataSource = FileDataSource(trainDatasetFileNames[rankId], DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for training data and labels trainData = HomogenNumericTable(NUM_FEATURES, 0, NumericTableIface.doNotAllocate) trainDependentVariables = HomogenNumericTable( NUM_DEPENDENT_VARS, 0, NumericTableIface.doNotAllocate) mergedData = MergedNumericTable(trainData, trainDependentVariables) # Retrieve the data from the input file trainDataSource.loadDataBlock(mergedData) # Create an algorithm object to train the ridge regression model based on the local-node data localAlgorithm = training.Distributed(step1Local) # Pass a training data set and dependent values to the algorithm localAlgorithm.input.set(training.data, trainData) localAlgorithm.input.set(training.dependentVariables, trainDependentVariables) # Train the ridge regression model on local nodes pres = localAlgorithm.compute() # Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) # Transfer partial results to step 2 on the root node nodeResults = dataArch.getArchiveAsArray() serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: # Create an algorithm object to build the final ridge regression model on the master node masterAlgorithm = training.Distributed(step2Master) for i in range(NUM_BLOCKS): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) # Set the local ridge regression model as input for the master-node algorithm masterAlgorithm.input.add(training.partialModels, dataForStep2FromStep1) # Merge and finalizeCompute the ridge regression model on the master node masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute() # Retrieve the algorithm results printNumericTable( trainingResult.get(training.model).getBeta(), "Ridge Regression coefficients:") return trainingResult
) # Retrieve the input data dataSource.loadDataBlock() # Create an algorithm to compute a variance-covariance matrix on local nodes localAlgorithm = covariance.Distributed(step1Local) # Set the input data set to the algorithm localAlgorithm.input.set(covariance.data, dataSource.getNumericTable()) # Compute a variance-covariance matrix pres = localAlgorithm.compute() # Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) perNodeArchLength = dataArch.getSizeOfArchive() nodeResults = dataArch.getArchiveAsArray() # Transfer partial results to step 2 on the root node data = comm_size.gather(nodeResults, MPI_ROOT) if rankId == MPI_ROOT: # Create an algorithm to compute a variance-covariance matrix on the master node masterAlgorithm = covariance.Distributed(step2Master) for i in range(nBlocks):
# Retrieve the input data from a file dataTable = createSparseTable(datasetFileNames[rankId]) # Create an algorithm to compute low order moments on local nodes localAlgorithm = low_order_moments.Distributed( step1Local, method=low_order_moments.fastCSR) # Set the input data set to the algorithm localAlgorithm.input.set(low_order_moments.data, dataTable) # Compute low order moments pres = localAlgorithm.compute() # Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) nodeResults = dataArch.getArchiveAsArray() # Transfer partial results to step 2 on the root node serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: # Create an algorithm to compute low order moments on the master node masterAlgorithm = low_order_moments.Distributed( step2Master, method=low_order_moments.fastCSR) for i in range(nBlocks): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[i])