def computestep1Local(): global serializedData, dataFromStep1ForStep3 # Initialize FileDataSource to retrieve the input data from a .csv file dataSource = FileDataSource(datasetFileNames[rankId], DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Retrieve the input data dataSource.loadDataBlock() # Create an algorithm to compute SVD on local nodes algorithm = svd.Distributed(step1Local) algorithm.input.set(svd.data, dataSource.getNumericTable()) # Compute SVD # OnlinePartialResult class from svd pres = algorithm.compute() dataFromStep1ForStep2 = pres.get(svd.outputOfStep1ForStep2) dataFromStep1ForStep3 = pres.get(svd.outputOfStep1ForStep3) # Serialize partial results required by step 2 dataArch = InputDataArchive() dataFromStep1ForStep2.serialize(dataArch) nodeResults = dataArch.getArchiveAsArray() # Transfer partial results to step 2 on the root node serializedData = comm.gather(nodeResults)
def trainModel(): global trainingResult # Retrieve the input data from a .csv file trainDataTable = createSparseTable(trainDatasetFileNames[rankId]) # Initialize FileDataSource to retrieve the input data from a .csv file trainLabelsSource = FileDataSource(trainGroundTruthFileNames[rankId], DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Retrieve the data from input files trainLabelsSource.loadDataBlock() # Create an algorithm object to train the Naive Bayes model based on the local-node data localAlgorithm = training.Distributed(step1Local, nClasses, method=training.fastCSR) # Pass a training data set and dependent values to the algorithm localAlgorithm.input.set(classifier.training.data, trainDataTable) localAlgorithm.input.set(classifier.training.labels, trainLabelsSource.getNumericTable()) # Train the Naive Bayes model on local nodes pres = localAlgorithm.compute() # Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) nodeResults = dataArch.getArchiveAsArray() # Transfer partial results to step 2 on the root node serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: # Create an algorithm object to build the final Naive Bayes model on the master node masterAlgorithm = training.Distributed(step2Master, nClasses, method=training.fastCSR) for i in range(nBlocks): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) # Set the local Naive Bayes model as input for the master-node algorithm masterAlgorithm.input.add(training.partialModels, dataForStep2FromStep1) # Merge and finalizeCompute the Naive Bayes model on the master node masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute()
def trainModel(): global trainingResult masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense() for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size): trainDataSource = FileDataSource( trainDatasetFileNames[filenameIndex], DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate) trainDependentVariables = HomogenNumericTable( nDependentVariables, 0, NumericTableIface.notAllocate) mergedData = MergedNumericTable(trainData, trainDependentVariables) trainDataSource.loadDataBlock(mergedData) localAlgorithm = training.Distributed_Step1LocalFloat64NormEqDense() localAlgorithm.input.set(training.data, trainData) localAlgorithm.input.set(training.dependentVariables, trainDependentVariables) pres = localAlgorithm.compute() masterAlgorithm.input.add(training.partialModels, pres) mergedData.freeDataMemory() trainData.freeDataMemory() trainDependentVariables.freeDataMemory() pres = masterAlgorithm.compute() dataArch = InputDataArchive() pres.serialize(dataArch) nodeResults = dataArch.getArchiveAsArray() serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: print("Number of processes is %d." % (len(serializedData))) masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense() for i in range(comm_size): dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) masterAlgorithm.input.add(training.partialModels, dataForStep2FromStep1) masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute()
def trainModel(): global trainingResult nodeResults = [] # Create an algorithm object to build the final Naive Bayes model on the master node masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses) for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size): # Initialize FileDataSource to retrieve the input data from a .csv file #print("The worker with rank %d will read %s." % (rankId, trainDatasetFileNames[filenameIndex])) trainDataSource = FileDataSource(trainDatasetFileNames[filenameIndex], DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for training data and labels trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate) trainDependentVariables = HomogenNumericTable(1, 0, NumericTableIface.notAllocate) mergedData = MergedNumericTable(trainData, trainDependentVariables) # Retrieve the data from the input file trainDataSource.loadDataBlock(mergedData) # Create an algorithm object to train the Naive Bayes model based on the local-node data localAlgorithm = training.Distributed_Step1LocalFloat64DefaultDense(nClasses) # Pass a training data set and dependent values to the algorithm localAlgorithm.input.set(classifier.training.data, trainData) localAlgorithm.input.set(classifier.training.labels, trainDependentVariables) # Train the Naive Bayes model on local nodes pres = localAlgorithm.compute() # Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) masterAlgorithm.input.add(classifier.training.partialModels, pres) """ nodeResults.append(dataArch.getArchiveAsArray().copy()) localAlgorithm.clean() """ mergedData.freeDataMemory() trainData.freeDataMemory() trainDependentVariables.freeDataMemory() # Transfer partial results to step 2 on the root node pres = masterAlgorithm.compute() dataArch = InputDataArchive() pres.serialize(dataArch) nodeResults.append(dataArch.getArchiveAsArray().copy()) serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: # Create an algorithm object to build the final Naive Bayes model on the master node masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses) for currentRank in range(len(serializedData)): for currentBlock in range(0, len(serializedData[currentRank])): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[currentRank][currentBlock]) dataForStep2FromStep1 = classifier.training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) # Set the local Naive Bayes model as input for the master-node algorithm masterAlgorithm.input.add(classifier.training.partialModels, dataForStep2FromStep1) # Merge and finalizeCompute the Naive Bayes model on the master node masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute()
def trainModel(comm, rankId): trainingResult = None # Initialize FileDataSource to retrieve the input data from a .csv file trainDataSource = FileDataSource(trainDatasetFileNames[rankId], DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for training data and labels trainData = HomogenNumericTable(NUM_FEATURES, 0, NumericTableIface.doNotAllocate) trainDependentVariables = HomogenNumericTable( NUM_DEPENDENT_VARS, 0, NumericTableIface.doNotAllocate) mergedData = MergedNumericTable(trainData, trainDependentVariables) # Retrieve the data from the input file trainDataSource.loadDataBlock(mergedData) # Create an algorithm object to train the ridge regression model based on the local-node data localAlgorithm = training.Distributed(step1Local) # Pass a training data set and dependent values to the algorithm localAlgorithm.input.set(training.data, trainData) localAlgorithm.input.set(training.dependentVariables, trainDependentVariables) # Train the ridge regression model on local nodes pres = localAlgorithm.compute() # Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) # Transfer partial results to step 2 on the root node nodeResults = dataArch.getArchiveAsArray() serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: # Create an algorithm object to build the final ridge regression model on the master node masterAlgorithm = training.Distributed(step2Master) for i in range(NUM_BLOCKS): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) # Set the local ridge regression model as input for the master-node algorithm masterAlgorithm.input.add(training.partialModels, dataForStep2FromStep1) # Merge and finalizeCompute the ridge regression model on the master node masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute() # Retrieve the algorithm results printNumericTable( trainingResult.get(training.model).getBeta(), "Ridge Regression coefficients:") return trainingResult
# Create an algorithm to compute a variance-covariance matrix on local nodes localAlgorithm = covariance.Distributed(step1Local) # Set the input data set to the algorithm localAlgorithm.input.set(covariance.data, dataSource.getNumericTable()) # Compute a variance-covariance matrix pres = localAlgorithm.compute() # Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) perNodeArchLength = dataArch.getSizeOfArchive() nodeResults = dataArch.getArchiveAsArray() # Transfer partial results to step 2 on the root node data = comm_size.gather(nodeResults, MPI_ROOT) if rankId == MPI_ROOT: # Create an algorithm to compute a variance-covariance matrix on the master node masterAlgorithm = covariance.Distributed(step2Master) for i in range(nBlocks): # Deserialize partial results from step 1 dataArch = OutputDataArchive(data[i]) dataForStep2FromStep1 = covariance.PartialResult()