def testModel(trainingResult): # Initialize FileDataSource to retrieve the input data from a .csv file testDataSource = FileDataSource(testDatasetFileName, DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for testing data and ground truth values testData = HomogenNumericTable(NUM_FEATURES, 0, NumericTableIface.doNotAllocate) testGroundTruth = HomogenNumericTable(NUM_DEPENDENT_VARS, 0, NumericTableIface.doNotAllocate) mergedData = MergedNumericTable(testData, testGroundTruth) # Retrieve the data from an input file testDataSource.loadDataBlock(mergedData) # Create an algorithm object to predict values of ridge regression algorithm = prediction.Batch() # Pass a testing data set and the trained model to the algorithm algorithm.input.setTable(prediction.data, testData) algorithm.input.setModel(prediction.model, trainingResult.get(training.model)) # Predict values of ridge regression res = algorithm.compute() # Retrieve the algorithm results printNumericTable(res.get(prediction.prediction), "Ridge Regression prediction results: (first 10 rows):", 10) printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10)
def readTensorFromCSV(datasetFileName): dataSource = FileDataSource(datasetFileName, DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) dataSource.loadDataBlock() nt = dataSource.getNumericTable() size = nt.getNumberOfRows() block = BlockDescriptor() nt.getBlockOfRows(0, size, readOnly, block) blockData = block.getArray().flatten() dims = [size] if nt.getNumberOfColumns() > 1: dims.append(nt.getNumberOfColumns()) size *= dims[1] tensorData = np.array(blockData, copy=True, dtype=np.float32) #for i in range(size): # tensorData[i] = blockData[i] nt.releaseBlockOfRows(block) tensorData.shape = dims tensor = HomogenTensor(tensorData, ntype=np.float32) return tensor
def computestep1Local(): global serializedData, dataFromStep1ForStep3 # Initialize FileDataSource to retrieve the input data from a .csv file dataSource = FileDataSource(datasetFileNames[rankId], DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Retrieve the input data dataSource.loadDataBlock() # Create an algorithm to compute SVD on local nodes algorithm = svd.Distributed(step1Local) algorithm.input.set(svd.data, dataSource.getNumericTable()) # Compute SVD # OnlinePartialResult class from svd pres = algorithm.compute() dataFromStep1ForStep2 = pres.get(svd.outputOfStep1ForStep2) dataFromStep1ForStep3 = pres.get(svd.outputOfStep1ForStep3) # Serialize partial results required by step 2 dataArch = InputDataArchive() dataFromStep1ForStep2.serialize(dataArch) nodeResults = dataArch.getArchiveAsArray() # Transfer partial results to step 2 on the root node serializedData = comm.gather(nodeResults)
def trainModel(): global trainingResult # Retrieve the input data from a .csv file trainDataTable = createSparseTable(trainDatasetFileNames[rankId]) # Initialize FileDataSource to retrieve the input data from a .csv file trainLabelsSource = FileDataSource(trainGroundTruthFileNames[rankId], DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Retrieve the data from input files trainLabelsSource.loadDataBlock() # Create an algorithm object to train the Naive Bayes model based on the local-node data localAlgorithm = training.Distributed(step1Local, nClasses, method=training.fastCSR) # Pass a training data set and dependent values to the algorithm localAlgorithm.input.set(classifier.training.data, trainDataTable) localAlgorithm.input.set(classifier.training.labels, trainLabelsSource.getNumericTable()) # Train the Naive Bayes model on local nodes pres = localAlgorithm.compute() # Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) nodeResults = dataArch.getArchiveAsArray() # Transfer partial results to step 2 on the root node serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: # Create an algorithm object to build the final Naive Bayes model on the master node masterAlgorithm = training.Distributed(step2Master, nClasses, method=training.fastCSR) for i in range(nBlocks): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) # Set the local Naive Bayes model as input for the master-node algorithm masterAlgorithm.input.add(training.partialModels, dataForStep2FromStep1) # Merge and finalizeCompute the Naive Bayes model on the master node masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute()
def getNumericTable(self, **kwargs): if self.informat == 'numpy': return HomogenNumericTable(self.indata) if self.informat == 'pandas': array = self.indata.as_matrix() return HomogenNumericTable(array) if self.informat == 'csv': dataSource = \ FileDataSource(self.indata, DataSource.doAllocateNumericTable, DataSource.doDictionaryFormContext) dataSource.loadDataBlock() return dataSource.getNumericTable() raise ValueError("Cannot identify input type.")
def printResults(): testGroundTruth = FileDataSource(testGroundTruthFileName, DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) testGroundTruth.loadDataBlock() printNumericTables(testGroundTruth.getNumericTable(), predictionResult.get(classifier.prediction.prediction), "Ground truth", "Classification results", "NaiveBayes classification results (first 20 observations):", 20, interval=15, flt64=False)
def trainModel(): global trainingResult masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense() for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size): trainDataSource = FileDataSource( trainDatasetFileNames[filenameIndex], DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate) trainDependentVariables = HomogenNumericTable( nDependentVariables, 0, NumericTableIface.notAllocate) mergedData = MergedNumericTable(trainData, trainDependentVariables) trainDataSource.loadDataBlock(mergedData) localAlgorithm = training.Distributed_Step1LocalFloat64NormEqDense() localAlgorithm.input.set(training.data, trainData) localAlgorithm.input.set(training.dependentVariables, trainDependentVariables) pres = localAlgorithm.compute() masterAlgorithm.input.add(training.partialModels, pres) mergedData.freeDataMemory() trainData.freeDataMemory() trainDependentVariables.freeDataMemory() pres = masterAlgorithm.compute() dataArch = InputDataArchive() pres.serialize(dataArch) nodeResults = dataArch.getArchiveAsArray() serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: print("Number of processes is %d." % (len(serializedData))) masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense() for i in range(comm_size): dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) masterAlgorithm.input.add(training.partialModels, dataForStep2FromStep1) masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute()
def testModel(): global predictionResult # Initialize FileDataSource to retrieve the input data from a .csv file testDataSource = FileDataSource(testDatasetFileName, DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Retrieve the data from an input file testDataSource.loadDataBlock() # Create an algorithm object to predict values of the Naive Bayes model algorithm = prediction.Batch(nClasses) # Pass a testing data set and the trained model to the algorithm algorithm.input.setTable(classifier.prediction.data, testDataSource.getNumericTable()) algorithm.input.setModel(classifier.prediction.model, trainingResult.get(classifier.training.model)) # Predict values of the Naive Bayes model # Result class from classifier.prediction predictionResult = algorithm.compute()
def getNumericTable(self, **kwargs): if self.informat == 'numpy': return AOSNumericTable(self.indata) if self.informat == 'pandas': array = self._getStructureArray( self.indata, dtypes=self.indata.dtypes) return AOSNumericTable(array) if self.informat == 'csv': dataSource = FileDataSource( self.indata, DataSource.notAllocateNumericTable, DataSource.doDictionaryFromContext) if 'nRows' not in kwargs and 'dtype' not in kwargs: raise ValueError("HeterogenousDaalData, for csv file, \ 'nrows' and 'dtypes' must be specified.") nRows = kwargs['nRows'] dtype = kwargs['dtype'] array = np.empty([nRows,], dtype=dtype) nT = AOSNumericTable(array) return dataSource.loadDataBlock(nRows, nT) return None
def getNumericTableFromCSV(csvFileName, Rows='All'): dataSource = FileDataSource(csvFileName, DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) nT = HomogenNumericTable() if type(Rows) != str: dataSource.loadDataBlock(Rows, nT) elif Rows == 'All': dataSource.loadDataBlock(nT) else: warnings.warn('Type error in "Rows" arguments, Can be only int') raise SystemError return nT
def trainModel(): global trainingResult nodeResults = [] # Create an algorithm object to build the final Naive Bayes model on the master node masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses) for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size): # Initialize FileDataSource to retrieve the input data from a .csv file #print("The worker with rank %d will read %s." % (rankId, trainDatasetFileNames[filenameIndex])) trainDataSource = FileDataSource(trainDatasetFileNames[filenameIndex], DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for training data and labels trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate) trainDependentVariables = HomogenNumericTable(1, 0, NumericTableIface.notAllocate) mergedData = MergedNumericTable(trainData, trainDependentVariables) # Retrieve the data from the input file trainDataSource.loadDataBlock(mergedData) # Create an algorithm object to train the Naive Bayes model based on the local-node data localAlgorithm = training.Distributed_Step1LocalFloat64DefaultDense(nClasses) # Pass a training data set and dependent values to the algorithm localAlgorithm.input.set(classifier.training.data, trainData) localAlgorithm.input.set(classifier.training.labels, trainDependentVariables) # Train the Naive Bayes model on local nodes pres = localAlgorithm.compute() # Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) masterAlgorithm.input.add(classifier.training.partialModels, pres) """ nodeResults.append(dataArch.getArchiveAsArray().copy()) localAlgorithm.clean() """ mergedData.freeDataMemory() trainData.freeDataMemory() trainDependentVariables.freeDataMemory() # Transfer partial results to step 2 on the root node pres = masterAlgorithm.compute() dataArch = InputDataArchive() pres.serialize(dataArch) nodeResults.append(dataArch.getArchiveAsArray().copy()) serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: # Create an algorithm object to build the final Naive Bayes model on the master node masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses) for currentRank in range(len(serializedData)): for currentBlock in range(0, len(serializedData[currentRank])): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[currentRank][currentBlock]) dataForStep2FromStep1 = classifier.training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) # Set the local Naive Bayes model as input for the master-node algorithm masterAlgorithm.input.add(classifier.training.partialModels, dataForStep2FromStep1) # Merge and finalizeCompute the Naive Bayes model on the master node masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute()
def getArrayFromNT(table, nrows=0): bd = BlockDescriptor_Float64() if nrows == 0: nrows = table.getNumberOfRows() table.getBlockOfRows(0, nrows, readOnly, bd) npa = bd.getArray() table.releaseBlockOfRows(bd) return npa if __name__ == "__main__": trainDatasetFileNames = getDatasetFileNames('news_train_dense_dist_data_*.csv') comm = MPI.COMM_WORLD comm_size = comm.Get_size() rankId = comm.Get_rank() print("I am a worker with rank %d on %s." % (rankId, MPI.Get_processor_name())) start = MPI.Wtime() trainModel() if rankId == MPI_ROOT: end = MPI.Wtime() testModel() testGroundTruth = FileDataSource(testGroundTruthFileName, DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) testGroundTruth.loadDataBlock() a = getArrayFromNT(predictionResult.get(classifier.prediction.prediction)) b = getArrayFromNT(testGroundTruth.getNumericTable()) acc = metrics.accuracy_score(a, b, normalize=True) print('Accuracy: {:.4f}'.format(acc)) print('Computational time: {:.2f}'.format(end - start))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--weights_dir', required=True) parser.add_argument('--num_clusters', required=True, type=int) parser.add_argument('--num_iters', type=int, required=True) parser.add_argument( '--sample_size', type=int, choices=range(1, 100), metavar='INT[1,100]', required=True) parser.add_argument('--num_threads', type=int, default=1) parser.add_argument('--output_dir_base', required=True) args = parser.parse_args() weights_dir = args.weights_dir num_clusters = args.num_clusters num_iters = args.num_iters sample_size = args.sample_size set_num_threads(args.num_threads) clusters_dir = os.path.join(args.output_dir_base, str(sample_size), str(num_iters)) if not os.path.exists(clusters_dir): os.makedirs(clusters_dir) datasetFileName = os.path.join(weights_dir, 'user_weights.csv') centroidsFileName = os.path.join(clusters_dir, '%d_centroids.csv' % num_clusters) centroidSource = FileDataSource(centroidsFileName, DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) centroidSource.loadDataBlock() print(weights_dir) t0 = time.time() dataSource = FileDataSource(datasetFileName, DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) dataSource.loadDataBlock() initAlg = init.Batch_Float32DeterministicDense(num_clusters) initAlg.input.set(init.data, centroidSource.getNumericTable()) t1 = time.time() init_time = t1 - t0 print('init time: %f' % init_time) t0 = time.time() res = initAlg.compute() t1 = time.time() centroid_time = t1 - t0 print('centroid time: %f' % centroid_time) centroidsResult = res.get(init.centroids) algorithm = kmeans.Batch_Float32LloydDense(num_clusters, 0) algorithm.input.set(kmeans.data, dataSource.getNumericTable()) algorithm.input.set(kmeans.inputCentroids, centroidsResult) t0 = time.time() res2 = algorithm.compute() t1 = time.time() cluster_time = t1 - t0 print('cluster time: %f' % cluster_time) printNumericTable( res2.get(kmeans.centroids), 'First 10 dimensions of centroids:', 20, 10) assignments_table = res2.get(kmeans.assignments) assignment_num_rows = assignments_table.getNumberOfRows() assignments_block = BlockDescriptor() assignments_table.getBlockOfRows(0, assignment_num_rows, readOnly, assignments_block) # assignments numpy array assignments_array = assignments_block.getArray() centroids_table = res2.get(kmeans.centroids) centroids_num_rows = centroids_table.getNumberOfRows() centroids_block = BlockDescriptor() centroids_table.getBlockOfRows(0, centroids_num_rows, readOnly, centroids_block) t0 = time.time() user_to_clusters_fname = os.path.join(clusters_dir, '%d_user_cluster_ids' % num_clusters) with open(user_to_clusters_fname, 'w') as f: for i in range(assignments_array.shape[0]): print('%d' % int(assignments_array[i][0]), file=f) t1 = time.time() output_time = t1 - t0 print('output time: %f' % output_time)
datasetFileNames = [ jp(DAAL_PREFIX, 'covcormoments_dense_1.csv'), jp(DAAL_PREFIX, 'covcormoments_dense_2.csv'), jp(DAAL_PREFIX, 'covcormoments_dense_3.csv'), jp(DAAL_PREFIX, 'covcormoments_dense_4.csv') ] if __name__ == '__main__': comm_size = MPI.COMM_WORLD rankId = comm_size.Get_rank() # Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file dataSource = FileDataSource( datasetFileNames[rankId], DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext ) # Retrieve the input data dataSource.loadDataBlock() # Create an algorithm to compute a variance-covariance matrix on local nodes localAlgorithm = covariance.Distributed(step1Local) # Set the input data set to the algorithm localAlgorithm.input.set(covariance.data, dataSource.getNumericTable()) # Compute a variance-covariance matrix pres = localAlgorithm.compute()
def testModel(): thresholdValues = np.linspace(-25.0, 25.0, num=101) numberOfCorrectlyClassifiedObjects = np.zeros(len(thresholdValues)) numberOfObjectsInTestFiles = 0 numberOfNonzeroObjectsInTestFiles = 0 for filenameIndex in range(0, len(testDatasetFileNames)): testDataSource = FileDataSource( testDatasetFileNames[filenameIndex], DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) testData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate) testGroundTruth = HomogenNumericTable(nDependentVariables, 0, NumericTableIface.notAllocate) mergedData = MergedNumericTable(testData, testGroundTruth) testDataSource.loadDataBlock(mergedData) algorithm = prediction.Batch_Float64DefaultDense() algorithm.input.setNumericTableInput(prediction.data, testData) algorithm.input.setModelInput(prediction.model, trainingResult.get(training.model)) predictionResult = algorithm.compute() block1 = BlockDescriptor() block2 = BlockDescriptor() testGroundTruth.getBlockOfRows(0, testGroundTruth.getNumberOfRows(), readOnly, block1) predictionResult.get(prediction.prediction).getBlockOfRows( 0, testGroundTruth.getNumberOfRows(), readOnly, block2) y_true = getClassVector(block1.getArray(), 0.000000000000) predictionRegression = block2.getArray() for thresholdIndex in range(0, len(thresholdValues)): y_pred = getClassVector(predictionRegression, thresholdValues[thresholdIndex]) numberOfCorrectlyClassifiedObjects[ thresholdIndex] += accuracy_score(y_true, y_pred, normalize=False) numberOfObjectsInTestFiles += len(y_true) numberOfNonzeroObjectsInTestFiles += np.count_nonzero(y_true) mergedData.freeDataMemory() testData.freeDataMemory() testGroundTruth.freeDataMemory() classificationAccuracyResult = np.zeros(len(thresholdValues)) best_threshold = None best_accuracy = -1 for thresholdIndex in range(0, len(thresholdValues)): classificationAccuracyResult[ thresholdIndex] = numberOfCorrectlyClassifiedObjects[ thresholdIndex] / numberOfObjectsInTestFiles if (classificationAccuracyResult[thresholdIndex] > best_accuracy): best_threshold = thresholdValues[thresholdIndex] best_accuracy = classificationAccuracyResult[thresholdIndex] print('Best threshold:{:.4f}. Best accuracy:{:.4f}'.format( best_threshold, best_accuracy)) print( 'Test set. Number of objects of 0 class:{:.4f}.Number of objects of 1 class:{:.4f}. ' 'Frequency of 1 class:{:.4f}'.format( numberOfObjectsInTestFiles - numberOfNonzeroObjectsInTestFiles, numberOfNonzeroObjectsInTestFiles, numberOfNonzeroObjectsInTestFiles / numberOfObjectsInTestFiles)) indexOfZeroThreshold = np.where(thresholdValues == 0.0)[0][0] print('Threshold=0. Classification accuracy:{:.4f}'.format( classificationAccuracyResult[indexOfZeroThreshold]))
datasetfilenames = [ jp(DATA_PREFIX,'low1.csv'), jp(DATA_PREFIX,'low2.csv'), jp(DATA_PREFIX,'low3.csv'), jp(DATA_PREFIX,'low4.csv') ] #Names of datafiles for every process if __name__ == "__main__": timed_total = time.process_time() comm = MPI.COMM_WORLD rankId = comm.Get_rank() dataSource = FileDataSource(datasetfilenames[rankId], DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) #Every process reads its own file dataSource.loadDataBlock() timed = time.process_time() localAlgorithm = low_order_moments.Distributed(step=step1Local) localAlgorithm.input.set(low_order_moments.data, dataSource.getNumericTable()) pres = localAlgorithm.compute() #serializing results for sending to master-node dataArch = InputDataArchive() pres.serialize(dataArch) nodeResults = dataArch.getArchiveAsArray() serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: masterAlgorithm = low_order_moments.Distributed(step=step2Master)
import daal.algorithms.adaboost as adaB from daal.algorithms.adaboost import prediction, training from GridSearch import GridSearch from daal.data_management import (FileDataSource, DataSourceIface, HomogenNumericTable, MergedNumericTable, NumericTableIface) DATA_PREFIX = os.path.join(os.path.dirname(sys.executable), 'share', 'pydaal_examples', 'examples', 'data', 'batch') trainDatasetFileName = os.path.join(DATA_PREFIX, 'adaboost_train.csv') nFeatures = 20 # Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file trainDataSource = FileDataSource(trainDatasetFileName, DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for training data and labels trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.doNotAllocate) trainGroundTruth = HomogenNumericTable(1, 0, NumericTableIface.doNotAllocate) mergedData = MergedNumericTable(trainData, trainGroundTruth) # Retrieve the data from the input file trainDataSource.loadDataBlock(mergedData) #default keyword arguments ''' GridSearch(<args>, tuned_parameters = None, score=None, best_score_criteria='high', create_best_training_model = False,
############################################################################# def getArrayFromNT(table, nrows=0): bd = BlockDescriptor_Float64() if nrows == 0: nrows = table.getNumberOfRows() table.getBlockOfRows(0, nrows, readOnly, bd) npa = bd.getArray() table.releaseBlockOfRows(bd) return npa # Initialize FileDataSource<CSVFeatureManager> to retrieve the test data from a .csv file testDataSource = FileDataSource(testDatasetFileName, DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for testing data and labels testData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate) testGroundTruth = HomogenNumericTable(1, 0, NumericTableIface.notAllocate) mergedData = MergedNumericTable(testData, testGroundTruth) # Retrieve the data from input file testDataSource.loadDataBlock(mergedData) # Create an algorithm object to predict Naive Bayes values algorithm_test = prediction.Batch(nClasses) # Pass a testing data set and the trained model to the algorithm algorithm_test.input.setTable(classifier.prediction.data, testData)
def trainModel(comm, rankId): trainingResult = None # Initialize FileDataSource to retrieve the input data from a .csv file trainDataSource = FileDataSource(trainDatasetFileNames[rankId], DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for training data and labels trainData = HomogenNumericTable(NUM_FEATURES, 0, NumericTableIface.doNotAllocate) trainDependentVariables = HomogenNumericTable( NUM_DEPENDENT_VARS, 0, NumericTableIface.doNotAllocate) mergedData = MergedNumericTable(trainData, trainDependentVariables) # Retrieve the data from the input file trainDataSource.loadDataBlock(mergedData) # Create an algorithm object to train the ridge regression model based on the local-node data localAlgorithm = training.Distributed(step1Local) # Pass a training data set and dependent values to the algorithm localAlgorithm.input.set(training.data, trainData) localAlgorithm.input.set(training.dependentVariables, trainDependentVariables) # Train the ridge regression model on local nodes pres = localAlgorithm.compute() # Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) # Transfer partial results to step 2 on the root node nodeResults = dataArch.getArchiveAsArray() serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: # Create an algorithm object to build the final ridge regression model on the master node masterAlgorithm = training.Distributed(step2Master) for i in range(NUM_BLOCKS): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) # Set the local ridge regression model as input for the master-node algorithm masterAlgorithm.input.add(training.partialModels, dataForStep2FromStep1) # Merge and finalizeCompute the ridge regression model on the master node masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute() # Retrieve the algorithm results printNumericTable( trainingResult.get(training.model).getBeta(), "Ridge Regression coefficients:") return trainingResult
f.write('{:.0f}}};'.format( right_items_array[i][len(right_items_array[i]) - 1])) f.write('{0:.3g}\n'.format(confidence_array[i])) f.close() start = time.time() for num in range(1): datasetFileName = 'daal_retail.csv' minSupport = 0.0015 minConfidence = 0.8 dataSource = FileDataSource(datasetFileName, DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) dataSource.loadDataBlock() alg = Batch_Float64Apriori() alg.input.set(data, dataSource.getNumericTable()) alg.parameter.minSupport = minSupport alg.parameter.minConfidence = minConfidence # alg.parameter.itemsetsOrder = itemsetsSortedBySupport # alg.parameter.rulesOrder = rulesSortedByConfidence res = alg.compute() end = time.time() print('Performance comparison. Time: %s seconds' % (end - start)) nt1 = res.get(largeItemsets)
datasetFileNames = [ jp(DATA_PREFIX, 'covcormoments_dense_1.csv'), jp(DATA_PREFIX, 'covcormoments_dense_2.csv'), jp(DATA_PREFIX, 'covcormoments_dense_3.csv'), jp(DATA_PREFIX, 'covcormoments_dense_4.csv') ] if __name__ == "__main__": comm = MPI.COMM_WORLD rankId = comm.Get_rank() # Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file dataSource = FileDataSource(datasetFileNames[rankId], DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Retrieve the input data dataSource.loadDataBlock() # Create an algorithm to compute low order moments on local nodes localAlgorithm = low_order_moments.Distributed(step=step1Local) # Set the input data set to the algorithm localAlgorithm.input.set(low_order_moments.data, dataSource.getNumericTable()) # Compute low order moments pres = localAlgorithm.compute()