예제 #1
0
def trainModel():
    global trainingResult
    masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense()

    for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size):
        trainDataSource = FileDataSource(
            trainDatasetFileNames[filenameIndex],
            DataSourceIface.notAllocateNumericTable,
            DataSourceIface.doDictionaryFromContext)
        trainData = HomogenNumericTable(nFeatures, 0,
                                        NumericTableIface.notAllocate)
        trainDependentVariables = HomogenNumericTable(
            nDependentVariables, 0, NumericTableIface.notAllocate)
        mergedData = MergedNumericTable(trainData, trainDependentVariables)
        trainDataSource.loadDataBlock(mergedData)

        localAlgorithm = training.Distributed_Step1LocalFloat64NormEqDense()
        localAlgorithm.input.set(training.data, trainData)
        localAlgorithm.input.set(training.dependentVariables,
                                 trainDependentVariables)
        pres = localAlgorithm.compute()
        masterAlgorithm.input.add(training.partialModels, pres)

        mergedData.freeDataMemory()
        trainData.freeDataMemory()
        trainDependentVariables.freeDataMemory()

    pres = masterAlgorithm.compute()
    dataArch = InputDataArchive()
    pres.serialize(dataArch)
    nodeResults = dataArch.getArchiveAsArray()
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        print("Number of processes is %d." % (len(serializedData)))
        masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense()

        for i in range(comm_size):
            dataArch = OutputDataArchive(serializedData[i])
            dataForStep2FromStep1 = training.PartialResult()
            dataForStep2FromStep1.deserialize(dataArch)
            masterAlgorithm.input.add(training.partialModels,
                                      dataForStep2FromStep1)
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()
예제 #2
0
def trainModel():
    global trainingResult
    nodeResults = []
    # Create an algorithm object to build the final Naive Bayes model on the master node
    masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses)
    for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size):
        # Initialize FileDataSource to retrieve the input data from a .csv file
        #print("The worker with rank %d will read %s." % (rankId, trainDatasetFileNames[filenameIndex]))
        trainDataSource = FileDataSource(trainDatasetFileNames[filenameIndex],
                                         DataSourceIface.notAllocateNumericTable,
                                         DataSourceIface.doDictionaryFromContext)

        # Create Numeric Tables for training data and labels
        trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate)
        trainDependentVariables = HomogenNumericTable(1, 0, NumericTableIface.notAllocate)
        mergedData = MergedNumericTable(trainData, trainDependentVariables)

        # Retrieve the data from the input file
        trainDataSource.loadDataBlock(mergedData)

        # Create an algorithm object to train the Naive Bayes model based on the local-node data
        localAlgorithm = training.Distributed_Step1LocalFloat64DefaultDense(nClasses)

        # Pass a training data set and dependent values to the algorithm
        localAlgorithm.input.set(classifier.training.data, trainData)
        localAlgorithm.input.set(classifier.training.labels, trainDependentVariables)

        # Train the Naive Bayes model on local nodes
        pres = localAlgorithm.compute()
        # Serialize partial results required by step 2
        dataArch = InputDataArchive()
        pres.serialize(dataArch)

        masterAlgorithm.input.add(classifier.training.partialModels, pres)
        """
        nodeResults.append(dataArch.getArchiveAsArray().copy())
        localAlgorithm.clean()
        """
        mergedData.freeDataMemory()
        trainData.freeDataMemory()
        trainDependentVariables.freeDataMemory()
    # Transfer partial results to step 2 on the root node
    pres = masterAlgorithm.compute()
    dataArch = InputDataArchive()
    pres.serialize(dataArch)
    nodeResults.append(dataArch.getArchiveAsArray().copy())
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        # Create an algorithm object to build the final Naive Bayes model on the master node
        masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses)

        for currentRank in range(len(serializedData)):
            for currentBlock in range(0, len(serializedData[currentRank])):
                # Deserialize partial results from step 1
                dataArch = OutputDataArchive(serializedData[currentRank][currentBlock])

                dataForStep2FromStep1 = classifier.training.PartialResult()
                dataForStep2FromStep1.deserialize(dataArch)

                # Set the local Naive Bayes model as input for the master-node algorithm
                masterAlgorithm.input.add(classifier.training.partialModels, dataForStep2FromStep1)

        # Merge and finalizeCompute the Naive Bayes model on the master node
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()
예제 #3
0
def testModel():
    thresholdValues = np.linspace(-25.0, 25.0, num=101)
    numberOfCorrectlyClassifiedObjects = np.zeros(len(thresholdValues))
    numberOfObjectsInTestFiles = 0
    numberOfNonzeroObjectsInTestFiles = 0
    for filenameIndex in range(0, len(testDatasetFileNames)):
        testDataSource = FileDataSource(
            testDatasetFileNames[filenameIndex],
            DataSourceIface.doAllocateNumericTable,
            DataSourceIface.doDictionaryFromContext)
        testData = HomogenNumericTable(nFeatures, 0,
                                       NumericTableIface.notAllocate)
        testGroundTruth = HomogenNumericTable(nDependentVariables, 0,
                                              NumericTableIface.notAllocate)
        mergedData = MergedNumericTable(testData, testGroundTruth)
        testDataSource.loadDataBlock(mergedData)

        algorithm = prediction.Batch_Float64DefaultDense()
        algorithm.input.setNumericTableInput(prediction.data, testData)
        algorithm.input.setModelInput(prediction.model,
                                      trainingResult.get(training.model))
        predictionResult = algorithm.compute()

        block1 = BlockDescriptor()
        block2 = BlockDescriptor()
        testGroundTruth.getBlockOfRows(0, testGroundTruth.getNumberOfRows(),
                                       readOnly, block1)
        predictionResult.get(prediction.prediction).getBlockOfRows(
            0, testGroundTruth.getNumberOfRows(), readOnly, block2)
        y_true = getClassVector(block1.getArray(), 0.000000000000)
        predictionRegression = block2.getArray()
        for thresholdIndex in range(0, len(thresholdValues)):
            y_pred = getClassVector(predictionRegression,
                                    thresholdValues[thresholdIndex])
            numberOfCorrectlyClassifiedObjects[
                thresholdIndex] += accuracy_score(y_true,
                                                  y_pred,
                                                  normalize=False)
        numberOfObjectsInTestFiles += len(y_true)
        numberOfNonzeroObjectsInTestFiles += np.count_nonzero(y_true)
        mergedData.freeDataMemory()
        testData.freeDataMemory()
        testGroundTruth.freeDataMemory()

    classificationAccuracyResult = np.zeros(len(thresholdValues))
    best_threshold = None
    best_accuracy = -1
    for thresholdIndex in range(0, len(thresholdValues)):
        classificationAccuracyResult[
            thresholdIndex] = numberOfCorrectlyClassifiedObjects[
                thresholdIndex] / numberOfObjectsInTestFiles
        if (classificationAccuracyResult[thresholdIndex] > best_accuracy):
            best_threshold = thresholdValues[thresholdIndex]
            best_accuracy = classificationAccuracyResult[thresholdIndex]
    print('Best threshold:{:.4f}. Best accuracy:{:.4f}'.format(
        best_threshold, best_accuracy))
    print(
        'Test set. Number of objects of 0 class:{:.4f}.Number of objects of 1 class:{:.4f}. '
        'Frequency of 1 class:{:.4f}'.format(
            numberOfObjectsInTestFiles - numberOfNonzeroObjectsInTestFiles,
            numberOfNonzeroObjectsInTestFiles,
            numberOfNonzeroObjectsInTestFiles / numberOfObjectsInTestFiles))
    indexOfZeroThreshold = np.where(thresholdValues == 0.0)[0][0]
    print('Threshold=0. Classification accuracy:{:.4f}'.format(
        classificationAccuracyResult[indexOfZeroThreshold]))