Пример #1
0
def getBlockOfCols(nT,*args):
    mnT = MergedNumericTable()
    for idx in range(args[0],args[1]):
        doubleBlock = BlockDescriptor_Float64()
        nT.getBlockOfColumnValues(idx, 0, nT.getNumberOfRows(), readOnly, doubleBlock)
        mnT.releaseBlockOfColumnValues(doubleBlock)
    return mnT
def testModel(trainingResult):

    # Initialize FileDataSource to retrieve the input data from a .csv file
    testDataSource = FileDataSource(testDatasetFileName,
                                    DataSourceIface.doAllocateNumericTable,
                                    DataSourceIface.doDictionaryFromContext)

    # Create Numeric Tables for testing data and ground truth values
    testData = HomogenNumericTable(NUM_FEATURES, 0,
                                   NumericTableIface.doNotAllocate)
    testGroundTruth = HomogenNumericTable(NUM_DEPENDENT_VARS, 0,
                                          NumericTableIface.doNotAllocate)
    mergedData = MergedNumericTable(testData, testGroundTruth)

    # Retrieve the data from an input file
    testDataSource.loadDataBlock(mergedData)

    # Create an algorithm object to predict values of ridge regression
    algorithm = prediction.Batch()

    # Pass a testing data set and the trained model to the algorithm
    algorithm.input.setTable(prediction.data, testData)
    algorithm.input.setModel(prediction.model,
                             trainingResult.get(training.model))

    # Predict values of ridge regression
    res = algorithm.compute()

    # Retrieve the algorithm results
    printNumericTable(res.get(prediction.prediction),
                      "Ridge Regression prediction results: (first 10 rows):",
                      10)
    printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10)
Пример #3
0
def trainModel():
    global trainingResult
    masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense()

    for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size):
        trainDataSource = FileDataSource(
            trainDatasetFileNames[filenameIndex],
            DataSourceIface.notAllocateNumericTable,
            DataSourceIface.doDictionaryFromContext)
        trainData = HomogenNumericTable(nFeatures, 0,
                                        NumericTableIface.notAllocate)
        trainDependentVariables = HomogenNumericTable(
            nDependentVariables, 0, NumericTableIface.notAllocate)
        mergedData = MergedNumericTable(trainData, trainDependentVariables)
        trainDataSource.loadDataBlock(mergedData)

        localAlgorithm = training.Distributed_Step1LocalFloat64NormEqDense()
        localAlgorithm.input.set(training.data, trainData)
        localAlgorithm.input.set(training.dependentVariables,
                                 trainDependentVariables)
        pres = localAlgorithm.compute()
        masterAlgorithm.input.add(training.partialModels, pres)

        mergedData.freeDataMemory()
        trainData.freeDataMemory()
        trainDependentVariables.freeDataMemory()

    pres = masterAlgorithm.compute()
    dataArch = InputDataArchive()
    pres.serialize(dataArch)
    nodeResults = dataArch.getArchiveAsArray()
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        print("Number of processes is %d." % (len(serializedData)))
        masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense()

        for i in range(comm_size):
            dataArch = OutputDataArchive(serializedData[i])
            dataForStep2FromStep1 = training.PartialResult()
            dataForStep2FromStep1.deserialize(dataArch)
            masterAlgorithm.input.add(training.partialModels,
                                      dataForStep2FromStep1)
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()
Пример #4
0
def getBlockOfNumericTable(nT, Rows='All', Columns='All'):
    from daal.data_management import HomogenNumericTable_Float64, \
    MergedNumericTable, readOnly, BlockDescriptor
    import numpy as np

    # Get First and Last Row indexes
    lastRow = nT.getNumberOfRows()
    if type(Rows) != str:
        if type(Rows) == list:
            firstRow = Rows[0]
            if len(Rows) == 2: lastRow = min(Rows[1], lastRow)
        else:
            firstRow = 0
            lastRow = Rows
    elif Rows == 'All':
        firstRow = 0
    else:
        warnings.warn(
            'Type error in "Rows" arguments, Can be only int/list type')
        raise SystemExit

    # Get First and Last Column indexes
    nEndDim = nT.getNumberOfColumns()
    if type(Columns) != str:
        if type(Columns) == list:
            nStartDim = Columns[0]
            if len(Columns) == 2: nEndDim = min(Columns[1], nEndDim)
        else:
            nStartDim = 0
            nEndDim = Columns
    elif Columns == 'All':
        nStartDim = 0
    else:
        warnings.warn(
            'Type error in "Columns" arguments, Can be only int/list type')
        raise SystemExit

    #Retrieve block of Columns Values within First & Last Rows
    #Merge all the retrieved block of Columns Values
    #Return merged numeric table
    mnT = MergedNumericTable()
    for idx in range(nStartDim, nEndDim):
        block = BlockDescriptor()
        nT.getBlockOfColumnValues(idx, firstRow, (lastRow - firstRow),
                                  readOnly, block)
        mnT.addNumericTable(HomogenNumericTable_Float64(block.getArray()))
        nT.releaseBlockOfColumnValues(block)
    block = BlockDescriptor()
    mnT.getBlockOfRows(0, mnT.getNumberOfRows(), readOnly, block)
    mnT = HomogenNumericTable(block.getArray())
    return mnT
DATA_PREFIX = os.path.join(os.path.dirname(sys.executable), 'share',
                           'pydaal_examples', 'examples', 'data', 'batch')
trainDatasetFileName = os.path.join(DATA_PREFIX, 'adaboost_train.csv')

nFeatures = 20

# Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file
trainDataSource = FileDataSource(trainDatasetFileName,
                                 DataSourceIface.notAllocateNumericTable,
                                 DataSourceIface.doDictionaryFromContext)

# Create Numeric Tables for training data and labels
trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.doNotAllocate)
trainGroundTruth = HomogenNumericTable(1, 0, NumericTableIface.doNotAllocate)
mergedData = MergedNumericTable(trainData, trainGroundTruth)

# Retrieve the data from the input file
trainDataSource.loadDataBlock(mergedData)

#default keyword arguments
'''
GridSearch(<args>, tuned_parameters = None, score=None,
			best_score_criteria='high',
			create_best_training_model = False,
			save_model=False,nClasses=None )			
'''
#create a dictionary of hyperparameter values in a list
adaB_params = [{'accuracyThreshold': [0.99, 0.1], 'maxIterations': [1, 5]}]
#Create GridSearch object
clf = GridSearch(adaB,
Пример #6
0
def trainModel():
    global trainingResult
    nodeResults = []
    # Create an algorithm object to build the final Naive Bayes model on the master node
    masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses)
    for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size):
        # Initialize FileDataSource to retrieve the input data from a .csv file
        #print("The worker with rank %d will read %s." % (rankId, trainDatasetFileNames[filenameIndex]))
        trainDataSource = FileDataSource(trainDatasetFileNames[filenameIndex],
                                         DataSourceIface.notAllocateNumericTable,
                                         DataSourceIface.doDictionaryFromContext)

        # Create Numeric Tables for training data and labels
        trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate)
        trainDependentVariables = HomogenNumericTable(1, 0, NumericTableIface.notAllocate)
        mergedData = MergedNumericTable(trainData, trainDependentVariables)

        # Retrieve the data from the input file
        trainDataSource.loadDataBlock(mergedData)

        # Create an algorithm object to train the Naive Bayes model based on the local-node data
        localAlgorithm = training.Distributed_Step1LocalFloat64DefaultDense(nClasses)

        # Pass a training data set and dependent values to the algorithm
        localAlgorithm.input.set(classifier.training.data, trainData)
        localAlgorithm.input.set(classifier.training.labels, trainDependentVariables)

        # Train the Naive Bayes model on local nodes
        pres = localAlgorithm.compute()
        # Serialize partial results required by step 2
        dataArch = InputDataArchive()
        pres.serialize(dataArch)

        masterAlgorithm.input.add(classifier.training.partialModels, pres)
        """
        nodeResults.append(dataArch.getArchiveAsArray().copy())
        localAlgorithm.clean()
        """
        mergedData.freeDataMemory()
        trainData.freeDataMemory()
        trainDependentVariables.freeDataMemory()
    # Transfer partial results to step 2 on the root node
    pres = masterAlgorithm.compute()
    dataArch = InputDataArchive()
    pres.serialize(dataArch)
    nodeResults.append(dataArch.getArchiveAsArray().copy())
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        # Create an algorithm object to build the final Naive Bayes model on the master node
        masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses)

        for currentRank in range(len(serializedData)):
            for currentBlock in range(0, len(serializedData[currentRank])):
                # Deserialize partial results from step 1
                dataArch = OutputDataArchive(serializedData[currentRank][currentBlock])

                dataForStep2FromStep1 = classifier.training.PartialResult()
                dataForStep2FromStep1.deserialize(dataArch)

                # Set the local Naive Bayes model as input for the master-node algorithm
                masterAlgorithm.input.add(classifier.training.partialModels, dataForStep2FromStep1)

        # Merge and finalizeCompute the Naive Bayes model on the master node
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()
def trainModel(comm, rankId):

    trainingResult = None

    # Initialize FileDataSource to retrieve the input data from a .csv file
    trainDataSource = FileDataSource(trainDatasetFileNames[rankId],
                                     DataSourceIface.notAllocateNumericTable,
                                     DataSourceIface.doDictionaryFromContext)

    # Create Numeric Tables for training data and labels
    trainData = HomogenNumericTable(NUM_FEATURES, 0,
                                    NumericTableIface.doNotAllocate)
    trainDependentVariables = HomogenNumericTable(
        NUM_DEPENDENT_VARS, 0, NumericTableIface.doNotAllocate)
    mergedData = MergedNumericTable(trainData, trainDependentVariables)

    # Retrieve the data from the input file
    trainDataSource.loadDataBlock(mergedData)

    # Create an algorithm object to train the ridge regression model based on the local-node data
    localAlgorithm = training.Distributed(step1Local)

    # Pass a training data set and dependent values to the algorithm
    localAlgorithm.input.set(training.data, trainData)
    localAlgorithm.input.set(training.dependentVariables,
                             trainDependentVariables)

    # Train the ridge regression model on local nodes
    pres = localAlgorithm.compute()

    # Serialize partial results required by step 2
    dataArch = InputDataArchive()
    pres.serialize(dataArch)

    # Transfer partial results to step 2 on the root node
    nodeResults = dataArch.getArchiveAsArray()

    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:

        # Create an algorithm object to build the final ridge regression model on the master node
        masterAlgorithm = training.Distributed(step2Master)

        for i in range(NUM_BLOCKS):

            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(serializedData[i])
            dataForStep2FromStep1 = training.PartialResult()
            dataForStep2FromStep1.deserialize(dataArch)

            # Set the local ridge regression model as input for the master-node algorithm
            masterAlgorithm.input.add(training.partialModels,
                                      dataForStep2FromStep1)

        # Merge and finalizeCompute the ridge regression model on the master node
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()

        # Retrieve the algorithm results
        printNumericTable(
            trainingResult.get(training.model).getBeta(),
            "Ridge Regression coefficients:")

    return trainingResult
# Input data set parameters
trainDatasetFileName = os.path.join(DATA_PREFIX, 'decision_tree_train.csv')
pruneDatasetFileName = os.path.join(DATA_PREFIX, 'decision_tree_prune.csv')

nFeatures = 5
nClasses = 5

# Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file
trainDataSource = FileDataSource(trainDatasetFileName,
                                 DataSourceIface.notAllocateNumericTable,
                                 DataSourceIface.doDictionaryFromContext)

# Create Numeric Tables for training data and labels
trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate)
trainGroundTruth = HomogenNumericTable(1, 0, NumericTableIface.notAllocate)
mergedData = MergedNumericTable(trainData, trainGroundTruth)

# Retrieve the data from the input file
trainDataSource.loadDataBlock(mergedData)

# Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file
pruneDataSource = FileDataSource(pruneDatasetFileName,
                                 DataSourceIface.notAllocateNumericTable,
                                 DataSourceIface.doDictionaryFromContext)

# Create Numeric Tables for pruning data and labels
pruneData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate)
pruneGroundTruth = HomogenNumericTable(1, 0, NumericTableIface.notAllocate)
pruneMergedData = MergedNumericTable(pruneData, pruneGroundTruth)

# Retrieve the data from the input file
Пример #9
0
def testModel():
    thresholdValues = np.linspace(-25.0, 25.0, num=101)
    numberOfCorrectlyClassifiedObjects = np.zeros(len(thresholdValues))
    numberOfObjectsInTestFiles = 0
    numberOfNonzeroObjectsInTestFiles = 0
    for filenameIndex in range(0, len(testDatasetFileNames)):
        testDataSource = FileDataSource(
            testDatasetFileNames[filenameIndex],
            DataSourceIface.doAllocateNumericTable,
            DataSourceIface.doDictionaryFromContext)
        testData = HomogenNumericTable(nFeatures, 0,
                                       NumericTableIface.notAllocate)
        testGroundTruth = HomogenNumericTable(nDependentVariables, 0,
                                              NumericTableIface.notAllocate)
        mergedData = MergedNumericTable(testData, testGroundTruth)
        testDataSource.loadDataBlock(mergedData)

        algorithm = prediction.Batch_Float64DefaultDense()
        algorithm.input.setNumericTableInput(prediction.data, testData)
        algorithm.input.setModelInput(prediction.model,
                                      trainingResult.get(training.model))
        predictionResult = algorithm.compute()

        block1 = BlockDescriptor()
        block2 = BlockDescriptor()
        testGroundTruth.getBlockOfRows(0, testGroundTruth.getNumberOfRows(),
                                       readOnly, block1)
        predictionResult.get(prediction.prediction).getBlockOfRows(
            0, testGroundTruth.getNumberOfRows(), readOnly, block2)
        y_true = getClassVector(block1.getArray(), 0.000000000000)
        predictionRegression = block2.getArray()
        for thresholdIndex in range(0, len(thresholdValues)):
            y_pred = getClassVector(predictionRegression,
                                    thresholdValues[thresholdIndex])
            numberOfCorrectlyClassifiedObjects[
                thresholdIndex] += accuracy_score(y_true,
                                                  y_pred,
                                                  normalize=False)
        numberOfObjectsInTestFiles += len(y_true)
        numberOfNonzeroObjectsInTestFiles += np.count_nonzero(y_true)
        mergedData.freeDataMemory()
        testData.freeDataMemory()
        testGroundTruth.freeDataMemory()

    classificationAccuracyResult = np.zeros(len(thresholdValues))
    best_threshold = None
    best_accuracy = -1
    for thresholdIndex in range(0, len(thresholdValues)):
        classificationAccuracyResult[
            thresholdIndex] = numberOfCorrectlyClassifiedObjects[
                thresholdIndex] / numberOfObjectsInTestFiles
        if (classificationAccuracyResult[thresholdIndex] > best_accuracy):
            best_threshold = thresholdValues[thresholdIndex]
            best_accuracy = classificationAccuracyResult[thresholdIndex]
    print('Best threshold:{:.4f}. Best accuracy:{:.4f}'.format(
        best_threshold, best_accuracy))
    print(
        'Test set. Number of objects of 0 class:{:.4f}.Number of objects of 1 class:{:.4f}. '
        'Frequency of 1 class:{:.4f}'.format(
            numberOfObjectsInTestFiles - numberOfNonzeroObjectsInTestFiles,
            numberOfNonzeroObjectsInTestFiles,
            numberOfNonzeroObjectsInTestFiles / numberOfObjectsInTestFiles))
    indexOfZeroThreshold = np.where(thresholdValues == 0.0)[0][0]
    print('Threshold=0. Classification accuracy:{:.4f}'.format(
        classificationAccuracyResult[indexOfZeroThreshold]))