def testModel(trainingResult):

    # Initialize FileDataSource to retrieve the input data from a .csv file
    testDataSource = FileDataSource(testDatasetFileName,
                                    DataSourceIface.doAllocateNumericTable,
                                    DataSourceIface.doDictionaryFromContext)

    # Create Numeric Tables for testing data and ground truth values
    testData = HomogenNumericTable(NUM_FEATURES, 0,
                                   NumericTableIface.doNotAllocate)
    testGroundTruth = HomogenNumericTable(NUM_DEPENDENT_VARS, 0,
                                          NumericTableIface.doNotAllocate)
    mergedData = MergedNumericTable(testData, testGroundTruth)

    # Retrieve the data from an input file
    testDataSource.loadDataBlock(mergedData)

    # Create an algorithm object to predict values of ridge regression
    algorithm = prediction.Batch()

    # Pass a testing data set and the trained model to the algorithm
    algorithm.input.setTable(prediction.data, testData)
    algorithm.input.setModel(prediction.model,
                             trainingResult.get(training.model))

    # Predict values of ridge regression
    res = algorithm.compute()

    # Retrieve the algorithm results
    printNumericTable(res.get(prediction.prediction),
                      "Ridge Regression prediction results: (first 10 rows):",
                      10)
    printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10)
示例#2
0
文件: __init__.py 项目: iburyl/Labs
def readTensorFromCSV(datasetFileName):
    dataSource = FileDataSource(datasetFileName,
                                DataSourceIface.doAllocateNumericTable,
                                DataSourceIface.doDictionaryFromContext)
    dataSource.loadDataBlock()

    nt = dataSource.getNumericTable()
    size = nt.getNumberOfRows()
    block = BlockDescriptor()
    nt.getBlockOfRows(0, size, readOnly, block)
    blockData = block.getArray().flatten()

    dims = [size]
    if nt.getNumberOfColumns() > 1:
        dims.append(nt.getNumberOfColumns())
        size *= dims[1]

    tensorData = np.array(blockData, copy=True, dtype=np.float32)

    #for i in range(size):
    #    tensorData[i] = blockData[i]

    nt.releaseBlockOfRows(block)

    tensorData.shape = dims
    tensor = HomogenTensor(tensorData, ntype=np.float32)

    return tensor
def computestep1Local():
    global serializedData, dataFromStep1ForStep3

    # Initialize FileDataSource to retrieve the input data from a .csv file
    dataSource = FileDataSource(datasetFileNames[rankId],
                                DataSourceIface.doAllocateNumericTable,
                                DataSourceIface.doDictionaryFromContext)

    # Retrieve the input data
    dataSource.loadDataBlock()

    # Create an algorithm to compute SVD on local nodes
    algorithm = svd.Distributed(step1Local)

    algorithm.input.set(svd.data, dataSource.getNumericTable())

    # Compute SVD
    # OnlinePartialResult class from svd
    pres = algorithm.compute()

    dataFromStep1ForStep2 = pres.get(svd.outputOfStep1ForStep2)
    dataFromStep1ForStep3 = pres.get(svd.outputOfStep1ForStep3)

    # Serialize partial results required by step 2
    dataArch = InputDataArchive()
    dataFromStep1ForStep2.serialize(dataArch)

    nodeResults = dataArch.getArchiveAsArray()

    # Transfer partial results to step 2 on the root node
    serializedData = comm.gather(nodeResults)
示例#4
0
def trainModel():
    global trainingResult

    # Retrieve the input data from a .csv file
    trainDataTable = createSparseTable(trainDatasetFileNames[rankId])

    # Initialize FileDataSource to retrieve the input data from a .csv file
    trainLabelsSource = FileDataSource(trainGroundTruthFileNames[rankId],
                                       DataSourceIface.doAllocateNumericTable,
                                       DataSourceIface.doDictionaryFromContext)

    # Retrieve the data from input files
    trainLabelsSource.loadDataBlock()

    # Create an algorithm object to train the Naive Bayes model based on the local-node data
    localAlgorithm = training.Distributed(step1Local,
                                          nClasses,
                                          method=training.fastCSR)

    # Pass a training data set and dependent values to the algorithm
    localAlgorithm.input.set(classifier.training.data, trainDataTable)
    localAlgorithm.input.set(classifier.training.labels,
                             trainLabelsSource.getNumericTable())

    # Train the Naive Bayes model on local nodes
    pres = localAlgorithm.compute()

    # Serialize partial results required by step 2
    dataArch = InputDataArchive()
    pres.serialize(dataArch)

    nodeResults = dataArch.getArchiveAsArray()

    # Transfer partial results to step 2 on the root node
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        # Create an algorithm object to build the final Naive Bayes model on the master node
        masterAlgorithm = training.Distributed(step2Master,
                                               nClasses,
                                               method=training.fastCSR)

        for i in range(nBlocks):
            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(serializedData[i])

            dataForStep2FromStep1 = training.PartialResult()
            dataForStep2FromStep1.deserialize(dataArch)

            # Set the local Naive Bayes model as input for the master-node algorithm
            masterAlgorithm.input.add(training.partialModels,
                                      dataForStep2FromStep1)

        # Merge and finalizeCompute the Naive Bayes model on the master node
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()
示例#5
0
文件: IInput.py 项目: yxoos/h2o4gpu
 def getNumericTable(self, **kwargs):
     if self.informat == 'numpy':
         return HomogenNumericTable(self.indata)
     if self.informat == 'pandas':
         array = self.indata.as_matrix()
         return HomogenNumericTable(array)
     if self.informat == 'csv':
         dataSource =  \
             FileDataSource(self.indata,
                            DataSource.doAllocateNumericTable,
                            DataSource.doDictionaryFormContext)
         dataSource.loadDataBlock()
         return dataSource.getNumericTable()
     raise ValueError("Cannot identify input type.")
示例#6
0
def printResults():

    testGroundTruth = FileDataSource(testGroundTruthFileName,
                                     DataSourceIface.doAllocateNumericTable,
                                     DataSourceIface.doDictionaryFromContext)
    testGroundTruth.loadDataBlock()

    printNumericTables(testGroundTruth.getNumericTable(),
                       predictionResult.get(classifier.prediction.prediction),
                       "Ground truth",
                       "Classification results",
                       "NaiveBayes classification results (first 20 observations):",
                       20,
                       interval=15,
                       flt64=False)
示例#7
0
def trainModel():
    global trainingResult
    masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense()

    for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size):
        trainDataSource = FileDataSource(
            trainDatasetFileNames[filenameIndex],
            DataSourceIface.notAllocateNumericTable,
            DataSourceIface.doDictionaryFromContext)
        trainData = HomogenNumericTable(nFeatures, 0,
                                        NumericTableIface.notAllocate)
        trainDependentVariables = HomogenNumericTable(
            nDependentVariables, 0, NumericTableIface.notAllocate)
        mergedData = MergedNumericTable(trainData, trainDependentVariables)
        trainDataSource.loadDataBlock(mergedData)

        localAlgorithm = training.Distributed_Step1LocalFloat64NormEqDense()
        localAlgorithm.input.set(training.data, trainData)
        localAlgorithm.input.set(training.dependentVariables,
                                 trainDependentVariables)
        pres = localAlgorithm.compute()
        masterAlgorithm.input.add(training.partialModels, pres)

        mergedData.freeDataMemory()
        trainData.freeDataMemory()
        trainDependentVariables.freeDataMemory()

    pres = masterAlgorithm.compute()
    dataArch = InputDataArchive()
    pres.serialize(dataArch)
    nodeResults = dataArch.getArchiveAsArray()
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        print("Number of processes is %d." % (len(serializedData)))
        masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense()

        for i in range(comm_size):
            dataArch = OutputDataArchive(serializedData[i])
            dataForStep2FromStep1 = training.PartialResult()
            dataForStep2FromStep1.deserialize(dataArch)
            masterAlgorithm.input.add(training.partialModels,
                                      dataForStep2FromStep1)
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()
示例#8
0
def testModel():
    global predictionResult

    # Initialize FileDataSource to retrieve the input data from a .csv file
    testDataSource = FileDataSource(testDatasetFileName,
                                    DataSourceIface.doAllocateNumericTable,
                                    DataSourceIface.doDictionaryFromContext)

    # Retrieve the data from an input file
    testDataSource.loadDataBlock()

    # Create an algorithm object to predict values of the Naive Bayes model
    algorithm = prediction.Batch(nClasses)

    # Pass a testing data set and the trained model to the algorithm
    algorithm.input.setTable(classifier.prediction.data,  testDataSource.getNumericTable())
    algorithm.input.setModel(classifier.prediction.model, trainingResult.get(classifier.training.model))

    # Predict values of the Naive Bayes model
    # Result class from classifier.prediction
    predictionResult = algorithm.compute()
示例#9
0
文件: IInput.py 项目: yxoos/h2o4gpu
    def getNumericTable(self, **kwargs):
        if self.informat == 'numpy':
            return AOSNumericTable(self.indata)
        if self.informat == 'pandas':
            array = self._getStructureArray(
                self.indata,
                dtypes=self.indata.dtypes)
            return AOSNumericTable(array)
        if self.informat == 'csv':
            dataSource = FileDataSource(
                self.indata,
                DataSource.notAllocateNumericTable,
                DataSource.doDictionaryFromContext)

            if 'nRows' not in kwargs and 'dtype' not in kwargs:
                raise ValueError("HeterogenousDaalData, for csv file, \
                'nrows' and 'dtypes' must be specified.")
            nRows = kwargs['nRows']
            dtype = kwargs['dtype']
            array = np.empty([nRows,], dtype=dtype)
            nT = AOSNumericTable(array)
            return dataSource.loadDataBlock(nRows, nT)
        return None
示例#10
0
def getNumericTableFromCSV(csvFileName, Rows='All'):
    dataSource = FileDataSource(csvFileName,
                                DataSourceIface.doAllocateNumericTable,
                                DataSourceIface.doDictionaryFromContext)
    nT = HomogenNumericTable()
    if type(Rows) != str: dataSource.loadDataBlock(Rows, nT)
    elif Rows == 'All': dataSource.loadDataBlock(nT)
    else:
        warnings.warn('Type error in "Rows" arguments, Can be only int')
        raise SystemError
    return nT
示例#11
0
def trainModel():
    global trainingResult
    nodeResults = []
    # Create an algorithm object to build the final Naive Bayes model on the master node
    masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses)
    for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size):
        # Initialize FileDataSource to retrieve the input data from a .csv file
        #print("The worker with rank %d will read %s." % (rankId, trainDatasetFileNames[filenameIndex]))
        trainDataSource = FileDataSource(trainDatasetFileNames[filenameIndex],
                                         DataSourceIface.notAllocateNumericTable,
                                         DataSourceIface.doDictionaryFromContext)

        # Create Numeric Tables for training data and labels
        trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate)
        trainDependentVariables = HomogenNumericTable(1, 0, NumericTableIface.notAllocate)
        mergedData = MergedNumericTable(trainData, trainDependentVariables)

        # Retrieve the data from the input file
        trainDataSource.loadDataBlock(mergedData)

        # Create an algorithm object to train the Naive Bayes model based on the local-node data
        localAlgorithm = training.Distributed_Step1LocalFloat64DefaultDense(nClasses)

        # Pass a training data set and dependent values to the algorithm
        localAlgorithm.input.set(classifier.training.data, trainData)
        localAlgorithm.input.set(classifier.training.labels, trainDependentVariables)

        # Train the Naive Bayes model on local nodes
        pres = localAlgorithm.compute()
        # Serialize partial results required by step 2
        dataArch = InputDataArchive()
        pres.serialize(dataArch)

        masterAlgorithm.input.add(classifier.training.partialModels, pres)
        """
        nodeResults.append(dataArch.getArchiveAsArray().copy())
        localAlgorithm.clean()
        """
        mergedData.freeDataMemory()
        trainData.freeDataMemory()
        trainDependentVariables.freeDataMemory()
    # Transfer partial results to step 2 on the root node
    pres = masterAlgorithm.compute()
    dataArch = InputDataArchive()
    pres.serialize(dataArch)
    nodeResults.append(dataArch.getArchiveAsArray().copy())
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        # Create an algorithm object to build the final Naive Bayes model on the master node
        masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses)

        for currentRank in range(len(serializedData)):
            for currentBlock in range(0, len(serializedData[currentRank])):
                # Deserialize partial results from step 1
                dataArch = OutputDataArchive(serializedData[currentRank][currentBlock])

                dataForStep2FromStep1 = classifier.training.PartialResult()
                dataForStep2FromStep1.deserialize(dataArch)

                # Set the local Naive Bayes model as input for the master-node algorithm
                masterAlgorithm.input.add(classifier.training.partialModels, dataForStep2FromStep1)

        # Merge and finalizeCompute the Naive Bayes model on the master node
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()
示例#12
0
def getArrayFromNT(table, nrows=0):
    bd = BlockDescriptor_Float64()
    if nrows == 0:
        nrows = table.getNumberOfRows()
    table.getBlockOfRows(0, nrows, readOnly, bd)
    npa = bd.getArray()
    table.releaseBlockOfRows(bd)
    return npa

if __name__ == "__main__":
    trainDatasetFileNames = getDatasetFileNames('news_train_dense_dist_data_*.csv')
    comm = MPI.COMM_WORLD
    comm_size = comm.Get_size()
    rankId = comm.Get_rank()
    print("I am a worker with rank %d on %s." % (rankId, MPI.Get_processor_name()))
    start = MPI.Wtime()
    trainModel()
    if rankId == MPI_ROOT:
        end = MPI.Wtime()
        testModel()
        testGroundTruth = FileDataSource(testGroundTruthFileName,
                                         DataSourceIface.doAllocateNumericTable,
                                         DataSourceIface.doDictionaryFromContext)
        testGroundTruth.loadDataBlock()
        a = getArrayFromNT(predictionResult.get(classifier.prediction.prediction))
        b = getArrayFromNT(testGroundTruth.getNumericTable())
        acc = metrics.accuracy_score(a, b, normalize=True)
        print('Accuracy: {:.4f}'.format(acc))
        print('Computational time: {:.2f}'.format(end - start))

示例#13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights_dir', required=True)
    parser.add_argument('--num_clusters', required=True, type=int)
    parser.add_argument('--num_iters', type=int, required=True)
    parser.add_argument(
        '--sample_size',
        type=int,
        choices=range(1, 100),
        metavar='INT[1,100]',
        required=True)
    parser.add_argument('--num_threads', type=int, default=1)
    parser.add_argument('--output_dir_base', required=True)
    args = parser.parse_args()

    weights_dir = args.weights_dir
    num_clusters = args.num_clusters
    num_iters = args.num_iters
    sample_size = args.sample_size
    set_num_threads(args.num_threads)
    clusters_dir = os.path.join(args.output_dir_base,
                                str(sample_size), str(num_iters))
    if not os.path.exists(clusters_dir): os.makedirs(clusters_dir)

    datasetFileName = os.path.join(weights_dir, 'user_weights.csv')

    centroidsFileName = os.path.join(clusters_dir,
                                     '%d_centroids.csv' % num_clusters)

    centroidSource = FileDataSource(centroidsFileName,
                                    DataSourceIface.doAllocateNumericTable,
                                    DataSourceIface.doDictionaryFromContext)
    centroidSource.loadDataBlock()

    print(weights_dir)

    t0 = time.time()
    dataSource = FileDataSource(datasetFileName,
                                DataSourceIface.doAllocateNumericTable,
                                DataSourceIface.doDictionaryFromContext)

    dataSource.loadDataBlock()

    initAlg = init.Batch_Float32DeterministicDense(num_clusters)
    initAlg.input.set(init.data, centroidSource.getNumericTable())

    t1 = time.time()
    init_time = t1 - t0
    print('init time: %f' % init_time)

    t0 = time.time()
    res = initAlg.compute()
    t1 = time.time()
    centroid_time = t1 - t0
    print('centroid time: %f' % centroid_time)

    centroidsResult = res.get(init.centroids)

    algorithm = kmeans.Batch_Float32LloydDense(num_clusters, 0)
    algorithm.input.set(kmeans.data, dataSource.getNumericTable())
    algorithm.input.set(kmeans.inputCentroids, centroidsResult)

    t0 = time.time()
    res2 = algorithm.compute()
    t1 = time.time()
    cluster_time = t1 - t0
    print('cluster time: %f' % cluster_time)

    printNumericTable(
        res2.get(kmeans.centroids), 'First 10 dimensions of centroids:', 20,
        10)

    assignments_table = res2.get(kmeans.assignments)
    assignment_num_rows = assignments_table.getNumberOfRows()

    assignments_block = BlockDescriptor()
    assignments_table.getBlockOfRows(0, assignment_num_rows, readOnly,
                                     assignments_block)
    # assignments numpy array
    assignments_array = assignments_block.getArray()

    centroids_table = res2.get(kmeans.centroids)
    centroids_num_rows = centroids_table.getNumberOfRows()

    centroids_block = BlockDescriptor()
    centroids_table.getBlockOfRows(0, centroids_num_rows, readOnly,
                                   centroids_block)

    t0 = time.time()

    user_to_clusters_fname = os.path.join(clusters_dir,
                                          '%d_user_cluster_ids' % num_clusters)
    with open(user_to_clusters_fname, 'w') as f:
        for i in range(assignments_array.shape[0]):
            print('%d' % int(assignments_array[i][0]), file=f)

    t1 = time.time()
    output_time = t1 - t0
    print('output time: %f' % output_time)
示例#14
0
datasetFileNames = [
    jp(DAAL_PREFIX, 'covcormoments_dense_1.csv'),
    jp(DAAL_PREFIX, 'covcormoments_dense_2.csv'),
    jp(DAAL_PREFIX, 'covcormoments_dense_3.csv'),
    jp(DAAL_PREFIX, 'covcormoments_dense_4.csv')
]

if __name__ == '__main__':

    comm_size = MPI.COMM_WORLD
    rankId = comm_size.Get_rank()

    # Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file
    dataSource = FileDataSource(
        datasetFileNames[rankId],
        DataSourceIface.doAllocateNumericTable,
        DataSourceIface.doDictionaryFromContext
    )

    # Retrieve the input data
    dataSource.loadDataBlock()

    # Create an algorithm to compute a variance-covariance matrix on local nodes
    localAlgorithm = covariance.Distributed(step1Local)

    # Set the input data set to the algorithm
    localAlgorithm.input.set(covariance.data, dataSource.getNumericTable())

    # Compute a variance-covariance matrix
    pres = localAlgorithm.compute()
示例#15
0
def testModel():
    thresholdValues = np.linspace(-25.0, 25.0, num=101)
    numberOfCorrectlyClassifiedObjects = np.zeros(len(thresholdValues))
    numberOfObjectsInTestFiles = 0
    numberOfNonzeroObjectsInTestFiles = 0
    for filenameIndex in range(0, len(testDatasetFileNames)):
        testDataSource = FileDataSource(
            testDatasetFileNames[filenameIndex],
            DataSourceIface.doAllocateNumericTable,
            DataSourceIface.doDictionaryFromContext)
        testData = HomogenNumericTable(nFeatures, 0,
                                       NumericTableIface.notAllocate)
        testGroundTruth = HomogenNumericTable(nDependentVariables, 0,
                                              NumericTableIface.notAllocate)
        mergedData = MergedNumericTable(testData, testGroundTruth)
        testDataSource.loadDataBlock(mergedData)

        algorithm = prediction.Batch_Float64DefaultDense()
        algorithm.input.setNumericTableInput(prediction.data, testData)
        algorithm.input.setModelInput(prediction.model,
                                      trainingResult.get(training.model))
        predictionResult = algorithm.compute()

        block1 = BlockDescriptor()
        block2 = BlockDescriptor()
        testGroundTruth.getBlockOfRows(0, testGroundTruth.getNumberOfRows(),
                                       readOnly, block1)
        predictionResult.get(prediction.prediction).getBlockOfRows(
            0, testGroundTruth.getNumberOfRows(), readOnly, block2)
        y_true = getClassVector(block1.getArray(), 0.000000000000)
        predictionRegression = block2.getArray()
        for thresholdIndex in range(0, len(thresholdValues)):
            y_pred = getClassVector(predictionRegression,
                                    thresholdValues[thresholdIndex])
            numberOfCorrectlyClassifiedObjects[
                thresholdIndex] += accuracy_score(y_true,
                                                  y_pred,
                                                  normalize=False)
        numberOfObjectsInTestFiles += len(y_true)
        numberOfNonzeroObjectsInTestFiles += np.count_nonzero(y_true)
        mergedData.freeDataMemory()
        testData.freeDataMemory()
        testGroundTruth.freeDataMemory()

    classificationAccuracyResult = np.zeros(len(thresholdValues))
    best_threshold = None
    best_accuracy = -1
    for thresholdIndex in range(0, len(thresholdValues)):
        classificationAccuracyResult[
            thresholdIndex] = numberOfCorrectlyClassifiedObjects[
                thresholdIndex] / numberOfObjectsInTestFiles
        if (classificationAccuracyResult[thresholdIndex] > best_accuracy):
            best_threshold = thresholdValues[thresholdIndex]
            best_accuracy = classificationAccuracyResult[thresholdIndex]
    print('Best threshold:{:.4f}. Best accuracy:{:.4f}'.format(
        best_threshold, best_accuracy))
    print(
        'Test set. Number of objects of 0 class:{:.4f}.Number of objects of 1 class:{:.4f}. '
        'Frequency of 1 class:{:.4f}'.format(
            numberOfObjectsInTestFiles - numberOfNonzeroObjectsInTestFiles,
            numberOfNonzeroObjectsInTestFiles,
            numberOfNonzeroObjectsInTestFiles / numberOfObjectsInTestFiles))
    indexOfZeroThreshold = np.where(thresholdValues == 0.0)[0][0]
    print('Threshold=0. Classification accuracy:{:.4f}'.format(
        classificationAccuracyResult[indexOfZeroThreshold]))
示例#16
0
datasetfilenames = [
    jp(DATA_PREFIX,'low1.csv'),
    jp(DATA_PREFIX,'low2.csv'),
    jp(DATA_PREFIX,'low3.csv'),
    jp(DATA_PREFIX,'low4.csv')
] #Names of datafiles for every process

if __name__ == "__main__":

    timed_total = time.process_time()
    comm = MPI.COMM_WORLD
    rankId = comm.Get_rank()


    dataSource = FileDataSource(datasetfilenames[rankId],
                                DataSourceIface.doAllocateNumericTable,
                                DataSourceIface.doDictionaryFromContext) #Every process reads its own file

    dataSource.loadDataBlock()
    timed = time.process_time()
    localAlgorithm = low_order_moments.Distributed(step=step1Local)
    localAlgorithm.input.set(low_order_moments.data, dataSource.getNumericTable())
    pres = localAlgorithm.compute()
    #serializing results for sending to master-node
    dataArch = InputDataArchive()
    pres.serialize(dataArch)

    nodeResults = dataArch.getArchiveAsArray()
    serializedData = comm.gather(nodeResults)
    if rankId == MPI_ROOT:
        masterAlgorithm = low_order_moments.Distributed(step=step2Master)
import daal.algorithms.adaboost as adaB
from daal.algorithms.adaboost import prediction, training
from GridSearch import GridSearch
from daal.data_management import (FileDataSource, DataSourceIface,
                                  HomogenNumericTable, MergedNumericTable,
                                  NumericTableIface)

DATA_PREFIX = os.path.join(os.path.dirname(sys.executable), 'share',
                           'pydaal_examples', 'examples', 'data', 'batch')
trainDatasetFileName = os.path.join(DATA_PREFIX, 'adaboost_train.csv')

nFeatures = 20

# Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file
trainDataSource = FileDataSource(trainDatasetFileName,
                                 DataSourceIface.notAllocateNumericTable,
                                 DataSourceIface.doDictionaryFromContext)

# Create Numeric Tables for training data and labels
trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.doNotAllocate)
trainGroundTruth = HomogenNumericTable(1, 0, NumericTableIface.doNotAllocate)
mergedData = MergedNumericTable(trainData, trainGroundTruth)

# Retrieve the data from the input file
trainDataSource.loadDataBlock(mergedData)

#default keyword arguments
'''
GridSearch(<args>, tuned_parameters = None, score=None,
			best_score_criteria='high',
			create_best_training_model = False,
示例#18
0
#############################################################################


def getArrayFromNT(table, nrows=0):
    bd = BlockDescriptor_Float64()
    if nrows == 0:
        nrows = table.getNumberOfRows()
    table.getBlockOfRows(0, nrows, readOnly, bd)
    npa = bd.getArray()
    table.releaseBlockOfRows(bd)
    return npa


# Initialize FileDataSource<CSVFeatureManager> to retrieve the test data from a .csv file
testDataSource = FileDataSource(testDatasetFileName,
                                DataSourceIface.notAllocateNumericTable,
                                DataSourceIface.doDictionaryFromContext)

# Create Numeric Tables for testing data and labels
testData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate)
testGroundTruth = HomogenNumericTable(1, 0, NumericTableIface.notAllocate)
mergedData = MergedNumericTable(testData, testGroundTruth)

# Retrieve the data from input file
testDataSource.loadDataBlock(mergedData)

# Create an algorithm object to predict Naive Bayes values
algorithm_test = prediction.Batch(nClasses)

# Pass a testing data set and the trained model to the algorithm
algorithm_test.input.setTable(classifier.prediction.data, testData)
def trainModel(comm, rankId):

    trainingResult = None

    # Initialize FileDataSource to retrieve the input data from a .csv file
    trainDataSource = FileDataSource(trainDatasetFileNames[rankId],
                                     DataSourceIface.notAllocateNumericTable,
                                     DataSourceIface.doDictionaryFromContext)

    # Create Numeric Tables for training data and labels
    trainData = HomogenNumericTable(NUM_FEATURES, 0,
                                    NumericTableIface.doNotAllocate)
    trainDependentVariables = HomogenNumericTable(
        NUM_DEPENDENT_VARS, 0, NumericTableIface.doNotAllocate)
    mergedData = MergedNumericTable(trainData, trainDependentVariables)

    # Retrieve the data from the input file
    trainDataSource.loadDataBlock(mergedData)

    # Create an algorithm object to train the ridge regression model based on the local-node data
    localAlgorithm = training.Distributed(step1Local)

    # Pass a training data set and dependent values to the algorithm
    localAlgorithm.input.set(training.data, trainData)
    localAlgorithm.input.set(training.dependentVariables,
                             trainDependentVariables)

    # Train the ridge regression model on local nodes
    pres = localAlgorithm.compute()

    # Serialize partial results required by step 2
    dataArch = InputDataArchive()
    pres.serialize(dataArch)

    # Transfer partial results to step 2 on the root node
    nodeResults = dataArch.getArchiveAsArray()

    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:

        # Create an algorithm object to build the final ridge regression model on the master node
        masterAlgorithm = training.Distributed(step2Master)

        for i in range(NUM_BLOCKS):

            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(serializedData[i])
            dataForStep2FromStep1 = training.PartialResult()
            dataForStep2FromStep1.deserialize(dataArch)

            # Set the local ridge regression model as input for the master-node algorithm
            masterAlgorithm.input.add(training.partialModels,
                                      dataForStep2FromStep1)

        # Merge and finalizeCompute the ridge regression model on the master node
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()

        # Retrieve the algorithm results
        printNumericTable(
            trainingResult.get(training.model).getBeta(),
            "Ridge Regression coefficients:")

    return trainingResult
示例#20
0
        f.write('{:.0f}}};'.format(
            right_items_array[i][len(right_items_array[i]) - 1]))

        f.write('{0:.3g}\n'.format(confidence_array[i]))
    f.close()


start = time.time()
for num in range(1):
    datasetFileName = 'daal_retail.csv'

    minSupport = 0.0015
    minConfidence = 0.8

    dataSource = FileDataSource(datasetFileName,
                                DataSourceIface.doAllocateNumericTable,
                                DataSourceIface.doDictionaryFromContext)
    dataSource.loadDataBlock()

    alg = Batch_Float64Apriori()
    alg.input.set(data, dataSource.getNumericTable())
    alg.parameter.minSupport = minSupport
    alg.parameter.minConfidence = minConfidence
    # alg.parameter.itemsetsOrder = itemsetsSortedBySupport
    # alg.parameter.rulesOrder = rulesSortedByConfidence

    res = alg.compute()
end = time.time()
print('Performance comparison. Time: %s seconds' % (end - start))

nt1 = res.get(largeItemsets)
datasetFileNames = [
    jp(DATA_PREFIX, 'covcormoments_dense_1.csv'),
    jp(DATA_PREFIX, 'covcormoments_dense_2.csv'),
    jp(DATA_PREFIX, 'covcormoments_dense_3.csv'),
    jp(DATA_PREFIX, 'covcormoments_dense_4.csv')
]

if __name__ == "__main__":

    comm = MPI.COMM_WORLD
    rankId = comm.Get_rank()

    # Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file
    dataSource = FileDataSource(datasetFileNames[rankId],
                                DataSourceIface.doAllocateNumericTable,
                                DataSourceIface.doDictionaryFromContext)

    # Retrieve the input data
    dataSource.loadDataBlock()

    # Create an algorithm to compute low order moments on local nodes
    localAlgorithm = low_order_moments.Distributed(step=step1Local)

    # Set the input data set to the algorithm
    localAlgorithm.input.set(low_order_moments.data,
                             dataSource.getNumericTable())

    # Compute low order moments
    pres = localAlgorithm.compute()