def testModel(trainingResult):

    # Initialize FileDataSource to retrieve the input data from a .csv file
    testDataSource = FileDataSource(testDatasetFileName,
                                    DataSourceIface.doAllocateNumericTable,
                                    DataSourceIface.doDictionaryFromContext)

    # Create Numeric Tables for testing data and ground truth values
    testData = HomogenNumericTable(NUM_FEATURES, 0,
                                   NumericTableIface.doNotAllocate)
    testGroundTruth = HomogenNumericTable(NUM_DEPENDENT_VARS, 0,
                                          NumericTableIface.doNotAllocate)
    mergedData = MergedNumericTable(testData, testGroundTruth)

    # Retrieve the data from an input file
    testDataSource.loadDataBlock(mergedData)

    # Create an algorithm object to predict values of ridge regression
    algorithm = prediction.Batch()

    # Pass a testing data set and the trained model to the algorithm
    algorithm.input.setTable(prediction.data, testData)
    algorithm.input.setModel(prediction.model,
                             trainingResult.get(training.model))

    # Predict values of ridge regression
    res = algorithm.compute()

    # Retrieve the algorithm results
    printNumericTable(res.get(prediction.prediction),
                      "Ridge Regression prediction results: (first 10 rows):",
                      10)
    printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10)
예제 #2
0
 def predictReducedModelResults(self,trainingResult,trainData, nDependentVariables, reducedBeta):
     model = trainingResult.get(training.model)
     betas = model.getBeta ()
     nBetas = model.getNumberOfBetas ()
     savedBeta = np.zeros((nDependentVariables,nBetas))
     block = BlockDescriptor ()
     betas.getBlockOfRows (0, nDependentVariables, readWrite, block)
     pBeta = block.getArray()
     if type (reducedBeta) == int: reducedBeta = [reducedBeta]
     for beta in reducedBeta:
         for i in range (nDependentVariables):
             savedBeta[i][beta] = pBeta[i][beta]
             pBeta[i][beta] = 0
     betas.releaseBlockOfRows (block)
     printNumericTable(betas)
     predictedResults = LinearRegression.predict(self,trainingResult,trainData)
     block = BlockDescriptor ()
     betas.getBlockOfRows (0, nBetas, readWrite, block)
     pBeta = block.getArray()
     for beta in reducedBeta:
         for i in range (0, nDependentVariables):
              pBeta[i][beta] = savedBeta[i][beta]
     betas.releaseBlockOfRows (block)
     printNumericTable(betas)
     return predictedResults
예제 #3
0
	def printAllQualityMetrics(self, qualityMetricSetResult):
	
		# Print the quality metrics
		printNumericTable(qualityMetricSetResult.get('confusionMatrix'), "Confusion matrix:")

		print("Accuracy:      {0:.3f}".format(qualityMetricSetResult.get('accuracy')))
		print("Precision:     {0:.3f}".format(qualityMetricSetResult.get('precision')))
		print("Recall:        {0:.3f}".format(qualityMetricSetResult.get('recall')))
		print("F1-score:      {0:.3f}".format(qualityMetricSetResult.get('fscore')))
		print("Specificity:   {0:.3f}".format(qualityMetricSetResult.get('specificity')))
		print("AUC:           {0:.3f}".format(qualityMetricSetResult.get('auc')))
예제 #4
0
	def printAllQualityMetrics(self, qualityMetricSetResult):
		# Print the quality metrics
		printNumericTable(qualityMetricSetResult.get('confusionMatrix'), "Confusion matrix:")

		print ("Average accuracy: {0:.3f}".format (qualityMetricSetResult.get('averageAccuracy')))
		print ("Error rate:       {0:.3f}".format (qualityMetricSetResult.get('errorRate')))
		print ("Micro precision:  {0:.3f}".format (qualityMetricSetResult.get('microPrecision')))
		print ("Micro recall:     {0:.3f}".format (qualityMetricSetResult.get('microRecall')))
		print ("Micro F-score:    {0:.3f}".format (qualityMetricSetResult.get('microFscore')))
		print ("Macro precision:  {0:.3f}".format (qualityMetricSetResult.get('macroPrecision')))
		print ("Macro recall:     {0:.3f}".format (qualityMetricSetResult.get('macroRecall')))
		print ("Macro F-score:    {0:.3f}".format (qualityMetricSetResult.get('macroFscore')))
예제 #5
0
    def printAllQualityMetrics(self, qualityMetricSetResult):

        # Print the quality metrics
        qualityMetricResult = qualityMetricSetResult.getResult(quality_metric_set.confusionMatrix)
        printNumericTable(qualityMetricResult.get(binary_confusion_matrix.confusionMatrix), "Confusion matrix:")

        block = BlockDescriptor()
        qualityMetricsTable = qualityMetricResult.get(binary_confusion_matrix.binaryMetrics)
        qualityMetricsTable.getBlockOfRows(0, 1, readOnly, block)
        qualityMetricsData = block.getArray().flatten()
        print("Accuracy:      {0:.3f}".format(qualityMetricsData[binary_confusion_matrix.accuracy]))
        print("Precision:     {0:.3f}".format(qualityMetricsData[binary_confusion_matrix.precision]))
        print("Recall:        {0:.3f}".format(qualityMetricsData[binary_confusion_matrix.recall]))
        print("F1-score:      {0:.3f}".format(qualityMetricsData[binary_confusion_matrix.fscore]))
        print("Specificity:   {0:.3f}".format(qualityMetricsData[binary_confusion_matrix.specificity]))
        print("AUC:           {0:.3f}".format(qualityMetricsData[binary_confusion_matrix.AUC]))
        qualityMetricsTable.releaseBlockOfRows(block)
예제 #6
0
    def printAllQualityMetrics(self, qualityMetricSetResult):

        # Print the quality metrics
        qualityMetricResult = qualityMetricSetResult.getResult (
            multi_class_classifier.quality_metric_set.confusionMatrix)
        printNumericTable (qualityMetricResult.get (multiclass_confusion_matrix.confusionMatrix), "Confusion matrix:")

        block = BlockDescriptor ()
        qualityMetricsTable = qualityMetricResult.get (multiclass_confusion_matrix.multiClassMetrics)
        qualityMetricsTable.getBlockOfRows (0, 1, readOnly, block)
        qualityMetricsData = block.getArray ().flatten ()
        print ("Average accuracy: {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.averageAccuracy]))
        print ("Error rate:       {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.errorRate]))
        print ("Micro precision:  {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.microPrecision]))
        print ("Micro recall:     {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.microRecall]))
        print ("Micro F-score:    {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.microFscore]))
        print ("Macro precision:  {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.macroPrecision]))
        print ("Macro recall:     {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.macroRecall]))
        print ("Macro F-score:    {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.macroFscore]))
        qualityMetricsTable.releaseBlockOfRows (block)
예제 #7
0
def printResultsM():

    # Print the classification results
    printNumericTables(groundTruthLabels,
                       predictedLabels,
                       "Ground truth",
                       "Classification results",
                       "SVM classification results (first 20 observations):",
                       20,
                       interval=15,
                       flt64=False)
    # Print the quality metrics
    qualityMetricResult = qualityMetricSetResult.getResult(
        multi_class_classifier.quality_metric_set.confusionMatrix)
    printNumericTable(
        qualityMetricResult.get(multiclass_confusion_matrix.confusionMatrix),
        "Confusion matrix:")

    block = BlockDescriptor()
    qualityMetricsTable = qualityMetricResult.get(
        multiclass_confusion_matrix.multiClassMetrics)
    qualityMetricsTable.getBlockOfRows(0, 1, readOnly, block)
    qualityMetricsData = block.getArray().flatten()
    print("Average accuracy: {0:.3f}".format(
        qualityMetricsData[multiclass_confusion_matrix.averageAccuracy]))
    print("Error rate:       {0:.3f}".format(
        qualityMetricsData[multiclass_confusion_matrix.errorRate]))
    print("Micro precision:  {0:.3f}".format(
        qualityMetricsData[multiclass_confusion_matrix.microPrecision]))
    print("Micro recall:     {0:.3f}".format(
        qualityMetricsData[multiclass_confusion_matrix.microRecall]))
    print("Micro F-score:    {0:.3f}".format(
        qualityMetricsData[multiclass_confusion_matrix.microFscore]))
    print("Macro precision:  {0:.3f}".format(
        qualityMetricsData[multiclass_confusion_matrix.macroPrecision]))
    print("Macro recall:     {0:.3f}".format(
        qualityMetricsData[multiclass_confusion_matrix.macroRecall]))
    print("Macro F-score:    {0:.3f}".format(
        qualityMetricsData[multiclass_confusion_matrix.macroFscore]))
    qualityMetricsTable.releaseBlockOfRows(block)
예제 #8
0
def printResultsB():

    # Print the classification results
    printNumericTables(groundTruthLabels,
                       predictedLabels,
                       "Ground truth",
                       "Classification results",
                       "SVM classification results (first 20 observations):",
                       20,
                       interval=15,
                       flt64=False)

    # Print the quality metrics
    qualityMetricResult = qualityMetricSetResult.getResult(
        svm.quality_metric_set.confusionMatrix)
    printNumericTable(
        qualityMetricResult.get(binary_confusion_matrix.confusionMatrix),
        "Confusion matrix:")

    block = BlockDescriptor()
    qualityMetricsTable = qualityMetricResult.get(
        binary_confusion_matrix.binaryMetrics)
    qualityMetricsTable.getBlockOfRows(0, 1, readOnly, block)
    qualityMetricsData = block.getArray().flatten()
    print("Accuracy:      {0:.3f}".format(
        qualityMetricsData[binary_confusion_matrix.accuracy]))
    print("Precision:     {0:.3f}".format(
        qualityMetricsData[binary_confusion_matrix.precision]))
    print("Recall:        {0:.3f}".format(
        qualityMetricsData[binary_confusion_matrix.recall]))
    print("F-score:       {0:.3f}".format(
        qualityMetricsData[binary_confusion_matrix.fscore]))
    print("Specificity:   {0:.3f}".format(
        qualityMetricsData[binary_confusion_matrix.specificity]))
    print("AUC:           {0:.3f}".format(
        qualityMetricsData[binary_confusion_matrix.AUC]))
    qualityMetricsTable.releaseBlockOfRows(block)
예제 #9
0
    def printAllQualityMetrics(self, resultSingleBeta, resultGroupBeta):
        print ("Quality metrics for a single beta")
        printNumericTable (resultSingleBeta.getResult (single_beta.rms),
                           "Root means square errors for each response (dependent variable):")
        printNumericTable (resultSingleBeta.getResult (single_beta.variance), "Variance for each response (dependent variable):")
        printNumericTable (resultSingleBeta.getResult (single_beta.zScore), "Z-score statistics:")
        printNumericTable (resultSingleBeta.getResult (single_beta.confidenceIntervals),
                           "Confidence intervals for each beta coefficient:")
        printNumericTable (resultSingleBeta.getResult (single_beta.inverseOfXtX), "Inverse(Xt * X) matrix:")

        coll = resultSingleBeta.getResultDataCollection (single_beta.betaCovariances)
        for i in range (0, coll.size ()):
            message = "Variance-covariance matrix for betas of " + str (i) + "-th response"
            betaCov = resultSingleBeta.get (single_beta.betaCovariances, i)
            printNumericTable (betaCov, message)

        # Print quality metrics for a group of betas
        print ("Quality metrics for a group of betas")
        printNumericTable (resultGroupBeta.get (group_of_betas.expectedMeans),
                           "Means of expected responses for each dependent variable:", 0, 0, 20)
        printNumericTable (resultGroupBeta.get (group_of_betas.expectedVariance),
                           "Variance of expected responses for each dependent variable:", 0, 0, 20)
        printNumericTable (resultGroupBeta.get (group_of_betas.regSS), "Regression sum of squares of expected responses:", 0, 0,
                           20)
        printNumericTable (resultGroupBeta.get (group_of_betas.resSS),
                           "Sum of squares of residuals for each dependent variable:", 0, 0, 20)
        printNumericTable (resultGroupBeta.get (group_of_betas.tSS), "Total sum of squares for each dependent variable:", 0, 0,
                           20)
        printNumericTable (resultGroupBeta.get (group_of_betas.determinationCoeff),
                           "Determination coefficient for each dependent variable:", 0, 0, 20)
        printNumericTable (resultGroupBeta.get (group_of_betas.fStatistics), "F-statistics for each dependent variable:", 0, 0,
                           20)
예제 #10
0
if __name__ == "__main__":

    # Create SparkContext that loads defaults from the system properties and the classpath and sets the name
    sc = SparkContext(
        conf=SparkConf().setAppName("Spark QR").setMaster('local[4]'))

    # Read from the distributed HDFS data set at a specified path
    dd = DistributedHDFSDataSet("/Spark/QR/data/")
    dataRDD = dd.getAsPairRDD(sc)

    # Compute QR decomposition for dataRDD
    result = runQR(dataRDD, sc)

    # Redirect stdout to a file for correctness verification
    stdout = sys.stdout
    sys.stdout = open('QR.out', 'w')

    # Print the results
    ntRPList = result['Q'].collect()
    for key, table in ntRPList:
        deserialized_table = deserializeNumericTable(table)
        printNumericTable(deserialized_table,
                          "Q (2 first vectors from node #{}):".format(key), 2)

    printNumericTable(result['R'], "R:")

    # Restore stdout
    sys.stdout = stdout

    sc.stop()
        trainDataFilesPath, trainDataLabelsFilesPath, sc)

    # Read the test data and labels from a specified path
    testDataAndLabelsRDD = getMergedDataAndLabelsRDD(testDataFilesPath,
                                                     testDataLabelsFilesPath,
                                                     sc)

    # Compute linear regression for dataRDD
    res = runLinearRegression(trainDataAndLabelsRDD, testDataAndLabelsRDD)

    # Redirect stdout to a file for correctness verification
    stdout = sys.stdout
    sys.stdout = open('LinearRegressionNormEq.out', 'w')

    # Print the results
    parts_list = testDataAndLabelsRDD.collect()
    for key, (_, h_table2) in parts_list:
        expected = h_table2
        deserialized_expected = deserializeNumericTable(expected)

    printNumericTable(res['beta'], "Coefficients:")
    printNumericTable(res['predicted'],
                      "First 10 rows of results (obtained): ", 10)
    printNumericTable(deserialized_expected,
                      "First 10 rows of results (expected): ", 10)

    # Restore stdout
    sys.stdout = stdout

    sc.stop()
y_test = y_test_temp[:, np.newaxis]

trainData = HomogenNumericTable(x_train)
trainDependentVariables = HomogenNumericTable(y_train)
testData = HomogenNumericTable(x_test)
testGroundTruth = HomogenNumericTable(y_test)

#Instantiate Linear Regression object
lr = LinearRegression()
#Training
trainingResult = lr.training(trainData, trainDependentVariables)
#Prediction
prediction_nT = lr.predict(trainingResult, testData)
#Evaluation
qualityMet = lr.qualityMetrics(trainingResult, prediction_nT, testGroundTruth)
printNumericTable(qualityMet.get('rms'), "Root mean square")
#To print all the metrics
lr.printAllQualityMetrics(qualityMet)
#To predict and evaluate. Note that this method performs predictions on both unrestricted and restricted(reduced) model
predRes, predResRed, qualityMet = lr.predictWithQualityMetrics(
    trainingResult, testData, testGroundTruth, [1, 2])
#Serialize
lr.serialize(trainingResult, fileName='LR.npy')
#Deseriailze
de_trainingResult = lr.deserialize(fileName="LR.npy")
#Print Metrics results
#print predicted responses and actual response
printNumericTable(predRes,
                  "Linear Regression prediction results: (first 10 rows):", 10)
printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10)
'''
Instantiate Decision Forest object: Classification(nClasses, nTrees = 100, observationsPerTreeFraction = 1,featuresPerNode=0,maxTreeDepth=0,
				 minObservationsInLeafNodes=1,impurityThreshold=0,varImportance='MDI')
'''
daal_DF = Classification(len(np.unique(y)),observationsPerTreeFraction=0.7,varImportance='MDI',resultsToCompute=3)
#Train
trainingResult = daal_DF.training(trainData,trainDependentVariables)
#Predict
predictResults = daal_DF.predict(trainingResult,testData)
#Evaluate you model
qualityMet = daal_DF.qualityMetrics(predictResults,testGroundTruth)
#print accuracy
print("Accuracy".format(qualityMet.get('accuracy')))
#print confusion matrix
printNumericTable(qualityMet.get('confusionMatrix'),"Confusion Matrix")
#print all metrics
print("All available metrics")
daal_DF.printAllQualityMetrics(qualityMet)
#Serialize and save
daal_DF.serialize(trainingResult, fileName='DF', useCompression=True)
#Deserialize
dese_trainingRes = daal_DF.deserialize(fileName='DF.npy', useCompression=True)

#Print predicted responses and actual responses
printNumericTables (
    testGroundTruth, predictResults,
    "Ground truth", "Classification results",
    "Decision Forest classification results (first 20 observations):", 20,  flt64=False
 )
예제 #14
0
    maximum = res.get(low_order_moments.maximum)
    sum = res.get(low_order_moments.sum)
    sumSquares = res.get(low_order_moments.sumSquares)
    sumSquaresCentered = res.get(low_order_moments.sumSquaresCentered)
    mean = res.get(low_order_moments.mean)
    secondOrderRawMoment = res.get(low_order_moments.secondOrderRawMoment)
    variance = res.get(low_order_moments.variance)
    standardDeviation = res.get(low_order_moments.standardDeviation)
    variation = res.get(low_order_moments.variation)

    # Redirect stdout to a file for correctness verification
    stdout = sys.stdout
    sys.stdout = open('LowOrderMomentsDense.out', 'w')

    print("Low order moments:")
    printNumericTable(minimum, "Min:")
    printNumericTable(maximum, "Max:")
    printNumericTable(sum, "Sum:")
    printNumericTable(sumSquares, "SumSquares:")
    printNumericTable(sumSquaresCentered, "SumSquaredDiffFromMean:")
    printNumericTable(mean, "Mean:")
    printNumericTable(secondOrderRawMoment, "SecondOrderRawMoment:")
    printNumericTable(variance, "Variance:")
    printNumericTable(standardDeviation, "StandartDeviation:")
    printNumericTable(variation, "Variation:")

    # Restore stdout
    sys.stdout = stdout

    sc.stop()
예제 #15
0
    def printAllQualityMetrics(self, qualityMet):
        # Print quality metrics for single belta
        print("Quality metrics for a single beta")
        printNumericTable(
            qualityMet.get('rms'),
            "Root means square errors for each response (dependent variable):")
        printNumericTable(qualityMet.get('variance'),
                          "Variance for each response (dependent variable):")
        printNumericTable(qualityMet.get('zScore'), "Z-score statistics:")
        printNumericTable(qualityMet.get('confidenceIntervals'),
                          "Confidence intervals for each beta coefficient:")
        printNumericTable(qualityMet.get('inverseOfXtX'),
                          "Inverse(Xt * X) matrix:")
        betaCov = qualityMet.get('betaCovariances')
        for i in range(len(betaCov)):
            message = "Variance-covariance matrix for betas of " + str(
                i + 1) + "-th response"
            printNumericTable(betaCov[i], message)

        # Print quality metrics for a group of betas
        print("Quality metrics for a group of betas")
        printNumericTable(
            qualityMet.get('expectedMeans'),
            "Means of expected responses for each dependent variable:")
        printNumericTable(
            qualityMet.get('expectedVariance'),
            "Variance of expected responses for each dependent variable:")
        printNumericTable(qualityMet.get('regSS'),
                          "Regression sum of squares of expected responses:")
        printNumericTable(
            qualityMet.get('resSS'),
            "Sum of squares of residuals for each dependent variable:")
        printNumericTable(qualityMet.get('tSS'),
                          "Total sum of squares for each dependent variable:")
        printNumericTable(
            qualityMet.get('determinationCoeff'),
            "Determination coefficient for each dependent variable:")
        printNumericTable(qualityMet.get('fStatistics'),
                          "F-statistics for each dependent variable:")
예제 #16
0
    kmeansMaster.compute()

    # Finalize computations and retrieve the results
    res = kmeansMaster.finalizeCompute()

    return res.get(kmeans.centroids)

if __name__ == "__main__":

    # Create SparkContext that loads defaults from the system properties and the classpath and sets the name
    sc = SparkContext(conf=SparkConf().setAppName("Spark Kmeans").setMaster('local[4]'))

    # Read from the distributed HDFS data set at a specified path
    dd = DistributedHDFSDataSet("/Spark/KmeansDense/data/")
    dataRDD = dd.getAsPairRDD(sc)

    # Compute k-means for dataRDD
    result = runKmeans(dataRDD)

    # Redirect stdout to a file for correctness verification
    stdout = sys.stdout
    sys.stdout = open('KmeansDense.out', 'w')

    # Print the results
    printNumericTable(result, "First 10 dimensions of centroids:", 20, 10)

    # Restore stdout
    sys.stdout = stdout

    sc.stop()
def trainModel(comm, rankId):

    trainingResult = None

    # Initialize FileDataSource to retrieve the input data from a .csv file
    trainDataSource = FileDataSource(trainDatasetFileNames[rankId],
                                     DataSourceIface.notAllocateNumericTable,
                                     DataSourceIface.doDictionaryFromContext)

    # Create Numeric Tables for training data and labels
    trainData = HomogenNumericTable(NUM_FEATURES, 0,
                                    NumericTableIface.doNotAllocate)
    trainDependentVariables = HomogenNumericTable(
        NUM_DEPENDENT_VARS, 0, NumericTableIface.doNotAllocate)
    mergedData = MergedNumericTable(trainData, trainDependentVariables)

    # Retrieve the data from the input file
    trainDataSource.loadDataBlock(mergedData)

    # Create an algorithm object to train the ridge regression model based on the local-node data
    localAlgorithm = training.Distributed(step1Local)

    # Pass a training data set and dependent values to the algorithm
    localAlgorithm.input.set(training.data, trainData)
    localAlgorithm.input.set(training.dependentVariables,
                             trainDependentVariables)

    # Train the ridge regression model on local nodes
    pres = localAlgorithm.compute()

    # Serialize partial results required by step 2
    dataArch = InputDataArchive()
    pres.serialize(dataArch)

    # Transfer partial results to step 2 on the root node
    nodeResults = dataArch.getArchiveAsArray()

    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:

        # Create an algorithm object to build the final ridge regression model on the master node
        masterAlgorithm = training.Distributed(step2Master)

        for i in range(NUM_BLOCKS):

            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(serializedData[i])
            dataForStep2FromStep1 = training.PartialResult()
            dataForStep2FromStep1.deserialize(dataArch)

            # Set the local ridge regression model as input for the master-node algorithm
            masterAlgorithm.input.add(training.partialModels,
                                      dataForStep2FromStep1)

        # Merge and finalizeCompute the ridge regression model on the master node
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()

        # Retrieve the algorithm results
        printNumericTable(
            trainingResult.get(training.model).getBeta(),
            "Ridge Regression coefficients:")

    return trainingResult
    # Create a data archive to deserialize the numeric table
    dataArch = OutputDataArchive(buffer)

    # Deserialize the numeric table from the data archive
    object.deserialize(dataArch)

    return object


if __name__ == "__main__":

    comm = MPI.COMM_WORLD
    rankId = comm.Get_rank()

    transposedDataTable = createSparseTable(
        transposedTrainDatasetFileNames[rankId])

    step4LocalInput = KeyValueDataCollection()
    itemsPartialResultPrediction = KeyValueDataCollection()

    dataTable = initializeModel()
    trainModel(dataTable, transposedDataTable)
    testModel()

    if rankId == MPI_ROOT:
        for i in range(nBlocks):
            for j in range(nBlocks):
                print("prediction {}, {}".format(i, j))
                printNumericTable(predictedRatingsMaster[i][j].get(
                    ratings.prediction))
예제 #19
0
    return result


if __name__ == "__main__":

    # Create JavaSparkContext that loads defaults from the system properties and the classpath and sets the name
    sc = SparkContext(conf=SparkConf().setAppName(
        "Spark covariance(CSR)").setMaster("local[4]"))

    # Read from the distributed HDFS data set at a specified path
    dd = DistributedHDFSDataSet("/Spark/CovarianceCSR/data/")
    dataRDD = dd.getCSRAsPairRDD(sc)

    # Compute a sparse variance-covariance matrix for dataRDD
    final_result = runCovariance(dataRDD)

    # Redirect stdout to a file for correctness verification
    stdout = sys.stdout
    sys.stdout = open('CovarianceCSR.out', 'w')

    # Print the results
    printNumericTable(final_result['covariance'],
                      "Covariance matrix (upper left square 10*10) :", 10, 10,
                      9)
    printNumericTable(final_result['mean'], "Mean vector:", 1, 10, 9)

    # Restore stdout
    sys.stdout = stdout

    sc.stop()
예제 #20
0
    # Transfer partial results to step 2 on the root node
    data = comm_size.gather(nodeResults, MPI_ROOT)

    if rankId == MPI_ROOT:
        # Create an algorithm to compute a sparse variance-covariance matrix on the master node
        masterAlgorithm = covariance.Distributed(step2Master,
                                                 method=covariance.fastCSR)

        for i in range(nBlocks):
            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(data[i])

            dataForStep2FromStep1 = covariance.PartialResult()

            dataForStep2FromStep1.deserialize(dataArch)

            # Set local partial results as input for the master-node algorithm
            masterAlgorithm.input.add(covariance.partialResults,
                                      dataForStep2FromStep1)

        # Merge and finalizeCompute a sparse variance-covariance matrix on the master node
        masterAlgorithm.compute()
        result = masterAlgorithm.finalizeCompute(
        )  # Retrieve the algorithm results

        # Print the results
        printNumericTable(result.get(covariance.covariance),
                          "Covariance matrix (upper left square 10*10) :", 10,
                          10)
        printNumericTable(result.get(covariance.mean), "Mean vector:", 1, 10)
예제 #21
0
    result['mean'] = res.get(covariance.mean)

    return result


if __name__ == "__main__":

    # Create SparkContext that loads defaults from the system properties and the classpath and sets the name
    sc = SparkContext(conf=SparkConf().setAppName(
        "Spark covariance(dense)").setMaster("local[4]"))

    # Read from the distributed HDFS data set at a specified path
    dd = DistributedHDFSDataSet("/Spark/CovarianceDense/data/")
    dataRDD = dd.getAsPairRDD(sc)

    # Redirect stdout to a file for correctness verification
    stdout = sys.stdout
    sys.stdout = open('CovarianceDense.out', 'w')

    # Compute a dense variance-covariance matrix for dataRDD
    final_result = runCovariance(dataRDD)

    # Print the results
    printNumericTable(final_result['covariance'], "Covariance:", interval=9)
    printNumericTable(final_result['mean'], "Mean:", interval=9)

    # Restore stdout
    sys.stdout = stdout

    sc.stop()
예제 #22
0
    # Serialize partial results required by step 2
    dataArch = InputDataArchive()
    pres.serialize(dataArch)

    nodeResults = dataArch.getArchiveAsArray()

    # Transfer partial results to step 2 on the root node
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        # Create an algorithm for principal component analysis using the SVD method on the master node
        masterAlgorithm = pca.Distributed(step2Master, method=pca.svdDense)

        for i in range(nBlocks):
            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(serializedData[i])

            dataForStep2FromStep1 = pca.PartialResult(pca.svdDense)
            dataForStep2FromStep1.deserialize(dataArch)

            # Set local partial results as input for the master-node algorithm
            masterAlgorithm.input.add(pca.partialResults, dataForStep2FromStep1)

        # Merge and finalizeCompute PCA decomposition on the master node
        masterAlgorithm.compute()
        res = masterAlgorithm.finalizeCompute()

        # Print the results
        printNumericTable(res.get(pca.eigenvalues), "Eigenvalues:")
        printNumericTable(res.get(pca.eigenvectors), "Eigenvectors:")
예제 #23
0
        'eigenvectors': res.get(pca.eigenvectors),
        'eigenvalues': res.get(pca.eigenvalues)
    }


if __name__ == "__main__":

    # Create SparkContext that loads defaults from the system properties and the classpath and sets the name
    sc = SparkContext(
        conf=SparkConf().setAppName("Spark PCA(COR)").setMaster('local[4]'))

    # Read from the distributed HDFS data set at a specified path
    dd = DistributedHDFSDataSet("/Spark/PcaCorCSR/data/")
    dataRDD = dd.getCSRAsPairRDD(sc)

    # Compute PCA decomposition for dataRDD using the correlation method
    result = runPCA(dataRDD)

    # Redirect stdout to a file for correctness verification
    stdout = sys.stdout
    sys.stdout = open('PcaCorCSR.out', 'w')

    # Print the results
    printNumericTable(result['eigenvalues'], "Eigenvalues:")
    printNumericTable(result['eigenvectors'], "Eigenvectors:")

    # Restore sdtout
    sys.stdout = stdout

    sc.stop()
예제 #24
0
    # Create SparkContext that loads defaults from the system properties and the classpath and sets the name
    sc = SparkContext(
        conf=SparkConf().setAppName('Spark SVD').setMaster('local[4]'))

    # Read from the distributed HDFS data set at a specified path
    dd = DistributedHDFSDataSet("/Spark/Svd/data/")
    dataRDD = dd.getAsPairRDD(sc)

    # Compute SVD decomposition for dataRDD
    res = runSVD(dataRDD)

    # Redirect stdout to a file for correctness verification
    stdout = sys.stdout
    sys.stdout = open('Svd.out', 'w')

    # Print the results
    ntRPList = res['U'].collect()

    for num, table in ntRPList:
        deserialized_table = deserializeNumericTable(table)
        printNumericTable(deserialized_table,
                          "U (2 first vectors from node #{}):".format(num), 2)

    printNumericTable(res['Sigma'], "Sigma:")
    printNumericTable(res['V'], "V:")

    # Restore stdout
    sys.stdout = stdout

    sc.stop()
예제 #25
0
    Ui = res.get(svd.leftSingularMatrix)


if __name__ == "__main__":

    comm = MPI.COMM_WORLD
    comm_size = comm.Get_size()
    rankId = comm.Get_rank()

    if nBlocks != comm_size:
        if rankId == MPI_ROOT:
            frmt = "{} MPI ranks != {} datasets available, so please start exactly {} ranks."
            print(frmt.format(comm_size, nBlocks, nBlocks))
        sys.exit(0)

    computestep1Local()

    if rankId == MPI_ROOT:
        computeOnMasterNode()

    finalizeComputestep1Local()

    # Print the results
    if rankId == MPI_ROOT:
        printNumericTable(Sigma, "Singular values:")
        printNumericTable(V, "Right orthogonal matrix V:")
        printNumericTable(Ui,
                          "Part of left orthogonal matrix U from root node:",
                          10)
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

data = load_boston()
X = data.data
Y = data.target
x_train, x_test, y_train_temp, y_test_temp = train_test_split(X,
                                                              Y,
                                                              test_size=0.40,
                                                              random_state=42)
y_train = y_train_temp[:, np.newaxis]
y_test = y_test_temp[:, np.newaxis]

trainData = HomogenNumericTable(x_train)
trainDependentVariables = HomogenNumericTable(y_train)
testData = HomogenNumericTable(x_test)
testGroundTruth = HomogenNumericTable(y_test)
#Instantiate Linear Regression object
rigde = RidgeRegression(ridgeParameters=0.0005)
#Training
trainingResult = rigde.training(trainData, trainDependentVariables)
#Prediction
pred_nT = rigde.predict(trainingResult, trainData)
#Serialize
rigde.serialize(trainingResult, fileName='RR.npy')
#Deseriailze
de_trainingResult = rigde.deserialize(fileName="RR.npy")
#print predicted responses and actual response
printNumericTable(pred_nT,
                  "Ridge Regression prediction results: (first 10 rows):", 10)
printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10)
            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(serializedData[i])

            dataForStep2FromStep1 = low_order_moments.PartialResult()

            dataForStep2FromStep1.deserialize(dataArch)

            # Set local partial results as input for the master-node algorithm
            masterAlgorithm.input.add(low_order_moments.partialResults,
                                      dataForStep2FromStep1)

        # Merge and finalizeCompute low order moments on the master node
        masterAlgorithm.compute()
        res = masterAlgorithm.finalizeCompute()

        # Print the results
        printNumericTable(res.get(low_order_moments.minimum), "Minimum:")
        printNumericTable(res.get(low_order_moments.maximum), "Maximum:")
        printNumericTable(res.get(low_order_moments.sum), "Sum:")
        printNumericTable(res.get(low_order_moments.sumSquares),
                          "Sum of squares:")
        printNumericTable(res.get(low_order_moments.sumSquaresCentered),
                          "Sum of squared difference from the means:")
        printNumericTable(res.get(low_order_moments.mean), "Mean:")
        printNumericTable(res.get(low_order_moments.secondOrderRawMoment),
                          "Second order raw moment:")
        printNumericTable(res.get(low_order_moments.variance), "Variance:")
        printNumericTable(res.get(low_order_moments.standardDeviation),
                          "Standard deviation:")
        printNumericTable(res.get(low_order_moments.variation), "Variation:")
예제 #28
0
    nodeResults = dataArch.getArchiveAsArray()

    # Transfer partial results to step 2 on the root node
    data = comm_size.gather(nodeResults, MPI_ROOT)

    if rankId == MPI_ROOT:

        # Create an algorithm to compute a variance-covariance matrix on the master node
        masterAlgorithm = covariance.Distributed(step2Master)

        for i in range(nBlocks):

            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(data[i])

            dataForStep2FromStep1 = covariance.PartialResult()

            dataForStep2FromStep1.deserialize(dataArch)

            # Set local partial results as input for the master-node algorithm
            masterAlgorithm.input.add(covariance.partialResults, dataForStep2FromStep1)

        # Merge and finalizeCompute a dense variance-covariance matrix on the master node */
        masterAlgorithm.compute()
        result = masterAlgorithm.finalizeCompute()

        # Print the results
        printNumericTable(result.get(covariance.covariance), "Covariance matrix:")
        printNumericTable(result.get(covariance.mean),       "Mean vector:")
trainData = HomogenNumericTable(seeded.rand(200, nFeatures))
trainDependentVariables = HomogenNumericTable(
    seeded.rand(200, nDependentVariables))
testData = HomogenNumericTable(seeded.rand(50, nFeatures))
testGroundTruth = HomogenNumericTable(seeded.rand(50, nDependentVariables))

#Instantiate Linear Regression object
lr = LinearRegression()
#Training
trainingResult = lr.training(trainData, trainDependentVariables)
#Prediction
pred_array = lr.predict(trainingResult, trainData)
#Serialize
lr.serialize(trainingResult, fileName='trainingResult.npy')
#Deseriailze
de_trainingResult = lr.deserialize(fileName="trainingResult.npy")
#Predict with Metrics
predRes, predResRed, singleBeta, groupBeta = lr.predictWithQualityMetrics(
    trainingResult,
    trainData,
    trainDependentVariables,
    reducedBetaIndex=[2, 10])
#Print Metrics results
lr.printAllQualityMetrics(singleBeta, groupBeta)
#print predicted responses and actual response
printNumericTable(predRes,
                  "Linear Regression prediction results: (first 10 rows):", 10)
printNumericTable(predResRed,
                  "Linear Regression prediction results: (first 10 rows):", 10)
printNumericTable(trainDependentVariables, "Ground truth (first 10 rows):", 10)
예제 #30
0
    algorithm.compute()
    res = algorithm.finalizeCompute()

    Qi = res.get(qr.matrixQ)


if __name__ == "__main__":

    comm = MPI.COMM_WORLD
    comm_size = comm.Get_size()
    rankId = comm.Get_rank()

    if nBlocks != comm_size:
        if rankId == MPI_ROOT:
            print(
                "{} MPI ranks != {} datasets available, so please start exactly {} ranks.",
                comm_size, nBlocks, nBlocks)
        sys.exit(0)

    computestep1Local()

    if rankId == MPI_ROOT:
        computeOnMasterNode()

    finalizeComputestep1Local()

    # Print the results
    if rankId == MPI_ROOT:
        printNumericTable(Qi, "Part of orthogonal matrix Q from 1st node:", 10)
        printNumericTable(R, "Triangular matrix R:")