Exemplo n.º 1
0
        # compute
        result = pca_alg.compute()
        self.eigenvalues_ = result.get(pca.eigenvalues)
        self.eigenvectors_ = result.get(pca.eigenvectors)


data = np.genfromtxt("cs-data.csv",
                     delimiter=',',
                     dtype=np.double,
                     skip_header=1,
                     usecols=list(range(1, 11)))
data = data[~np.isnan(data).any(axis=1)]
data = scale(data)

data_nt = HomogenNumericTable(data)
print(data_nt.getNumberOfRows(), data_nt.getNumberOfColumns())

# PCA via SVD
pr = PCA(method='svd')
pr.compute(data_nt)
loadings = getArrayFromNT(pr.eigenvectors_)
ev = getArrayFromNT(pr.eigenvalues_)
print(ev / np.sum(ev))

# PCA via covariances
cov_data = np.cov(data.transpose())
cov_nt = HomogenNumericTable(cov_data)
pr = PCA(method='correlation')
pr.compute(cov_nt)
loadings = getArrayFromNT(pr.eigenvectors_)
ev = getArrayFromNT(pr.eigenvalues_)
Exemplo n.º 2
0
def testModel():
    thresholdValues = np.linspace(-25.0, 25.0, num=101)
    numberOfCorrectlyClassifiedObjects = np.zeros(len(thresholdValues))
    numberOfObjectsInTestFiles = 0
    numberOfNonzeroObjectsInTestFiles = 0
    for filenameIndex in range(0, len(testDatasetFileNames)):
        testDataSource = FileDataSource(
            testDatasetFileNames[filenameIndex],
            DataSourceIface.doAllocateNumericTable,
            DataSourceIface.doDictionaryFromContext)
        testData = HomogenNumericTable(nFeatures, 0,
                                       NumericTableIface.notAllocate)
        testGroundTruth = HomogenNumericTable(nDependentVariables, 0,
                                              NumericTableIface.notAllocate)
        mergedData = MergedNumericTable(testData, testGroundTruth)
        testDataSource.loadDataBlock(mergedData)

        algorithm = prediction.Batch_Float64DefaultDense()
        algorithm.input.setNumericTableInput(prediction.data, testData)
        algorithm.input.setModelInput(prediction.model,
                                      trainingResult.get(training.model))
        predictionResult = algorithm.compute()

        block1 = BlockDescriptor()
        block2 = BlockDescriptor()
        testGroundTruth.getBlockOfRows(0, testGroundTruth.getNumberOfRows(),
                                       readOnly, block1)
        predictionResult.get(prediction.prediction).getBlockOfRows(
            0, testGroundTruth.getNumberOfRows(), readOnly, block2)
        y_true = getClassVector(block1.getArray(), 0.000000000000)
        predictionRegression = block2.getArray()
        for thresholdIndex in range(0, len(thresholdValues)):
            y_pred = getClassVector(predictionRegression,
                                    thresholdValues[thresholdIndex])
            numberOfCorrectlyClassifiedObjects[
                thresholdIndex] += accuracy_score(y_true,
                                                  y_pred,
                                                  normalize=False)
        numberOfObjectsInTestFiles += len(y_true)
        numberOfNonzeroObjectsInTestFiles += np.count_nonzero(y_true)
        mergedData.freeDataMemory()
        testData.freeDataMemory()
        testGroundTruth.freeDataMemory()

    classificationAccuracyResult = np.zeros(len(thresholdValues))
    best_threshold = None
    best_accuracy = -1
    for thresholdIndex in range(0, len(thresholdValues)):
        classificationAccuracyResult[
            thresholdIndex] = numberOfCorrectlyClassifiedObjects[
                thresholdIndex] / numberOfObjectsInTestFiles
        if (classificationAccuracyResult[thresholdIndex] > best_accuracy):
            best_threshold = thresholdValues[thresholdIndex]
            best_accuracy = classificationAccuracyResult[thresholdIndex]
    print('Best threshold:{:.4f}. Best accuracy:{:.4f}'.format(
        best_threshold, best_accuracy))
    print(
        'Test set. Number of objects of 0 class:{:.4f}.Number of objects of 1 class:{:.4f}. '
        'Frequency of 1 class:{:.4f}'.format(
            numberOfObjectsInTestFiles - numberOfNonzeroObjectsInTestFiles,
            numberOfNonzeroObjectsInTestFiles,
            numberOfNonzeroObjectsInTestFiles / numberOfObjectsInTestFiles))
    indexOfZeroThreshold = np.where(thresholdValues == 0.0)[0][0]
    print('Threshold=0. Classification accuracy:{:.4f}'.format(
        classificationAccuracyResult[indexOfZeroThreshold]))