def __init__(self, numFolds, data, labels): self._numFolds = numFolds self._curFold = 0 self._trainData = array_split(data, self._numFolds) self._testData = None self._trainLabels = array_split(labels, self._numFolds) self._testLabels = None
def test_integer_split_2D_rows(self): a = np.array([np.arange(10), np.arange(10)]) res = array_split(a, 3, axis=0) tgt = [np.array([np.arange(10)]), np.array([np.arange(10)]), np.zeros((0, 10))] compare_results(res, tgt) assert_(a.dtype.type is res[-1].dtype.type) # Same thing for manual splits: res = array_split(a, [0, 1, 2], axis=0) tgt = [np.zeros((0, 10)), np.array([np.arange(10)]), np.array([np.arange(10)])] compare_results(res, tgt) assert_(a.dtype.type is res[-1].dtype.type)
def crossValidation(numFolds, data, labels, algorithm, accuracyList, learningCurveList, numLearningCurveIterations, learningCurveIndexMod): dataFolds = array_split(data, numFolds) labelFolds = array_split(labels, numFolds) for testIndex in range(numFolds): print testIndex, testData = dataFolds.pop(testIndex) testLabels = labelFolds.pop(testIndex) trainData = vstack(dataFolds) trainLabels = hstack(labelFolds) accuracyList.append(algorithm(trainData, trainLabels, testData, testLabels)) learningCurve(algorithm, learningCurveList, trainData, trainLabels, testData, testLabels, numLearningCurveIterations, learningCurveIndexMod) dataFolds.insert(testIndex, testData) labelFolds.insert(testIndex, testLabels) print ''
def test_integer_split_2D_rows_greater_max_int32(self): a = np.broadcast_to([0], (1 << 32, 2)) res = array_split(a, 4) chunk = np.broadcast_to([0], (1 << 30, 2)) tgt = [chunk] * 4 for i in range(len(tgt)): assert_equal(res[i].shape, tgt[i].shape)
def test_two_dimensional_two_integer_remainder_split(self): matrix = np.reshape(np.arange(16), (4, 4)) res = array_split(matrix, [3, 3], 0, True) desired = [[[0, 1], [4, 5]], [[2], [6]], [[3], [7]], [[8, 9]], [[10]], [[11]], [[12, 13]], [[14]], [[15]]] compare_results(res, desired)
def test_integer_split_2D_cols(self): a = np.array([np.arange(10), np.arange(10)]) res = array_split(a, 3, axis=-1) desired = [np.array([np.arange(4), np.arange(4)]), np.array([np.arange(4, 7), np.arange(4, 7)]), np.array([np.arange(7, 10), np.arange(7, 10)])] compare_results(res, desired)
def test_index_split_simple(self): a = np.arange(10) indices = [1, 5, 7] res = array_split(a, indices, axis=-1) desired = [np.arange(0, 1), np.arange(1, 5), np.arange(5, 7), np.arange(7, 10)] compare_results(res, desired)
def test_index_split_high_bound(self): a = np.arange(10) indices = [0, 5, 7, 10, 12] res = array_split(a, indices, axis=-1) desired = [np.array([]), np.arange(0, 5), np.arange(5, 7), np.arange(7, 10), np.array([]), np.array([])] compare_results(res, desired)
def test_integer_split_2D_default(self): """ This will fail if we change default axis """ a = np.array([np.arange(10), np.arange(10)]) res = array_split(a, 3) tgt = [np.array([np.arange(10)]), np.array([np.arange(10)]), np.zeros((0, 10))] compare_results(res, tgt) assert_(a.dtype.type is res[-1].dtype.type)
datasets = getDataSets(dataDir, ['ionosphere', 'iris', 'wine']) #datasets = getDataSets(dataDir, ['by_hand']) #datasets = getDataSets(dataDir, ['ionosphere']) tp = ThreadPool(4) for name, (data, labels) in datasets.iteritems(): datasetOutDir = getDatasetOutDir(outDir, name) print "Computing on", name #do a split into overall test and overall train overallTestTrainRatio = 1.0 / 3.0 overallTestTrainSplitIndex = array([int(overallTestTrainRatio * len(data))]) overallTestData, overallTrainData = array_split(data, array([overallTestTrainSplitIndex])) overallTestLabels, overallTrainLabels = array_split(labels, overallTestTrainSplitIndex) #test a whole bunch of generic kernels on the overall split data fileIdentifier = 'overall' #print "train:", overallTrainData #print "test:", overallTestData compareAlgorithmsOnSameKernels(tp, overallTrainData, overallTrainLabels, overallTestData, overallTestLabels, name, fileIdentifier) #now, try to find an optimal kernel for either svm or kfd #do it for each kernel type numOptimizationFolds = 3 fileIdentifier = 'optimized' compareAlgorithmsOnOptimizedKernel(tp, overallTrainData, overallTrainLabels, overallTestData, overallTestLabels, numOptimizationFolds, datasetOutDir, name, fileIdentifier)
#remove ignored columns and class column, in reverse sorted order #do it in reverse sorted order so the indexes stay correct for removeCol in sorted(ignoreColList + [classCol], reverse=True): if removeCol == classCol: label = features.pop(removeCol) else: features.pop(removeCol) datasetDict[label].append(features) origDataFile.close() #make it into a 2 class problem by lumping classes together #don't have rhyme or reason - don't want to favor one class or another, or make our data artificially clean numOrigClasses = len(datasetDict.keys()) #split into 2, possibly unequal, groups of class labels newClassMap = array_split(datasetDict.keys(), 2) #reorganize the data dataWithNewLabelMap = defaultdict(list) for newClassLabel, oldClassLabelList in enumerate(newClassMap): for oldClassLabel in oldClassLabelList: for featureRow in datasetDict[oldClassLabel]: dataWithNewLabelMap[newClassLabel].append(featureRow) #make the two datasets the same size dataWithNewLabelTupleList = [] minClassSize = min([len(x) for x in dataWithNewLabelMap.values()]) for newClassLabel, featureRowList in dataWithNewLabelMap.iteritems(): for featureRow in featureRowList[:minClassSize]: dataWithNewLabelTupleList.append((featureRow, newClassLabel))
def test_integer_split(self): a = np.arange(10) res = array_split(a, 1) desired = [np.arange(10)] compare_results(res, desired) res = array_split(a, 2) desired = [np.arange(5), np.arange(5, 10)] compare_results(res, desired) res = array_split(a, 3) desired = [np.arange(4), np.arange(4, 7), np.arange(7, 10)] compare_results(res, desired) res = array_split(a, 4) desired = [np.arange(3), np.arange(3, 6), np.arange(6, 8), np.arange(8, 10)] compare_results(res, desired) res = array_split(a, 5) desired = [np.arange(2), np.arange(2, 4), np.arange(4, 6), np.arange(6, 8), np.arange(8, 10)] compare_results(res, desired) res = array_split(a, 6) desired = [np.arange(2), np.arange(2, 4), np.arange(4, 6), np.arange(6, 8), np.arange(8, 9), np.arange(9, 10)] compare_results(res, desired) res = array_split(a, 7) desired = [np.arange(2), np.arange(2, 4), np.arange(4, 6), np.arange(6, 7), np.arange(7, 8), np.arange(8, 9), np.arange(9, 10)] compare_results(res, desired) res = array_split(a, 8) desired = [np.arange(2), np.arange(2, 4), np.arange(4, 5), np.arange(5, 6), np.arange(6, 7), np.arange(7, 8), np.arange(8, 9), np.arange(9, 10)] compare_results(res, desired) res = array_split(a, 9) desired = [np.arange(2), np.arange(2, 3), np.arange(3, 4), np.arange(4, 5), np.arange(5, 6), np.arange(6, 7), np.arange(7, 8), np.arange(8, 9), np.arange(9, 10)] compare_results(res, desired) res = array_split(a, 10) desired = [np.arange(1), np.arange(1, 2), np.arange(2, 3), np.arange(3, 4), np.arange(4, 5), np.arange(5, 6), np.arange(6, 7), np.arange(7, 8), np.arange(8, 9), np.arange(9, 10)] compare_results(res, desired) res = array_split(a, 11) desired = [np.arange(1), np.arange(1, 2), np.arange(2, 3), np.arange(3, 4), np.arange(4, 5), np.arange(5, 6), np.arange(6, 7), np.arange(7, 8), np.arange(8, 9), np.arange(9, 10), np.array([])] compare_results(res, desired)