class SampleTrainValTestSet(unittest.TestCase): def setUp(self): f, self.true_proportions = create_skewed_CSV() self.K = len(self.true_proportions) self.bd = BlockDesigner(f, self.K) self.samp = Sampler(self.bd.remainder()) def test_all_classes(self): for test_klass in range(self.bd.K): X, y = self.samp.custom_distribution(test_klass, 128) self.failUnless( (len(y) == len(X)) and (len(X) % 128 == 0) ) def test_cycles_through_all_data(self): X, y = self.samp.custom_distribution(0, 128) X2, y2 = self.samp.custom_distribution(0, 128) self.failUnless( len(set(X+X2)) == sum(ACTUAL_TRAIN_DR_PROPORTIONS) ) def test_custom_distribution(self): X, y = self.samp.custom_distribution(0, 128, [94,9,19,3,3]) collect = {} for k in set(y): collect[k] = [] for i, klass in enumerate(y): collect[klass].append(X[i]) self.failUnless( sum(abs(get_proportions(collect) - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K ) def test_skipping_classes(self): X, y = self.samp.custom_distribution(0, 128, [64,64,0,0,0]) collect = {} for k in set(y): collect[k] = [] for i, klass in enumerate(y): collect[klass].append(X[i]) self.failUnless( sum(abs(get_proportions(collect) - numpy.array([0.5, 0.5])) < PROPORTION_ERROR_MARGIN) == 2 )
class SampleTrainValTestSet(unittest.TestCase): def setUp(self): f, self.true_proportions = create_skewed_CSV() self.K = len(self.true_proportions) self.bd = BlockDesigner(f, self.K) self.samp = Sampler(self.bd.remainder()) def test_all_classes(self): for test_klass in range(self.bd.K): X, y = self.samp.custom_distribution(test_klass, 128) self.failUnless((len(y) == len(X)) and (len(X) % 128 == 0)) def test_cycles_through_all_data(self): X, y = self.samp.custom_distribution(0, 128) X2, y2 = self.samp.custom_distribution(0, 128) self.failUnless(len(set(X + X2)) == sum(ACTUAL_TRAIN_DR_PROPORTIONS)) def test_custom_distribution(self): X, y = self.samp.custom_distribution(0, 128, [94, 9, 19, 3, 3]) collect = {} for k in set(y): collect[k] = [] for i, klass in enumerate(y): collect[klass].append(X[i]) self.failUnless( sum( abs(get_proportions(collect) - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K) def test_skipping_classes(self): X, y = self.samp.custom_distribution(0, 128, [64, 64, 0, 0, 0]) collect = {} for k in set(y): collect[k] = [] for i, klass in enumerate(y): collect[klass].append(X[i]) self.failUnless( sum( abs(get_proportions(collect) - numpy.array([0.5, 0.5])) < PROPORTION_ERROR_MARGIN) == 2)
class CreateTrainValTestSet(unittest.TestCase): def setUp(self): f, self.true_proportions = create_skewed_CSV() self.K = len(self.true_proportions) self.bd = BlockDesigner(f, self.K) def get_counts(self, dataset): return numpy.array( [len(dataset[klass]) for klass in reversed(xrange(self.K))]) def test_instantiating_and_splitting_multiple_times(self): valid_dataset = self.bd.break_off_block(4864) train_dataset = self.bd.remainder() train_batches_to_take = self.bd.size() // 128 bd2 = BlockDesigner(train_dataset) batches2 = bd2.break_off_multiple_blocks(train_batches_to_take, 128) bd3 = BlockDesigner(train_dataset) batches3 = bd3.break_off_multiple_blocks(train_batches_to_take, 128) ideal_counts = numpy.array( [int(128 * p) for p in self.true_proportions]) for i in xrange(len(batches2)): counts = self.get_counts(batches2[i]) self.failUnless(sum(counts) == 128) self.failUnless( sum(abs(self.get_counts(batches2[i]) - ideal_counts)) < SAMPLE_COUNT_ERROR_MARGIN) counts = self.get_counts(batches3[i]) self.failUnless(sum(counts) == 128) self.failUnless( sum(abs(self.get_counts(batches3[i]) - ideal_counts)) < SAMPLE_COUNT_ERROR_MARGIN) def test_small_blocks_for_consistency(self): valid_dataset = self.bd.break_off_block(4864) bd2 = BlockDesigner(valid_dataset) batches = bd2.break_off_multiple_blocks(int(4864 / 128.), 128) ideal_counts = numpy.array( [int(128 * p) for p in self.true_proportions]) self.failUnless(bd2.size() == 0) for i in xrange(len(batches)): counts = self.get_counts(batches[i]) self.failUnless(sum(counts) == 128) self.failUnless( sum(abs(self.get_counts(batches[i]) - ideal_counts)) < SAMPLE_COUNT_ERROR_MARGIN) def test_no_test_set(self): valid_dataset = self.bd.break_off_block(4864) train_dataset = self.bd.remainder() self.failUnless( sum( self.get_counts(valid_dataset) + self.get_counts(train_dataset)) == self.bd.init_size) valid_proportions = get_proportions(valid_dataset) train_proportions = get_proportions(train_dataset) self.failUnless( sum( abs(valid_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K) self.failUnless( sum( abs(train_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K) def test_all_sets(self): test_dataset = self.bd.break_off_block(1024) valid_dataset = self.bd.break_off_block(4864) train_dataset = self.bd.remainder() self.failUnless( sum( self.get_counts(test_dataset) + self.get_counts(valid_dataset) + self.get_counts(train_dataset)) == self.bd.init_size) test_proportions = get_proportions(test_dataset) valid_proportions = get_proportions(valid_dataset) train_proportions = get_proportions(train_dataset) self.failUnless( sum( abs(test_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K) self.failUnless( sum( abs(valid_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K) self.failUnless( sum( abs(train_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K)
class CreateTrainValTestSet(unittest.TestCase): def setUp(self): f, self.true_proportions = create_skewed_CSV() self.K = len(self.true_proportions) self.bd = BlockDesigner(f, self.K) def get_counts(self, dataset): return numpy.array([len(dataset[klass]) for klass in reversed(xrange(self.K))]) def test_instantiating_and_splitting_multiple_times(self): valid_dataset = self.bd.break_off_block(4864) train_dataset = self.bd.remainder() train_batches_to_take = self.bd.size() // 128 bd2 = BlockDesigner(train_dataset) batches2 = bd2.break_off_multiple_blocks(train_batches_to_take, 128) bd3 = BlockDesigner(train_dataset) batches3 = bd3.break_off_multiple_blocks(train_batches_to_take, 128) ideal_counts = numpy.array([int(128 * p) for p in self.true_proportions]) for i in xrange(len(batches2)): counts = self.get_counts(batches2[i]) self.failUnless( sum(counts) == 128 ) self.failUnless( sum(abs(self.get_counts(batches2[i]) - ideal_counts)) < SAMPLE_COUNT_ERROR_MARGIN ) counts = self.get_counts(batches3[i]) self.failUnless( sum(counts) == 128 ) self.failUnless( sum(abs(self.get_counts(batches3[i]) - ideal_counts)) < SAMPLE_COUNT_ERROR_MARGIN ) def test_small_blocks_for_consistency(self): valid_dataset = self.bd.break_off_block(4864) bd2 = BlockDesigner(valid_dataset) batches = bd2.break_off_multiple_blocks(int(4864 / 128.), 128) ideal_counts = numpy.array([int(128 * p) for p in self.true_proportions]) self.failUnless( bd2.size() == 0 ) for i in xrange(len(batches)): counts = self.get_counts(batches[i]) self.failUnless( sum(counts) == 128 ) self.failUnless( sum(abs(self.get_counts(batches[i]) - ideal_counts)) < SAMPLE_COUNT_ERROR_MARGIN ) def test_no_test_set(self): valid_dataset = self.bd.break_off_block(4864) train_dataset = self.bd.remainder() self.failUnless( sum(self.get_counts(valid_dataset) + self.get_counts(train_dataset)) == self.bd.init_size ) valid_proportions = get_proportions(valid_dataset) train_proportions = get_proportions(train_dataset) self.failUnless( sum(abs(valid_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K ) self.failUnless( sum(abs(train_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K ) def test_all_sets(self): test_dataset = self.bd.break_off_block(1024) valid_dataset = self.bd.break_off_block(4864) train_dataset = self.bd.remainder() self.failUnless( sum(self.get_counts(test_dataset) + self.get_counts(valid_dataset) + self.get_counts(train_dataset)) == self.bd.init_size ) test_proportions = get_proportions(test_dataset) valid_proportions = get_proportions(valid_dataset) train_proportions = get_proportions(train_dataset) self.failUnless( sum(abs(test_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K ) self.failUnless( sum(abs(valid_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K ) self.failUnless( sum(abs(train_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K )