def load_dataset_into_batches(file_dir_path: str, subset: Subset, subset_size: int, shuffle: bool = False): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(file_dir_path, subset) return BatchGenerator(dataset, subset_size, shuffle, op)
def main(): data = PetsDataset("/home/helmuth/dlvc/cifar-10-batches-py", Subset.TRAINING) # ops chain op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32), ops.add(-127.5), ops.mul(1/127.5), ]) # batch generator #1 bg1 = BatchGenerator(data, len(data), False) assert(len(bg1) == 1) # batch generator #2 bg2 = BatchGenerator(data, 500, False, op) assert(len(bg2) == 16) # first batch cnt = 0 for batch in bg2: cnt += 1 if cnt < 16: assert(batch.data.shape == (500, 3072)) assert(batch.labels.shape == (500,)) assert(batch.data.dtype == np.float32) assert(np.issubdtype(batch.labels.dtype, np.integer)) if cnt == 1: print("First batch, first sample, not shuffled") print(batch.data[0]) # batch generator #3 bg3 = BatchGenerator(data, 500, True, op) # run 5 times through first sample of shuffled batch generator for i in range(5): it = iter(bg3) print("First batch, first sample, shuffled") print(next(it).data[0])
def test_data_transformation(self): op = ops.chain([ops.vectorize(), ops.type_cast(np.float32)]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) batch_gen = BatchGenerator(dataset, 100, False, op) self.assertEqual(len(batch_gen), 80) iter_gen = iter(batch_gen) iter_result = next(iter_gen) self.assertEqual(iter_result.data[0].shape, (3072, )) self.assertTrue(np.issubdtype(iter_result.data.dtype, np.float32))
def test_train_with_wrong_type_of_labels(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) batch_gen = BatchGenerator(dataset, 7959, False, op) batch_iter = iter(batch_gen) iter_result = next(batch_iter) classifier = KnnClassifier(10, 3072, 2) self.assertRaises(TypeError, classifier.train, iter_result.data, [0, 1, 0])
def test_train_with_proper_data(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) batch_gen = BatchGenerator(dataset, 7959, False, op) batch_iter = iter(batch_gen) iter_result = next(batch_iter) classifier = KnnClassifier(10, 3072, 2) classifier.train(iter_result.data, iter_result.label)
def test_train_wrong_vector_size_in_data(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) batch_gen = BatchGenerator(dataset, 7959, False, op) batch_iter = iter(batch_gen) iter_result = next(batch_iter) classifier = KnnClassifier(10, 3072, 2) changed_data = np.delete(iter_result.data, 100, 1) self.assertRaises(RuntimeError, classifier.train, changed_data, iter_result.label)
def test_correctness_of_data_for_train(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) one_batch_gen = BatchGenerator(dataset, 7959, False, op) self.assertEqual(len(one_batch_gen), 1) many_batch_gen = BatchGenerator(dataset, 500, False, op) self.assertEqual(len(many_batch_gen), 16) reference = [116., 125., 125., 91., 101.] batch_iter = iter(many_batch_gen) batch_iter = next(batch_iter) [self.assertEqual(item, reference[i]) for i, item in enumerate(batch_iter.data[0][:5])]
def test_predict_with_proper_data(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset_training = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) dataset_valid = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.VALIDATION) batch_gen_t = BatchGenerator(dataset_training, 795, False, op) batch_gen_v = BatchGenerator(dataset_valid, 204, False, op) batch_iter_t = iter(batch_gen_t) iter_result_t = next(batch_iter_t) batch_iter_v = iter(batch_gen_v) iter_result_v = next(batch_iter_v) classifier = KnnClassifier(10, 3072, 2) classifier.train(iter_result_t.data, iter_result_t.label) results = classifier.predict(iter_result_v.data) self.assertEqual(len(results), 204) for result in results: self.assertEqual(np.sum(result), 1.0)
expected = 1 assert num_of_batches == expected, "Number of batches is " + str( num_of_batches) + ", expected: " + str(expected) # The number of training batches is 16 if the batch size is set to 500 batch_generator = BatchGenerator(dataset_training, 500, False) num_of_batches = len(batch_generator) expected = 16 assert num_of_batches == expected, "Number of batches is " + str( num_of_batches) + ", expected: " + str(expected) # The data and label shapes are (500, 3072) and (500,), respectively, unless for the last batch batch_generator = BatchGenerator(dataset_training, 500, shuffle=False, op=vectorize()) last_batch_idx = len(batch_generator) - 1 batch_idx = 0 for batch in batch_generator: # skip last batch if batch_idx == last_batch_idx: continue assert batch.data.shape == (500, 3072), "Batch data shape: " + str( batch.data.shape) + ", expected: (500, 3072)." assert batch.labels.shape == (500, ), "Batch labels shape: " + str( batch.labels.shape) + ", expected: (500,)." batch_idx += 1 # The data type is always np.float32 and the label type is integral (one of the np.int and np.uint variants) # Implemented: for label type np.uint8 since there is less than 256 labels batch_generator = BatchGenerator(
import numpy as np dir = '/Users/mmatak/dev/college/DLVC/cifar-10/cifar-10-batches-py/' IMAGE_HEIGHT = 32 IMAGE_WIDTH = 32 NUM_CHANNELS = 3 NUM_CLASSES = 2 pets_training = PetsDataset(dir, Subset.TRAINING) pets_validation = PetsDataset(dir, Subset.VALIDATION) pets_test = PetsDataset(dir, Subset.TEST) batchGenerator_training = BatchGenerator(pets_training, len(pets_training), False, op=chain([type_cast(dtype=np.float32), vectorize()])) batchGenerator_validation = BatchGenerator(pets_validation, len(pets_validation), False, op=chain([type_cast(dtype=np.float32), vectorize()])) batchGenerator_test = BatchGenerator(pets_test, len(pets_test), False, op=chain([type_cast(dtype=np.float32), vectorize()])) best_accuracy = Accuracy() best_k = -1 results = {} knn = None for k in range(1, 100, 40): # grid search example knn = KnnClassifier(k, IMAGE_HEIGHT*IMAGE_WIDTH*NUM_CHANNELS, NUM_CLASSES) accuracy = Accuracy() # train and compute validation accuracy ...
TrainedModel = namedtuple('TrainedModel', ['model', 'accuracy']) # initialize RNG for reproducability random.seed(42) np.random.seed(42) torch.manual_seed(42) # Step 1: load the data sets (TRAIN, VALIDATION & TEST) train_data = PetsDataset("../cifar-10-batches-py", Subset.TRAINING) val_data = PetsDataset("../cifar-10-batches-py", Subset.VALIDATION) test_data = PetsDataset("../cifar-10-batches-py", Subset.TEST) # Operations to standardize op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32), ops.add(-127.5), ops.mul(1/127.5), ]) # Step 2: Create batch generator for each BATCH_SIZE = 512 train_batches = BatchGenerator(train_data, BATCH_SIZE, True, op) val_batches = BatchGenerator(val_data, BATCH_SIZE, True, op) test_batches = BatchGenerator(test_data, BATCH_SIZE, True, op) def train_model(lr: float, momentum: float) -> TrainedModel: ''' Trains a linear classifier with a given learning rate (lr) and momentum. Computes the accuracy on the validation set. Returns both the trained classifier and accuracy.
def load_dataset(subset: Subset) -> batches.BatchGenerator: dataset = PetsDataset('../data/cifar-10-batches-py', subset) op = ops.chain([ops.vectorize(), ops.type_cast(np.float32)]) return batches.BatchGenerator(dataset, len(dataset), True, op)
# kNN must have accuracy 100% start = time.time() pets = PetsDataset( '/Users/mmatak/dev/college/DLVC/cifar-10/cifar-10-batches-py/', Subset.TEST) num_classes = 2 k = 1 knn = KnnClassifier(k, 32 * 32 * 3, num_classes) batchGenerator = BatchGenerator(pets, 512, False, op=chain( [type_cast(dtype=np.float32), vectorize()])) groundTruthLabels = None for batch in batchGenerator: knn.train(batch.data, batch.label) groundTruthLabels = batch.label predictedLabels = None def measure_accuracy(predictedLabels: np.ndarray, groundTruthLabels: np.ndarray): correct = 0 for index, trueLabel in enumerate(groundTruthLabels): predictedLabel = np.argmax(predictedLabels[index]) if predictedLabel == trueLabel:
assert num_of_batches == expected, "Number of batches is " + str(num_of_batches) + ", expected: " + str(expected) # The number of training batches is 16 if the batch size is set to 500 batch_generator = BatchGenerator(dataset_training, 500, False) num_of_batches = len(batch_generator) expected = 16 assert num_of_batches == expected, "Number of batches is " + str(num_of_batches) + ", expected: " + str(expected) # and the last batch has size 459 batch_idx = 0 for batch in batch_generator: batch_idx += 1 if batch_idx == 16: assert len(batch.label) == 459, "Num of samples in the last batch is: " + str(len(batch.label)) + ", expected: 459" # The data and label shapes are (500, 3072) and (500,), respectively, unless for the last batch batch_generator = BatchGenerator(dataset_training, 500, shuffle=False, op=vectorize()) last_batch_idx = len(batch_generator) - 1 batch_idx = 0 for batch in batch_generator: # skip last batch if batch_idx == last_batch_idx: continue assert batch.data.shape == (500, 3072), "Batch data shape: " + str(batch.data.shape) + ", expected: (500, 3072)." assert batch.label.shape == (500,), "Batch labels shape: " + str(batch.label.shape) + ", expected: (500,)." batch_idx += 1 # The data type is always np.float32 and the label type is integral (one of the np.int and np.uint variants) # Implemented: for label type np.uint8 since there is less than 256 labels batch_generator = BatchGenerator(dataset_training, 500, False, op=chain([vectorize(), type_cast(dtype=np.float32)])) for batch in batch_generator: assert batch.data.dtype == np.float32, "Batch data type: " + str(batch.data.dtype) + ", expected: np.float32."