def load_dataset_into_batches(file_dir_path: str, subset: Subset, subset_size: int, shuffle: bool = False):
    op = ops.chain([
        ops.vectorize(),
        ops.type_cast(np.float32)
    ])
    dataset = PetsDataset(file_dir_path, subset)
    return BatchGenerator(dataset, subset_size, shuffle, op)
예제 #2
0
파일: test2.py 프로젝트: helmuthb/dlvc2020
def main():
    data = PetsDataset("/home/helmuth/dlvc/cifar-10-batches-py", Subset.TRAINING)
    # ops chain
    op = ops.chain([
        ops.vectorize(),
        ops.type_cast(np.float32),
        ops.add(-127.5),
        ops.mul(1/127.5),
    ])
    # batch generator #1
    bg1 = BatchGenerator(data, len(data), False)
    assert(len(bg1) == 1)
    # batch generator #2
    bg2 = BatchGenerator(data, 500, False, op)
    assert(len(bg2) == 16)
    # first batch
    cnt = 0
    for batch in bg2:
        cnt += 1
        if cnt < 16:
            assert(batch.data.shape == (500, 3072))
            assert(batch.labels.shape == (500,))
        assert(batch.data.dtype == np.float32)
        assert(np.issubdtype(batch.labels.dtype, np.integer))
        if cnt == 1:
            print("First batch, first sample, not shuffled")
            print(batch.data[0])
    # batch generator #3
    bg3 = BatchGenerator(data, 500, True, op)
    # run 5 times through first sample of shuffled batch generator
    for i in range(5):
        it = iter(bg3)
        print("First batch, first sample, shuffled")
        print(next(it).data[0])
예제 #3
0
 def test_data_transformation(self):
     op = ops.chain([ops.vectorize(), ops.type_cast(np.float32)])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir),
                           Subset.TRAINING)
     batch_gen = BatchGenerator(dataset, 100, False, op)
     self.assertEqual(len(batch_gen), 80)
     iter_gen = iter(batch_gen)
     iter_result = next(iter_gen)
     self.assertEqual(iter_result.data[0].shape, (3072, ))
     self.assertTrue(np.issubdtype(iter_result.data.dtype, np.float32))
 def test_train_with_wrong_type_of_labels(self):
     op = ops.chain([
         ops.vectorize(),
         ops.type_cast(np.float32)
     ])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
     batch_gen = BatchGenerator(dataset, 7959, False, op)
     batch_iter = iter(batch_gen)
     iter_result = next(batch_iter)
     classifier = KnnClassifier(10, 3072, 2)
     self.assertRaises(TypeError, classifier.train, iter_result.data, [0, 1, 0])
예제 #5
0
def load_dataset(subset: Subset) -> batches.BatchGenerator:
    dataset = PetsDataset('../data/cifar-10-batches-py', subset)

    op = ops.chain([
        ops.hwc2chw(),
        ops.add(-127.5),
        ops.mul(1 / 127.5),
        ops.type_cast(np.float32)
    ])

    return batches.BatchGenerator(dataset, 128, True, op)
 def test_train_with_proper_data(self):
     op = ops.chain([
         ops.vectorize(),
         ops.type_cast(np.float32)
     ])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
     batch_gen = BatchGenerator(dataset, 7959, False, op)
     batch_iter = iter(batch_gen)
     iter_result = next(batch_iter)
     classifier = KnnClassifier(10, 3072, 2)
     classifier.train(iter_result.data, iter_result.label)
 def test_train_wrong_vector_size_in_data(self):
     op = ops.chain([
         ops.vectorize(),
         ops.type_cast(np.float32)
     ])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
     batch_gen = BatchGenerator(dataset, 7959, False, op)
     batch_iter = iter(batch_gen)
     iter_result = next(batch_iter)
     classifier = KnnClassifier(10, 3072, 2)
     changed_data = np.delete(iter_result.data, 100, 1)
     self.assertRaises(RuntimeError, classifier.train, changed_data, iter_result.label)
 def test_correctness_of_data_for_train(self):
     op = ops.chain([
         ops.vectorize(),
         ops.type_cast(np.float32)
     ])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
     one_batch_gen = BatchGenerator(dataset, 7959, False, op)
     self.assertEqual(len(one_batch_gen), 1)
     many_batch_gen = BatchGenerator(dataset, 500, False, op)
     self.assertEqual(len(many_batch_gen), 16)
     reference = [116., 125., 125., 91., 101.]
     batch_iter = iter(many_batch_gen)
     batch_iter = next(batch_iter)
     [self.assertEqual(item, reference[i]) for i, item in enumerate(batch_iter.data[0][:5])]
예제 #9
0
def load_dataset(subset: Subset, augment=False) -> batches.BatchGenerator:
    dataset = PetsDataset('../data/cifar-10-batches-py', subset)

    ops_list = []

    if augment:
        ops_list += [ops.hflip(), ops.rcrop(32, 12, 'constant')]

    ops_list += [
        ops.mul(1 / 255),
        ops.type_cast(np.float32),
        # Imagenet:
        # ops.normalize(  mean=np.array([0.485, 0.456, 0.406]),
        #                 std=np.array([0.229, 0.224, 0.225])),
        # Cifar-10:
        ops.normalize(mean=np.array([0.41477802, 0.45935813, 0.49693552]),
                      std=np.array([0.25241926, 0.24699265, 0.25279155])),
        ops.hwc2chw()
    ]

    op = ops.chain(ops_list)

    return batches.BatchGenerator(dataset, 128, True, op)
    def test_predict_with_proper_data(self):

        op = ops.chain([
            ops.vectorize(),
            ops.type_cast(np.float32)
        ])
        dataset_training = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
        dataset_valid = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.VALIDATION)

        batch_gen_t = BatchGenerator(dataset_training, 795, False, op)
        batch_gen_v = BatchGenerator(dataset_valid, 204, False, op)

        batch_iter_t = iter(batch_gen_t)
        iter_result_t = next(batch_iter_t)

        batch_iter_v = iter(batch_gen_v)
        iter_result_v = next(batch_iter_v)

        classifier = KnnClassifier(10, 3072, 2)
        classifier.train(iter_result_t.data, iter_result_t.label)
        results = classifier.predict(iter_result_v.data)
        self.assertEqual(len(results), 204)
        for result in results:
            self.assertEqual(np.sum(result), 1.0)
예제 #11
0
    # skip last batch
    if batch_idx == last_batch_idx:
        continue
    assert batch.data.shape == (500, 3072), "Batch data shape: " + str(
        batch.data.shape) + ", expected: (500, 3072)."
    assert batch.labels.shape == (500, ), "Batch labels shape: " + str(
        batch.labels.shape) + ", expected: (500,)."
    batch_idx += 1

# The data type is always np.float32 and the label type is integral (one of the np.int and np.uint variants)
# Implemented: for label type np.uint8 since there is less than 256 labels
batch_generator = BatchGenerator(
    dataset_training,
    500,
    False,
    op=chain([vectorize(), type_cast(dtype=np.float32)]))
for batch in batch_generator:
    assert batch.data.dtype == np.float32, "Batch data type: " + str(
        batch.data.dtype) + ", expected: np.float32."
    assert batch.labels.dtype == np.uint8, "Batch labels type: " + str(
        batch.labels.dtype) + ", expected: np.uint8."

# The first sample of the first training batch returned without shuffling
# has label 0 ...
batch_generator = BatchGenerator(
    dataset_training,
    500,
    False,
    op=chain([type_cast(dtype=np.float32),
              vectorize()]))
first_sample_label_unshuffled = None
예제 #12
0
from dlvc.batches import BatchGenerator
from dlvc.test import Accuracy
from dlvc.datasets.pets import PetsDataset
from dlvc.dataset import Subset
import dlvc.ops as ops

np.random.seed(0)
torch.manual_seed(0)

DATA_PATH = "../cifar-10-batches-py/"
MODEL_PATH = "best_model.pt"
train_data = PetsDataset(DATA_PATH, Subset.TRAINING)
val_data = PetsDataset(DATA_PATH, Subset.VALIDATION)

op = ops.chain([
    ops.type_cast(np.float32),
    ops.add(-127.5),
    ops.mul(1 / 127.5),
    ops.hflip(),
    ops.rcrop(32, 4, 'constant'),
    ops.add_noise(),
    ops.hwc2chw()
])

train_batches = BatchGenerator(train_data, 128, False, op)
val_batches = BatchGenerator(val_data, 128, False, op)


class Net(nn.Module):
    def __init__(self, img_size, num_classes):
        super(Net, self).__init__()
예제 #13
0
NUM_CHANNELS = 3

BATCH_SIZE = 128
NUM_CLASSES = 2
EPOCHS = 500
lr = 0.001
# weight decay 0 in this configuration, in part 3 this is changed
wd = 0.0

pets_training = PetsDataset(dir, Subset.TRAINING)
pets_validation = PetsDataset(dir, Subset.VALIDATION)
pets_test = PetsDataset(dir, Subset.TEST)


batchGenerator_training = BatchGenerator(pets_training, BATCH_SIZE, shuffle=True,
                                         op=chain([type_cast(dtype=np.float32),
                                                   add(-127.5),
                                                   mul(1 / 127.5),
                                                   hwc2chw()]))
batchGenerator_validation = BatchGenerator(pets_validation, BATCH_SIZE, shuffle=False,
                                         op=chain([type_cast(dtype=np.float32),
                                                   add(-127.5),
                                                   mul(1 / 127.5),
                                                   hwc2chw()]))
batchGenerator_test = BatchGenerator(pets_test, BATCH_SIZE, shuffle=False,
                                         op=chain([type_cast(dtype=np.float32),
                                                   add(-127.5),
                                                   mul(1 / 127.5),
                                                   hwc2chw()]))

import numpy as np

from dlvc.dataset import Subset
from dlvc.datasets.pets import PetsDataset
from dlvc import ops, batches

dataset = PetsDataset('../data/cifar-10-batches-py', Subset.TRAINING)

op = ops.chain([ops.mul(1 / 255), ops.type_cast(np.float32)])

batch_generator = batches.BatchGenerator(dataset, 7959, True, op)

training_images = []

for batch in batch_generator:
    training_images.append(batch.data)

training_images = np.array(training_images, dtype=np.float32)
training_images = training_images.reshape(training_images.shape[1:])

train_mean = np.mean(training_images, axis=(0, 1, 2))
train_std = np.std(training_images, axis=(0, 1, 2))

print(train_mean, train_std)
예제 #15
0
np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Batch size to be used
BATCH_SIZE = 128

# Step 1: load the data sets (TRAIN, VALIDATION)
train_data = PetsDataset("../cifar-10-batches-py", Subset.TRAINING)
val_data = PetsDataset("../cifar-10-batches-py", Subset.VALIDATION)

# Operations to standardize
# First experiment: scale to [-1,1]
op1 = ops.chain([
    ops.type_cast(np.float32),
    ops.add(-127.5),
    ops.mul(1 / 127.5),
    ops.hwc2chw()
])
# Second experiment: scale to sample mean=0, sd=1
# calculate average training sample mean & sd
op_calc = ops.chain([ops.type_cast(np.float32), ops.mean_sd()])
# using batch generator (could do it directly but I'm lazy)
train_full_batch_gen = BatchGenerator(train_data, len(train_data), False,
                                      op_calc)
train_full_batch = next(b for b in train_full_batch_gen)
train_mean_sd = np.mean(train_full_batch.data, axis=0)
# create operation to scale
op2 = ops.chain([
    ops.type_cast(np.float32),
예제 #16
0
def load_dataset(subset: Subset) -> batches.BatchGenerator:
    dataset = PetsDataset('../data/cifar-10-batches-py', subset)

    op = ops.chain([ops.vectorize(), ops.type_cast(np.float32)])

    return batches.BatchGenerator(dataset, len(dataset), True, op)
예제 #17
0
import torch.nn as nn

from dlvc.datasets.pets import PetsDataset
from dlvc.models.pytorch import CnnClassifier

from dlvc.batches import BatchGenerator
from dlvc.test import Accuracy
from dlvc.dataset import Subset
import dlvc.ops as ops

np.random.seed(0)

pets_train = PetsDataset("../cifar-10-batches-py/", Subset.TRAINING)

op = ops.chain([
    ops.type_cast(np.float32),
    ops.add(-127.5),
    ops.mul(1 / 127.5),
    ops.hflip(),
    ops.rcrop(32, 4, 'constant'),
    ops.add_noise(),
    ops.hwc2chw()
])

reverse_op = ops.chain([
    ops.chw2hwc(),
    ops.mul(127.5),
    ops.add(127.5),
    ops.type_cast(np.uint8),
])
예제 #18
0

def set_parameter(model, freeze_parameters):
    if freeze_parameters:
        for param in model.parameters():
            param.requires_grad = False


if USE_TRANSFER_LEARNING:
    # there are two networks to use in transfer learning "resnet" and "alexnet"
    net = initialize_transfer_learning_model("resnet", NUM_CLASSES,
                                             FREEZE_CNN_PARAMETERS)
    net, input_size = net
    pad_mode_for_resizing = 'constant'
    op_chain = chain([
        type_cast(dtype=np.float32),
        add(-127.5),
        mul(1 / 127.5),
        rcrop(25, 2, 'median'),
        resize(input_size, pad_mode_for_resizing),
        hwc2chw()
    ])
else:
    net = CatDogNet()
    op_chain = chain([
        type_cast(dtype=np.float32),
        add(-127.5),
        mul(1 / 127.5),
        rcrop(25, 2, 'median'),
        hwc2chw()
    ])
예제 #19
0
# The data and label shapes are (500, 3072) and (500,), respectively, unless for the last batch
batch_generator = BatchGenerator(dataset_training, 500, shuffle=False, op=vectorize())
last_batch_idx = len(batch_generator) - 1
batch_idx = 0
for batch in batch_generator:
    # skip last batch
    if batch_idx == last_batch_idx:
        continue
    assert batch.data.shape == (500, 3072), "Batch data shape: " + str(batch.data.shape) + ", expected: (500, 3072)."
    assert batch.label.shape == (500,), "Batch labels shape: " + str(batch.label.shape) + ", expected: (500,)."
    batch_idx += 1

# The data type is always np.float32 and the label type is integral (one of the np.int and np.uint variants)
# Implemented: for label type np.uint8 since there is less than 256 labels
batch_generator = BatchGenerator(dataset_training, 500, False, op=chain([vectorize(), type_cast(dtype=np.float32)]))
for batch in batch_generator:
    assert batch.data.dtype == np.float32, "Batch data type: " + str(batch.data.dtype) + ", expected: np.float32."
    assert batch.label.dtype == np.uint8, "Batch labels type: " + str(batch.label.dtype) + ", expected: np.uint8."

# The first sample of the first training batch returned without shuffling
# has label 0 ...
batch_generator = BatchGenerator(dataset_training, 500, False, op=chain([type_cast(dtype=np.float32), vectorize()]))
first_sample_label_unshuffled = None
first_sample_data_unshuffled = None
expected_label = 0
for batch in batch_generator:
    for label in batch.label:
        first_sample_label_unshuffled = label
        break
    for data in batch.data: