def test_concat_14(): """ Test concat: create dataset with different dataset folder, and do diffrent operation then concat """ logger.info("test_concat_14") DATA_DIR = "../data/dataset/testPK/data" DATA_DIR2 = "../data/dataset/testImageNetData/train/" data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=3) data2 = ds.ImageFolderDatasetV2(DATA_DIR2, num_samples=2) transforms1 = F.ComposeOp([F.Decode(), F.Resize((224, 224)), F.ToTensor()]) data1 = data1.map(input_columns=["image"], operations=transforms1()) data2 = data2.map(input_columns=["image"], operations=transforms1()) data3 = data1 + data2 expected, output = [], [] for d in data1: expected.append(d[0]) for d in data2: expected.append(d[0]) for d in data3: output.append(d[0]) assert len(expected) == len(output) np.array_equal(np.array(output), np.array(expected)) assert sum([1 for _ in data3]) == 5 assert data3.get_dataset_size() == 5
def test_imagefolder(): data = ds.ImageFolderDatasetV2("../data/dataset/testPK/data/") assert data.get_dataset_size() == 44 assert data.num_classes() == 4 data = data.shuffle(100) assert data.num_classes() == 4 data = ds.ImageFolderDatasetV2("../data/dataset/testPK/data/", num_samples=10) assert data.get_dataset_size() == 10 assert data.num_classes() == 4
def test_cache_map_basic3(): """ Test a repeat under mappable cache Cache | Map(decode) | Repeat | ImageFolder """ logger.info("Test cache basic 3") some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) # This DATA_DIR only has 2 images in it ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR) decode_op = c_vision.Decode() ds1 = ds1.repeat(4) ds1 = ds1.map(input_columns=["image"], operations=decode_op, cache=some_cache) logger.info("ds1.dataset_size is ", ds1.get_dataset_size()) num_iter = 0 for _ in ds1.create_dict_iterator(): logger.info("get data from dataset") num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 8 logger.info('test_cache_basic3 Ended.\n')
def test_cache_map_failure1(): """ Test nested cache (failure) Repeat | Cache | Map(decode) | Cache | ImageFolder """ logger.info("Test cache failure 1") some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) # This DATA_DIR only has 2 images in it ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache) decode_op = c_vision.Decode() ds1 = ds1.map(input_columns=["image"], operations=decode_op, cache=some_cache) ds1 = ds1.repeat(4) try: num_iter = 0 for _ in ds1.create_dict_iterator(): num_iter += 1 except RuntimeError as e: logger.info("Got an exception in DE: {}".format(str(e))) assert "Nested cache operations is not supported!" in str(e) assert num_iter == 0 logger.info('test_cache_failure1 Ended.\n')
def test_imagefolder_negative_classindex(): logger.info("Test Case negative classIndex") # define parameters repeat_count = 1 # apply dataset operations class_index = {"class3": -333, "class1": 111} data1 = ds.ImageFolderDatasetV2(DATA_DIR, class_indexing=class_index, shuffle=False) data1 = data1.repeat(repeat_count) golden = [ 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, -333, -333, -333, -333, -333, -333, -333, -333, -333, -333, -333 ] num_iter = 0 for item in data1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) assert item["label"] == golden[num_iter] num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert num_iter == 22
def test_cache_map_basic2(): """ Test mappable leaf with the cache op later in the tree above the map(decode) Repeat | Cache | Map(decode) | ImageFolder """ logger.info("Test cache map basic 2") some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) # This DATA_DIR only has 2 images in it ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR) decode_op = c_vision.Decode() ds1 = ds1.map(input_columns=["image"], operations=decode_op, cache=some_cache) ds1 = ds1.repeat(4) filename = "cache_map_02_result.npz" save_and_check_md5(ds1, filename, generate_golden=GENERATE_GOLDEN) logger.info("test_cache_map_basic2 Ended.\n")
def test_imagefolder_rename(): logger.info("Test Case rename") # define parameters repeat_count = 1 # apply dataset operations data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=10) data1 = data1.repeat(repeat_count) num_iter = 0 for item in data1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert num_iter == 10 data1 = data1.rename(input_columns=["image"], output_columns="image2") num_iter = 0 for item in data1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image2"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert num_iter == 10
def test_sequential_sampler(): logger.info("Test Case SequentialSampler") golden = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 ] # define parameters repeat_count = 1 # apply dataset operations sampler = ds.SequentialSampler() data1 = ds.ImageFolderDatasetV2(DATA_DIR, sampler=sampler) data1 = data1.repeat(repeat_count) result = [] num_iter = 0 for item in data1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" result.append(item["label"]) num_iter += 1 logger.info("Result: {}".format(result)) assert result == golden
def test_one_hot_op(): """ Test one hot encoding op """ logger.info("Test one hot encoding op") # define map operations # ds = de.ImageFolderDataset(DATA_DIR, schema=SCHEMA_DIR) dataset = ds.ImageFolderDatasetV2(DATA_DIR) num_classes = 2 epsilon_para = 0.1 transforms = [ f.OneHotOp(num_classes=num_classes, smoothing_rate=epsilon_para), ] transform_label = py_vision.ComposeOp(transforms) dataset = dataset.map(input_columns=["label"], operations=transform_label()) golden_label = np.ones(num_classes) * epsilon_para / num_classes golden_label[1] = 1 - epsilon_para / num_classes for data in dataset.create_dict_iterator(): label = data["label"] logger.info("label is {}".format(label)) logger.info("golden_label is {}".format(golden_label)) assert (label.all() == golden_label.all()) logger.info("====test one hot op ok====")
def test_imagefolder_padded_with_decode_and_get_dataset_size(): num_shards = 5 count = 0 for shard_id in range(num_shards): DATA_DIR = "../data/dataset/testPK/data" data = ds.ImageFolderDatasetV2(DATA_DIR) white_io = BytesIO() Image.new('RGB', (224, 224), (255, 255, 255)).save(white_io, 'JPEG') padded_sample = {} padded_sample['image'] = np.array(bytearray(white_io.getvalue()), dtype='uint8') padded_sample['label'] = np.array(-1, np.int32) white_samples = [ padded_sample, padded_sample, padded_sample, padded_sample ] data2 = ds.PaddedDataset(white_samples) data3 = data + data2 testsampler = ds.DistributedSampler(num_shards=num_shards, shard_id=shard_id, shuffle=False, num_samples=None) data3.use_sampler(testsampler) shard_dataset_size = data3.get_dataset_size() data3 = data3.map(input_columns="image", operations=V_C.Decode()) shard_sample_count = 0 for ele in data3.create_dict_iterator(): print("label: {}".format(ele['label'])) count += 1 shard_sample_count += 1 assert shard_sample_count in (9, 10) assert shard_dataset_size == shard_sample_count assert count == 48
def test_cutmix_batch_success3(plot=False): """ Test CutMixBatch op with default values for alpha and prob on a batch of HWC images on ImageFolderDatasetV2 """ logger.info("test_cutmix_batch_success3") ds_original = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR2, shuffle=False) decode_op = vision.Decode() ds_original = ds_original.map(input_columns=["image"], operations=[decode_op]) ds_original = ds_original.batch(4, pad_info={}, drop_remainder=True) images_original = None for idx, (image, _) in enumerate(ds_original): if idx == 0: images_original = image else: images_original = np.append(images_original, image, axis=0) # CutMix Images data1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR2, shuffle=False) decode_op = vision.Decode() data1 = data1.map(input_columns=["image"], operations=[decode_op]) one_hot_op = data_trans.OneHot(num_classes=10) data1 = data1.map(input_columns=["label"], operations=one_hot_op) cutmix_batch_op = vision.CutMixBatch(mode.ImageBatchFormat.NHWC) data1 = data1.batch(4, pad_info={}, drop_remainder=True) data1 = data1.map(input_columns=["image", "label"], operations=cutmix_batch_op) images_cutmix = None for idx, (image, _) in enumerate(data1): if idx == 0: images_cutmix = image else: images_cutmix = np.append(images_cutmix, image, axis=0) if plot: visualize_list(images_original, images_cutmix) num_samples = images_original.shape[0] mse = np.zeros(num_samples) for i in range(num_samples): mse[i] = diff_mse(images_cutmix[i], images_original[i]) logger.info("MSE= {}".format(str(np.mean(mse))))
def test_mix_up_multi(): """ Test multi batch mix up op """ logger.info("Test several batch mix up op") resize_height = 224 resize_width = 224 # Create dataset and define map operations ds1 = ds.ImageFolderDatasetV2(DATA_DIR_2) num_classes = 3 decode_op = c_vision.Decode() resize_op = c_vision.Resize((resize_height, resize_width), c_vision.Inter.LINEAR) one_hot_encode = c.OneHot(num_classes) # num_classes is input argument ds1 = ds1.map(input_columns=["image"], operations=decode_op) ds1 = ds1.map(input_columns=["image"], operations=resize_op) ds1 = ds1.map(input_columns=["label"], operations=one_hot_encode) # apply batch operations batch_size = 3 ds1 = ds1.batch(batch_size, drop_remainder=True) ds2 = ds1 alpha = 0.2 transforms = [ py_vision.MixUp(batch_size=batch_size, alpha=alpha, is_single=False) ] ds1 = ds1.map(input_columns=["image", "label"], operations=transforms) num_iter = 0 batch1_image1 = 0 for data1, data2 in zip(ds1.create_dict_iterator(), ds2.create_dict_iterator()): image1 = data1["image"] label1 = data1["label"] logger.info("label: {}".format(label1)) image2 = data2["image"] label2 = data2["label"] logger.info("label2: {}".format(label2)) if num_iter == 0: batch1_image1 = image1 if num_iter == 1: lam = np.abs(label2 - label1) logger.info("lam value in multi: {}".format(lam)) for index in range(batch_size): if np.square(lam[index]).mean() != 0: lam_value = 1 - np.sum(lam[index]) / 2 img_golden = lam_value * image2[index] + ( 1 - lam_value) * batch1_image1[index] assert image1[index].all() == img_golden.all() logger.info("====test several batch mixup ok====") break num_iter = num_iter + 1
def test_mixup_batch_success2(plot=False): """ Test MixUpBatch op with specified alpha parameter on ImageFolderDatasetV2 """ logger.info("test_mixup_batch_success2") # Original Images ds_original = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR2, shuffle=False) decode_op = vision.Decode() ds_original = ds_original.map(input_columns=["image"], operations=[decode_op]) ds_original = ds_original.batch(4, pad_info={}, drop_remainder=True) images_original = None for idx, (image, _) in enumerate(ds_original): if idx == 0: images_original = image else: images_original = np.append(images_original, image, axis=0) # MixUp Images data1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR2, shuffle=False) decode_op = vision.Decode() data1 = data1.map(input_columns=["image"], operations=[decode_op]) one_hot_op = data_trans.OneHot(num_classes=10) data1 = data1.map(input_columns=["label"], operations=one_hot_op) mixup_batch_op = vision.MixUpBatch(2.0) data1 = data1.batch(4, pad_info={}, drop_remainder=True) data1 = data1.map(input_columns=["image", "label"], operations=mixup_batch_op) images_mixup = None for idx, (image, _) in enumerate(data1): if idx == 0: images_mixup = image else: images_mixup = np.append(images_mixup, image, axis=0) if plot: visualize_list(images_original, images_mixup) num_samples = images_original.shape[0] mse = np.zeros(num_samples) for i in range(num_samples): mse[i] = diff_mse(images_mixup[i], images_original[i]) logger.info("MSE= {}".format(str(np.mean(mse))))
def test_imagenet_rawdata_dataset_size(): ds_total = ds.ImageFolderDatasetV2(IMAGENET_RAWDATA_DIR) assert ds_total.get_dataset_size() == 6 ds_shard_1_0 = ds.ImageFolderDatasetV2(IMAGENET_RAWDATA_DIR, num_shards=1, shard_id=0) assert ds_shard_1_0.get_dataset_size() == 6 ds_shard_2_0 = ds.ImageFolderDatasetV2(IMAGENET_RAWDATA_DIR, num_shards=2, shard_id=0) assert ds_shard_2_0.get_dataset_size() == 3 ds_shard_3_0 = ds.ImageFolderDatasetV2(IMAGENET_RAWDATA_DIR, num_shards=3, shard_id=0) assert ds_shard_3_0.get_dataset_size() == 2
def test_imagefolder_numsamples(): logger.info("Test Case numSamples") # define parameters repeat_count = 1 # apply dataset operations data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=10, num_parallel_workers=2) data1 = data1.repeat(repeat_count) num_iter = 0 for item in data1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert num_iter == 10 random_sampler = ds.RandomSampler(num_samples=3, replacement=True) data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=10, num_parallel_workers=2, sampler=random_sampler) num_iter = 0 for item in data1.create_dict_iterator(): num_iter += 1 assert num_iter == 3 random_sampler = ds.RandomSampler(num_samples=3, replacement=False) data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=10, num_parallel_workers=2, sampler=random_sampler) num_iter = 0 for item in data1.create_dict_iterator(): num_iter += 1 assert num_iter == 3
def test_apply_imagefolder_case(): # apply dataset map operations data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_shards=4, shard_id=3) data2 = ds.ImageFolderDatasetV2(DATA_DIR, num_shards=4, shard_id=3) decode_op = vision.Decode() normalize_op = vision.Normalize([121.0, 115.0, 100.0], [70.0, 68.0, 71.0]) def dataset_fn(ds): ds = ds.map(operations = decode_op) ds = ds.map(operations = normalize_op) ds = ds.repeat(2) return ds data1 = data1.apply(dataset_fn) data2 = data2.map(operations = decode_op) data2 = data2.map(operations = normalize_op) data2 = data2.repeat(2) for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()): assert np.array_equal(item1["image"], item2["image"])
def test_imagefolder_zip(): logger.info("Test Case zip") # define parameters repeat_count = 2 # apply dataset operations data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=10) data2 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=10) data1 = data1.repeat(repeat_count) # rename dataset2 for no conflict data2 = data2.rename(input_columns=["image", "label"], output_columns=["image1", "label1"]) data3 = ds.zip((data1, data2)) num_iter = 0 for item in data3.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert (num_iter == 10)
def create_imagenet_dataset(imagenet_dir): ds = de.ImageFolderDatasetV2(imagenet_dir) transform = F.ComposeOp([ F.Decode(), F.RandomHorizontalFlip(0.5), F.ToTensor(), F.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)), F.RandomErasing() ]) ds = ds.map(input_columns="image", operations=transform()) ds = ds.shuffle(buffer_size=5) ds = ds.repeat(3) return ds
def test_var_batch_var_resize(): # fake resize image according to its batch number, if it's 5-th batch, resize to (5^2, 5^2) = (25, 25) def np_psedo_resize(col, batchInfo): s = (batchInfo.get_batch_num() + 1) ** 2 return ([np.copy(c[0:s, 0:s, :]) for c in col],) def add_one(batchInfo): return batchInfo.get_batch_num() + 1 data1 = ds.ImageFolderDatasetV2("../data/dataset/testPK/data/", num_parallel_workers=4, decode=True) data1 = data1.batch(batch_size=add_one, drop_remainder=True, input_columns=["image"], per_batch_map=np_psedo_resize) # i-th batch has shape [i, i^2, i^2, 3] i = 1 for item in data1.create_dict_iterator(): assert item["image"].shape == (i, i ** 2, i ** 2, 3), "\ntest_var_batch_var_resize FAILED\n" i += 1
def test_concat_15(): """ Test concat: create dataset with different format of dataset file, and then concat """ logger.info("test_concat_15") DATA_DIR = "../data/dataset/testPK/data" DATA_DIR2 = [ "../data/dataset/test_tf_file_3_images/train-0000-of-0001.data" ] data1 = ds.ImageFolderDatasetV2(DATA_DIR) data2 = ds.TFRecordDataset(DATA_DIR2, columns_list=["image"]) data1 = data1.project(["image"]) data3 = data1 + data2 assert sum([1 for _ in data3]) == 47
def test_imagefolder_shardid(): logger.info("Test Case withShardID") # define parameters repeat_count = 1 # apply dataset operations data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_shards=4, shard_id=1) data1 = data1.repeat(repeat_count) num_iter = 0 for item in data1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert num_iter == 11
def test_imagefolder_noshuffle(): logger.info("Test Case noShuffle") # define parameters repeat_count = 1 # apply dataset operations data1 = ds.ImageFolderDatasetV2(DATA_DIR, shuffle=False) data1 = data1.repeat(repeat_count) num_iter = 0 for item in data1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert (num_iter == 44)
def test_pk_sampler(): logger.info("Test Case PKSampler") # define parameters repeat_count = 1 # apply dataset operations sampler = ds.PKSampler(3) data1 = ds.ImageFolderDatasetV2(DATA_DIR, sampler=sampler) data1 = data1.repeat(repeat_count) num_iter = 0 for item in data1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert num_iter == 12
def test_imagefolder_decode(): logger.info("Test Case decode") # define parameters repeat_count = 1 # apply dataset operations ext = [".jpg", ".JPEG"] data1 = ds.ImageFolderDatasetV2(DATA_DIR, extensions=ext, decode=True) data1 = data1.repeat(repeat_count) num_iter = 0 for item in data1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert num_iter == 44
def test_weighted_random_sampler(): logger.info("Test Case WeightedRandomSampler") # define parameters repeat_count = 1 # apply dataset operations weights = [1.0, 0.1, 0.02, 0.3, 0.4, 0.05, 1.2, 0.13, 0.14, 0.015, 0.16, 1.1] sampler = ds.WeightedRandomSampler(weights, 11) data1 = ds.ImageFolderDatasetV2(DATA_DIR, sampler=sampler) data1 = data1.repeat(repeat_count) num_iter = 0 for item in data1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert (num_iter == 11)
def test_subset_random_sampler(): logger.info("Test Case SubsetRandomSampler") # define parameters repeat_count = 1 # apply dataset operations indices = [0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 11] sampler = ds.SubsetRandomSampler(indices) data1 = ds.ImageFolderDatasetV2(DATA_DIR, sampler=sampler) data1 = data1.repeat(repeat_count) num_iter = 0 for item in data1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert num_iter == 12
def sharding_config(num_shards, shard_id, num_samples, shuffle, class_index, repeat_cnt=1): data1 = ds.ImageFolderDatasetV2(image_folder_dir, num_samples=num_samples, num_shards=num_shards, shard_id=shard_id, shuffle=shuffle, class_indexing=class_index, decode=True) data1 = data1.repeat(repeat_cnt) res = [] for item in data1.create_dict_iterator(): # each data is a dictionary res.append(item["label"].item()) if print_res: logger.info("labels of dataset: {}".format(res)) return res
def test_imagefolder_padded(): DATA_DIR = "../data/dataset/testPK/data" data = ds.ImageFolderDatasetV2(DATA_DIR) data1 = [{'image': np.zeros(1, np.uint8), 'label': np.array(0, np.int32)}, {'image': np.zeros(2, np.uint8), 'label': np.array(1, np.int32)}, {'image': np.zeros(3, np.uint8), 'label': np.array(0, np.int32)}, {'image': np.zeros(4, np.uint8), 'label': np.array(1, np.int32)}, {'image': np.zeros(5, np.uint8), 'label': np.array(0, np.int32)}, {'image': np.zeros(6, np.uint8), 'label': np.array(1, np.int32)}] data2 = ds.PaddedDataset(data1) data3 = data + data2 testsampler = ds.DistributedSampler(num_shards=5, shard_id=4, shuffle=False, num_samples=None) data3.use_sampler(testsampler) assert sum([1 for _ in data3]) == 10 verify_list = [] for ele in data3.create_dict_iterator(): verify_list.append(len(ele['image'])) assert verify_list[8] == 1 assert verify_list[9] == 6
def test_cache_map_basic4(): """ Test different rows result in core dump """ logger.info("Test cache basic 4") some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) # This DATA_DIR only has 2 images in it ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache) decode_op = c_vision.Decode() ds1 = ds1.repeat(4) ds1 = ds1.map(input_columns=["image"], operations=decode_op) logger.info("ds1.dataset_size is ", ds1.get_dataset_size()) shape = ds1.output_shapes() logger.info(shape) num_iter = 0 for _ in ds1.create_dict_iterator(): logger.info("get data from dataset") num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 8 logger.info('test_cache_basic3 Ended.\n')
def classification_dataset(data_dir, image_size, per_batch_size, rank=0, group_size=1, mode='train', input_mode='folder', root='', num_parallel_workers=None, shuffle=None, sampler=None, repeat_num=1, class_indexing=None, drop_remainder=True, transform=None, target_transform=None): """ A function that returns a dataset for classification. The mode of input dataset could be "folder" or "txt". If it is "folder", all images within one folder have the same label. If it is "txt", all paths of images are written into a textfile. Args: data_dir (str): Path to the root directory that contains the dataset for "input_mode="folder"". Or path of the textfile that contains every image's path of the dataset. image_size (str): Size of the input images. per_batch_size (int): the batch size of evey step during training. rank (int): The shard ID within num_shards (default=None). group_size (int): Number of shards that the dataset should be divided into (default=None). mode (str): "train" or others. Default: " train". input_mode (str): The form of the input dataset. "folder" or "txt". Default: "folder". root (str): the images path for "input_mode="txt"". Default: " ". num_parallel_workers (int): Number of workers to read the data. Default: None. shuffle (bool): Whether or not to perform shuffle on the dataset (default=None, performs shuffle). sampler (Sampler): Object used to choose samples from the dataset. Default: None. repeat_num (int): the num of repeat dataset. class_indexing (dict): A str-to-int mapping from folder name to index (default=None, the folder names will be sorted alphabetically and each class will be given a unique index starting from 0). Examples: >>> from mindvision.common.datasets.classification import classification_dataset >>> # path to imagefolder directory. This directory needs to contain sub-directories which contain the images >>> dataset_dir = "/path/to/imagefolder_directory" >>> de_dataset = classification_dataset(train_data_dir, image_size=[224, 244], >>> per_batch_size=64, rank=0, group_size=4) >>> # Path of the textfile that contains every image's path of the dataset. >>> dataset_dir = "/path/to/dataset/images/train.txt" >>> images_dir = "/path/to/dataset/images" >>> de_dataset = classification_dataset(train_data_dir, image_size=[224, 244], >>> per_batch_size=64, rank=0, group_size=4, >>> input_mode="txt", root=images_dir) """ mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] std = [0.229 * 255, 0.224 * 255, 0.225 * 255] if transform is None: if mode == 'train': transform_img = [ vision.RandomCropDecodeResize(image_size, scale=(0.08, 1.0)), vision.RandomHorizontalFlip(prob=0.5), vision.Normalize(mean=mean, std=std), vision.HWC2CHW() ] else: transform_img = [ vision.Decode(), vision.Resize((256, 256)), vision.CenterCrop(image_size), vision.Normalize(mean=mean, std=std), vision.HWC2CHW() ] else: transform_img = transform if target_transform is None: transform_label = [C.TypeCast(mstype.int32)] else: transform_label = target_transform if input_mode == 'folder': de_dataset = de.ImageFolderDatasetV2( data_dir, num_parallel_workers=num_parallel_workers, shuffle=shuffle, sampler=sampler, class_indexing=class_indexing, num_shards=group_size, shard_id=rank) else: dataset = TxtDataset(root, data_dir) sampler = DistributedSampler(dataset, rank, group_size, shuffle=shuffle) de_dataset = de.GeneratorDataset(dataset, ["image", "label"], sampler=sampler) de_dataset.set_dataset_size(len(sampler)) de_dataset = de_dataset.map(input_columns="image", num_parallel_workers=8, operations=transform_img) de_dataset = de_dataset.map(input_columns="label", num_parallel_workers=8, operations=transform_label) columns_to_project = ["image", "label"] de_dataset = de_dataset.project(columns=columns_to_project) de_dataset = de_dataset.batch(per_batch_size, drop_remainder=drop_remainder) de_dataset = de_dataset.repeat(repeat_num) return de_dataset