def test_mixup_batch_fail5(): """ Test MixUpBatch Fail 5 We expect this to fail because labels are not OntHot encoded """ logger.info("test_mixup_batch_fail5") # Original Images ds_original = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False) ds_original = ds_original.batch(5) images_original = np.array([]) for idx, (image, _) in enumerate(ds_original): if idx == 0: images_original = image else: images_original = np.append(images_original, image, axis=0) # MixUp Images data1 = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False) mixup_batch_op = vision.MixUpBatch() data1 = data1.batch(5, drop_remainder=True) data1 = data1.map(input_columns=["image", "label"], operations=mixup_batch_op) with pytest.raises(RuntimeError) as error: images_mixup = np.array([]) for idx, (image, _) in enumerate(data1): if idx == 0: images_mixup = image else: images_mixup = np.append(images_mixup, image, axis=0) error_message = "MixUpBatch: Wrong labels shape. The second column (labels) must have a shape of NC or NLC" assert error_message in str(error.value)
def test_mixup_batch_fail1(): """ Test MixUpBatch Fail 1 We expect this to fail because the images and labels are not batched """ logger.info("test_mixup_batch_fail1") # Original Images ds_original = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False) ds_original = ds_original.batch(5) images_original = np.array([]) for idx, (image, _) in enumerate(ds_original): if idx == 0: images_original = image else: images_original = np.append(images_original, image, axis=0) # MixUp Images data1 = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False) one_hot_op = data_trans.OneHot(num_classes=10) data1 = data1.map(input_columns=["label"], operations=one_hot_op) mixup_batch_op = vision.MixUpBatch(0.1) with pytest.raises(RuntimeError) as error: data1 = data1.map(input_columns=["image", "label"], operations=mixup_batch_op) for idx, (image, _) in enumerate(data1): if idx == 0: images_mixup = image else: images_mixup = np.append(images_mixup, image, axis=0) error_message = "You must make sure images are HWC or CHW and batch" assert error_message in str(error.value)
def create_dataset_cifar10(data_path, batch_size=32, repeat_size=1, status="train", target="Ascend"): """ create dataset for train or test """ if target == "Ascend": device_num, rank_id = _get_rank_info() if target != "Ascend" or device_num == 1: cifar_ds = ds.Cifar10Dataset(data_path) else: cifar_ds = ds.Cifar10Dataset(data_path, num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) rescale = 1.0 / 255.0 shift = 0.0 cfg = alexnet_cifar10_cfg resize_op = CV.Resize((cfg.image_height, cfg.image_width)) rescale_op = CV.Rescale(rescale, shift) normalize_op = CV.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) if status == "train": random_crop_op = CV.RandomCrop([32, 32], [4, 4, 4, 4]) random_horizontal_op = CV.RandomHorizontalFlip() channel_swap_op = CV.HWC2CHW() typecast_op = C.TypeCast(mstype.int32) cifar_ds = cifar_ds.map(input_columns="label", operations=typecast_op, num_parallel_workers=8) if status == "train": cifar_ds = cifar_ds.map(input_columns="image", operations=random_crop_op, num_parallel_workers=8) cifar_ds = cifar_ds.map(input_columns="image", operations=random_horizontal_op, num_parallel_workers=8) cifar_ds = cifar_ds.map(input_columns="image", operations=resize_op, num_parallel_workers=8) cifar_ds = cifar_ds.map(input_columns="image", operations=rescale_op, num_parallel_workers=8) cifar_ds = cifar_ds.map(input_columns="image", operations=normalize_op, num_parallel_workers=8) cifar_ds = cifar_ds.map(input_columns="image", operations=channel_swap_op, num_parallel_workers=8) cifar_ds = cifar_ds.shuffle(buffer_size=cfg.buffer_size) cifar_ds = cifar_ds.batch(batch_size, drop_remainder=True) cifar_ds = cifar_ds.repeat(repeat_size) return cifar_ds
def test_mixup_batch_fail4(): """ Test MixUpBatch Fail 2 We expect this to fail because alpha is zero """ logger.info("test_mixup_batch_fail4") # Original Images ds_original = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False) ds_original = ds_original.batch(5) images_original = np.array([]) for idx, (image, _) in enumerate(ds_original): if idx == 0: images_original = image else: images_original = np.append(images_original, image, axis=0) # MixUp Images data1 = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False) one_hot_op = data_trans.OneHot(num_classes=10) data1 = data1.map(input_columns=["label"], operations=one_hot_op) with pytest.raises(ValueError) as error: vision.MixUpBatch(0.0) error_message = "Input is not within the required interval" assert error_message in str(error.value)
def test_mixup_batch_fail3(): """ Test MixUpBatch op We expect this to fail because label column is not passed to mixup_batch """ logger.info("test_mixup_batch_fail3") # Original Images ds_original = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False) ds_original = ds_original.batch(5, drop_remainder=True) images_original = None for idx, (image, _) in enumerate(ds_original): if idx == 0: images_original = image else: images_original = np.append(images_original, image, axis=0) # MixUp Images data1 = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False) one_hot_op = data_trans.OneHot(num_classes=10) data1 = data1.map(input_columns=["label"], operations=one_hot_op) mixup_batch_op = vision.MixUpBatch() data1 = data1.batch(5, drop_remainder=True) data1 = data1.map(input_columns=["image"], operations=mixup_batch_op) with pytest.raises(RuntimeError) as error: images_mixup = np.array([]) for idx, (image, _) in enumerate(data1): if idx == 0: images_mixup = image else: images_mixup = np.append(images_mixup, image, axis=0) error_message = "Both images and labels columns are required" assert error_message in str(error.value)
def create_dataset(data_home, repeat_num=1, training=True): """Data operations.""" ds.config.set_seed(1) rank_size = int( os.environ.get("RANK_SIZE")) if os.environ.get("RANK_SIZE") else None device_num = int(os.getenv("RANK_SIZE")) rank_id = int(os.getenv("DEVICE_ID")) if device_num == 1: data_set = ds.Cifar10Dataset(data_home, num_parallel_workers=8, shuffle=True) else: data_set = ds.Cifar10Dataset(data_home, num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) resize_height = cfg.image_height resize_width = cfg.image_width rescale = 1.0 / 255.0 shift = 0.0 # define map operations random_crop_op = vision.RandomCrop( (32, 32), (4, 4, 4, 4)) # padding_mode default CONSTANT random_horizontal_op = vision.RandomHorizontalFlip() resize_op = vision.Resize( (resize_height, resize_width)) # interpolation default BILINEAR rescale_op = vision.Rescale(rescale, shift) normalize_op = vision.Normalize((0.4465, 0.4822, 0.4914), (0.2010, 0.1994, 0.2023)) changeswap_op = vision.HWC2CHW() type_cast_op = C.TypeCast(mstype.int32) c_trans = [] if training: c_trans = [random_crop_op, random_horizontal_op] c_trans += [resize_op, rescale_op, normalize_op, changeswap_op] # apply map operations on images data_set = data_set.map(input_columns="label", operations=type_cast_op) data_set = data_set.map(input_columns="image", operations=c_trans) # apply repeat operations data_set = data_set.repeat(repeat_num) # apply shuffle operations data_set = data_set.shuffle(buffer_size=10) # apply batch operations data_set = data_set.batch(batch_size=cfg.batch_size, drop_remainder=True) return data_set
def create_dataset(repeat_num=1, training=True, batch_size=32, rank_id=0, rank_size=1, enable_hccl=False): data_dir = data_home + "/cifar-10-batches-bin" if not training: data_dir = data_home + "/cifar-10-verify-bin" data_set = ds.Cifar10Dataset(data_dir) if enable_hccl: rank_id = rank_id rank_size = rank_size data_set = ds.Cifar10Dataset(data_dir, num_shards=rank_size, shard_id=rank_id) resize_height = 224 resize_width = 224 rescale = 1.0 / 255.0 shift = 0.0 # define map operations random_crop_op = vision.RandomCrop( (32, 32), (4, 4, 4, 4)) # padding_mode default CONSTANT random_horizontal_op = vision.RandomHorizontalFlip() # interpolation default BILINEAR resize_op = vision.Resize((resize_height, resize_width)) rescale_op = vision.Rescale(rescale, shift) normalize_op = vision.Normalize((0.4465, 0.4822, 0.4914), (0.2010, 0.1994, 0.2023)) changeswap_op = vision.HWC2CHW() type_cast_op = C.TypeCast(mstype.int32) c_trans = [] if training: c_trans = [random_crop_op, random_horizontal_op] c_trans += [resize_op, rescale_op, normalize_op, changeswap_op] # apply map operations on images data_set = data_set.map(input_columns="label", operations=type_cast_op) data_set = data_set.map(input_columns="image", operations=c_trans) # apply shuffle operations data_set = data_set.shuffle(buffer_size=1000) # apply batch operations data_set = data_set.batch(batch_size=batch_size, drop_remainder=True) # apply repeat operations data_set = data_set.repeat(repeat_num) return data_set
def create_dataset_cifar10(data_home, repeat_num=1, training=True, cifar_cfg=None): """Data operations.""" data_dir = os.path.join(data_home, "cifar-10-batches-bin") if not training: data_dir = os.path.join(data_home, "cifar-10-verify-bin") rank_size, rank_id = _get_rank_info() if training: data_set = ds.Cifar10Dataset(data_dir, num_shards=rank_size, shard_id=rank_id, shuffle=True) else: data_set = ds.Cifar10Dataset(data_dir, num_shards=rank_size, shard_id=rank_id, shuffle=False) resize_height = cifar_cfg.image_height resize_width = cifar_cfg.image_width # define map operations random_crop_op = vision.RandomCrop( (32, 32), (4, 4, 4, 4)) # padding_mode default CONSTANT random_horizontal_op = vision.RandomHorizontalFlip() resize_op = vision.Resize( (resize_height, resize_width)) # interpolation default BILINEAR rescale_op = vision.Rescale(1.0 / 255.0, 0.0) #normalize_op = vision.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) normalize_op = vision.Normalize((0.4914, 0.4822, 0.4465), (0.24703233, 0.24348505, 0.26158768)) changeswap_op = vision.HWC2CHW() type_cast_op = c_transforms.TypeCast(mstype.int32) c_trans = [] if training: c_trans = [random_crop_op, random_horizontal_op] c_trans += [resize_op, rescale_op, normalize_op, changeswap_op] # apply map operations on images data_set = data_set.map(operations=type_cast_op, input_columns="label") data_set = data_set.map(operations=c_trans, input_columns="image") # apply batch operations data_set = data_set.batch(batch_size=cifar_cfg.batch_size, drop_remainder=True) # apply repeat operations data_set = data_set.repeat(repeat_num) return data_set
def test_cifar_exception_file_path(): def exception_func(item): raise Exception("Error occur!") try: data = ds.Cifar10Dataset(DATA_DIR_10) data = data.map(operations=exception_func, input_columns=["image"], num_parallel_workers=1) num_rows = 0 for _ in data.create_dict_iterator(): num_rows += 1 assert False except RuntimeError as e: assert "map operation: [PyFunc] failed. The corresponding data files" in str(e) try: data = ds.Cifar10Dataset(DATA_DIR_10) data = data.map(operations=exception_func, input_columns=["label"], num_parallel_workers=1) num_rows = 0 for _ in data.create_dict_iterator(): num_rows += 1 assert False except RuntimeError as e: assert "map operation: [PyFunc] failed. The corresponding data files" in str(e) try: data = ds.Cifar100Dataset(DATA_DIR_100) data = data.map(operations=exception_func, input_columns=["image"], num_parallel_workers=1) num_rows = 0 for _ in data.create_dict_iterator(): num_rows += 1 assert False except RuntimeError as e: assert "map operation: [PyFunc] failed. The corresponding data files" in str(e) try: data = ds.Cifar100Dataset(DATA_DIR_100) data = data.map(operations=exception_func, input_columns=["coarse_label"], num_parallel_workers=1) num_rows = 0 for _ in data.create_dict_iterator(): num_rows += 1 assert False except RuntimeError as e: assert "map operation: [PyFunc] failed. The corresponding data files" in str(e) try: data = ds.Cifar100Dataset(DATA_DIR_100) data = data.map(operations=exception_func, input_columns=["fine_label"], num_parallel_workers=1) num_rows = 0 for _ in data.create_dict_iterator(): num_rows += 1 assert False except RuntimeError as e: assert "map operation: [PyFunc] failed. The corresponding data files" in str(e)
def create_dataset_cifar10(dataset_path, do_train, cfg, repeat_num=1): """ create a train or eval dataset Args: dataset_path(string): the path of dataset. do_train(bool): whether dataset is used for train or eval. cfg (dict): the config for creating dataset. repeat_num(int): the repeat times of dataset. Default: 1. Returns: dataset """ dataset_path = os.path.join( dataset_path, "cifar-10-batches-bin" if do_train else "cifar-10-verify-bin") if cfg.group_size == 1: data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True) else: data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True, num_shards=cfg.group_size, shard_id=cfg.rank) # define map operations trans = [] if do_train: trans.append(C.RandomCrop((32, 32), (4, 4, 4, 4))) trans.append(C.RandomHorizontalFlip(prob=0.5)) trans.append(C.Resize((299, 299))) trans.append(C.Rescale(1.0 / 255.0, 0.0)) trans.append( C.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])) trans.append(C.HWC2CHW()) type_cast_op = C2.TypeCast(mstype.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums) # apply batch operations data_set = data_set.batch(cfg.batch_size, drop_remainder=do_train) # apply dataset repeat operation data_set = data_set.repeat(repeat_num) return data_set
def test_cifar_usage(): """ test usage of cifar """ logger.info("Test Cifar100Dataset usage flag") # flag, if True, test cifar10 else test cifar100 def test_config(usage, flag=True, cifar_path=None): if cifar_path is None: cifar_path = DATA_DIR_10 if flag else DATA_DIR_100 try: data = ds.Cifar10Dataset(cifar_path, usage=usage) if flag else ds.Cifar100Dataset(cifar_path, usage=usage) num_rows = 0 for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True): num_rows += 1 except (ValueError, TypeError, RuntimeError) as e: return str(e) return num_rows # test the usage of CIFAR100 assert test_config("train") == 10000 assert test_config("all") == 10000 assert "usage is not within the valid set of ['train', 'test', 'all']" in test_config("invalid") assert "Argument usage with value ['list'] is not of type (<class 'str'>,)" in test_config(["list"]) assert "no valid data matching the dataset API Cifar10Dataset" in test_config("test") # test the usage of CIFAR10 assert test_config("test", False) == 10000 assert test_config("all", False) == 10000 assert "no valid data matching the dataset API Cifar100Dataset" in test_config("train", False) assert "usage is not within the valid set of ['train', 'test', 'all']" in test_config("invalid", False) # change this directory to the folder that contains all cifar10 files all_cifar10 = None if all_cifar10 is not None: assert test_config("train", True, all_cifar10) == 50000 assert test_config("test", True, all_cifar10) == 10000 assert test_config("all", True, all_cifar10) == 60000 assert ds.Cifar10Dataset(all_cifar10, usage="train").get_dataset_size() == 50000 assert ds.Cifar10Dataset(all_cifar10, usage="test").get_dataset_size() == 10000 assert ds.Cifar10Dataset(all_cifar10, usage="all").get_dataset_size() == 60000 # change this directory to the folder that contains all cifar100 files all_cifar100 = None if all_cifar100 is not None: assert test_config("train", False, all_cifar100) == 50000 assert test_config("test", False, all_cifar100) == 10000 assert test_config("all", False, all_cifar100) == 60000 assert ds.Cifar100Dataset(all_cifar100, usage="train").get_dataset_size() == 50000 assert ds.Cifar100Dataset(all_cifar100, usage="test").get_dataset_size() == 10000 assert ds.Cifar100Dataset(all_cifar100, usage="all").get_dataset_size() == 60000
def test_cifar10_exception(): """ Test error cases for Cifar10Dataset """ logger.info("Test error cases for Cifar10Dataset") error_msg_1 = "sampler and shuffle cannot be specified at the same time" with pytest.raises(RuntimeError, match=error_msg_1): ds.Cifar10Dataset(DATA_DIR_10, shuffle=False, sampler=ds.PKSampler(3)) error_msg_2 = "sampler and sharding cannot be specified at the same time" with pytest.raises(RuntimeError, match=error_msg_2): ds.Cifar10Dataset(DATA_DIR_10, sampler=ds.PKSampler(3), num_shards=2, shard_id=0) error_msg_3 = "num_shards is specified and currently requires shard_id as well" with pytest.raises(RuntimeError, match=error_msg_3): ds.Cifar10Dataset(DATA_DIR_10, num_shards=10) error_msg_4 = "shard_id is specified but num_shards is not" with pytest.raises(RuntimeError, match=error_msg_4): ds.Cifar10Dataset(DATA_DIR_10, shard_id=0) error_msg_5 = "Input shard_id is not within the required interval" with pytest.raises(ValueError, match=error_msg_5): ds.Cifar10Dataset(DATA_DIR_10, num_shards=2, shard_id=-1) with pytest.raises(ValueError, match=error_msg_5): ds.Cifar10Dataset(DATA_DIR_10, num_shards=2, shard_id=5) error_msg_6 = "num_parallel_workers exceeds" with pytest.raises(ValueError, match=error_msg_6): ds.Cifar10Dataset(DATA_DIR_10, shuffle=False, num_parallel_workers=0) with pytest.raises(ValueError, match=error_msg_6): ds.Cifar10Dataset(DATA_DIR_10, shuffle=False, num_parallel_workers=88)
def test_cifar10_sequential_sampler(): """ Test Cifar10Dataset with SequentialSampler """ logger.info("Test Cifar10Dataset Op with SequentialSampler") num_samples = 30 sampler = ds.SequentialSampler(num_samples=num_samples) data1 = ds.Cifar10Dataset(DATA_DIR_10, sampler=sampler) data2 = ds.Cifar10Dataset(DATA_DIR_10, shuffle=False, num_samples=num_samples) num_iter = 0 for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()): np.testing.assert_equal(item1["label"], item2["label"]) num_iter += 1 assert num_iter == num_samples
def create_dataset(repeat_num=1, training=True): """ create data for next use such as training or infering """ cifar_ds = ds.Cifar10Dataset(data_home) if args_opt.run_distribute: rank_id = int(os.getenv('RANK_ID')) rank_size = int(os.getenv('RANK_SIZE')) cifar_ds = ds.Cifar10Dataset(data_home, num_shards=rank_size, shard_id=rank_id) resize_height = 224 resize_width = 224 rescale = 1.0 / 255.0 shift = 0.0 # define map operations random_crop_op = C.RandomCrop( (32, 32), (4, 4, 4, 4)) # padding_mode default CONSTANT random_horizontal_op = C.RandomHorizontalFlip() resize_op = C.Resize( (resize_height, resize_width)) # interpolation default BILINEAR rescale_op = C.Rescale(rescale, shift) normalize_op = C.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) changeswap_op = C.HWC2CHW() type_cast_op = C2.TypeCast(mstype.int32) c_trans = [] if training: c_trans = [random_crop_op, random_horizontal_op] c_trans += [resize_op, rescale_op, normalize_op, changeswap_op] # apply map operations on images cifar_ds = cifar_ds.map(operations=type_cast_op, input_columns="label") cifar_ds = cifar_ds.map(operations=c_trans, input_columns="image") # apply shuffle operations cifar_ds = cifar_ds.shuffle(buffer_size=10) # apply batch operations cifar_ds = cifar_ds.batch(batch_size=args_opt.batch_size, drop_remainder=True) # apply repeat operations cifar_ds = cifar_ds.repeat(repeat_num) return cifar_ds
def test_cutmix_batch_fail5(): """ Test CutMixBatch op We expect this to fail because label column is not passed to cutmix_batch """ logger.info("test_cutmix_batch_fail5") # CutMixBatch Images data1 = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False) one_hot_op = data_trans.OneHot(num_classes=10) data1 = data1.map(operations=one_hot_op, input_columns=["label"]) cutmix_batch_op = vision.CutMixBatch(mode.ImageBatchFormat.NHWC) data1 = data1.batch(5, drop_remainder=True) data1 = data1.map(operations=cutmix_batch_op, input_columns=["image"]) with pytest.raises(RuntimeError) as error: images_cutmix = np.array([]) for idx, (image, _) in enumerate(data1): if idx == 0: images_cutmix = image.asnumpy() else: images_cutmix = np.append(images_cutmix, image.asnumpy(), axis=0) error_message = "both image and label columns are required" assert error_message in str(error.value)
def test_cutmix_batch_fail1(): """ Test CutMixBatch Fail 1 We expect this to fail because the images and labels are not batched """ logger.info("test_cutmix_batch_fail1") # CutMixBatch Images data1 = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False) one_hot_op = data_trans.OneHot(num_classes=10) data1 = data1.map(operations=one_hot_op, input_columns=["label"]) cutmix_batch_op = vision.CutMixBatch(mode.ImageBatchFormat.NHWC) with pytest.raises(RuntimeError) as error: data1 = data1.map(operations=cutmix_batch_op, input_columns=["image", "label"]) for idx, (image, _) in enumerate(data1): if idx == 0: images_cutmix = image.asnumpy() else: images_cutmix = np.append(images_cutmix, image.asnumpy(), axis=0) error_message = "You must make sure images are HWC or CHW and batch " assert error_message in str(error.value)
def get_dataset(self): print("get data from dir:", self.data_dir) if self.device_num == 1: ds_ = ds.Cifar10Dataset( self.data_dir, num_parallel_workers=self.num_parallel_workers) else: ds_ = ds.Cifar10Dataset( self.data_dir, num_parallel_workers=self.num_parallel_workers, num_shards=self.device_num, shard_id=self.device_id) ds_ = ds_.map(input_columns=["image"], operations=self.trsfm) typecast_op = C.TypeCast(mstype.int32) ds_ = ds_.map(input_columns=["label"], operations=typecast_op) return ds_
def test_cpp_uniform_augment_random_crop_badinput(num_ops=1): """ Test UniformAugment with greater crop size """ logger.info("Test CPP UniformAugment with random_crop bad input") batch_size = 2 cifar10_dir = "../data/dataset/testCifar10Data" ds1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3] transforms_ua = [ # Note: crop size [224, 224] > image size [32, 32] C.RandomCrop(size=[224, 224]), C.RandomHorizontalFlip() ] uni_aug = C.UniformAugment(transforms=transforms_ua, num_ops=num_ops) ds1 = ds1.map(operations=uni_aug, input_columns="image") # apply DatasetOps ds1 = ds1.batch(batch_size, drop_remainder=True, num_parallel_workers=1) num_batches = 0 try: for _ in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): num_batches += 1 except Exception as e: assert "Crop size" in str(e)
def test_cutmix_batch_fail6(): """ Test CutMixBatch op We expect this to fail because image_batch_format passed to CutMixBatch doesn't match the format of the images """ logger.info("test_cutmix_batch_fail6") # CutMixBatch Images data1 = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False) one_hot_op = data_trans.OneHot(num_classes=10) data1 = data1.map(input_columns=["label"], operations=one_hot_op) cutmix_batch_op = vision.CutMixBatch(mode.ImageBatchFormat.NCHW) data1 = data1.batch(5, drop_remainder=True) data1 = data1.map(input_columns=["image", "label"], operations=cutmix_batch_op) with pytest.raises(RuntimeError) as error: images_cutmix = np.array([]) for idx, (image, _) in enumerate(data1): if idx == 0: images_cutmix = image else: images_cutmix = np.append(images_cutmix, image, axis=0) error_message = "CutMixBatch: Image doesn't match the given image format." assert error_message in str(error.value)
def test_cutmix_batch_nchw_md5(): """ Test CutMixBatch on a batch of CHW images with MD5: """ logger.info("test_cutmix_batch_nchw_md5") original_seed = config_get_set_seed(0) original_num_parallel_workers = config_get_set_num_parallel_workers(1) # CutMixBatch Images data = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False) hwc2chw_op = vision.HWC2CHW() data = data.map(input_columns=["image"], operations=hwc2chw_op) one_hot_op = data_trans.OneHot(num_classes=10) data = data.map(input_columns=["label"], operations=one_hot_op) cutmix_batch_op = vision.CutMixBatch(mode.ImageBatchFormat.NCHW) data = data.batch(5, drop_remainder=True) data = data.map(input_columns=["image", "label"], operations=cutmix_batch_op) filename = "cutmix_batch_c_nchw_result.npz" save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) # Restore config setting ds.config.set_seed(original_seed) ds.config.set_num_parallel_workers(original_num_parallel_workers)
def test_cutmix_batch_fail7(): """ Test CutMixBatch op We expect this to fail because labels are not in one-hot format """ logger.info("test_cutmix_batch_fail7") # CutMixBatch Images data1 = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False) cutmix_batch_op = vision.CutMixBatch(mode.ImageBatchFormat.NHWC) data1 = data1.batch(5, drop_remainder=True) data1 = data1.map(operations=cutmix_batch_op, input_columns=["image", "label"]) with pytest.raises(RuntimeError) as error: images_cutmix = np.array([]) for idx, (image, _) in enumerate(data1): if idx == 0: images_cutmix = image.asnumpy() else: images_cutmix = np.append(images_cutmix, image.asnumpy(), axis=0) error_message = "CutMixBatch: Wrong labels shape. The second column (labels) must have a shape of NC or NLC" assert error_message in str(error.value)
def test_filte_case_dataset_cifar10(): DATA_DIR_10 = "../data/dataset/testCifar10Data" dataset_c = ds.Cifar10Dataset(dataset_dir=DATA_DIR_10, num_samples=100000, shuffle=False) dataset_f1 = dataset_c.filter(input_columns=["image", "label"], predicate=filter_func_cifar, num_parallel_workers=1) for item in dataset_f1.create_dict_iterator(num_epochs=1, output_numpy=True): # in this example, each dictionary has keys "image" and "label" assert item["label"] % 3 == 0
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="CPU"): data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) # define map operations trans = [] if do_train: trans += [ C.RandomCrop((32, 32), (4, 4, 4, 4)), C.RandomHorizontalFlip(prob=0.5) ] trans += [ C.Resize((48,48)), C.Rescale(1.0 / 255.0, 0.0), C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]), C.HWC2CHW() ] type_cast_op = C2.TypeCast(mstype.int32) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) # apply batch operations data_set = data_set.shuffle(buffer_size=10) data_set = data_set.batch(batch_size, drop_remainder=False) # apply dataset repeat operation data_set = data_set.repeat(repeat_num) return data_set
def test_cifar_sampler_chain(): """ Test Cifar sampler chain """ logger.info("test_cifar_sampler_chain") sampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=5) child_sampler = ds.RandomSampler(replacement=True, num_samples=4) child_sampler2 = ds.SequentialSampler(start_index=0, num_samples=2) child_sampler.add_child(child_sampler2) sampler.add_child(child_sampler) data1 = ds.Cifar10Dataset(CIFAR10_DATA_DIR, sampler=sampler) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 1 # Verify number of rows assert sum([1 for _ in data1]) == 1 # Verify dataset contents res = [] for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): logger.info("item: {}".format(item)) res.append(item) logger.info("dataset: {}".format(res))
def create_dataset(data_home, repeat_num=1, batch_size=32, do_train=True, device_target="CPU"): """ create data for next use such as training or inferring """ cifar_ds = ds.Cifar10Dataset(data_home,num_parallel_workers=8, shuffle=True) c_trans = [] if do_train: c_trans += [ C.RandomCrop((32, 32), (4, 4, 4, 4)), C.RandomHorizontalFlip(prob=0.5) ] c_trans += [ C.Resize((224, 224)), C.Rescale(1.0 / 255.0, 0.0), C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]), C.HWC2CHW() ] type_cast_op = C2.TypeCast(mstype.int32) cifar_ds = cifar_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) cifar_ds = cifar_ds.map(operations=c_trans, input_columns="image", num_parallel_workers=8) cifar_ds = cifar_ds.batch(batch_size, drop_remainder=True) cifar_ds = cifar_ds.repeat(repeat_num) return cifar_ds
def test_get_column_name_zip(): data1 = ds.Cifar10Dataset(CIFAR10_DIR) assert data1.get_col_names() == ["image", "label"] data2 = ds.CSVDataset(CSV_DIR) assert data2.get_col_names() == ["1", "2", "3", "4"] data = ds.zip((data1, data2)) assert data.get_col_names() == ["image", "label", "1", "2", "3", "4"]
def test_cifar10(): """ dataset parameter """ logger.info("Test dataset parameter") data_dir_10 = "../data/dataset/testCifar10Data" num_repeat = 2 batch_size = 32 limit_dataset = 100 # apply dataset operations data1 = ds.Cifar10Dataset(data_dir_10, num_samples=limit_dataset) data1 = data1.repeat(num_repeat) data1 = data1.batch(batch_size, True) num_epoch = 5 # iter1 will always assume there is a next epoch and never shutdown. iter1 = data1.create_tuple_iterator() epoch_count = 0 sample_count = 0 for _ in range(num_epoch): row_count = 0 for _ in iter1: # in this example, each dictionary has keys "image" and "label" row_count += 1 assert row_count == int(limit_dataset * num_repeat / batch_size) logger.debug("row_count: ", row_count) epoch_count += 1 sample_count += row_count assert epoch_count == num_epoch logger.debug("total epochs: ", epoch_count) assert sample_count == int( limit_dataset * num_repeat / batch_size) * num_epoch logger.debug("total sample: ", sample_count)
def create_dataset(data_path, batch_size=32, repeat_size=1, mode="train"): """ create dataset for train or test """ cifar_ds = ds.Cifar10Dataset(data_path) rescale = 1.0 / 255.0 shift = 0.0 resize_op = CV.Resize((cfg.image_height, cfg.image_width)) rescale_op = CV.Rescale(rescale, shift) normalize_op = CV.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) if mode == "train": random_crop_op = CV.RandomCrop([32, 32], [4, 4, 4, 4]) random_horizontal_op = CV.RandomHorizontalFlip() channel_swap_op = CV.HWC2CHW() typecast_op = C.TypeCast(mstype.int32) cifar_ds = cifar_ds.map(input_columns="label", operations=typecast_op) if mode == "train": cifar_ds = cifar_ds.map(input_columns="image", operations=random_crop_op) cifar_ds = cifar_ds.map(input_columns="image", operations=random_horizontal_op) cifar_ds = cifar_ds.map(input_columns="image", operations=resize_op) cifar_ds = cifar_ds.map(input_columns="image", operations=rescale_op) cifar_ds = cifar_ds.map(input_columns="image", operations=normalize_op) cifar_ds = cifar_ds.map(input_columns="image", operations=channel_swap_op) cifar_ds = cifar_ds.shuffle(buffer_size=cfg.buffer_size) cifar_ds = cifar_ds.batch(batch_size, drop_remainder=True) cifar_ds = cifar_ds.repeat(repeat_size) return cifar_ds
def pad_batch_config(): data2 = ds.Cifar10Dataset(cifar10_dir, shuffle=False, num_samples=1000) # shape = [32,32,3] data2 = data2.map(operations=(lambda x: x.reshape(-1)), input_columns="image") # reshape to 1d data2 = data2.batch(batch_size=25, drop_remainder=True, pad_info={"image": ([3888], 0)}) res = [] for data in data2.create_dict_iterator(num_epochs=1, output_numpy=True): res.append(data["image"]) return res
def test_cifar10_dataset_size(): ds_total = ds.Cifar10Dataset(CIFAR10_DATA_DIR) assert ds_total.get_dataset_size() == 10000 # test get_dataset_size with usage flag train_size = ds.Cifar100Dataset(CIFAR100_DATA_DIR, usage="train").get_dataset_size() assert train_size == 0 train_size = ds.Cifar10Dataset(CIFAR10_DATA_DIR, usage="train").get_dataset_size() assert train_size == 10000 all_size = ds.Cifar10Dataset(CIFAR10_DATA_DIR, usage="all").get_dataset_size() assert all_size == 10000 ds_shard_1_0 = ds.Cifar10Dataset(CIFAR10_DATA_DIR, num_shards=1, shard_id=0) assert ds_shard_1_0.get_dataset_size() == 10000 ds_shard_2_0 = ds.Cifar10Dataset(CIFAR10_DATA_DIR, num_shards=2, shard_id=0) assert ds_shard_2_0.get_dataset_size() == 5000 ds_shard_3_0 = ds.Cifar10Dataset(CIFAR10_DATA_DIR, num_shards=3, shard_id=0) assert ds_shard_3_0.get_dataset_size() == 3334 ds_shard_7_0 = ds.Cifar10Dataset(CIFAR10_DATA_DIR, num_shards=7, shard_id=0) assert ds_shard_7_0.get_dataset_size() == 1429