def test_cache_map_basic3(): """ Test a repeat under mappable cache Cache | Map(decode) | Repeat | ImageFolder """ logger.info("Test cache basic 3") some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) # This DATA_DIR only has 2 images in it ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR) decode_op = c_vision.Decode() ds1 = ds1.repeat(4) ds1 = ds1.map(input_columns=["image"], operations=decode_op, cache=some_cache) logger.info("ds1.dataset_size is ", ds1.get_dataset_size()) num_iter = 0 for _ in ds1.create_dict_iterator(): logger.info("get data from dataset") num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 8 logger.info('test_cache_basic3 Ended.\n')
def test_cache_nomap_basic3(): """ A TF reader dataset (a non mappable dataset) with a cache over it just after the leaf Repeat | Map(decode) | Cache | TFReader """ logger.info("Test cache nomap basic 3") some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) ds1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False, cache=some_cache) decode_op = c_vision.Decode() ds1 = ds1.map(input_columns=["image"], operations=decode_op) ds1 = ds1.repeat(4) num_iter = 0 for _ in ds1.create_dict_iterator(): num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 12 logger.info("test_cache_nomap_basic3 Ended.\n")
def test_cache_nomap_basic5(): """ A TF reader dataset (a non mappable dataset) with a cache over it just after the leaf Same as test 3, but this one does not have shuffle arg, causing tf to default to global shuffle which attempts to inject a shuffle operator. However, since there is a cache we do not need global shuffle, so the shuffle will not be built. It ends up being identical to test basic 3, however we arrive at the same tree in different codepaths (if there was no cache, then the shuffle IS built) Repeat | Map(decode) | Cache | TFReader """ logger.info("Test cache nomap basic 5") # This dataset has 3 records in it only some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) ds1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], cache=some_cache) decode_op = c_vision.Decode() ds1 = ds1.map(input_columns=["image"], operations=decode_op) ds1 = ds1.repeat(4) num_iter = 0 for _ in ds1.create_dict_iterator(): num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 12 logger.info("test_cache_nomap_basic5 Ended.\n")
def test_cache_nomap_basic1(): """ A random dataset (a non mappable dataset) with a cache over it just after the leaf """ logger.info("Test cache nomap basic 1") schema = ds.Schema() schema.add_column('image', de_type=mstype.uint8, shape=[640, 480, 3]) # 921600 bytes (a bit less than 1 MB per image) schema.add_column('label', de_type=mstype.uint8, shape=[1]) # create a cache. arbitrary session_id for now some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) # User-created sampler here ds1 = ds.RandomDataset(schema=schema, total_rows=10, num_parallel_workers=4, cache=some_cache) ds1 = ds1.repeat(4) num_iter = 0 for data in ds1.create_dict_iterator(): logger.info("printing the label: {}".format(data["label"])) num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 40 logger.info("test_cache_nomap_basic1 Ended.\n")
def test_cache_nomap_allowed_share1(): """ It is allowed to share the cache between the following two trees: Repeat Shuffle | | Cache Cache | | TFReader TFReader """ logger.info("Test cache nomap allowed share 1") ds.config.set_seed(1) # This dataset has 3 records in it only some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) ds1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False, cache=some_cache) ds1 = ds1.repeat(4) ds2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False, cache=some_cache) ds2 = ds2.shuffle(buffer_size=2) num_iter = 0 for _ in ds1.create_dict_iterator(): num_iter += 1 assert num_iter == 12 logger.info("Number of data in ds1: {} ".format(num_iter)) num_iter = 0 for _ in ds2.create_dict_iterator(): num_iter += 1 assert num_iter == 3 logger.info("test_cache_nomap_allowed_share1 Ended.\n")
def test_cache_map_basic2(): """ Test mappable leaf with the cache op later in the tree above the map(decode) Repeat | Cache | Map(decode) | ImageFolder """ logger.info("Test cache map basic 2") some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) # This DATA_DIR only has 2 images in it ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR) decode_op = c_vision.Decode() ds1 = ds1.map(input_columns=["image"], operations=decode_op, cache=some_cache) ds1 = ds1.repeat(4) filename = "cache_map_02_result.npz" save_and_check_md5(ds1, filename, generate_golden=GENERATE_GOLDEN) logger.info("test_cache_map_basic2 Ended.\n")
def test_cache_map_failure1(): """ Test nested cache (failure) Repeat | Cache | Map(decode) | Cache | ImageFolder """ logger.info("Test cache failure 1") some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) # This DATA_DIR only has 2 images in it ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache) decode_op = c_vision.Decode() ds1 = ds1.map(input_columns=["image"], operations=decode_op, cache=some_cache) ds1 = ds1.repeat(4) try: num_iter = 0 for _ in ds1.create_dict_iterator(): num_iter += 1 except RuntimeError as e: logger.info("Got an exception in DE: {}".format(str(e))) assert "Nested cache operations is not supported!" in str(e) assert num_iter == 0 logger.info('test_cache_failure1 Ended.\n')
def test_cache_nomap_allowed_share3(): """ It is allowed to share the cache between the following two trees (different shard ids): Repeat Repeat | | Cache Cache | | TFReader(shard_id = 0) TFReader(shard_id = 1) """ logger.info("Test cache nomap allowed share 3") some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) tf_files = ["../data/dataset/tf_file_dataset/test1.data", "../data/dataset/tf_file_dataset/test2.data"] ds1 = ds.TFRecordDataset(tf_files, num_shards=2, shard_id=0, num_samples=3, shuffle=False, cache=some_cache) ds1 = ds1.repeat(4) ds2 = ds.TFRecordDataset(tf_files, num_shards=2, shard_id=1, num_samples=3, shuffle=False, cache=some_cache) ds2 = ds2.repeat(4) num_iter = 0 for _ in ds1.create_dict_iterator(): num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 12 num_iter = 0 for _ in ds2.create_dict_iterator(): num_iter += 1 assert num_iter == 12 logger.info("test_cache_nomap_allowed_share3 Ended.\n")
def test_cache_nomap_basic2(): """ A random dataset (a non mappable dataset) with a cache over it just after the leaf """ logger.info("Test cache nomap basic 2") schema = ds.Schema() schema.add_column('image', de_type=mstype.uint8, shape=[640, 480, 3]) # 921600 bytes (a bit less than 1 MB per image) schema.add_column('label', de_type=mstype.uint8, shape=[1]) # create a cache. arbitrary session_id for now some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) # sampler arg not given directly, however any of these args will auto-generate an appropriate sampler: # num_samples, shuffle, num_shards, shard_id # In this case, the presence of num_samples chooses a sampler. ds1 = ds.RandomDataset(schema=schema, total_rows=20, num_samples=20, num_parallel_workers=4, cache=some_cache) ds1 = ds1.repeat(2) num_iter = 0 for data in ds1.create_dict_iterator(): logger.info("printing the label: {}".format(data["label"])) num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 40 logger.info("test_cache_nomap_basic2 Ended.\n")
def test_cache_nomap_basic7(): """ A TF reader dataset (a non mappable dataset) that uses global shuffle, and is cached followed by map. In this one, the tf dataset with global shuffle might want to inject a shuffle op over top of the tf reader, but since a cache is given, it will choose not to. Repeat | Map(decode) | cache | TFReader """ logger.info("Test cache nomap basic 7") # This dataset has 3 records in it only some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) ds1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=ds.Shuffle.GLOBAL, cache=some_cache) decode_op = c_vision.Decode() ds1 = ds1.map(input_columns=["image"], operations=decode_op) ds1 = ds1.repeat(4) num_iter = 0 for _ in ds1.create_dict_iterator(): num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 12 logger.info("test_cache_nomap_basic7 Ended.\n")
def test_cache_nomap_allowed_share2(): """ It is allowed to share the cache between the following two trees (with map decode): Repeat Shuffle | | Cache Cache | | Map(decode) Map(decode) | | TFReader TFReader """ logger.info("Test cache nomap allowed share 2") ds.config.set_seed(1) # This dataset has 3 records in it only some_cache = ds.DatasetCache(session_id=2, size=0, spilling=True) decode_op = c_vision.Decode() ds1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) ds1 = ds1.map(operations=decode_op, input_columns=["image"], cache=some_cache) ds1 = ds1.repeat(4) ds2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) ds2 = ds2.map(operations=decode_op, input_columns=["image"], cache=some_cache) ds2 = ds2.shuffle(buffer_size=2) num_iter = 0 for _ in ds1.create_dict_iterator(num_epochs=1): num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 12 num_iter = 0 for _ in ds2.create_dict_iterator(num_epochs=1): num_iter += 1 assert num_iter == 3 logger.info("test_cache_nomap_allowed_share2 Ended.\n")
def test_cache_nomap_allowed_share4(): """ It is allowed to share the cache between the following two trees: Cache Cache | | Map(decode, num_parallel_workers=1) Map(decode, num_parallel_workers=2) | | TFReader TFReader """ logger.info("Test cache nomap allowed share 4") # This dataset has 3 records in it only some_cache = ds.DatasetCache(session_id=2, size=0, spilling=True) decode_op = c_vision.Decode() ds1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) ds1 = ds1.map(input_columns=["image"], operations=decode_op, cache=some_cache, num_parallel_workers=1) ds2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) ds2 = ds2.map(input_columns=["image"], operations=decode_op, cache=some_cache, num_parallel_workers=2) num_iter = 0 for _ in ds1.create_dict_iterator(): num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 3 num_iter = 0 for _ in ds2.create_dict_iterator(): num_iter += 1 logger.info("Number of data in ds2: {} ".format(num_iter)) assert num_iter == 3 logger.info("test_cache_nomap_allowed_share4 Ended.\n")
def test_cache_nomap_disallowed_share1(): """ It is not allowed to share the cache between the following two trees: Cache Cache | | Map(decode) Map(rescale) | | TFReader TFReader """ logger.info("Test cache nomap disallowed share1") # This dataset has 3 records in it only some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) decode_op = c_vision.Decode() rescale_op = c_vision.Rescale(1.0 / 255.0, -1.0) ds1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) ds1 = ds1.map(input_columns=["image"], operations=decode_op, cache=some_cache) ds2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) ds2 = ds2.map(input_columns=["image"], operations=rescale_op, cache=some_cache) num_iter = 0 for _ in ds1.create_dict_iterator(): num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 3 try: sum([1 for _ in ds2]) except RuntimeError as e: logger.info("Got an exception in DE: {}".format(str(e))) assert "Attempt to re-use a cache for a different tree!" in str(e) logger.info("test_cache_nomap_disallowed_share1 Ended.\n")
def test_cache_nomap_basic3(): """ A TF reader dataset (a non mappable dataset) with a cache over it just after the leaf Repeat | Map(decode) | Cache | TFReader """ logger.info("Test cache nomap basic 3") some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) ds1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False, cache=some_cache) decode_op = c_vision.Decode() ds1 = ds1.map(operations=decode_op, input_columns=["image"]) ds1 = ds1.repeat(4) num_iter = 0 for _ in ds1.create_dict_iterator(num_epochs=1): num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 12 # Contact the server to get the statistics stat = some_cache.GetStat() cache_sz = stat.avg_cache_sz num_mem_cached = stat.num_mem_cached num_disk_cached = stat.num_disk_cached logger.info("Number of rows cached in memory: {}".format(num_mem_cached)) logger.info("Number of rows spilled to disk: {}".format(num_disk_cached)) logger.info("Average row cache size: {}".format(cache_sz)) logger.info("test_cache_nomap_basic3 Ended.\n")
def test_cache_nomap_basic4(): """ A TF reader dataset (a non mappable dataset) with a map decode and cache after it Since a global shuffle is used for the tf reader, it will inject a shuffle op over the tf. But, if there's a cache later, that shuffle becomes invalid and should be removed. Repeat | Cache | Map(decode) | TFReader """ logger.info("Test cache nomap basic 4") # This dataset has 3 records in it only some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) # With shuffle not being set, TF defaults to a "global" shuffle when there is no cache # in the picture. This causes a shuffle-injection over the TF. For clarify, this test will # explicitly give the global option, even though it's the default in python. # But, when caching is added in the ascendent tree above TF, we do global shuffling # through the sampler over the cache, not by the shuffle op. In that case, tree prepare # will remove the shuffle op that got injected by the initial tree creation. ds1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=ds.Shuffle.GLOBAL) decode_op = c_vision.Decode() ds1 = ds1.map(input_columns=["image"], operations=decode_op, cache=some_cache) ds1 = ds1.repeat(4) num_iter = 0 for _ in ds1.create_dict_iterator(): num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 12 logger.info("test_cache_nomap_basic4 Ended.\n")
def test_cache_nomap_basic6(): """ A TF reader dataset (a non mappable dataset) with a cache over it just after the leaf In this one, the tf dataset will be given sharding configuration, however since a cache is used, the tree prepare should undo the sharding configuration and instead, a distributed sampler will be chosen with the same shard config. Repeat | Map(decode) | Cache | TFReader """ logger.info("Test cache nomap basic 6") # This dataset has 3 records in it only some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) # With only 3 records shard into 3, we expect only 1 record returned for this shard # However, the sharding will be done by the sampler, not by the tf record leaf node # In this case, it is a row-based sharding, not the file-based sharding that would happen if # there was not any cache. ds1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], num_shards=3, shard_id=1, cache=some_cache) decode_op = c_vision.Decode() ds1 = ds1.map(input_columns=["image"], operations=decode_op) ds1 = ds1.repeat(4) num_iter = 0 for _ in ds1.create_dict_iterator(): num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 4 logger.info("test_cache_nomap_basic6 Ended.\n")
def test_cache_map_basic4(): """ Test different rows result in core dump """ logger.info("Test cache basic 4") some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) # This DATA_DIR only has 2 images in it ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache) decode_op = c_vision.Decode() ds1 = ds1.repeat(4) ds1 = ds1.map(input_columns=["image"], operations=decode_op) logger.info("ds1.dataset_size is ", ds1.get_dataset_size()) shape = ds1.output_shapes() logger.info(shape) num_iter = 0 for _ in ds1.create_dict_iterator(): logger.info("get data from dataset") num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 8 logger.info('test_cache_basic3 Ended.\n')
def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False, enable_cache=False, cache_session_id=None): """ create a train or evaluate cifar10 dataset for resnet50 Args: dataset_path(string): the path of dataset. do_train(bool): whether dataset is used for train or eval. repeat_num(int): the repeat times of dataset. Default: 1 batch_size(int): the batch size of dataset. Default: 32 target(str): the device target. Default: Ascend distribute(bool): data for distribute or not. Default: False enable_cache(bool): whether tensor caching service is used for eval. Default: False cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None Returns: dataset """ if target == "Ascend": device_num, rank_id = _get_rank_info() else: if distribute: init() rank_id = get_rank() device_num = get_group_size() else: device_num = 1 if device_num == 1: data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) else: data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) # define map operations trans = [] if do_train: trans += [ C.RandomCrop((32, 32), (4, 4, 4, 4)), C.RandomHorizontalFlip(prob=0.5) ] trans += [ C.Resize((224, 224)), C.Rescale(1.0 / 255.0, 0.0), C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]), C.HWC2CHW() ] type_cast_op = C2.TypeCast(mstype.int32) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) # only enable cache for eval if do_train: enable_cache = False if enable_cache: if not cache_session_id: raise ValueError( "A cache session_id must be provided to use cache.") eval_cache = ds.DatasetCache(session_id=int(cache_session_id), size=0) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8, cache=eval_cache) else: data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) # apply batch operations data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation data_set = data_set.repeat(repeat_num) return data_set
Training script """ import argparse import mindspore.dataset as ds parser = argparse.ArgumentParser(description='Cache Example') parser.add_argument('--num_devices', type=int, default=1, help='Device num.') parser.add_argument('--device', type=int, default=0, help='Device id.') parser.add_argument('--session_id', type=int, default=1, help='Session id.') parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') args_opt = parser.parse_args() # apply cache to dataset test_cache = ds.DatasetCache(session_id=args_opt.session_id, size=0, spilling=False) dataset = ds.Cifar10Dataset(dataset_dir=args_opt.dataset_path, num_samples=4, shuffle=False, num_parallel_workers=1, num_shards=args_opt.num_devices, shard_id=args_opt.device, cache=test_cache) num_iter = 0 for _ in dataset.create_dict_iterator(): num_iter += 1 print("Got {} samples on device {}".format(num_iter, args_opt.device))
def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False, enable_cache=False, cache_session_id=None): """ create a train or eval imagenet2012 dataset for se-resnet50 Args: dataset_path(string): the path of dataset. do_train(bool): whether dataset is used for train or eval. repeat_num(int): the repeat times of dataset. Default: 1 batch_size(int): the batch size of dataset. Default: 32 target(str): the device target. Default: Ascend distribute(bool): data for distribute or not. Default: False enable_cache(bool): whether tensor caching service is used for eval. Default: False cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None Returns: dataset """ if target == "Ascend": device_num, rank_id = _get_rank_info() else: if distribute: init() rank_id = get_rank() device_num = get_group_size() else: device_num = 1 if device_num == 1: data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True) else: data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True, num_shards=device_num, shard_id=rank_id) image_size = 224 mean = [123.68, 116.78, 103.94] std = [1.0, 1.0, 1.0] # define map operations if do_train: trans = [ C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), C.RandomHorizontalFlip(prob=0.5), C.Normalize(mean=mean, std=std), C.HWC2CHW() ] else: trans = [ C.Decode(), C.Resize(292), C.CenterCrop(256), C.Normalize(mean=mean, std=std), C.HWC2CHW() ] type_cast_op = C2.TypeCast(mstype.int32) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=12) # only enable cache for eval if do_train: enable_cache = False if enable_cache: if not cache_session_id: raise ValueError( "A cache session_id must be provided to use cache.") eval_cache = ds.DatasetCache(session_id=int(cache_session_id), size=0) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12, cache=eval_cache) else: data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12) # apply batch operations data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation data_set = data_set.repeat(repeat_num) return data_set