def test_numpyslices_sampler_chain_batch(): """ Test NumpySlicesDataset sampler chaining, with batch """ logger.info("test_numpyslices_sampler_chain_batch") # Create NumpySlicesDataset with sampler chain np_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] sampler = ds.SequentialSampler(start_index=1, num_samples=3) sampler = sampler.add_child( ds.SequentialSampler(start_index=1, num_samples=2)) data1 = ds.NumpySlicesDataset(np_data, sampler=sampler) data1 = data1.batch(batch_size=3, drop_remainder=False) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 4 # Verify number of rows assert sum([1 for _ in data1]) == 4 # Verify dataset contents res = [] for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): logger.info("item: {}".format(item)) res.append(item) logger.info("dataset: {}".format(res))
def test_numpyslices_sampler_chain2(): """ Test NumpySlicesDataset sampler chain """ logger.info("test_numpyslices_sampler_chain2") # Create NumpySlicesDataset with sampler chain # Use 2 statements to add child sampler np_data = [1, 2, 3, 4] sampler = ds.SequentialSampler(start_index=1, num_samples=1) child_sampler = ds.SequentialSampler(start_index=1, num_samples=2) sampler.add_child(child_sampler) data1 = ds.NumpySlicesDataset(np_data, sampler=sampler) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 1 # Verify number of rows assert sum([1 for _ in data1]) == 1 # Verify dataset contents res = [] for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): logger.info("item: {}".format(item)) res.append(item) logger.info("dataset: {}".format(res))
def test_cv_minddataset_sequential_sampler_exceed_size(add_and_remove_cv_file): data = get_data(CV_DIR_NAME, True) columns_list = ["data", "file_name", "label"] num_readers = 4 sampler = ds.SequentialSampler(2, 10) data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers, sampler=sampler) dataset_size = data_set.get_dataset_size() assert dataset_size == 10 num_iter = 0 for item in data_set.create_dict_iterator(): logger.info( "-------------- cv reader basic: {} ------------------------". format(num_iter)) logger.info( "-------------- item[data]: {} -----------------------------". format(item["data"])) logger.info( "-------------- item[file_name]: {} ------------------------". format(item["file_name"])) logger.info( "-------------- item[label]: {} ----------------------------". format(item["label"])) assert item['file_name'] == np.array(data[(num_iter + 2) % dataset_size]['file_name'], dtype='S') num_iter += 1 assert num_iter == 10
def test_add_sampler_invalid_input(): manifest_file = "../data/dataset/testManifestData/test5trainimgs.json" _ = { (172876, 0): 0, (54214, 0): 1, (54214, 1): 2, (173673, 0): 3, (64631, 1): 4 } data1 = ds.ManifestDataset(manifest_file) with pytest.raises(TypeError) as info: data1.use_sampler(1) assert "not an instance of a sampler" in str(info.value) with pytest.raises(TypeError) as info: data1.use_sampler("sampler") assert "not an instance of a sampler" in str(info.value) sampler = ds.SequentialSampler() with pytest.raises(RuntimeError) as info: data2 = ds.ManifestDataset(manifest_file, sampler=sampler, num_samples=20) assert "sampler and num_samples cannot be specified at the same time" in str( info.value)
def test_chained_sampler_03(): logger.info("Test Case Chained Sampler - Random and Sequential, with repeat then batch") # Create chained sampler, random and sequential sampler = ds.RandomSampler() child_sampler = ds.SequentialSampler() sampler.add_child(child_sampler) # Create ImageFolderDataset with sampler data1 = ds.ImageFolderDataset(DATA_DIR, sampler=sampler) data1 = data1.repeat(count=2) data1 = data1.batch(batch_size=5, drop_remainder=False) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 18 # Verify number of iterations num_iter = 0 for item in data1.create_dict_iterator(num_epochs=1): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert num_iter == 18
def test_generator_num_samples(): source = [(np.array([x]), ) for x in range(64)] num_samples = 32 ds1 = ds.GeneratorDataset( source, ["data"], sampler=ds.SequentialSampler(num_samples=num_samples)) ds2 = ds.GeneratorDataset(source, ["data"], sampler=[i for i in range(32)], num_samples=num_samples) ds3 = ds.GeneratorDataset(generator_1d, ["data"], num_samples=num_samples) count = 0 for _ in ds1.create_dict_iterator(): count = count + 1 assert count == num_samples count = 0 for _ in ds2.create_dict_iterator(): count = count + 1 assert count == num_samples count = 0 for _ in ds3.create_dict_iterator(): count = count + 1 assert count == num_samples
def test_raise_error(): data1 = [{'image': np.zeros(1, np.uint8)}, {'image': np.zeros(2, np.uint8)}, {'image': np.zeros(3, np.uint8)}, {'image': np.zeros(4, np.uint8)}, {'image': np.zeros(5, np.uint8)}] data2 = [{'image': np.zeros(6, np.uint8)}, {'image': np.zeros(7, np.uint8)}, {'image': np.zeros(8, np.uint8)}] ds1 = ds.PaddedDataset(data1) ds4 = ds1.batch(2) ds2 = ds.PaddedDataset(data2) ds3 = ds4 + ds2 with pytest.raises(TypeError) as excinfo: testsampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=None) ds3.use_sampler(testsampler) assert excinfo.type == 'TypeError' with pytest.raises(TypeError) as excinfo: otherSampler = ds.SequentialSampler() ds3.use_sampler(otherSampler) assert excinfo.type == 'TypeError' with pytest.raises(ValueError) as excinfo: testsampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=True, num_samples=None) ds3.use_sampler(testsampler) assert excinfo.type == 'ValueError' with pytest.raises(ValueError) as excinfo: testsampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=5) ds3.use_sampler(testsampler) assert excinfo.type == 'ValueError'
def test_add_sampler_invalid_input(): manifest_file = "../data/dataset/testManifestData/test5trainimgs.json" _ = { (172876, 0): 0, (54214, 0): 1, (54214, 1): 2, (173673, 0): 3, (64631, 1): 4 } data1 = ds.ManifestDataset(manifest_file) with pytest.raises(TypeError) as info: data1.use_sampler(1) assert "not an instance of a sampler" in str(info.value) with pytest.raises(TypeError) as info: data1.use_sampler("sampler") assert "not an instance of a sampler" in str(info.value) sampler = ds.SequentialSampler() with pytest.raises(ValueError) as info: data2 = ds.ManifestDataset(manifest_file, sampler=sampler, num_samples=20) assert "Conflicting arguments during sampler assignments" in str( info.value)
def test_sequential_sampler(): logger.info("Test Case SequentialSampler") golden = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 ] # define parameters repeat_count = 1 # apply dataset operations sampler = ds.SequentialSampler() data1 = ds.ImageFolderDatasetV2(DATA_DIR, sampler=sampler) data1 = data1.repeat(repeat_count) result = [] num_iter = 0 for item in data1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" result.append(item["label"]) num_iter += 1 logger.info("Result: {}".format(result)) assert result == golden
def test_manifest_sampler_chain_batch_repeat(): """ Test ManifestDataset sampler chain DistributedSampler->SequentialSampler, with batch then repeat """ logger.info("test_manifest_sampler_chain_batch_repeat") manifest_file = "../data/dataset/testManifestData/test5trainimgs.json" # Create sampler chain DistributedSampler->SequentialSampler sampler = ds.DistributedSampler(num_shards=1, shard_id=0, shuffle=False, num_samples=5) child_sampler = ds.SequentialSampler() sampler.add_child(child_sampler) # Create ManifestDataset with sampler chain data1 = ds.ManifestDataset(manifest_file, decode=True, sampler=sampler) one_hot_encode = c_transforms.OneHot(3) data1 = data1.map(operations=one_hot_encode, input_columns=["label"]) data1 = data1.batch(batch_size=5, drop_remainder=False) data1 = data1.repeat(count=2) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 2
def test_voc_sampler_chain(): """ Test VOC sampler chain """ logger.info("test_voc_sampler_chain") sampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=5) child_sampler = ds.SequentialSampler(start_index=0) sampler.add_child(child_sampler) data1 = ds.VOCDataset(VOC_DATA_DIR, task="Segmentation", sampler=sampler) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 5 # Verify number of rows assert sum([1 for _ in data1]) == 5 # Verify dataset contents res = [] for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): logger.info("item: {}".format(item)) res.append(item) logger.info("dataset: {}".format(res))
def test_manifest_sampler_chain_repeat(): """ Test ManifestDataset sampler chain DistributedSampler->SequentialSampler, with repeat """ logger.info("test_manifest_sampler_chain_batch") manifest_file = "../data/dataset/testManifestData/test5trainimgs.json" # Create sampler chain DistributedSampler->SequentialSampler sampler = ds.DistributedSampler(num_shards=1, shard_id=0, shuffle=False, num_samples=5) child_sampler = ds.SequentialSampler() sampler.add_child(child_sampler) # Create ManifestDataset with sampler chain data1 = ds.ManifestDataset(manifest_file, sampler=sampler) data1 = data1.repeat(count=2) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 10 # Verify number of rows assert sum([1 for _ in data1]) == 10 # Verify dataset contents filename = "sampler_chain_manifest_repeat_result.npz" save_and_check_md5(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_cifar_sampler_chain(): """ Test Cifar sampler chain """ logger.info("test_cifar_sampler_chain") sampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=5) child_sampler = ds.RandomSampler(replacement=True, num_samples=4) child_sampler2 = ds.SequentialSampler(start_index=0, num_samples=2) child_sampler.add_child(child_sampler2) sampler.add_child(child_sampler) data1 = ds.Cifar10Dataset(CIFAR10_DATA_DIR, sampler=sampler) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 1 # Verify number of rows assert sum([1 for _ in data1]) == 1 # Verify dataset contents res = [] for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): logger.info("item: {}".format(item)) res.append(item) logger.info("dataset: {}".format(res))
def test_generator_sequential_sampler(): source = [(np.array([x]),) for x in range(64)] ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler()) i = 0 for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary golden = np.array([i]) np.testing.assert_array_equal(data["data"], golden) i = i + 1
def test_sequential_sampler(): source = [(np.array([x]),) for x in range(64)] ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler()) i = 0 for data in ds1.create_dict_iterator(): # each data is a dictionary golden = np.array([i]) assert np.array_equal(data["data"], golden) i = i + 1
def test_numpy_slices_sequential_sampler(): logger.info("Test numpy_slices_dataset with SequentialSampler and repeat.") np_data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] ds = de.NumpySlicesDataset(np_data, sampler=de.SequentialSampler()).repeat(2) for i, data in enumerate(ds): assert np.equal(data[0].asnumpy(), np_data[i % 8]).all()
def test_config(start_index, num_samples): sampler = ds.SequentialSampler(start_index, num_samples) d = ds.ManifestDataset(manifest_file, sampler=sampler) res = [] for item in d.create_dict_iterator(): res.append(map_[(item["image"].shape[0], item["label"].item())]) return res
def test_cifar10_with_chained_sampler_get_dataset_size(): """ Test Cifar10Dataset with PKSampler chained with a SequentialSampler and get_dataset_size """ sampler = ds.SequentialSampler(start_index=0, num_samples=5) child_sampler = ds.PKSampler(4) sampler.add_child(child_sampler) data = ds.Cifar10Dataset(DATA_DIR_10, sampler=sampler) num_iter = 0 ds_sz = data.get_dataset_size() for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True): num_iter += 1 assert ds_sz == num_iter == 5
def test_config(num_samples, num_repeats=None): sampler = ds.SequentialSampler() data1 = ds.ManifestDataset(manifest_file, num_samples=num_samples, sampler=sampler) if num_repeats is not None: data1 = data1.repeat(num_repeats) res = [] for item in data1.create_dict_iterator(): logger.info("item[image].shape[0]: {}, item[label].item(): {}" .format(item["image"].shape[0], item["label"].item())) res.append(map_[(item["image"].shape[0], item["label"].item())]) if print_res: logger.info("image.shapes and labels: {}".format(res)) return res
def test_config(num_shards, shard_id): sampler = ds.DistributedSampler(num_shards, shard_id, False) child_sampler = ds.SequentialSampler() sampler.add_child(child_sampler) data1 = ds.ManifestDataset(manifest_file, num_samples=5, sampler=sampler) res = [] for item in data1.create_dict_iterator(): logger.info("item[image].shape[0]: {}, item[label].item(): {}" .format(item["image"].shape[0], item["label"].item())) res.append(map_[(item["image"].shape[0], item["label"].item())]) return res
def test_cifar10_sequential_sampler(): """ Test Cifar10Dataset with SequentialSampler """ logger.info("Test Cifar10Dataset Op with SequentialSampler") num_samples = 30 sampler = ds.SequentialSampler(num_samples=num_samples) data1 = ds.Cifar10Dataset(DATA_DIR_10, sampler=sampler) data2 = ds.Cifar10Dataset(DATA_DIR_10, shuffle=False, num_samples=num_samples) num_iter = 0 for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()): np.testing.assert_equal(item1["label"], item2["label"]) num_iter += 1 assert num_iter == num_samples
def test_case_14(): """ Test 1D Generator MP + CPP sampler """ logger.info("Test 1D Generator MP : 0 - 63") source = [(np.array([x]),) for x in range(256)] ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler(), num_parallel_workers=4).repeat(2) i = 0 for data in ds1.create_dict_iterator(): # each data is a dictionary golden = np.array([i]) assert np.array_equal(data["data"], golden) i = i + 1 if i == 256: i = 0
def test_sampler_chain_errors(): """ Test error cases for sampler chains """ logger.info("test_sampler_chain_errors") error_msg_1 = "'NoneType' object has no attribute 'add_child'" # Test add child sampler within child sampler sampler = ds.SequentialSampler(start_index=1, num_samples=2) sampler = sampler.add_child( ds.SequentialSampler(start_index=1, num_samples=2)) with pytest.raises(AttributeError, match=error_msg_1): sampler.add_child(ds.SequentialSampler(start_index=1, num_samples=2)) # error_msg_2 = "'NoneType' object has no attribute 'add_child'" # Test add second and nested child sampler sampler = ds.SequentialSampler(start_index=1, num_samples=2) child_sampler = ds.SequentialSampler(start_index=1, num_samples=2) sampler.add_child(child_sampler) child_sampler2 = ds.SequentialSampler(start_index=1, num_samples=2) sampler.add_child(child_sampler2) # FIXME - no error is raised; uncomment after code issue is resolved # with pytest.raises(AttributeError, match=error_msg_2): # sampler.add_child(child_sampler2) # np_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # data1 = ds.NumpySlicesDataset(np_data, sampler=sampler) error_msg_3 = "Conflicting arguments during sampler assignments." # Test conflicting arguments (sampler and shuffle=False) for sampler (no chain) np_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] sampler = ds.SequentialSampler(start_index=1, num_samples=3) with pytest.raises(ValueError, match=error_msg_3): ds.NumpySlicesDataset(np_data, shuffle=False, sampler=sampler) # error_msg_4 = "Conflicting arguments during sampler assignments." # Test conflicting arguments (sampler and shuffle=False) for sampler chaining np_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] sampler = ds.SequentialSampler(start_index=1, num_samples=3) sampler = sampler.add_child( ds.SequentialSampler(start_index=1, num_samples=2))
def test_case_16(): """ Test multi column generator Mp + CPP sampler """ logger.info("Test multi column generator") source = [(np.array([x]), np.array([x + 1])) for x in range(256)] # apply dataset operations data1 = ds.GeneratorDataset(source, ["col0", "col1"], sampler=ds.SequentialSampler()) i = 0 for item in data1.create_dict_iterator(): # each data is a dictionary golden = np.array([i]) assert np.array_equal(item["col0"], golden) golden = np.array([i + 1]) assert np.array_equal(item["col1"], golden) i = i + 1
def test_mnist_sequential_sampler(): """ Test MnistDataset with SequentialSampler """ logger.info("Test MnistDataset Op with SequentialSampler") num_samples = 50 sampler = ds.SequentialSampler(num_samples=num_samples) data1 = ds.MnistDataset(DATA_DIR, sampler=sampler) data2 = ds.MnistDataset(DATA_DIR, shuffle=False, num_samples=num_samples) label_list1, label_list2 = [], [] num_iter = 0 for item1, item2 in zip(data1.create_dict_iterator(num_epochs=1), data2.create_dict_iterator(num_epochs=1)): label_list1.append(item1["label"].asnumpy()) label_list2.append(item2["label"].asnumpy()) num_iter += 1 np.testing.assert_array_equal(label_list1, label_list2) assert num_iter == num_samples
def test_sampler_py_api(): sampler = ds.SequentialSampler().create() sampler.set_num_rows(128) sampler.set_num_samples(64) sampler.initialize() sampler.get_indices() sampler = ds.RandomSampler().create() sampler.set_num_rows(128) sampler.set_num_samples(64) sampler.initialize() sampler.get_indices() sampler = ds.DistributedSampler(8, 4).create() sampler.set_num_rows(128) sampler.set_num_samples(64) sampler.initialize() sampler.get_indices()
def test_imagefolder_sampler_chain(): """ Test ImageFolderDataset sampler chain """ logger.info("test_imagefolder_sampler_chain") sampler = ds.SequentialSampler(start_index=1, num_samples=3) child_sampler = ds.PKSampler(2) sampler.add_child(child_sampler) data1 = ds.ImageFolderDataset(IMAGENET_RAWDATA_DIR, sampler=sampler) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 3 # Verify number of rows assert sum([1 for _ in data1]) == 3 # Verify dataset contents res = [] for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): logger.info("item: {}".format(item)) res.append(item) logger.info("dataset: {}".format(res))
def test_sampler_py_api(): sampler = ds.SequentialSampler().parse() sampler1 = ds.RandomSampler().parse() sampler1.add_child(sampler)
def test_serdes_imagefolder_dataset(remove_json_files=True): """ Test simulating resnet50 dataset pipeline. """ data_dir = "../data/dataset/testPK/data" ds.config.set_seed(1) # define data augmentation parameters rescale = 1.0 / 255.0 shift = 0.0 resize_height, resize_width = 224, 224 weights = [ 1.0, 0.1, 0.02, 0.3, 0.4, 0.05, 1.2, 0.13, 0.14, 0.015, 0.16, 1.1 ] # Constructing DE pipeline sampler = ds.WeightedRandomSampler(weights, 11) child_sampler = ds.SequentialSampler() sampler.add_child(child_sampler) data1 = ds.ImageFolderDataset(data_dir, sampler=sampler) data1 = data1.repeat(1) data1 = data1.map(operations=[vision.Decode(True)], input_columns=["image"]) rescale_op = vision.Rescale(rescale, shift) resize_op = vision.Resize((resize_height, resize_width), Inter.LINEAR) data1 = data1.map(operations=[rescale_op, resize_op], input_columns=["image"]) data1 = data1.batch(2) # Serialize the dataset pre-processing pipeline. # data1 should still work after saving. ds.serialize(data1, "imagenet_dataset_pipeline.json") ds1_dict = ds.serialize(data1) assert validate_jsonfile("imagenet_dataset_pipeline.json") is True # Print the serialized pipeline to stdout ds.show(data1) # Deserialize the serialized json file data2 = ds.deserialize(json_filepath="imagenet_dataset_pipeline.json") # Serialize the pipeline we just deserialized. # The content of the json file should be the same to the previous serialize. ds.serialize(data2, "imagenet_dataset_pipeline_1.json") assert validate_jsonfile("imagenet_dataset_pipeline_1.json") is True assert filecmp.cmp('imagenet_dataset_pipeline.json', 'imagenet_dataset_pipeline_1.json') # Deserialize the latest json file again data3 = ds.deserialize(json_filepath="imagenet_dataset_pipeline_1.json") data4 = ds.deserialize(input_dict=ds1_dict) num_samples = 0 # Iterate and compare the data in the original pipeline (data1) against the deserialized pipeline (data2) for item1, item2, item3, item4 in zip( data1.create_dict_iterator(num_epochs=1, output_numpy=True), data2.create_dict_iterator(num_epochs=1, output_numpy=True), data3.create_dict_iterator(num_epochs=1, output_numpy=True), data4.create_dict_iterator(num_epochs=1, output_numpy=True)): np.testing.assert_array_equal(item1['image'], item2['image']) np.testing.assert_array_equal(item1['image'], item3['image']) np.testing.assert_array_equal(item1['label'], item2['label']) np.testing.assert_array_equal(item1['label'], item3['label']) np.testing.assert_array_equal(item3['image'], item4['image']) np.testing.assert_array_equal(item3['label'], item4['label']) num_samples += 1 logger.info("Number of data in data1: {}".format(num_samples)) assert num_samples == 6 # Remove the generated json file if remove_json_files: delete_json_files()