def test_add_sampler_invalid_input(): manifest_file = "../data/dataset/testManifestData/test5trainimgs.json" _ = { (172876, 0): 0, (54214, 0): 1, (54214, 1): 2, (173673, 0): 3, (64631, 1): 4 } data1 = ds.ManifestDataset(manifest_file) with pytest.raises(TypeError) as info: data1.use_sampler(1) assert "not an instance of a sampler" in str(info.value) with pytest.raises(TypeError) as info: data1.use_sampler("sampler") assert "not an instance of a sampler" in str(info.value) sampler = ds.SequentialSampler() with pytest.raises(ValueError) as info: data2 = ds.ManifestDataset(manifest_file, sampler=sampler, num_samples=20) assert "Conflicting arguments during sampler assignments" in str( info.value)
def test_add_sampler_invalid_input(): manifest_file = "../data/dataset/testManifestData/test5trainimgs.json" _ = { (172876, 0): 0, (54214, 0): 1, (54214, 1): 2, (173673, 0): 3, (64631, 1): 4 } data1 = ds.ManifestDataset(manifest_file) with pytest.raises(TypeError) as info: data1.use_sampler(1) assert "not an instance of a sampler" in str(info.value) with pytest.raises(TypeError) as info: data1.use_sampler("sampler") assert "not an instance of a sampler" in str(info.value) sampler = ds.SequentialSampler() with pytest.raises(RuntimeError) as info: data2 = ds.ManifestDataset(manifest_file, sampler=sampler, num_samples=20) assert "sampler and num_samples cannot be specified at the same time" in str( info.value)
def test_manifest_dataset_exception(): def exception_func(item): raise Exception("Error occur!") try: data = ds.ManifestDataset(DATA_FILE) data = data.map(operations=exception_func, input_columns=["image"], num_parallel_workers=1) for _ in data.__iter__(): pass assert False except RuntimeError as e: assert "map operation: [PyFunc] failed. The corresponding data files" in str( e) try: data = ds.ManifestDataset(DATA_FILE) data = data.map(operations=vision.Decode(), input_columns=["image"], num_parallel_workers=1) data = data.map(operations=exception_func, input_columns=["image"], num_parallel_workers=1) for _ in data.__iter__(): pass assert False except RuntimeError as e: assert "map operation: [PyFunc] failed. The corresponding data files" in str( e) try: data = ds.ManifestDataset(DATA_FILE) data = data.map(operations=exception_func, input_columns=["label"], num_parallel_workers=1) for _ in data.__iter__(): pass assert False except RuntimeError as e: assert "map operation: [PyFunc] failed. The corresponding data files" in str( e) NO_SOURCE_DATA_FILE = "../data/dataset/testManifestData/invalidNoSource.manifest" try: data = ds.ManifestDataset(NO_SOURCE_DATA_FILE) for _ in data.__iter__(): pass assert False except RuntimeError as e: assert "Invalid data, source is not found in Manifest file" in str(e) NO_USAGE_DATA_FILE = "../data/dataset/testManifestData/invalidNoUsage.manifest" try: data = ds.ManifestDataset(NO_USAGE_DATA_FILE) for _ in data.__iter__(): pass assert False except RuntimeError as e: assert "Invalid data, usage is not found in Manifest file" in str(e)
def test_mappable_invalid_input(): d = ds.ManifestDataset(manifest_file) split_with_invalid_inputs(d) d = ds.ManifestDataset(manifest_file, num_shards=2, shard_id=0) with pytest.raises(RuntimeError) as info: _, _ = d.split([4, 1]) assert "Dataset should not be sharded before split" in str(info.value)
def test_manifest_dataset_size(): ds_total = ds.ManifestDataset(MANIFEST_DATA_FILE) assert ds_total.get_dataset_size() == 4 ds_shard_1_0 = ds.ManifestDataset(MANIFEST_DATA_FILE, num_shards=1, shard_id=0) assert ds_shard_1_0.get_dataset_size() == 4 ds_shard_2_0 = ds.ManifestDataset(MANIFEST_DATA_FILE, num_shards=2, shard_id=0) assert ds_shard_2_0.get_dataset_size() == 2 ds_shard_3_0 = ds.ManifestDataset(MANIFEST_DATA_FILE, num_shards=3, shard_id=0) assert ds_shard_3_0.get_dataset_size() == 2
def test_manifest(): data = ds.ManifestDataset("../data/dataset/testManifestData/test.manifest") assert data.get_dataset_size() == 4 assert data.num_classes() == 3 data = data.shuffle(100) assert data.num_classes() == 3
def test_rounding(): d = ds.ManifestDataset(manifest_file, shuffle=False) # under rounding s1, s2 = d.split([0.5, 0.5], randomize=False) s1_output = [] for item in s1.create_dict_iterator(): s1_output.append(manifest_map[(item["image"].shape[0], item["label"].item())]) s2_output = [] for item in s2.create_dict_iterator(): s2_output.append(manifest_map[(item["image"].shape[0], item["label"].item())]) assert s1_output == [0, 1, 2] assert s2_output == [3, 4] # over rounding s1, s2, s3 = d.split([0.15, 0.55, 0.3], randomize=False) s1_output = [] for item in s1.create_dict_iterator(): s1_output.append(manifest_map[(item["image"].shape[0], item["label"].item())]) s2_output = [] for item in s2.create_dict_iterator(): s2_output.append(manifest_map[(item["image"].shape[0], item["label"].item())]) s3_output = [] for item in s3.create_dict_iterator(): s3_output.append(manifest_map[(item["image"].shape[0], item["label"].item())]) assert s1_output == [0] assert s2_output == [1, 2] assert s3_output == [3, 4]
def test_mappable_get_dataset_size(): d = ds.ManifestDataset(manifest_file, shuffle=False) s1, s2 = d.split([4, 1]) assert d.get_dataset_size() == 5 assert s1.get_dataset_size() == 4 assert s2.get_dataset_size() == 1
def test_manifest_sampler_chain(): """ Test Manifest sampler chain """ logger.info("test_manifest_sampler_chain") sampler = ds.RandomSampler(replacement=True, num_samples=2) child_sampler = ds.DistributedSampler(num_shards=1, shard_id=0, shuffle=False, num_samples=3, offset=1) sampler.add_child(child_sampler) data1 = ds.ManifestDataset(MANIFEST_DATA_FILE, sampler=sampler) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 2 # Verify number of rows assert sum([1 for _ in data1]) == 2 # Verify dataset contents res = [] for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): logger.info("item: {}".format(item)) res.append(item) logger.info("dataset: {}".format(res))
def test_manifest_sampler_chain_repeat(): """ Test ManifestDataset sampler chain DistributedSampler->SequentialSampler, with repeat """ logger.info("test_manifest_sampler_chain_batch") manifest_file = "../data/dataset/testManifestData/test5trainimgs.json" # Create sampler chain DistributedSampler->SequentialSampler sampler = ds.DistributedSampler(num_shards=1, shard_id=0, shuffle=False, num_samples=5) child_sampler = ds.SequentialSampler() sampler.add_child(child_sampler) # Create ManifestDataset with sampler chain data1 = ds.ManifestDataset(manifest_file, sampler=sampler) data1 = data1.repeat(count=2) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 10 # Verify number of rows assert sum([1 for _ in data1]) == 10 # Verify dataset contents filename = "sampler_chain_manifest_repeat_result.npz" save_and_check_md5(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_manifest_sampler_chain_batch_repeat(): """ Test ManifestDataset sampler chain DistributedSampler->SequentialSampler, with batch then repeat """ logger.info("test_manifest_sampler_chain_batch_repeat") manifest_file = "../data/dataset/testManifestData/test5trainimgs.json" # Create sampler chain DistributedSampler->SequentialSampler sampler = ds.DistributedSampler(num_shards=1, shard_id=0, shuffle=False, num_samples=5) child_sampler = ds.SequentialSampler() sampler.add_child(child_sampler) # Create ManifestDataset with sampler chain data1 = ds.ManifestDataset(manifest_file, decode=True, sampler=sampler) one_hot_encode = c_transforms.OneHot(3) data1 = data1.map(operations=one_hot_encode, input_columns=["label"]) data1 = data1.batch(batch_size=5, drop_remainder=False) data1 = data1.repeat(count=2) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 2
def test_config(start_index, num_samples): sampler = ds.SequentialSampler(start_index, num_samples) d = ds.ManifestDataset(manifest_file, sampler=sampler) res = [] for item in d.create_dict_iterator(num_epochs=1, output_numpy=True): res.append(map_[(item["image"].shape[0], item["label"].item())]) return res
def test_config(num_samples, start_index, subset_size): sampler = ds.SubsetSampler(start_index, subset_size) d = ds.ManifestDataset(manifest_file, sampler=sampler) res = [] for item in d.create_dict_iterator(): res.append(map_[(item["image"].shape[0], item["label"].item())]) return res
def test_manifest_dataset_multi_label(): data = ds.ManifestDataset(DATA_FILE, decode=True, shuffle=False) count = 0 expect_label = [1, 0, 0, [0, 2]] for item in data.create_dict_iterator(num_epochs=1, output_numpy=True): assert item["label"].tolist() == expect_label[count] logger.info("item[image] is {}".format(item["image"])) count = count + 1 assert count == 4
def test_manifest_dataset_eval(): data = ds.ManifestDataset(DATA_FILE, "eval", decode=True) count = 0 for item in data.create_dict_iterator(num_epochs=1, output_numpy=True): logger.info("item[image] is {}".format(item["image"])) count = count + 1 if item["label"] != 0 and item["label"] != 1: assert 0 assert count == 2
def test_config(replacement, num_samples, num_repeats): sampler = ds.RandomSampler(replacement=replacement, num_samples=num_samples) data1 = ds.ManifestDataset(manifest_file, sampler=sampler) data1 = data1.repeat(num_repeats) res = [] for item in data1.create_dict_iterator(): res.append(map_[(item["image"].shape[0], item["label"].item())]) if print_res: logger.info("image.shapes and labels: {}".format(res)) return res
def test_manifest_dataset_exception(): def exception_func(item): raise Exception("Error occur!") try: data = ds.ManifestDataset(DATA_FILE) data = data.map(operations=exception_func, input_columns=["image"], num_parallel_workers=1) for _ in data.__iter__(): pass assert False except RuntimeError as e: assert "map operation: [PyFunc] failed. The corresponding data files" in str( e) try: data = ds.ManifestDataset(DATA_FILE) data = data.map(operations=vision.Decode(), input_columns=["image"], num_parallel_workers=1) data = data.map(operations=exception_func, input_columns=["image"], num_parallel_workers=1) for _ in data.__iter__(): pass assert False except RuntimeError as e: assert "map operation: [PyFunc] failed. The corresponding data files" in str( e) try: data = ds.ManifestDataset(DATA_FILE) data = data.map(operations=exception_func, input_columns=["label"], num_parallel_workers=1) for _ in data.__iter__(): pass assert False except RuntimeError as e: assert "map operation: [PyFunc] failed. The corresponding data files" in str( e)
def test_config(num_samples, num_repeats, sampler): data1 = ds.ManifestDataset(manifest_file, num_samples=num_samples, sampler=sampler) if num_repeats is not None: data1 = data1.repeat(num_repeats) res = [] for item in data1.create_dict_iterator(): logger.info("item[image].shape[0]: {}, item[label].item(): {}" .format(item["image"].shape[0], item["label"].item())) res.append(map_[(item["image"].shape[0], item["label"].item())]) # print(res) return res
def test_num_samples(): manifest_file = "../data/dataset/testManifestData/test5trainimgs.json" num_samples = 1 # sampler = ds.DistributedSampler(num_shards=1, shard_id=0, shuffle=False, num_samples=3, offset=1) data1 = ds.ManifestDataset( manifest_file, num_samples=num_samples, num_shards=3, shard_id=1 ) row_count = 0 for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True): row_count += 1 assert row_count == 1
def test_manifest_dataset_multi_label_onehot(): data = ds.ManifestDataset(DATA_FILE, decode=True, shuffle=False) expect_label = [[[0, 1, 0], [1, 0, 0]], [[1, 0, 0], [1, 0, 1]]] one_hot_encode = data_trans.OneHot(3) data = data.map(operations=one_hot_encode, input_columns=["label"]) data = data.map(operations=multi_label_hot, input_columns=["label"]) data = data.batch(2) count = 0 for item in data.create_dict_iterator(num_epochs=1, output_numpy=True): assert item["label"].tolist() == expect_label[count] logger.info("item[image] is {}".format(item["image"])) count = count + 1
def test_add_sampler_invalid_input(): manifest_file = "../data/dataset/testManifestData/test5trainimgs.json" _ = {(172876, 0): 0, (54214, 0): 1, (54214, 1): 2, (173673, 0): 3, (64631, 1): 4} data1 = ds.ManifestDataset(manifest_file) with pytest.raises(TypeError) as info: data1.use_sampler(1) assert "not an instance of a sampler" in str(info.value) with pytest.raises(TypeError) as info: data1.use_sampler("sampler") assert "not an instance of a sampler" in str(info.value)
def test_manifest_dataset_get_class_index(): data = ds.ManifestDataset(DATA_FILE, decode=True) class_indexing = data.get_class_indexing() assert class_indexing == {'cat': 0, 'dog': 1, 'flower': 2} data = data.shuffle(4) class_indexing = data.get_class_indexing() assert class_indexing == {'cat': 0, 'dog': 1, 'flower': 2} count = 0 for item in data.create_dict_iterator(num_epochs=1): logger.info("item[image] is {}".format(item["image"])) count = count + 1 assert count == 4
def test_config(num_samples, num_repeats=None): sampler = ds.SequentialSampler(num_samples=num_samples) data1 = ds.ManifestDataset(manifest_file, sampler=sampler) if num_repeats is not None: data1 = data1.repeat(num_repeats) res = [] for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): logger.info("item[image].shape[0]: {}, item[label].item(): {}" .format(item["image"].shape[0], item["label"].item())) res.append(map_[(item["image"].shape[0], item["label"].item())]) if print_res: logger.info("image.shapes and labels: {}".format(res)) return res
def test_config(replacement, num_samples, num_repeats, validate): sampler = ds.RandomSampler(replacement=replacement, num_samples=num_samples) data1 = ds.ManifestDataset(manifest_file, sampler=sampler) while num_repeats > 0: res = [] for item in data1.create_dict_iterator(): res.append(map_[(item["image"].shape[0], item["label"].item())]) if print_res: logger.info("image.shapes and labels: {}".format(res)) if validate != sorted(res): break num_repeats -= 1 assert num_repeats > 0
def test_config(num_shards, shard_id): sampler = ds.DistributedSampler(num_shards, shard_id, False) child_sampler = ds.SequentialSampler() sampler.add_child(child_sampler) data1 = ds.ManifestDataset(manifest_file, num_samples=5, sampler=sampler) res = [] for item in data1.create_dict_iterator(): logger.info("item[image].shape[0]: {}, item[label].item(): {}" .format(item["image"].shape[0], item["label"].item())) res.append(map_[(item["image"].shape[0], item["label"].item())]) return res
def test_mappable_split_general(): d = ds.ManifestDataset(manifest_file, shuffle=False) d = d.take(5) # absolute rows s1, s2 = d.split([4, 1], randomize=False) s1_output = [] for item in s1.create_dict_iterator(): s1_output.append(manifest_map[(item["image"].shape[0], item["label"].item())]) s2_output = [] for item in s2.create_dict_iterator(): s2_output.append(manifest_map[(item["image"].shape[0], item["label"].item())]) assert s1_output == [0, 1, 2, 3] assert s2_output == [4] # exact percentages s1, s2 = d.split([0.8, 0.2], randomize=False) s1_output = [] for item in s1.create_dict_iterator(): s1_output.append(manifest_map[(item["image"].shape[0], item["label"].item())]) s2_output = [] for item in s2.create_dict_iterator(): s2_output.append(manifest_map[(item["image"].shape[0], item["label"].item())]) assert s1_output == [0, 1, 2, 3] assert s2_output == [4] # fuzzy percentages s1, s2 = d.split([0.33, 0.67], randomize=False) s1_output = [] for item in s1.create_dict_iterator(): s1_output.append(manifest_map[(item["image"].shape[0], item["label"].item())]) s2_output = [] for item in s2.create_dict_iterator(): s2_output.append(manifest_map[(item["image"].shape[0], item["label"].item())]) assert s1_output == [0, 1] assert s2_output == [2, 3, 4]
def test_manifest_dataset_class_index(): class_indexing = {"dog": 11} data = ds.ManifestDataset(DATA_FILE, decode=True, class_indexing=class_indexing) out_class_indexing = data.get_class_indexing() assert out_class_indexing == {"dog": 11} count = 0 for item in data.create_dict_iterator(num_epochs=1, output_numpy=True): logger.info("item[image] is {}".format(item["image"])) count = count + 1 if item["label"] != 11: assert 0 assert count == 1
def test_manifest_dataset_train(): data = ds.ManifestDataset(DATA_FILE, decode=True) count = 0 cat_count = 0 dog_count = 0 for item in data.create_dict_iterator(num_epochs=1, output_numpy=True): logger.info("item[image] is {}".format(item["image"])) count = count + 1 if item["label"].size == 1 and item["label"] == 0: cat_count = cat_count + 1 elif item["label"].size == 1 and item["label"] == 1: dog_count = dog_count + 1 assert cat_count == 2 assert dog_count == 1 assert count == 4
def test_manifest_dataset_get_num_class(): data = ds.ManifestDataset(DATA_FILE, decode=True, shuffle=False) assert data.num_classes() == 3 padded_samples = [{ 'image': np.zeros(1, np.uint8), 'label': np.array(1, np.int32) }] padded_ds = ds.PaddedDataset(padded_samples) data = data.repeat(2) padded_ds = padded_ds.repeat(2) data1 = data + padded_ds assert data1.num_classes() == 3
def sharding_config(num_shards, shard_id, num_samples, shuffle, repeat_cnt=1): data1 = ds.ManifestDataset(manifest_file, num_samples=num_samples, num_shards=num_shards, shard_id=shard_id, shuffle=shuffle, decode=True) data1 = data1.repeat(repeat_cnt) res = [] for item in data1.create_dict_iterator(): # each data is a dictionary res.append(item["label"].item()) if print_res: logger.info("labels of dataset: {}".format(res)) return res