Exemplo n.º 1
0
def test_raise_error():
    data1 = [{'image': np.zeros(1, np.uint8)}, {'image': np.zeros(2, np.uint8)},
             {'image': np.zeros(3, np.uint8)}, {'image': np.zeros(4, np.uint8)},
             {'image': np.zeros(5, np.uint8)}]
    data2 = [{'image': np.zeros(6, np.uint8)}, {'image': np.zeros(7, np.uint8)},
             {'image': np.zeros(8, np.uint8)}]

    ds1 = ds.PaddedDataset(data1)
    ds4 = ds1.batch(2)
    ds2 = ds.PaddedDataset(data2)
    ds3 = ds4 + ds2

    with pytest.raises(TypeError) as excinfo:
        testsampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=None)
        ds3.use_sampler(testsampler)
        assert excinfo.type == 'TypeError'

    with pytest.raises(TypeError) as excinfo:
        otherSampler = ds.SequentialSampler()
        ds3.use_sampler(otherSampler)
        assert excinfo.type == 'TypeError'

    with pytest.raises(ValueError) as excinfo:
        testsampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=True, num_samples=None)
        ds3.use_sampler(testsampler)
        assert excinfo.type == 'ValueError'

    with pytest.raises(ValueError) as excinfo:
        testsampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=5)
        ds3.use_sampler(testsampler)
        assert excinfo.type == 'ValueError'
Exemplo n.º 2
0
def test_chained_sampler_04():
    logger.info("Test Case Chained Sampler - Distributed and Random, with batch then repeat")

    # Create chained sampler, distributed and random
    sampler = ds.DistributedSampler(num_shards=4, shard_id=3)
    child_sampler = ds.RandomSampler()
    sampler.add_child(child_sampler)
    # Create ImageFolderDataset with sampler
    data1 = ds.ImageFolderDataset(DATA_DIR, sampler=sampler)

    data1 = data1.batch(batch_size=5, drop_remainder=True)
    data1 = data1.repeat(count=3)

    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 6

    # Verify number of iterations
    num_iter = 0
    for item in data1.create_dict_iterator(num_epochs=1):  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    # Note: Each of the 4 shards has 44/4=11 samples
    # Note: Number of iterations is (11/5 = 2) * 3 = 6
    assert num_iter == 6
Exemplo n.º 3
0
def test_concat_17():
    """
    Test concat: test get_dataset_size on nested concats (with sampler)
    """
    logger.info("test_concat_17")

    data1 = ds.GeneratorDataset(generator, ["col1"])
    data2 = ds.GeneratorDataset(generator_10, ["col1"])

    data3 = ds.GeneratorDataset(generator_20, ["col1"])
    data4 = ds.GeneratorDataset(generator_29, ["col1"])

    data5 = data1 + data2
    data6 = data3 + data4
    data7 = data5 + data6

    ds.config.set_seed(1)
    shard_num = 10
    counter = 0

    for i in range(shard_num):
        distributed_sampler = ds.DistributedSampler(num_shards=shard_num,
                                                    shard_id=i,
                                                    shuffle=False,
                                                    num_samples=None)
        data7.use_sampler(distributed_sampler)
        iter_counter = 0
        for _ in data7.create_dict_iterator(num_epochs=1, output_numpy=True):
            counter += 1
            iter_counter += 1
        assert data7.get_dataset_size() == iter_counter

    # 29 is the total size of all 4 leaf datasets
    assert counter == 29
Exemplo n.º 4
0
def create_cell_nuclei_dataset(data_dir,
                               img_size,
                               repeat,
                               batch_size,
                               is_train=False,
                               augment=False,
                               eval_resize=False,
                               split=0.8,
                               rank=0,
                               group_size=1,
                               python_multiprocessing=True,
                               num_parallel_workers=8):
    """
    Get generator dataset for cell nuclei dataset.
    """
    cell_dataset = CellNucleiDataset(data_dir, repeat, is_train, split)
    sampler = ds.DistributedSampler(group_size, rank, shuffle=is_train)
    dataset = ds.GeneratorDataset(cell_dataset,
                                  cell_dataset.column_names,
                                  sampler=sampler)
    compose_map_func = (lambda image, mask: preprocess_img_mask(
        image, mask, tuple(img_size), augment and is_train, eval_resize))
    dataset = dataset.map(operations=compose_map_func,
                          input_columns=cell_dataset.column_names,
                          output_columns=cell_dataset.column_names,
                          column_order=cell_dataset.column_names,
                          python_multiprocessing=python_multiprocessing,
                          num_parallel_workers=num_parallel_workers)
    dataset = dataset.batch(batch_size, drop_remainder=is_train)
    dataset = dataset.repeat(1)
    return dataset
Exemplo n.º 5
0
def test_manifest_sampler_chain_batch_repeat():
    """
    Test ManifestDataset sampler chain DistributedSampler->SequentialSampler, with batch then repeat
    """
    logger.info("test_manifest_sampler_chain_batch_repeat")
    manifest_file = "../data/dataset/testManifestData/test5trainimgs.json"

    # Create sampler chain DistributedSampler->SequentialSampler
    sampler = ds.DistributedSampler(num_shards=1,
                                    shard_id=0,
                                    shuffle=False,
                                    num_samples=5)
    child_sampler = ds.SequentialSampler()
    sampler.add_child(child_sampler)

    # Create ManifestDataset with sampler chain
    data1 = ds.ManifestDataset(manifest_file, decode=True, sampler=sampler)
    one_hot_encode = c_transforms.OneHot(3)
    data1 = data1.map(operations=one_hot_encode, input_columns=["label"])
    data1 = data1.batch(batch_size=5, drop_remainder=False)
    data1 = data1.repeat(count=2)

    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 2
Exemplo n.º 6
0
def test_imagefolder_error():
    DATA_DIR = "../data/dataset/testPK/data"
    data = ds.ImageFolderDataset(DATA_DIR, num_samples=14)

    data1 = [{
        'image': np.zeros(1, np.uint8),
        'label': np.array(0, np.int32)
    }, {
        'image': np.zeros(2, np.uint8),
        'label': np.array(1, np.int32)
    }, {
        'image': np.zeros(3, np.uint8),
        'label': np.array(0, np.int32)
    }, {
        'image': np.zeros(4, np.uint8),
        'label': np.array(1, np.int32)
    }, {
        'image': np.zeros(5, np.uint8),
        'label': np.array(0, np.int32)
    }, {
        'image': np.zeros(6, np.uint8),
        'label': np.array(1, np.int32)
    }]

    data2 = ds.PaddedDataset(data1)
    data3 = data + data2
    with pytest.raises(ValueError) as excinfo:
        testsampler = ds.DistributedSampler(num_shards=5,
                                            shard_id=4,
                                            shuffle=False,
                                            num_samples=None)
        data3.use_sampler(testsampler)
        assert excinfo.type == 'ValueError'
Exemplo n.º 7
0
def skip_test_chained_sampler_08():
    logger.info("Test Case Chained Sampler - SubsetRandom and Distributed, 4 shards")

    # Create chained sampler, subset random and distributed
    indices = [0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 11]
    sampler = ds.SubsetRandomSampler(indices, num_samples=12)
    child_sampler = ds.DistributedSampler(num_shards=4, shard_id=1)
    sampler.add_child(child_sampler)
    # Create ImageFolderDataset with sampler
    data1 = ds.ImageFolderDataset(DATA_DIR, sampler=sampler)

    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 3

    # Verify number of iterations
    num_iter = 0
    for item in data1.create_dict_iterator(num_epochs=1):  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    # Note: SubsetRandomSampler returns 12 samples
    # Note: Each of 4 shards has 3 samples
    assert num_iter == 3
Exemplo n.º 8
0
def test_manifest_sampler_chain_repeat():
    """
    Test ManifestDataset sampler chain DistributedSampler->SequentialSampler, with repeat
    """
    logger.info("test_manifest_sampler_chain_batch")
    manifest_file = "../data/dataset/testManifestData/test5trainimgs.json"

    # Create sampler chain DistributedSampler->SequentialSampler
    sampler = ds.DistributedSampler(num_shards=1,
                                    shard_id=0,
                                    shuffle=False,
                                    num_samples=5)
    child_sampler = ds.SequentialSampler()
    sampler.add_child(child_sampler)

    # Create ManifestDataset with sampler chain
    data1 = ds.ManifestDataset(manifest_file, sampler=sampler)
    data1 = data1.repeat(count=2)

    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 10

    # Verify number of rows
    assert sum([1 for _ in data1]) == 10

    # Verify dataset contents
    filename = "sampler_chain_manifest_repeat_result.npz"
    save_and_check_md5(data1, filename, generate_golden=GENERATE_GOLDEN)
Exemplo n.º 9
0
def test_manifest_sampler_chain():
    """
    Test Manifest sampler chain
    """
    logger.info("test_manifest_sampler_chain")

    sampler = ds.RandomSampler(replacement=True, num_samples=2)
    child_sampler = ds.DistributedSampler(num_shards=1,
                                          shard_id=0,
                                          shuffle=False,
                                          num_samples=3,
                                          offset=1)
    sampler.add_child(child_sampler)
    data1 = ds.ManifestDataset(MANIFEST_DATA_FILE, sampler=sampler)

    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 2
    # Verify number of rows
    assert sum([1 for _ in data1]) == 2

    # Verify dataset contents
    res = []
    for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
        logger.info("item: {}".format(item))
        res.append(item)
    logger.info("dataset: {}".format(res))
Exemplo n.º 10
0
def test_cifar_sampler_chain():
    """
    Test Cifar sampler chain
    """
    logger.info("test_cifar_sampler_chain")

    sampler = ds.DistributedSampler(num_shards=2,
                                    shard_id=0,
                                    shuffle=False,
                                    num_samples=5)
    child_sampler = ds.RandomSampler(replacement=True, num_samples=4)
    child_sampler2 = ds.SequentialSampler(start_index=0, num_samples=2)
    child_sampler.add_child(child_sampler2)
    sampler.add_child(child_sampler)
    data1 = ds.Cifar10Dataset(CIFAR10_DATA_DIR, sampler=sampler)
    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 1

    # Verify number of rows
    assert sum([1 for _ in data1]) == 1

    # Verify dataset contents
    res = []
    for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
        logger.info("item: {}".format(item))
        res.append(item)
    logger.info("dataset: {}".format(res))
Exemplo n.º 11
0
def test_GeneratorDataSet_Padded():
    result_list = []
    for i in range(10):
        tem_list = []
        tem_list.append(i)
        tem_list.append(10 + i)
        result_list.append(tem_list)

    verify_list = []
    data1 = ds.GeneratorDataset(generator_20, ["col1"])
    data2 = ds.GeneratorDataset(generator_10, ["col1"])
    data3 = data2 + data1
    shard_num = 10
    for i in range(shard_num):
        distributed_sampler = ds.DistributedSampler(num_shards=shard_num,
                                                    shard_id=i,
                                                    shuffle=False,
                                                    num_samples=None)
        data3.use_sampler(distributed_sampler)
        tem_list = []
        for ele in data3.create_dict_iterator():
            tem_list.append(ele['col1'][0])
        verify_list.append(tem_list)

    assert verify_list == result_list
Exemplo n.º 12
0
def test_imagefolder_padded_with_decode_and_get_dataset_size():
    num_shards = 5
    count = 0
    for shard_id in range(num_shards):
        DATA_DIR = "../data/dataset/testPK/data"
        data = ds.ImageFolderDatasetV2(DATA_DIR)

        white_io = BytesIO()
        Image.new('RGB', (224, 224), (255, 255, 255)).save(white_io, 'JPEG')
        padded_sample = {}
        padded_sample['image'] = np.array(bytearray(white_io.getvalue()),
                                          dtype='uint8')
        padded_sample['label'] = np.array(-1, np.int32)

        white_samples = [
            padded_sample, padded_sample, padded_sample, padded_sample
        ]
        data2 = ds.PaddedDataset(white_samples)
        data3 = data + data2

        testsampler = ds.DistributedSampler(num_shards=num_shards,
                                            shard_id=shard_id,
                                            shuffle=False,
                                            num_samples=None)
        data3.use_sampler(testsampler)
        shard_dataset_size = data3.get_dataset_size()
        data3 = data3.map(input_columns="image", operations=V_C.Decode())
        shard_sample_count = 0
        for ele in data3.create_dict_iterator():
            print("label: {}".format(ele['label']))
            count += 1
            shard_sample_count += 1
        assert shard_sample_count in (9, 10)
        assert shard_dataset_size == shard_sample_count
    assert count == 48
Exemplo n.º 13
0
def test_Mindrecord_Padded(remove_mindrecord_file):
    result_list = []
    verify_list = [[1, 2], [3, 4], [5, 11], [6, 12], [7, 13], [8, 14], [9],
                   [10]]
    num_readers = 4
    data_set = ds.MindDataset(CV_FILE_NAME + "0", ['file_name'],
                              num_readers,
                              shuffle=False)
    data1 = [{
        'file_name': np.array(b'image_00011.jpg', dtype='|S15')
    }, {
        'file_name': np.array(b'image_00012.jpg', dtype='|S15')
    }, {
        'file_name': np.array(b'image_00013.jpg', dtype='|S15')
    }, {
        'file_name': np.array(b'image_00014.jpg', dtype='|S15')
    }]
    ds1 = ds.PaddedDataset(data1)
    ds2 = data_set + ds1
    shard_num = 8
    for i in range(shard_num):
        testsampler = ds.DistributedSampler(num_shards=shard_num,
                                            shard_id=i,
                                            shuffle=False,
                                            num_samples=None)
        ds2.use_sampler(testsampler)
        tem_list = []
        for ele in ds2.create_dict_iterator():
            tem_list.append(
                int(ele['file_name'].tostring().decode().lstrip(
                    'image_').rstrip('.jpg')))
        result_list.append(tem_list)
    assert result_list == verify_list
Exemplo n.º 14
0
def test_bath_afterPadded():
    data1 = [{
        'image': np.zeros(1, np.uint8)
    }, {
        'image': np.zeros(1, np.uint8)
    }, {
        'image': np.zeros(1, np.uint8)
    }, {
        'image': np.zeros(1, np.uint8)
    }, {
        'image': np.zeros(1, np.uint8)
    }]
    data2 = [{
        'image': np.zeros(1, np.uint8)
    }, {
        'image': np.zeros(1, np.uint8)
    }, {
        'image': np.zeros(1, np.uint8)
    }]

    ds1 = ds.PaddedDataset(data1)
    ds2 = ds.PaddedDataset(data2)
    ds3 = ds1 + ds2

    testsampler = ds.DistributedSampler(num_shards=2,
                                        shard_id=0,
                                        shuffle=False,
                                        num_samples=None)
    ds3.use_sampler(testsampler)

    ds4 = ds3.batch(2)
    assert sum([1 for _ in ds4]) == 2
Exemplo n.º 15
0
def test_three_datasets_connected():
    result_list = []
    for i in range(10):
        tem_list = []
        tem_list.append(i)
        tem_list.append(10 + i)
        tem_list.append(20 + i)
        result_list.append(tem_list)

    verify_list = []
    data1 = ds.GeneratorDataset(generator_10, ["col1"])
    data2 = ds.GeneratorDataset(generator_20, ["col1"])
    data3 = ds.GeneratorDataset(generator_30, ["col1"])
    data4 = data1 + data2 + data3
    shard_num = 10
    for i in range(shard_num):
        distributed_sampler = ds.DistributedSampler(num_shards=shard_num,
                                                    shard_id=i,
                                                    shuffle=False,
                                                    num_samples=None)
        data4.use_sampler(distributed_sampler)
        tem_list = []
        for ele in data4.create_dict_iterator(num_epochs=1, output_numpy=True):
            tem_list.append(ele['col1'][0])
        verify_list.append(tem_list)

    assert verify_list == result_list
Exemplo n.º 16
0
def test_voc_sampler_chain():
    """
    Test VOC sampler chain
    """
    logger.info("test_voc_sampler_chain")

    sampler = ds.DistributedSampler(num_shards=2,
                                    shard_id=0,
                                    shuffle=False,
                                    num_samples=5)
    child_sampler = ds.SequentialSampler(start_index=0)
    sampler.add_child(child_sampler)
    data1 = ds.VOCDataset(VOC_DATA_DIR, task="Segmentation", sampler=sampler)

    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 5

    # Verify number of rows
    assert sum([1 for _ in data1]) == 5

    # Verify dataset contents
    res = []
    for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
        logger.info("item: {}".format(item))
        res.append(item)
    logger.info("dataset: {}".format(res))
Exemplo n.º 17
0
def test_coco_sampler_chain():
    """
    Test Coco sampler chain
    """
    logger.info("test_coco_sampler_chain")

    sampler = ds.DistributedSampler(num_shards=2,
                                    shard_id=0,
                                    shuffle=False,
                                    num_samples=5)
    child_sampler = ds.RandomSampler(replacement=True, num_samples=2)
    sampler.add_child(child_sampler)
    data1 = ds.CocoDataset(COCO_DATA_DIR,
                           annotation_file=ANNOTATION_FILE,
                           task="Detection",
                           decode=True,
                           sampler=sampler)

    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 1

    # Verify number of rows
    assert sum([1 for _ in data1]) == 1

    # Verify dataset contents
    res = []
    for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
        logger.info("item: {}".format(item))
        res.append(item)
    logger.info("dataset: {}".format(res))
Exemplo n.º 18
0
def test_distributed_sampler_invalid_offset():
    with pytest.raises(ValueError) as info:
        sampler = ds.DistributedSampler(num_shards=4,
                                        shard_id=0,
                                        shuffle=False,
                                        num_samples=None,
                                        offset=5)
    assert "offset should be no more than num_shards" in str(info.value)
Exemplo n.º 19
0
def test_more_shard_padded():
    result_list = []
    for i in range(8):
        result_list.append(1)
    result_list.append(0)

    data1 = ds.GeneratorDataset(generator_5, ["col1"])
    data2 = ds.GeneratorDataset(generator_8, ["col1"])
    data3 = data1 + data2
    vertifyList = []
    numShard = 9
    for i in range(numShard):
        tem_list = []
        testsampler = ds.DistributedSampler(num_shards=numShard, shard_id=i, shuffle=False, num_samples=None)
        data3.use_sampler(testsampler)
        for item in data3.create_dict_iterator():
            tem_list.append(item['col1'])
        vertifyList.append(tem_list)

    assert [len(ele) for ele in vertifyList] == result_list

    vertifyList1 = []
    result_list1 = []
    for i in range(8):
        result_list1.append([i+1])
    result_list1.append([])

    data1 = [{'image': np.zeros(1, np.uint8)}, {'image': np.zeros(2, np.uint8)},
             {'image': np.zeros(3, np.uint8)}, {'image': np.zeros(4, np.uint8)},
             {'image': np.zeros(5, np.uint8)}]
    data2 = [{'image': np.zeros(6, np.uint8)}, {'image': np.zeros(7, np.uint8)},
             {'image': np.zeros(8, np.uint8)}]

    ds1 = ds.PaddedDataset(data1)
    ds2 = ds.PaddedDataset(data2)
    ds3 = ds1 + ds2

    for i in range(numShard):
        tem_list = []
        testsampler = ds.DistributedSampler(num_shards=numShard, shard_id=i, shuffle=False, num_samples=None)
        ds3.use_sampler(testsampler)
        for item in ds3.create_dict_iterator():
            tem_list.append(len(item['image']))
        vertifyList1.append(tem_list)

    assert vertifyList1 == result_list1
Exemplo n.º 20
0
def test_distributed_sampler_invalid_offset():
    with pytest.raises(RuntimeError) as info:
        sampler = ds.DistributedSampler(num_shards=4,
                                        shard_id=0,
                                        shuffle=False,
                                        num_samples=None,
                                        offset=5).parse()
    assert "DistributedSampler: offset must be no more than num_shards(4)" in str(
        info.value)
Exemplo n.º 21
0
def test_Unevenly_distributed():
    result_list = [[1, 4, 7], [2, 5, 8], [3, 6]]
    verify_list = []

    data1 = [{
        'image': np.zeros(1, np.uint8)
    }, {
        'image': np.zeros(2, np.uint8)
    }, {
        'image': np.zeros(3, np.uint8)
    }, {
        'image': np.zeros(4, np.uint8)
    }, {
        'image': np.zeros(5, np.uint8)
    }]
    data2 = [{
        'image': np.zeros(6, np.uint8)
    }, {
        'image': np.zeros(7, np.uint8)
    }, {
        'image': np.zeros(8, np.uint8)
    }]

    testsampler = ds.DistributedSampler(num_shards=4,
                                        shard_id=0,
                                        shuffle=False,
                                        num_samples=None,
                                        offset=1)

    ds1 = ds.PaddedDataset(data1)
    ds2 = ds.PaddedDataset(data2)
    ds3 = ds1 + ds2
    numShard = 3
    for i in range(numShard):
        tem_list = []
        testsampler = ds.DistributedSampler(num_shards=numShard,
                                            shard_id=i,
                                            shuffle=False,
                                            num_samples=None)
        ds3.use_sampler(testsampler)
        for item in ds3.create_dict_iterator():
            tem_list.append(len(item['image']))
        verify_list.append(tem_list)
    assert verify_list == result_list
Exemplo n.º 22
0
    def test_config(num_shards, shard_id):
        sampler = ds.DistributedSampler(num_shards, shard_id, False)
        child_sampler = ds.SequentialSampler()
        sampler.add_child(child_sampler)

        data1 = ds.ManifestDataset(manifest_file, num_samples=5, sampler=sampler)

        res = []
        for item in data1.create_dict_iterator():
            logger.info("item[image].shape[0]: {}, item[label].item(): {}"
                        .format(item["image"].shape[0], item["label"].item()))
            res.append(map_[(item["image"].shape[0], item["label"].item())])
        return res
Exemplo n.º 23
0
def test_celeba_padded():
    data = ds.CelebADataset("../data/dataset/testCelebAData/")

    padded_samples = [{'image': np.zeros(1, np.uint8), 'attr': np.zeros(1, np.uint32)}]
    padded_ds = ds.PaddedDataset(padded_samples)
    data = data + padded_ds
    dis_sampler = ds.DistributedSampler(num_shards=2, shard_id=1, shuffle=False, num_samples=None)
    data.use_sampler(dis_sampler)
    data = data.repeat(2)

    count = 0
    for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        count = count + 1
    assert count == 4
Exemplo n.º 24
0
def test_clue_padded_and_skip_with_0_samples():
    """
    Test num_samples param of CLUE dataset
    """
    TRAIN_FILE = '../data/dataset/testCLUE/afqmc/train.json'

    data = ds.CLUEDataset(TRAIN_FILE, task='AFQMC', usage='train')
    count = 0
    for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        count += 1
    assert count == 3

    data_copy1 = copy.deepcopy(data)

    sample = {
        "label": np.array(1, np.string_),
        "sentence1": np.array(1, np.string_),
        "sentence2": np.array(1, np.string_)
    }
    samples = [sample]
    padded_ds = ds.PaddedDataset(samples)
    dataset = data + padded_ds
    testsampler = ds.DistributedSampler(num_shards=2,
                                        shard_id=1,
                                        shuffle=False,
                                        num_samples=None)
    dataset.use_sampler(testsampler)
    assert dataset.get_dataset_size() == 2
    count = 0
    for data in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        count += 1
    assert count == 2

    dataset = dataset.skip(count=2)  # dataset2 has none samples
    count = 0
    for data in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        count += 1
    assert count == 0

    with pytest.raises(ValueError, match="There is no samples in the "):
        dataset = dataset.concat(data_copy1)
        count = 0
        for data in dataset.create_dict_iterator(num_epochs=1,
                                                 output_numpy=True):
            count += 1
        assert count == 2
Exemplo n.º 25
0
 def sharding_config(num_shards,
                     shard_id,
                     num_samples,
                     shuffle,
                     repeat_cnt=1):
     sampler = ds.DistributedSampler(num_shards,
                                     shard_id,
                                     shuffle=shuffle,
                                     num_samples=num_samples)
     data1 = ds.VOCDataset(voc_dir, decode=True, sampler=sampler)
     data1 = data1.repeat(repeat_cnt)
     res = []
     for item in data1.create_dict_iterator():  # each data is a dictionary
         res.append(item["image"].shape[0])
     if print_res:
         logger.info("labels of dataset: {}".format(res))
     return res
Exemplo n.º 26
0
def test_sampler_py_api():
    sampler = ds.SequentialSampler().create()
    sampler.set_num_rows(128)
    sampler.set_num_samples(64)
    sampler.initialize()
    sampler.get_indices()

    sampler = ds.RandomSampler().create()
    sampler.set_num_rows(128)
    sampler.set_num_samples(64)
    sampler.initialize()
    sampler.get_indices()

    sampler = ds.DistributedSampler(8, 4).create()
    sampler.set_num_rows(128)
    sampler.set_num_samples(64)
    sampler.initialize()
    sampler.get_indices()
def test_distributed_sampler():
    logger.info("Test Case DistributedSampler")
    # define parameters
    repeat_count = 1

    # apply dataset operations
    sampler = ds.DistributedSampler(10, 1)
    data1 = ds.ImageFolderDatasetV2(DATA_DIR, sampler=sampler)
    data1 = data1.repeat(repeat_count)

    num_iter = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    assert num_iter == 5
Exemplo n.º 28
0
def test_TFRecord_Padded():
    DATA_DIR = [
        "../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"
    ]
    SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
    result_list = [[159109, 2], [192607, 3], [179251, 4], [1, 5]]
    verify_list = []
    shard_num = 4
    for i in range(shard_num):
        data = ds.TFRecordDataset(DATA_DIR,
                                  SCHEMA_DIR,
                                  columns_list=["image"],
                                  shuffle=False,
                                  shard_equal_rows=True)

        padded_samples = [{
            'image': np.zeros(1, np.uint8)
        }, {
            'image': np.zeros(2, np.uint8)
        }, {
            'image': np.zeros(3, np.uint8)
        }, {
            'image': np.zeros(4, np.uint8)
        }, {
            'image': np.zeros(5, np.uint8)
        }]

        padded_ds = ds.PaddedDataset(padded_samples)
        concat_ds = data + padded_ds
        testsampler = ds.DistributedSampler(num_shards=shard_num,
                                            shard_id=i,
                                            shuffle=False,
                                            num_samples=None)
        concat_ds.use_sampler(testsampler)
        shard_list = []
        for item in concat_ds.create_dict_iterator(num_epochs=1,
                                                   output_numpy=True):
            shard_list.append(len(item['image']))
        verify_list.append(shard_list)
    assert verify_list == result_list
Exemplo n.º 29
0
def test_imagefolder_padded():
    DATA_DIR = "../data/dataset/testPK/data"
    data = ds.ImageFolderDatasetV2(DATA_DIR)

    data1 = [{'image': np.zeros(1, np.uint8), 'label': np.array(0, np.int32)},
             {'image': np.zeros(2, np.uint8), 'label': np.array(1, np.int32)},
             {'image': np.zeros(3, np.uint8), 'label': np.array(0, np.int32)},
             {'image': np.zeros(4, np.uint8), 'label': np.array(1, np.int32)},
             {'image': np.zeros(5, np.uint8), 'label': np.array(0, np.int32)},
             {'image': np.zeros(6, np.uint8), 'label': np.array(1, np.int32)}]

    data2 = ds.PaddedDataset(data1)
    data3 = data + data2
    testsampler = ds.DistributedSampler(num_shards=5, shard_id=4, shuffle=False, num_samples=None)
    data3.use_sampler(testsampler)
    assert sum([1 for _ in data3]) == 10
    verify_list = []

    for ele in data3.create_dict_iterator():
        verify_list.append(len(ele['image']))
    assert verify_list[8] == 1
    assert verify_list[9] == 6
Exemplo n.º 30
0
def test_Reapeat_afterPadded():
    result_list = [1, 3, 5, 7]
    verify_list = []

    data1 = [{'image': np.zeros(1, np.uint8)}, {'image': np.zeros(2, np.uint8)},
             {'image': np.zeros(3, np.uint8)}, {'image': np.zeros(4, np.uint8)},
             {'image': np.zeros(5, np.uint8)}]
    data2 = [{'image': np.zeros(6, np.uint8)}, {'image': np.zeros(7, np.uint8)},
             {'image': np.zeros(8, np.uint8)}]

    ds1 = ds.PaddedDataset(data1)
    ds2 = ds.PaddedDataset(data2)
    ds3 = ds1 + ds2

    testsampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=None)
    ds3.use_sampler(testsampler)
    repeat_num = 2
    ds3 = ds3.repeat(repeat_num)
    for item in ds3.create_dict_iterator():
        verify_list.append(len(item['image']))

    assert verify_list == result_list * repeat_num