예제 #1
0
def test_concat_14():
    """
    Test concat: create dataset with different dataset folder, and do diffrent operation then concat
    """
    logger.info("test_concat_14")
    DATA_DIR = "../data/dataset/testPK/data"
    DATA_DIR2 = "../data/dataset/testImageNetData/train/"

    data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=3)
    data2 = ds.ImageFolderDatasetV2(DATA_DIR2, num_samples=2)

    transforms1 = F.ComposeOp([F.Decode(), F.Resize((224, 224)), F.ToTensor()])

    data1 = data1.map(input_columns=["image"], operations=transforms1())
    data2 = data2.map(input_columns=["image"], operations=transforms1())
    data3 = data1 + data2

    expected, output = [], []
    for d in data1:
        expected.append(d[0])
    for d in data2:
        expected.append(d[0])
    for d in data3:
        output.append(d[0])

    assert len(expected) == len(output)
    np.array_equal(np.array(output), np.array(expected))

    assert sum([1 for _ in data3]) == 5
    assert data3.get_dataset_size() == 5
예제 #2
0
def test_imagefolder():
    data = ds.ImageFolderDatasetV2("../data/dataset/testPK/data/")
    assert data.get_dataset_size() == 44
    assert data.num_classes() == 4
    data = data.shuffle(100)
    assert data.num_classes() == 4

    data = ds.ImageFolderDatasetV2("../data/dataset/testPK/data/", num_samples=10)
    assert data.get_dataset_size() == 10
    assert data.num_classes() == 4
예제 #3
0
def test_cache_map_basic3():
    """
    Test a repeat under mappable cache

        Cache
          |
      Map(decode)
          |
        Repeat
          |
      ImageFolder
    """

    logger.info("Test cache basic 3")

    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    # This DATA_DIR only has 2 images in it
    ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR)
    decode_op = c_vision.Decode()
    ds1 = ds1.repeat(4)
    ds1 = ds1.map(input_columns=["image"],
                  operations=decode_op,
                  cache=some_cache)
    logger.info("ds1.dataset_size is ", ds1.get_dataset_size())

    num_iter = 0
    for _ in ds1.create_dict_iterator():
        logger.info("get data from dataset")
        num_iter += 1

    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 8
    logger.info('test_cache_basic3 Ended.\n')
예제 #4
0
def test_cache_map_failure1():
    """
    Test nested cache (failure)

        Repeat
          |
        Cache
          |
      Map(decode)
          |
        Cache
          |
      ImageFolder

    """
    logger.info("Test cache failure 1")

    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    # This DATA_DIR only has 2 images in it
    ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache)
    decode_op = c_vision.Decode()
    ds1 = ds1.map(input_columns=["image"], operations=decode_op, cache=some_cache)
    ds1 = ds1.repeat(4)

    try:
        num_iter = 0
        for _ in ds1.create_dict_iterator():
            num_iter += 1
    except RuntimeError as e:
        logger.info("Got an exception in DE: {}".format(str(e)))
        assert "Nested cache operations is not supported!" in str(e)

    assert num_iter == 0
    logger.info('test_cache_failure1 Ended.\n')
def test_imagefolder_negative_classindex():
    logger.info("Test Case negative classIndex")
    # define parameters
    repeat_count = 1

    # apply dataset operations
    class_index = {"class3": -333, "class1": 111}
    data1 = ds.ImageFolderDatasetV2(DATA_DIR,
                                    class_indexing=class_index,
                                    shuffle=False)
    data1 = data1.repeat(repeat_count)

    golden = [
        111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, -333, -333,
        -333, -333, -333, -333, -333, -333, -333, -333, -333
    ]

    num_iter = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        assert item["label"] == golden[num_iter]
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    assert num_iter == 22
예제 #6
0
def test_cache_map_basic2():
    """
    Test mappable leaf with the cache op later in the tree above the map(decode)

       Repeat
         |
       Cache
         |
     Map(decode)
         |
     ImageFolder
    """

    logger.info("Test cache map basic 2")

    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    # This DATA_DIR only has 2 images in it
    ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR)
    decode_op = c_vision.Decode()
    ds1 = ds1.map(input_columns=["image"],
                  operations=decode_op,
                  cache=some_cache)
    ds1 = ds1.repeat(4)

    filename = "cache_map_02_result.npz"
    save_and_check_md5(ds1, filename, generate_golden=GENERATE_GOLDEN)

    logger.info("test_cache_map_basic2 Ended.\n")
def test_imagefolder_rename():
    logger.info("Test Case rename")
    # define parameters
    repeat_count = 1

    # apply dataset operations
    data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=10)
    data1 = data1.repeat(repeat_count)

    num_iter = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    assert num_iter == 10

    data1 = data1.rename(input_columns=["image"], output_columns="image2")

    num_iter = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image2"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    assert num_iter == 10
def test_sequential_sampler():
    logger.info("Test Case SequentialSampler")

    golden = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
    ]

    # define parameters
    repeat_count = 1

    # apply dataset operations
    sampler = ds.SequentialSampler()
    data1 = ds.ImageFolderDatasetV2(DATA_DIR, sampler=sampler)
    data1 = data1.repeat(repeat_count)

    result = []
    num_iter = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        result.append(item["label"])
        num_iter += 1

    logger.info("Result: {}".format(result))
    assert result == golden
def test_one_hot_op():
    """
    Test one hot encoding op
    """
    logger.info("Test one hot encoding op")

    # define map operations
    # ds = de.ImageFolderDataset(DATA_DIR, schema=SCHEMA_DIR)
    dataset = ds.ImageFolderDatasetV2(DATA_DIR)
    num_classes = 2
    epsilon_para = 0.1

    transforms = [
        f.OneHotOp(num_classes=num_classes, smoothing_rate=epsilon_para),
    ]
    transform_label = py_vision.ComposeOp(transforms)
    dataset = dataset.map(input_columns=["label"],
                          operations=transform_label())

    golden_label = np.ones(num_classes) * epsilon_para / num_classes
    golden_label[1] = 1 - epsilon_para / num_classes

    for data in dataset.create_dict_iterator():
        label = data["label"]
        logger.info("label is {}".format(label))
        logger.info("golden_label is {}".format(golden_label))
        assert (label.all() == golden_label.all())
        logger.info("====test one hot op ok====")
예제 #10
0
def test_imagefolder_padded_with_decode_and_get_dataset_size():
    num_shards = 5
    count = 0
    for shard_id in range(num_shards):
        DATA_DIR = "../data/dataset/testPK/data"
        data = ds.ImageFolderDatasetV2(DATA_DIR)

        white_io = BytesIO()
        Image.new('RGB', (224, 224), (255, 255, 255)).save(white_io, 'JPEG')
        padded_sample = {}
        padded_sample['image'] = np.array(bytearray(white_io.getvalue()),
                                          dtype='uint8')
        padded_sample['label'] = np.array(-1, np.int32)

        white_samples = [
            padded_sample, padded_sample, padded_sample, padded_sample
        ]
        data2 = ds.PaddedDataset(white_samples)
        data3 = data + data2

        testsampler = ds.DistributedSampler(num_shards=num_shards,
                                            shard_id=shard_id,
                                            shuffle=False,
                                            num_samples=None)
        data3.use_sampler(testsampler)
        shard_dataset_size = data3.get_dataset_size()
        data3 = data3.map(input_columns="image", operations=V_C.Decode())
        shard_sample_count = 0
        for ele in data3.create_dict_iterator():
            print("label: {}".format(ele['label']))
            count += 1
            shard_sample_count += 1
        assert shard_sample_count in (9, 10)
        assert shard_dataset_size == shard_sample_count
    assert count == 48
예제 #11
0
def test_cutmix_batch_success3(plot=False):
    """
    Test CutMixBatch op with default values for alpha and prob on a batch of HWC images on ImageFolderDatasetV2
    """
    logger.info("test_cutmix_batch_success3")

    ds_original = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR2, shuffle=False)
    decode_op = vision.Decode()
    ds_original = ds_original.map(input_columns=["image"],
                                  operations=[decode_op])
    ds_original = ds_original.batch(4, pad_info={}, drop_remainder=True)

    images_original = None
    for idx, (image, _) in enumerate(ds_original):
        if idx == 0:
            images_original = image
        else:
            images_original = np.append(images_original, image, axis=0)

    # CutMix Images
    data1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR2, shuffle=False)

    decode_op = vision.Decode()
    data1 = data1.map(input_columns=["image"], operations=[decode_op])

    one_hot_op = data_trans.OneHot(num_classes=10)
    data1 = data1.map(input_columns=["label"], operations=one_hot_op)

    cutmix_batch_op = vision.CutMixBatch(mode.ImageBatchFormat.NHWC)
    data1 = data1.batch(4, pad_info={}, drop_remainder=True)
    data1 = data1.map(input_columns=["image", "label"],
                      operations=cutmix_batch_op)

    images_cutmix = None
    for idx, (image, _) in enumerate(data1):
        if idx == 0:
            images_cutmix = image
        else:
            images_cutmix = np.append(images_cutmix, image, axis=0)
    if plot:
        visualize_list(images_original, images_cutmix)

    num_samples = images_original.shape[0]
    mse = np.zeros(num_samples)
    for i in range(num_samples):
        mse[i] = diff_mse(images_cutmix[i], images_original[i])
    logger.info("MSE= {}".format(str(np.mean(mse))))
def test_mix_up_multi():
    """
    Test multi batch mix up op
    """
    logger.info("Test several batch mix up op")

    resize_height = 224
    resize_width = 224

    # Create dataset and define map operations
    ds1 = ds.ImageFolderDatasetV2(DATA_DIR_2)

    num_classes = 3
    decode_op = c_vision.Decode()
    resize_op = c_vision.Resize((resize_height, resize_width),
                                c_vision.Inter.LINEAR)
    one_hot_encode = c.OneHot(num_classes)  # num_classes is input argument

    ds1 = ds1.map(input_columns=["image"], operations=decode_op)
    ds1 = ds1.map(input_columns=["image"], operations=resize_op)
    ds1 = ds1.map(input_columns=["label"], operations=one_hot_encode)

    # apply batch operations
    batch_size = 3
    ds1 = ds1.batch(batch_size, drop_remainder=True)

    ds2 = ds1
    alpha = 0.2
    transforms = [
        py_vision.MixUp(batch_size=batch_size, alpha=alpha, is_single=False)
    ]
    ds1 = ds1.map(input_columns=["image", "label"], operations=transforms)
    num_iter = 0
    batch1_image1 = 0
    for data1, data2 in zip(ds1.create_dict_iterator(),
                            ds2.create_dict_iterator()):
        image1 = data1["image"]
        label1 = data1["label"]
        logger.info("label: {}".format(label1))

        image2 = data2["image"]
        label2 = data2["label"]
        logger.info("label2: {}".format(label2))

        if num_iter == 0:
            batch1_image1 = image1

        if num_iter == 1:
            lam = np.abs(label2 - label1)
            logger.info("lam value in multi: {}".format(lam))
            for index in range(batch_size):
                if np.square(lam[index]).mean() != 0:
                    lam_value = 1 - np.sum(lam[index]) / 2
                    img_golden = lam_value * image2[index] + (
                        1 - lam_value) * batch1_image1[index]
                    assert image1[index].all() == img_golden.all()
                    logger.info("====test several batch mixup ok====")
            break
        num_iter = num_iter + 1
예제 #13
0
def test_mixup_batch_success2(plot=False):
    """
    Test MixUpBatch op with specified alpha parameter on ImageFolderDatasetV2
    """
    logger.info("test_mixup_batch_success2")

    # Original Images
    ds_original = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR2, shuffle=False)
    decode_op = vision.Decode()
    ds_original = ds_original.map(input_columns=["image"], operations=[decode_op])
    ds_original = ds_original.batch(4, pad_info={}, drop_remainder=True)

    images_original = None
    for idx, (image, _) in enumerate(ds_original):
        if idx == 0:
            images_original = image
        else:
            images_original = np.append(images_original, image, axis=0)

    # MixUp Images
    data1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR2, shuffle=False)

    decode_op = vision.Decode()
    data1 = data1.map(input_columns=["image"], operations=[decode_op])

    one_hot_op = data_trans.OneHot(num_classes=10)
    data1 = data1.map(input_columns=["label"], operations=one_hot_op)

    mixup_batch_op = vision.MixUpBatch(2.0)
    data1 = data1.batch(4, pad_info={}, drop_remainder=True)
    data1 = data1.map(input_columns=["image", "label"], operations=mixup_batch_op)

    images_mixup = None
    for idx, (image, _) in enumerate(data1):
        if idx == 0:
            images_mixup = image
        else:
            images_mixup = np.append(images_mixup, image, axis=0)
    if plot:
        visualize_list(images_original, images_mixup)

    num_samples = images_original.shape[0]
    mse = np.zeros(num_samples)
    for i in range(num_samples):
        mse[i] = diff_mse(images_mixup[i], images_original[i])
    logger.info("MSE= {}".format(str(np.mean(mse))))
def test_imagenet_rawdata_dataset_size():
    ds_total = ds.ImageFolderDatasetV2(IMAGENET_RAWDATA_DIR)
    assert ds_total.get_dataset_size() == 6

    ds_shard_1_0 = ds.ImageFolderDatasetV2(IMAGENET_RAWDATA_DIR,
                                           num_shards=1,
                                           shard_id=0)
    assert ds_shard_1_0.get_dataset_size() == 6

    ds_shard_2_0 = ds.ImageFolderDatasetV2(IMAGENET_RAWDATA_DIR,
                                           num_shards=2,
                                           shard_id=0)
    assert ds_shard_2_0.get_dataset_size() == 3

    ds_shard_3_0 = ds.ImageFolderDatasetV2(IMAGENET_RAWDATA_DIR,
                                           num_shards=3,
                                           shard_id=0)
    assert ds_shard_3_0.get_dataset_size() == 2
예제 #15
0
def test_imagefolder_numsamples():
    logger.info("Test Case numSamples")
    # define parameters
    repeat_count = 1

    # apply dataset operations
    data1 = ds.ImageFolderDatasetV2(DATA_DIR,
                                    num_samples=10,
                                    num_parallel_workers=2)
    data1 = data1.repeat(repeat_count)

    num_iter = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    assert num_iter == 10

    random_sampler = ds.RandomSampler(num_samples=3, replacement=True)
    data1 = ds.ImageFolderDatasetV2(DATA_DIR,
                                    num_samples=10,
                                    num_parallel_workers=2,
                                    sampler=random_sampler)

    num_iter = 0
    for item in data1.create_dict_iterator():
        num_iter += 1

    assert num_iter == 3

    random_sampler = ds.RandomSampler(num_samples=3, replacement=False)
    data1 = ds.ImageFolderDatasetV2(DATA_DIR,
                                    num_samples=10,
                                    num_parallel_workers=2,
                                    sampler=random_sampler)

    num_iter = 0
    for item in data1.create_dict_iterator():
        num_iter += 1

    assert num_iter == 3
예제 #16
0
def test_apply_imagefolder_case():
    # apply dataset map operations
    data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_shards=4, shard_id=3)
    data2 = ds.ImageFolderDatasetV2(DATA_DIR, num_shards=4, shard_id=3)

    decode_op = vision.Decode()
    normalize_op = vision.Normalize([121.0, 115.0, 100.0], [70.0, 68.0, 71.0])

    def dataset_fn(ds):
        ds = ds.map(operations = decode_op)
        ds = ds.map(operations = normalize_op)
        ds = ds.repeat(2)
        return ds
    
    data1 = data1.apply(dataset_fn)
    data2 = data2.map(operations = decode_op)
    data2 = data2.map(operations = normalize_op)
    data2 = data2.repeat(2)
    
    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
        assert np.array_equal(item1["image"], item2["image"])
def test_imagefolder_zip():
    logger.info("Test Case zip")
    # define parameters
    repeat_count = 2

    # apply dataset operations
    data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=10)
    data2 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=10)

    data1 = data1.repeat(repeat_count)
    # rename dataset2 for no conflict
    data2 = data2.rename(input_columns=["image", "label"], output_columns=["image1", "label1"])
    data3 = ds.zip((data1, data2))

    num_iter = 0
    for item in data3.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    assert (num_iter == 10)
def create_imagenet_dataset(imagenet_dir):
    ds = de.ImageFolderDatasetV2(imagenet_dir)

    transform = F.ComposeOp([
        F.Decode(),
        F.RandomHorizontalFlip(0.5),
        F.ToTensor(),
        F.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)),
        F.RandomErasing()
    ])
    ds = ds.map(input_columns="image", operations=transform())
    ds = ds.shuffle(buffer_size=5)
    ds = ds.repeat(3)
    return ds
예제 #19
0
def test_var_batch_var_resize():
    # fake resize image according to its batch number, if it's 5-th batch, resize to (5^2, 5^2) = (25, 25)
    def np_psedo_resize(col, batchInfo):
        s = (batchInfo.get_batch_num() + 1) ** 2
        return ([np.copy(c[0:s, 0:s, :]) for c in col],)

    def add_one(batchInfo):
        return batchInfo.get_batch_num() + 1

    data1 = ds.ImageFolderDatasetV2("../data/dataset/testPK/data/", num_parallel_workers=4, decode=True)
    data1 = data1.batch(batch_size=add_one, drop_remainder=True, input_columns=["image"], per_batch_map=np_psedo_resize)
    # i-th batch has shape [i, i^2, i^2, 3]
    i = 1
    for item in data1.create_dict_iterator():
        assert item["image"].shape == (i, i ** 2, i ** 2, 3), "\ntest_var_batch_var_resize FAILED\n"
        i += 1
예제 #20
0
def test_concat_15():
    """
    Test concat: create dataset with different format of dataset file, and then concat
    """
    logger.info("test_concat_15")
    DATA_DIR = "../data/dataset/testPK/data"
    DATA_DIR2 = [
        "../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"
    ]

    data1 = ds.ImageFolderDatasetV2(DATA_DIR)
    data2 = ds.TFRecordDataset(DATA_DIR2, columns_list=["image"])

    data1 = data1.project(["image"])
    data3 = data1 + data2

    assert sum([1 for _ in data3]) == 47
def test_imagefolder_shardid():
    logger.info("Test Case withShardID")
    # define parameters
    repeat_count = 1

    # apply dataset operations
    data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_shards=4, shard_id=1)
    data1 = data1.repeat(repeat_count)

    num_iter = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    assert num_iter == 11
def test_imagefolder_noshuffle():
    logger.info("Test Case noShuffle")
    # define parameters
    repeat_count = 1

    # apply dataset operations
    data1 = ds.ImageFolderDatasetV2(DATA_DIR, shuffle=False)
    data1 = data1.repeat(repeat_count)

    num_iter = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    assert (num_iter == 44)
def test_pk_sampler():
    logger.info("Test Case PKSampler")
    # define parameters
    repeat_count = 1

    # apply dataset operations
    sampler = ds.PKSampler(3)
    data1 = ds.ImageFolderDatasetV2(DATA_DIR, sampler=sampler)
    data1 = data1.repeat(repeat_count)

    num_iter = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    assert num_iter == 12
def test_imagefolder_decode():
    logger.info("Test Case decode")
    # define parameters
    repeat_count = 1

    # apply dataset operations
    ext = [".jpg", ".JPEG"]
    data1 = ds.ImageFolderDatasetV2(DATA_DIR, extensions=ext, decode=True)
    data1 = data1.repeat(repeat_count)

    num_iter = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    assert num_iter == 44
def test_weighted_random_sampler():
    logger.info("Test Case WeightedRandomSampler")
    # define parameters
    repeat_count = 1

    # apply dataset operations
    weights = [1.0, 0.1, 0.02, 0.3, 0.4, 0.05, 1.2, 0.13, 0.14, 0.015, 0.16, 1.1]
    sampler = ds.WeightedRandomSampler(weights, 11)
    data1 = ds.ImageFolderDatasetV2(DATA_DIR, sampler=sampler)
    data1 = data1.repeat(repeat_count)

    num_iter = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    assert (num_iter == 11)
def test_subset_random_sampler():
    logger.info("Test Case SubsetRandomSampler")
    # define parameters
    repeat_count = 1

    # apply dataset operations
    indices = [0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 11]
    sampler = ds.SubsetRandomSampler(indices)
    data1 = ds.ImageFolderDatasetV2(DATA_DIR, sampler=sampler)
    data1 = data1.repeat(repeat_count)

    num_iter = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    assert num_iter == 12
예제 #27
0
 def sharding_config(num_shards,
                     shard_id,
                     num_samples,
                     shuffle,
                     class_index,
                     repeat_cnt=1):
     data1 = ds.ImageFolderDatasetV2(image_folder_dir,
                                     num_samples=num_samples,
                                     num_shards=num_shards,
                                     shard_id=shard_id,
                                     shuffle=shuffle,
                                     class_indexing=class_index,
                                     decode=True)
     data1 = data1.repeat(repeat_cnt)
     res = []
     for item in data1.create_dict_iterator():  # each data is a dictionary
         res.append(item["label"].item())
     if print_res:
         logger.info("labels of dataset: {}".format(res))
     return res
예제 #28
0
def test_imagefolder_padded():
    DATA_DIR = "../data/dataset/testPK/data"
    data = ds.ImageFolderDatasetV2(DATA_DIR)

    data1 = [{'image': np.zeros(1, np.uint8), 'label': np.array(0, np.int32)},
             {'image': np.zeros(2, np.uint8), 'label': np.array(1, np.int32)},
             {'image': np.zeros(3, np.uint8), 'label': np.array(0, np.int32)},
             {'image': np.zeros(4, np.uint8), 'label': np.array(1, np.int32)},
             {'image': np.zeros(5, np.uint8), 'label': np.array(0, np.int32)},
             {'image': np.zeros(6, np.uint8), 'label': np.array(1, np.int32)}]

    data2 = ds.PaddedDataset(data1)
    data3 = data + data2
    testsampler = ds.DistributedSampler(num_shards=5, shard_id=4, shuffle=False, num_samples=None)
    data3.use_sampler(testsampler)
    assert sum([1 for _ in data3]) == 10
    verify_list = []

    for ele in data3.create_dict_iterator():
        verify_list.append(len(ele['image']))
    assert verify_list[8] == 1
    assert verify_list[9] == 6
예제 #29
0
def test_cache_map_basic4():
    """
    Test different rows result in core dump
    """
    logger.info("Test cache basic 4")
    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    # This DATA_DIR only has 2 images in it
    ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache)
    decode_op = c_vision.Decode()
    ds1 = ds1.repeat(4)
    ds1 = ds1.map(input_columns=["image"], operations=decode_op)
    logger.info("ds1.dataset_size is ", ds1.get_dataset_size())
    shape = ds1.output_shapes()
    logger.info(shape)
    num_iter = 0
    for _ in ds1.create_dict_iterator():
        logger.info("get data from dataset")
        num_iter += 1

    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 8
    logger.info('test_cache_basic3 Ended.\n')
예제 #30
0
def classification_dataset(data_dir,
                           image_size,
                           per_batch_size,
                           rank=0,
                           group_size=1,
                           mode='train',
                           input_mode='folder',
                           root='',
                           num_parallel_workers=None,
                           shuffle=None,
                           sampler=None,
                           repeat_num=1,
                           class_indexing=None,
                           drop_remainder=True,
                           transform=None,
                           target_transform=None):
    """
    A function that returns a dataset for classification. The mode of input dataset could be "folder" or "txt".
    If it is "folder", all images within one folder have the same label. If it is "txt", all paths of images
    are written into a textfile.

    Args:
        data_dir (str): Path to the root directory that contains the dataset for "input_mode="folder"".
            Or path of the textfile that contains every image's path of the dataset.
        image_size (str): Size of the input images.
        per_batch_size (int): the batch size of evey step during training.
        rank (int): The shard ID within num_shards (default=None).
        group_size (int): Number of shards that the dataset should be divided
            into (default=None).
        mode (str): "train" or others. Default: " train".
        input_mode (str): The form of the input dataset. "folder" or "txt". Default: "folder".
        root (str): the images path for "input_mode="txt"". Default: " ".
        num_parallel_workers (int): Number of workers to read the data. Default: None.
        shuffle (bool): Whether or not to perform shuffle on the dataset
            (default=None, performs shuffle).
        sampler (Sampler): Object used to choose samples from the dataset. Default: None.
        repeat_num (int): the num of repeat dataset.
        class_indexing (dict): A str-to-int mapping from folder name to index
            (default=None, the folder names will be sorted
            alphabetically and each class will be given a
            unique index starting from 0).

    Examples:
        >>> from mindvision.common.datasets.classification import classification_dataset
        >>> # path to imagefolder directory. This directory needs to contain sub-directories which contain the images
        >>> dataset_dir = "/path/to/imagefolder_directory"
        >>> de_dataset = classification_dataset(train_data_dir, image_size=[224, 244],
        >>>                               per_batch_size=64, rank=0, group_size=4)
        >>> # Path of the textfile that contains every image's path of the dataset.
        >>> dataset_dir = "/path/to/dataset/images/train.txt"
        >>> images_dir = "/path/to/dataset/images"
        >>> de_dataset = classification_dataset(train_data_dir, image_size=[224, 244],
        >>>                               per_batch_size=64, rank=0, group_size=4,
        >>>                               input_mode="txt", root=images_dir)
    """

    mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
    std = [0.229 * 255, 0.224 * 255, 0.225 * 255]

    if transform is None:
        if mode == 'train':
            transform_img = [
                vision.RandomCropDecodeResize(image_size, scale=(0.08, 1.0)),
                vision.RandomHorizontalFlip(prob=0.5),
                vision.Normalize(mean=mean, std=std),
                vision.HWC2CHW()
            ]
        else:
            transform_img = [
                vision.Decode(),
                vision.Resize((256, 256)),
                vision.CenterCrop(image_size),
                vision.Normalize(mean=mean, std=std),
                vision.HWC2CHW()
            ]
    else:
        transform_img = transform

    if target_transform is None:
        transform_label = [C.TypeCast(mstype.int32)]
    else:
        transform_label = target_transform

    if input_mode == 'folder':
        de_dataset = de.ImageFolderDatasetV2(
            data_dir,
            num_parallel_workers=num_parallel_workers,
            shuffle=shuffle,
            sampler=sampler,
            class_indexing=class_indexing,
            num_shards=group_size,
            shard_id=rank)
    else:
        dataset = TxtDataset(root, data_dir)
        sampler = DistributedSampler(dataset,
                                     rank,
                                     group_size,
                                     shuffle=shuffle)
        de_dataset = de.GeneratorDataset(dataset, ["image", "label"],
                                         sampler=sampler)
        de_dataset.set_dataset_size(len(sampler))

    de_dataset = de_dataset.map(input_columns="image",
                                num_parallel_workers=8,
                                operations=transform_img)
    de_dataset = de_dataset.map(input_columns="label",
                                num_parallel_workers=8,
                                operations=transform_label)

    columns_to_project = ["image", "label"]
    de_dataset = de_dataset.project(columns=columns_to_project)

    de_dataset = de_dataset.batch(per_batch_size,
                                  drop_remainder=drop_remainder)
    de_dataset = de_dataset.repeat(repeat_num)

    return de_dataset