예제 #1
0
def get_imagenet_dataflow(datadir, name, batch_size, augmentors):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    cpu = min(30, multiprocessing.cpu_count())
    if isTrain:
        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        ds = PrefetchDataZMQ(ds, cpu)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, cls

        ds = ThreadedMapData(ds, cpu, mapf, buffer_size=2000, strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
    return ds
예제 #2
0
def get_imagenet_dataflow(
        datadir, name, batch_size,
        augmentors, parallel=None):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    meta_dir = os.path.join(datadir, "meta")
    if parallel is None:
        parallel = min(40, multiprocessing.cpu_count())
    if isTrain:
        ds = Imagenet5k(datadir, name, meta_dir=meta_dir, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logger.warn("DataFlow may become the bottleneck when too few processes are used.")
        ds = PrefetchDataZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = Imagenet5kFiles(datadir, name, meta_dir=meta_dir, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, cls
        ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
    return ds
예제 #3
0
def get_val_dataflow(
        datadir, batch_size,
        augmentors, parallel=None,
        num_splits=None, split_index=None):
    assert datadir is not None
    assert isinstance(augmentors, list)
    if parallel is None:
        parallel = min(40, multiprocessing.cpu_count())

    if num_splits is None:
        ds = dataset.ILSVRC12Files(datadir, 'val', shuffle=False)
    else:
        assert split_index < num_splits
        files = dataset.ILSVRC12Files(datadir, 'val', shuffle=False)
        files.reset_state()
        files = list(files.get_data())
        logger.info("#ValidationData = {}".format(len(files)))
        split_size = len(files) // num_splits
        start, end = split_size * split_index, split_size * (split_index + 1)
        end = min(end, len(files))
        logger.info("#ValidationSplit = {} - {}".format(start, end))
        files = files[start: end]
        ds = DataFromList(files, shuffle=False)
    aug = imgaug.AugmentorList(augmentors)

    def mapf(dp):
        fname, cls = dp
        im = cv2.imread(fname, cv2.IMREAD_COLOR)
        im = aug.augment(im)
        return im, cls
    ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True)
    ds = BatchData(ds, batch_size, remainder=True)
    # ds = PrefetchDataZMQ(ds, 1)
    # do not fork() under MPI
    return ds
예제 #4
0
def data_augmentation(im, augmentors):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert isinstance(augmentors, list)
    aug = imgaug.AugmentorList(augmentors)
    im = aug.augment(im)
    return im
def get_imagenet_dataflow(datadir,
                          name,
                          batch_size,
                          augmentors,
                          parallel=None):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    if parallel is None:
        parallel = min(40, multiprocessing.cpu_count())
    if isTrain:
        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = PrefetchDataZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp

            jpeg_filename = os.path.basename(fname)
            jpeg_dirname = os.path.basename(os.path.dirname(fname))
            zip_filepath = os.path.dirname(fname) + '.zip'

            f = zipfile.ZipFile(zip_filepath, 'r')
            compress_jpeg = np.fromstring(f.read(
                os.path.join(jpeg_dirname, jpeg_filename)),
                                          dtype=np.uint8)

            im = cv2.imdecode(compress_jpeg, cv2.IMREAD_COLOR)
            #im = cv2.imread(fname, cv2.IMREAD_COLOR)

            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
    return ds
def get_imagenet_dataflow(datadir,
                          name,
                          batch_size,
                          augmentors,
                          parallel=None):  #获取图像网络数据流
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    if parallel is None:  # 如果不是并行的话
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading 超线程? 获取当前计算机cpu数量

    if isTrain:
        # dataset:创建一个在数据流上运行的预测器,并且拿出一个batch?
        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors,
                                   copy=False)  # 使用共享的增强参数在多个组件上应用图像增强器
        if parallel < 16:  # 如果少于16个的话
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = PrefetchDataZMQ(ds, parallel)  # 实现高效的数据流水线
        ds = BatchData(ds, batch_size, remainder=False)  # 取一个batch?
    else:
        # 如果是测试时,增强图像,加速对数据流的读取操作等
        # 与ILSVRC12相同,但生成图像的文件名而不是np array。
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR
                            )  # cv2.IMREAD_COLOR : 默认使用该种标识。加载一张彩色图片,忽视它的透明度
            im = aug.augment(im)  # 增强图像
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)  # 并行加速?
        ds = BatchData(ds, batch_size, remainder=True)  # 取一个batch?
        ds = PrefetchDataZMQ(ds, 1)
    return ds
예제 #7
0
def get_imagenet_dataflow(datadir,
                          name,
                          batch_size,
                          augmentors,
                          parallel=None):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    if parallel is None:
        parallel = min(40, multiprocessing.cpu_count() // 6)
    if isTrain:
        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logger.warning(
                "DataFlow may become the bottleneck when too few processes are used."
            )

        ds = PrefetchData(ds, 1000, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = np.zeros((256, 256, 3), dtype=np.uint8)
            for _ in range(30):
                try:
                    im = cv2.imread(fname, cv2.IMREAD_COLOR)
                    im = aug.augment(im)
                    break
                except Exception as e:
                    logger.warning(str(e), 'file=', fname)
                    time.sleep(1)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchData(ds, 100, 1)
    return ds
def get_val_dataflow(datadir,
                     batch_size,
                     augmentors=None,
                     parallel=None,
                     num_splits=None,
                     split_index=None,
                     dataname="val"):
    if augmentors is None:
        augmentors = fbresnet_augmentor(False)
    assert datadir is not None
    assert isinstance(augmentors, list)
    if parallel is None:
        parallel = min(40, multiprocessing.cpu_count())

    if num_splits is None:
        ds = dataset.ILSVRC12Files(datadir, dataname, shuffle=True)
    else:
        # shard validation data
        assert False
        assert split_index < num_splits
        files = dataset.ILSVRC12Files(datadir, dataname, shuffle=True)
        files.reset_state()
        files = list(files.get_data())
        logger.info("Number of validation data = {}".format(len(files)))
        split_size = len(files) // num_splits
        start, end = split_size * split_index, split_size * (split_index + 1)
        end = min(end, len(files))
        logger.info("Local validation split = {} - {}".format(start, end))
        files = files[start:end]
        ds = DataFromList(files, shuffle=True)

    aug = imgaug.AugmentorList(augmentors)

    def mapf(dp):
        fname, cls = dp
        im = cv2.imread(fname, cv2.IMREAD_COLOR)
        #from BGR to RGB
        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
        im = aug.augment(im)
        return im, cls

    ds = MultiThreadMapData(ds,
                            parallel,
                            mapf,
                            buffer_size=min(2000, ds.size()),
                            strict=True)
    ds = BatchData(ds, batch_size, remainder=False)
    ds = RepeatedData(ds, num=-1)
    # do not fork() under MPI
    return ds
예제 #9
0
def imgaug_wrapper(imgaug_list):
    imgaug_augmentor = imgaug.AugmentorList(imgaug_list)
    # don't normalize the image yet. will normalize within data_prefetcher
    # don't use transforms.ToTensor(). It will divide pixels by 256
    ToTensor = torch.from_numpy

    # img_obj is a PIL Image object. Return a CPU tensor
    def real_augmentor(img_obj):
        img_np = np.asarray(img_obj)
        transforms = imgaug_augmentor.get_transform(img_np)
        img_np2 = transforms.apply_image(img_np)
        img_np2 = np.rollaxis(img_np2, 2)
        img_ts = ToTensor(np.ascontiguousarray(img_np2))
        return img_ts

    return real_augmentor
def get_sequential_loader(ds, isTrain, batch_size, augmentors, parallel=None):
    """ Load a Single-File LMDB (Sequential Read)
    Args:
        augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)`

    Returns: A LMDBData which produces BGR images and labels.

    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html
    """
    assert isinstance(augmentors, list)
    aug = imgaug.AugmentorList(augmentors)

    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

    if isTrain:
        ds = LocallyShuffleData(ds, 50000)
        ds = MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR),
                              0)
        ds = AugmentImageComponent(ds, aug, copy=False)
        if parallel < 16:
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = BatchData(ds, batch_size, remainder=False, use_list=True)
        ds = MultiProcessRunnerZMQ(ds, parallel)
    else:

        def mapper(data):
            im, label = data
            im = cv2.imdecode(im, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, label

        ds = MultiProcessMapDataZMQ(ds,
                                    parallel,
                                    mapper,
                                    buffer_size=2000,
                                    strict=True)
        ds = BatchData(ds, batch_size, remainder=True, use_list=True)
    return ds
def get_random_loader(ds, isTrain, batch_size, augmentors, parallel=None):
    """ DataFlow data (Random Read)
    Args:
        augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)`

    Returns: A DataFlow which produces BGR images and labels.

    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html
    """
    assert isinstance(augmentors, list)
    aug = imgaug.AugmentorList(augmentors)

    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

    if isTrain:
        ds = AugmentImageComponent(ds, aug, copy=False)
        if parallel < 16:
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = MultiProcessRunnerZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = MultiProcessRunnerZMQ(ds, 1)
    return ds
예제 #12
0
def get_data_tmp(name, base_dir, meta_dir, gpu_nums):

    isTrain = True if 'train' in name else False

    m = np.array([104, 116, 122])
    const_arr = np.resize(m, (1, 1, 3))  # NCHW
    const_arr = np.zeros((args.crop_size[0], args.crop_size[1], 3)) + const_arr  # broadcast

    def imgread(ds):
        img, label = ds
        img = cv2.imread(img, cv2.IMREAD_COLOR)
        label = cv2.imread(label, cv2.IMREAD_GRAYSCALE)
        return img, label

    if isTrain:
        #ds = LMDBData('/data2/dataset/cityscapes/cityscapes_train.lmdb', shuffle=True)
        #ds = FakeData([[batch_size, CROP_HEIGHT, CROP_HEIGHT, 3], [batch_size, CROP_HEIGHT, CROP_HEIGHT, 1]], 5000, random=False, dtype='uint8')
        ds = PascalVOC12Files(base_dir, meta_dir, name, shuffle=True)
        parallel = min(6, multiprocessing.cpu_count())
        augmentors = [
            RandomCropWithPadding(args.crop_size),
            Flip(horiz=True),
        ]
        aug = imgaug.AugmentorList(augmentors)
        def mapf(ds):
            img, label = ds
            img = cv2.imread(img, cv2.IMREAD_COLOR)
            label = cv2.imread(label, cv2.IMREAD_GRAYSCALE)
            img, params = aug.augment_return_params(img)
            label = aug._augment(label, params)
            img = img - const_arr  # very time-consuming
            return img, label

        ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=500, strict=True)
        ds = BatchData(ds, args.batch_size * gpu_nums)
        ds = PrefetchDataZMQ(ds, 1)
    else:
        ds = PascalVOC12(base_dir, meta_dir, name, shuffle=False)
        ds = MapData(ds, imgread)
        ds = BatchData(ds, 1)

    return ds
예제 #13
0
def get_AVA2012_dataflow(datadir,
                         name,
                         batch_size,
                         augmentors,
                         CAM_dir_pkl=None,
                         CAMCropR=False,
                         repeat_times=1,
                         strict_order=False,
                         remainder_TR=False):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val']
    assert datadir is not None
    assert isinstance(augmentors, list)

    isTrain = name == 'train'
    cpu = min(2, multiprocessing.cpu_count())
    if isTrain:
        # change @ 20171122
        # change @ 20171125: for CAMCropR
        ds = dataset.AVA2012(datadir,
                             name,
                             CAM_dir_pkl=CAM_dir_pkl,
                             CAMCropR=CAMCropR,
                             shuffle=(strict_order == False))
        print(
            '--> information about the original dataFlow from AVA2012 [TRAIN]:'
        )
        print(ds)

        ds = AugmentImageComponent(ds, augmentors, copy=False)
        # change @ 20171122
        # NOTE: When ``nr_proc=1``, the dataflow produces the same data as ``ds`` in the same order
        ds = PrefetchDataZMQ(ds, cpu if strict_order == False else 1)
        ds = BatchData(ds, batch_size, remainder=remainder_TR)

    else:
        ds = dataset.AVA2012(datadir, name, shuffle=False)
        print(
            '--> information about the original dataFlow from AVA2012 [VALIDATION]:'
        )
        print(ds)

        # add @ 20171121
        ds = RepeatedDataPoint(ds, repeat_times)

        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            im, cls = dp
            im = aug.augment(im)
            return im, cls

        # change @ 20171122
        # BUG @ 20171128
        # NOTE: The order of data points out of MultiThreadMapData.get_data()
        #       is not the same as ds.get_data() !
        # NOTE: buffer_size is the minimum number of images loaded from a given folder
        ds = MultiThreadMapData(ds, cpu if strict_order == False else 1, \
            mapf, buffer_size = 500, strict = True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)

    return ds