Exemplo n.º 1
0
def get_imagenet_dataflow(datadir,
                          name,
                          batch_size,
                          augmentors=None,
                          parallel=None):
    """
    Args:
        augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)`

    Returns: A DataFlow which produces BGR images and labels.

    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    isTrain = name == 'train'
    assert datadir is not None
    if augmentors is None:
        augmentors = fbresnet_augmentor(isTrain)
    assert isinstance(augmentors, list)
    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

    if isTrain:
        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = MultiProcessRunnerZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = MultiProcessRunnerZMQ(ds, 1)
    return ds
Exemplo n.º 2
0
 def __init__(self, file_location, batch_size, train=True, shuffle=True, full=False, batch_from_disk=150):
   self.batch_size = batch_size
   self.train = train
   if train:
     self.ds = MyLMDBSerializer.load(file_location, shuffle=shuffle, batch_from_disk=batch_from_disk)
     self.ds = MyLocallyShuffleData(self.ds, buffer_size=10000, shuffle_interval=500)
     self.ds = MultiProcessRunnerZMQ(self.ds, num_proc=1, hwm=10000)
     self.len_ = 1281167
   else:
     self.ds = LMDBSerializer.load(file_location, shuffle=False)
     self.ds = MultiProcessRunnerZMQ(self.ds, num_proc=1, hwm=10000)
     self.len_ = 50000
   self.ds.reset_state()
   self.batches_in_epoch = int(math.ceil(self.len_ / self.batch_size))
Exemplo n.º 3
0
def get_imagenet_dataflow(datadir, name, batch_size, parallel=None):
    """
    Get a standard imagenet training/evaluation dataflow, for linear classifier tuning.
    """
    assert name in ['train', 'val']
    isTrain = name == 'train'
    assert datadir is not None
    augmentors = get_basic_augmentor(isTrain)
    augmentors = imgaug.AugmentorList(augmentors)
    if parallel is None:
        parallel = min(50, mp.cpu_count())

    def mapper(dp):
        fname, label = dp
        img = cv2.imread(fname)
        img = augmentors.augment(img)
        return img, label

    if isTrain:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=True)
        ds = MultiProcessMapAndBatchDataZMQ(ds,
                                            parallel,
                                            mapper,
                                            batch_size,
                                            buffer_size=7000)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        ds = MultiThreadMapData(ds,
                                parallel,
                                mapper,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = MultiProcessRunnerZMQ(ds, 1)
    return ds
Exemplo n.º 4
0
    def get_data(self, name, num_gpu):
        gpu_batch = self.batch_size // num_gpu

        assert name in ['train', 'val', 'test']
        isTrain = name == 'train'

        augmentors = fbresnet_augmentor(isTrain)
        assert isinstance(augmentors, list)

        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

        if isTrain:
            ds = dataset.ILSVRC12(self.datadir,
                                  name,
                                  shuffle=True,
                                  dir_structure='train')
            ds = AugmentImageComponent(ds, augmentors, copy=False)
            ds = MultiProcessRunnerZMQ(ds, parallel)
            ds = BatchData(ds, gpu_batch, remainder=False)
            #ds = QueueInput(ds)
        else:
            ds = dataset.ILSVRC12Files(self.datadir,
                                       name,
                                       shuffle=False,
                                       dir_structure='train')
            aug = imgaug.AugmentorList(augmentors)

            def mapf(dp):
                fname, cls = dp
                im = cv2.imread(fname, cv2.IMREAD_COLOR)
                im = aug.augment(im)
                return im, cls

            ds = MultiThreadMapData(ds,
                                    parallel,
                                    mapf,
                                    buffer_size=2000,
                                    strict=True)
            ds = BatchData(ds, gpu_batch, remainder=True)
            ds = MultiProcessRunnerZMQ(ds, 1)

            if num_gpu == 1:
                ds = QueueInput(ds)
        return ds
def get_random_loader(ds, isTrain, batch_size, augmentors, parallel=None):
    """ DataFlow data (Random Read)
    Args:
        augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)`

    Returns: A DataFlow which produces BGR images and labels.

    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html
    """
    assert isinstance(augmentors, list)
    aug = imgaug.AugmentorList(augmentors)

    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

    if isTrain:
        ds = AugmentImageComponent(ds, aug, copy=False)
        if parallel < 16:
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = MultiProcessRunnerZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = MultiProcessRunnerZMQ(ds, 1)
    return ds
Exemplo n.º 6
0
Arquivo: r.py Projeto: voidiak/MTRE
def getdata(path, isTrain):
    ds = LMDBSerializer.load(path, shuffle=isTrain)

    # Graph Benchmark
    # ds=FakeData([[10,10],[10,10],[10,10],[10,10],[10],[10],[10,10],[1],[1],[1]], 1000, random=False,dtype=['int32', 'int32', 'int32', 'int32', 'int32', 'int32',\
    #     'int32', 'int32', 'int32', 'int32'], domain=[(0, 100), (0, 120),(0,120),(0,1),(0,100),(0,100),(0,100),(0,52),(0,115),(0,115)])

    ds = getbatch(ds, 32, isTrain)
    if isTrain:
        ds = MultiProcessRunnerZMQ(ds, 4)
    return ds
Exemplo n.º 7
0
class ExternalInputIterator(object):
  def __init__(self, file_location, batch_size, train=True, shuffle=True, full=False, batch_from_disk=150):
    self.batch_size = batch_size
    self.train = train
    if train:
      self.ds = MyLMDBSerializer.load(file_location, shuffle=shuffle, batch_from_disk=batch_from_disk)
      self.ds = MyLocallyShuffleData(self.ds, buffer_size=10000, shuffle_interval=500)
      self.ds = MultiProcessRunnerZMQ(self.ds, num_proc=1, hwm=10000)
      self.len_ = 1281167
    else:
      self.ds = LMDBSerializer.load(file_location, shuffle=False)
      self.ds = MultiProcessRunnerZMQ(self.ds, num_proc=1, hwm=10000)
      self.len_ = 50000
    self.ds.reset_state()
    self.batches_in_epoch = int(math.ceil(self.len_ / self.batch_size))

  def __iter__(self):
    if not self.train:
      self.iterator = self.ds.__iter__()
    else:
      if not hasattr(self, 'iterator'):
        self.iterator = self.ds.__iter__()
    self.i = 0
    return self

  def __next__(self):
    batch = []
    labels = []

    if self.i >= self.batches_in_epoch:
      raise StopIteration

    for _ in range(self.batch_size):
      jpeg_bytes, label = next(self.iterator)
      batch.append(jpeg_bytes)
      labels.append(np.array([label], dtype = np.uint8))
    self.i += 1
    return batch, labels

  next = __next__
Exemplo n.º 8
0
def get_train_dataflow(datadir, batch, augmentors=None):
    """
    Sec 3, Remark 4:
    Use a single random shuffling of the training data (per epoch)
    that is divided amongst all k workers.

    NOTE: Here we do not follow the paper which makes some differences.
    Here, each machine shuffles independently.
    """
    if augmentors is None:
        augmentors = fbresnet_augmentor(True)
    ds = dataset.ILSVRC12(datadir, 'train', shuffle=True)
    ds = AugmentImageComponent(ds, augmentors, copy=False)
    ds = BatchData(ds, batch, remainder=False)
    ds = MultiProcessRunnerZMQ(ds, min(50, mp.cpu_count()))
    return ds
def get_sequential_loader(ds, isTrain, batch_size, augmentors, parallel=None):
    """ Load a Single-File LMDB (Sequential Read)
    Args:
        augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)`

    Returns: A LMDBData which produces BGR images and labels.

    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html
    """
    assert isinstance(augmentors, list)
    aug = imgaug.AugmentorList(augmentors)

    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

    if isTrain:
        ds = LocallyShuffleData(ds, 50000)
        ds = MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR),
                              0)
        ds = AugmentImageComponent(ds, aug, copy=False)
        if parallel < 16:
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = BatchData(ds, batch_size, remainder=False, use_list=True)
        ds = MultiProcessRunnerZMQ(ds, parallel)
    else:

        def mapper(data):
            im, label = data
            im = cv2.imdecode(im, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, label

        ds = MultiProcessMapDataZMQ(ds,
                                    parallel,
                                    mapper,
                                    buffer_size=2000,
                                    strict=True)
        ds = BatchData(ds, batch_size, remainder=True, use_list=True)
    return ds
def get_imagenet_dataflow(datadir,
                          name,
                          batch_size,
                          augmentors=None,
                          parallel=None):
    """
    Args:
        augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)`

    Returns: A DataFlow which produces BGR images and labels.
    """
    assert name in ['train', 'val', 'test']
    isTrain = name == 'train'
    assert datadir is not None
    if augmentors is None:
        augmentors = fbresnet_augmentor(isTrain)
    assert isinstance(augmentors, list)
    augmentors = AugmentorList(augmentors)
    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

    def mapf(dp):
        fname, label = dp
        img = cv2.imread(fname)
        img = augmentors.augment(img)
        return img, label

    if isTrain:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=True)
        ds = MultiProcessMapDataZMQ(ds, parallel, mapf, buffer_size=2000)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = MultiProcessRunnerZMQ(ds, 1)
    return ds
Exemplo n.º 11
0
def dump_imdb(ds, output_path, parallel=None):
    """
    Create a Single-File LMDB from raw images.
    """
    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

    def mapf(dp):
        fname, label = dp
        with open(fname, 'rb') as f:
            bytes = f.read()
        bytes = np.asarray(bytearray(bytes), dtype='uint8')
        return bytes, label

    ds = MultiThreadMapData(ds, 1, mapf, buffer_size=2000, strict=True)
    ds = MultiProcessRunnerZMQ(ds, num_proc=parallel)

    LMDBSerializer.save(ds, output_path)
Exemplo n.º 12
0
def getdata(path, batchsize, isTrain):
    ds = LMDBSerializer.load(path, shuffle=isTrain)
    ds = getbatch(ds, batchsize, isTrain)
    if isTrain:
        ds = MultiProcessRunnerZMQ(ds, 2)
    return ds
Exemplo n.º 13
0
    # --------------------
    parser.add_argument('--imagenet_folder')
    parser.add_argument('--val', action='store_true')
    parser.add_argument('--train', action='store_true')
    parser.add_argument('--lmdb_file', type=str)

    args = parser.parse_args()

    if args.val and args.train:
        print(
            "Train and Validation options are mutually exclusive! Chose only one."
        )

    if args.val:
        print(
            "We are generating the lmdb file containing validation images of imagenet."
        )
        print(f"The file will be saved at {args.lmdb_file}.lmdb")

        ds0 = BinaryILSVRC12(os.path.expanduser(args.imagenet_folder), 'val')
        ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1)
        LMDBSerializer.save(ds1, f"{os.path.expanduser(args.lmdb_file)}.lmdb")
    elif args.train:
        print(
            "We are generating the lmdb file containing training images of imagenet."
        )
        print(f"The file will be saved at {args.lmdb_file}.lmdb")

        ds0 = BinaryILSVRC12(os.path.expanduser(args.imagenet_folder), 'train')
        ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1)
        LMDBSerializer.save(ds1, f"{os.path.expanduser(args.lmdb_file)}.lmdb")