def get_imagenet_dataflow(datadir, name, batch_size, augmentors=None, parallel=None): """ Args: augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)` Returns: A DataFlow which produces BGR images and labels. See explanations in the tutorial: http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] isTrain = name == 'train' assert datadir is not None if augmentors is None: augmentors = fbresnet_augmentor(isTrain) assert isinstance(augmentors, list) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = MultiProcessRunnerZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = MultiProcessRunnerZMQ(ds, 1) return ds
def __init__(self, file_location, batch_size, train=True, shuffle=True, full=False, batch_from_disk=150): self.batch_size = batch_size self.train = train if train: self.ds = MyLMDBSerializer.load(file_location, shuffle=shuffle, batch_from_disk=batch_from_disk) self.ds = MyLocallyShuffleData(self.ds, buffer_size=10000, shuffle_interval=500) self.ds = MultiProcessRunnerZMQ(self.ds, num_proc=1, hwm=10000) self.len_ = 1281167 else: self.ds = LMDBSerializer.load(file_location, shuffle=False) self.ds = MultiProcessRunnerZMQ(self.ds, num_proc=1, hwm=10000) self.len_ = 50000 self.ds.reset_state() self.batches_in_epoch = int(math.ceil(self.len_ / self.batch_size))
def get_imagenet_dataflow(datadir, name, batch_size, parallel=None): """ Get a standard imagenet training/evaluation dataflow, for linear classifier tuning. """ assert name in ['train', 'val'] isTrain = name == 'train' assert datadir is not None augmentors = get_basic_augmentor(isTrain) augmentors = imgaug.AugmentorList(augmentors) if parallel is None: parallel = min(50, mp.cpu_count()) def mapper(dp): fname, label = dp img = cv2.imread(fname) img = augmentors.augment(img) return img, label if isTrain: ds = dataset.ILSVRC12Files(datadir, name, shuffle=True) ds = MultiProcessMapAndBatchDataZMQ(ds, parallel, mapper, batch_size, buffer_size=7000) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) ds = MultiThreadMapData(ds, parallel, mapper, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = MultiProcessRunnerZMQ(ds, 1) return ds
def get_data(self, name, num_gpu): gpu_batch = self.batch_size // num_gpu assert name in ['train', 'val', 'test'] isTrain = name == 'train' augmentors = fbresnet_augmentor(isTrain) assert isinstance(augmentors, list) parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = dataset.ILSVRC12(self.datadir, name, shuffle=True, dir_structure='train') ds = AugmentImageComponent(ds, augmentors, copy=False) ds = MultiProcessRunnerZMQ(ds, parallel) ds = BatchData(ds, gpu_batch, remainder=False) #ds = QueueInput(ds) else: ds = dataset.ILSVRC12Files(self.datadir, name, shuffle=False, dir_structure='train') aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, gpu_batch, remainder=True) ds = MultiProcessRunnerZMQ(ds, 1) if num_gpu == 1: ds = QueueInput(ds) return ds
def get_random_loader(ds, isTrain, batch_size, augmentors, parallel=None): """ DataFlow data (Random Read) Args: augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)` Returns: A DataFlow which produces BGR images and labels. See explanations in the tutorial: http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html """ assert isinstance(augmentors, list) aug = imgaug.AugmentorList(augmentors) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = AugmentImageComponent(ds, aug, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = MultiProcessRunnerZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = MultiProcessRunnerZMQ(ds, 1) return ds
def getdata(path, isTrain): ds = LMDBSerializer.load(path, shuffle=isTrain) # Graph Benchmark # ds=FakeData([[10,10],[10,10],[10,10],[10,10],[10],[10],[10,10],[1],[1],[1]], 1000, random=False,dtype=['int32', 'int32', 'int32', 'int32', 'int32', 'int32',\ # 'int32', 'int32', 'int32', 'int32'], domain=[(0, 100), (0, 120),(0,120),(0,1),(0,100),(0,100),(0,100),(0,52),(0,115),(0,115)]) ds = getbatch(ds, 32, isTrain) if isTrain: ds = MultiProcessRunnerZMQ(ds, 4) return ds
class ExternalInputIterator(object): def __init__(self, file_location, batch_size, train=True, shuffle=True, full=False, batch_from_disk=150): self.batch_size = batch_size self.train = train if train: self.ds = MyLMDBSerializer.load(file_location, shuffle=shuffle, batch_from_disk=batch_from_disk) self.ds = MyLocallyShuffleData(self.ds, buffer_size=10000, shuffle_interval=500) self.ds = MultiProcessRunnerZMQ(self.ds, num_proc=1, hwm=10000) self.len_ = 1281167 else: self.ds = LMDBSerializer.load(file_location, shuffle=False) self.ds = MultiProcessRunnerZMQ(self.ds, num_proc=1, hwm=10000) self.len_ = 50000 self.ds.reset_state() self.batches_in_epoch = int(math.ceil(self.len_ / self.batch_size)) def __iter__(self): if not self.train: self.iterator = self.ds.__iter__() else: if not hasattr(self, 'iterator'): self.iterator = self.ds.__iter__() self.i = 0 return self def __next__(self): batch = [] labels = [] if self.i >= self.batches_in_epoch: raise StopIteration for _ in range(self.batch_size): jpeg_bytes, label = next(self.iterator) batch.append(jpeg_bytes) labels.append(np.array([label], dtype = np.uint8)) self.i += 1 return batch, labels next = __next__
def get_train_dataflow(datadir, batch, augmentors=None): """ Sec 3, Remark 4: Use a single random shuffling of the training data (per epoch) that is divided amongst all k workers. NOTE: Here we do not follow the paper which makes some differences. Here, each machine shuffles independently. """ if augmentors is None: augmentors = fbresnet_augmentor(True) ds = dataset.ILSVRC12(datadir, 'train', shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) ds = BatchData(ds, batch, remainder=False) ds = MultiProcessRunnerZMQ(ds, min(50, mp.cpu_count())) return ds
def get_sequential_loader(ds, isTrain, batch_size, augmentors, parallel=None): """ Load a Single-File LMDB (Sequential Read) Args: augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)` Returns: A LMDBData which produces BGR images and labels. See explanations in the tutorial: http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html """ assert isinstance(augmentors, list) aug = imgaug.AugmentorList(augmentors) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = LocallyShuffleData(ds, 50000) ds = MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) ds = AugmentImageComponent(ds, aug, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = BatchData(ds, batch_size, remainder=False, use_list=True) ds = MultiProcessRunnerZMQ(ds, parallel) else: def mapper(data): im, label = data im = cv2.imdecode(im, cv2.IMREAD_COLOR) im = aug.augment(im) return im, label ds = MultiProcessMapDataZMQ(ds, parallel, mapper, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True, use_list=True) return ds
def get_imagenet_dataflow(datadir, name, batch_size, augmentors=None, parallel=None): """ Args: augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)` Returns: A DataFlow which produces BGR images and labels. """ assert name in ['train', 'val', 'test'] isTrain = name == 'train' assert datadir is not None if augmentors is None: augmentors = fbresnet_augmentor(isTrain) assert isinstance(augmentors, list) augmentors = AugmentorList(augmentors) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading def mapf(dp): fname, label = dp img = cv2.imread(fname) img = augmentors.augment(img) return img, label if isTrain: ds = dataset.ILSVRC12Files(datadir, name, shuffle=True) ds = MultiProcessMapDataZMQ(ds, parallel, mapf, buffer_size=2000) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = MultiProcessRunnerZMQ(ds, 1) return ds
def dump_imdb(ds, output_path, parallel=None): """ Create a Single-File LMDB from raw images. """ if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading def mapf(dp): fname, label = dp with open(fname, 'rb') as f: bytes = f.read() bytes = np.asarray(bytearray(bytes), dtype='uint8') return bytes, label ds = MultiThreadMapData(ds, 1, mapf, buffer_size=2000, strict=True) ds = MultiProcessRunnerZMQ(ds, num_proc=parallel) LMDBSerializer.save(ds, output_path)
def getdata(path, batchsize, isTrain): ds = LMDBSerializer.load(path, shuffle=isTrain) ds = getbatch(ds, batchsize, isTrain) if isTrain: ds = MultiProcessRunnerZMQ(ds, 2) return ds
# -------------------- parser.add_argument('--imagenet_folder') parser.add_argument('--val', action='store_true') parser.add_argument('--train', action='store_true') parser.add_argument('--lmdb_file', type=str) args = parser.parse_args() if args.val and args.train: print( "Train and Validation options are mutually exclusive! Chose only one." ) if args.val: print( "We are generating the lmdb file containing validation images of imagenet." ) print(f"The file will be saved at {args.lmdb_file}.lmdb") ds0 = BinaryILSVRC12(os.path.expanduser(args.imagenet_folder), 'val') ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1) LMDBSerializer.save(ds1, f"{os.path.expanduser(args.lmdb_file)}.lmdb") elif args.train: print( "We are generating the lmdb file containing training images of imagenet." ) print(f"The file will be saved at {args.lmdb_file}.lmdb") ds0 = BinaryILSVRC12(os.path.expanduser(args.imagenet_folder), 'train') ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1) LMDBSerializer.save(ds1, f"{os.path.expanduser(args.lmdb_file)}.lmdb")