def get_imagenet_dataflow(datadir, name, batch_size, augmentors=None, parallel=None): """ Args: augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)` Returns: A DataFlow which produces BGR images and labels. See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] isTrain = name == 'train' assert datadir is not None if augmentors is None: augmentors = fbresnet_augmentor(isTrain) assert isinstance(augmentors, list) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchDataZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
def get_moco_dataflow(datadir, batch_size, augmentors): """ Dataflow for training MOCO. """ augmentors = imgaug.AugmentorList(augmentors) parallel = min(30, mp.cpu_count()) # tuned on a 40-CPU 80-core machine ds = dataset.ILSVRC12Files(datadir, 'train', shuffle=True) ds = MultiProcessMapAndBatchDataZMQ(ds, parallel, MoCoMapper(augmentors), batch_size, buffer_size=5000) return ds
def get_imagenet_dataflow(datadir, is_train, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert datadir is not None assert isinstance(augmentors, list) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if is_train: ds = dataset.ILSVRC12(datadir, "train", shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logging.warning( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchDataZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, "val", shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = np.flip(im, axis=2) # print("fname={}".format(fname)) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) # ds = MapData(ds, mapf) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) # ds = PrefetchData(ds, 1) return ds
def get_data(self, name, num_gpu): gpu_batch = self.batch_size // num_gpu assert name in ['train', 'val', 'test'] isTrain = name == 'train' augmentors = fbresnet_augmentor(isTrain) assert isinstance(augmentors, list) parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = dataset.ILSVRC12(self.datadir, name, shuffle=True, dir_structure='train') ds = AugmentImageComponent(ds, augmentors, copy=False) ds = MultiProcessRunnerZMQ(ds, parallel) ds = BatchData(ds, gpu_batch, remainder=False) #ds = QueueInput(ds) else: ds = dataset.ILSVRC12Files(self.datadir, name, shuffle=False, dir_structure='train') aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, gpu_batch, remainder=True) ds = MultiProcessRunnerZMQ(ds, 1) if num_gpu == 1: ds = QueueInput(ds) return ds
def get_imagenet_dataflow( datadir, name, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: parallel = min(40, 16) # assuming hyperthreading if isTrain: ds1 = ilsvrcsemi.ILSVRC12(datadir, name, shuffle=True, labeled=True) ds2 = ilsvrcsemi.ILSVRC12(datadir, name, shuffle=True, labeled=False) ds1 = AugmentImageComponent(ds1, augmentors, copy=False) ds2 = AugmentImageComponent(ds2, augmentors, copy=False) ds = JoinData([ds1, ds2]) if parallel < 16: logger.warn("DataFlow may become the bottleneck when too few processes are used.") ds = PrefetchDataZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls, im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
parser.add_argument('--data', help='imagenet data dir') parser.add_argument('--batch', default=512, type=int, help='total batch size') parser.add_argument('--load', required=True, help='file or directory to evaluate') parser.add_argument('--top-k', type=int, default=200, help='top-k in KNN') parser.add_argument('--v2', action='store_true', help='use mocov2') args = parser.parse_args() hvd.init() local_batch_size = args.batch // hvd.size() train_files = dataset.ILSVRC12Files(args.data, 'train', shuffle=True) train_files.reset_state() all_train_files = list(train_files) all_train_files = all_train_files[:len(all_train_files) // args.batch * args.batch] # truncate num_train_images = len(all_train_files) logger.info( f"Creating graph for KNN of {num_train_images} training images ...") local_train_files = [(idx, fname, label) for idx, (fname, label) in enumerate(all_train_files) if idx % hvd.size() == hvd.rank()] image_input = tf.placeholder(tf.uint8, [None, 224, 224, 3], "image") idx_input = tf.placeholder(tf.int64, [None], "image_idx") feat_buffer = tf.get_variable("feature_buffer",
parser.add_argument('--out_dir', type=str, default="/home/sherry/datasets/ilsvrc-lmdb111") parser.add_argument('--procs', type=int, default=20) args = parser.parse_args() if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) if args.dataset == "ILSVRC": # class BinaryILSVRC12(dataset.ILSVRC12Files): # def __iter__(self): # for fname, label in super(BinaryILSVRC12, self).__iter__(): # with open(fname, 'rb') as f: # bytes = f.read() # bytes = np.asarray(bytearray(bytes), dtype='uint8') # yield [bytes, label] # ds = BinaryILSVRC12(args.data_dir, args.split) # ds = MultiProcessRunnerZMQ(ds, nr_proc=args.procs) # LMDBSerializer.save(ds, os.path.join(args.out_dir, '%s-%s.lmdb' % (args.dataset, args.split)) ds = dataset.ILSVRC12Files(args.data_dir, args.split) else: ds = BinaryFolder(args.data_dir, args.split, IMG_EXTENSIONS) output_path = os.path.join(args.out_dir, '{}-{}.lmdb'.format(args.dataset, args.split)) dump_imdb(ds, output_path, parallel=args.procs)