def get_rec_data_iterators(train_db_prefix, val_db_prefix, input_size, batch_size, devices): num_threads = 2 num_shards = len(devices) train_pipes = [ HybridRecPipe(train_db_prefix, True, input_size, batch_size, num_threads, device_id, num_shards) for device_id in range(num_shards) ] # Build train pipeline to get the epoch size out of the reader train_pipes[0].build() print("Training pipeline epoch size: {}".format( train_pipes[0].epoch_size("Reader"))) # Make train MXNet iterators out of rec pipelines dali_train_iter = DALIClassificationIterator( train_pipes, train_pipes[0].epoch_size("Reader")) if val_db_prefix: val_pipes = [ HybridRecPipe(val_db_prefix, False, input_size, batch_size, num_threads, device_id, num_shards) for device_id in range(num_shards) ] # Build val pipeline get the epoch size out of the reader val_pipes[0].build() print("Validation pipeline epoch size: {}".format( val_pipes[0].epoch_size("Reader"))) # Make val MXNet iterators out of rec pipelines dali_val_iter = DALIClassificationIterator( val_pipes, val_pipes[0].epoch_size("Reader")) else: dali_val_iter = None return dali_train_iter, dali_val_iter
def get_rec_iter(args, trainpipes, valpipes, data_paths, kv=None): (rank, num_workers) = _get_rank_and_worker_count(args, kv) # now data is available in the provided paths to DALI, it ensures that the data has not been touched # user need to clean up the /tmp from the created symlinks # DALIClassificationIterator() does the init so we need to provide the real data here if args.dali_cache_size > 0 and args.lazy_init_sanity: link_to_tmp_file(args.data_train, data_paths["train_data_tmp"]) link_to_tmp_file(args.data_train_idx, data_paths["train_idx_tmp"]) link_to_tmp_file(args.data_val, data_paths["val_data_tmp"]) link_to_tmp_file(args.data_val_idx, data_paths["val_idx_tmp"]) dali_train_iter = DALIClassificationIterator( trainpipes, args.num_examples // num_workers) if args.num_examples < trainpipes[0].epoch_size("Reader"): warnings.warn( "{} training examples will be used, although full training set contains {} examples" .format(args.num_examples, trainpipes[0].epoch_size("Reader"))) worker_val_examples = valpipes[0].epoch_size("Reader") if not args.separ_val: worker_val_examples = worker_val_examples // num_workers if rank < valpipes[0].epoch_size("Reader") % num_workers: worker_val_examples += 1 dali_val_iter = DALIClassificationIterator( valpipes, worker_val_examples, fill_last_batch=False) if args.data_val else None return dali_train_iter, dali_val_iter
def get_dali_iter(data_dir, batch_size, kv, image_shape, num_gpus): num_examples = 1281167 trainpipes = [ HybridTrainPipe(batch_size=batch_size // num_gpus, num_threads=2, device_id=i, num_gpus=num_gpus, db_folder=data_dir) for i in range(num_gpus) ] valpipes = [ HybridValPipe(batch_size=batch_size // num_gpus, num_threads=2, device_id=i, num_gpus=num_gpus, db_folder=data_dir) for i in range(num_gpus) ] trainpipes[0].build() valpipes[0].build() print("Training pipeline epoch size: {}".format( trainpipes[0].epoch_size("Reader"))) print("Validation pipeline epoch size: {}".format( valpipes[0].epoch_size("Reader"))) dali_train_iter = DALIClassificationIterator( trainpipes, trainpipes[0].epoch_size("Reader")) dali_val_iter = DALIClassificationIterator( valpipes, valpipes[0].epoch_size("Reader")) return dali_train_iter, dali_val_iter, num_examples
def get_rec_iter(args, kv=None): # resize is default base length of shorter edge for dataset; # all images will be reshaped to this size resize = int(args.resize) # target shape is final shape of images pipelined to network; # all images will be cropped to this size target_shape = tuple([int(l) for l in args.image_shape.split(',')]) pad_output = target_shape[0] == 4 gpus = list(map(int, filter(None, args.gpus.split(',')))) # filter to not encount eventually empty strings batch_size = args.batch_size//len(gpus) num_threads = args.dali_threads num_validation_threads = args.validation_dali_threads #db_folder = "/data/imagenet/train-480-val-256-recordio/" # the input_layout w.r.t. the model is the output_layout of the image pipeline output_layout = types.NHWC if args.input_layout == 'NHWC' else types.NCHW rank = kv.rank if kv else 0 nWrk = kv.num_workers if kv else 1 trainpipes = [HybridTrainPipe(batch_size = batch_size, num_threads = num_threads, device_id = gpu_id, rec_path = args.data_train, idx_path = args.data_train_idx, shard_id = gpus.index(gpu_id) + len(gpus)*rank, num_shards = len(gpus)*nWrk, crop_shape = target_shape[1:], output_layout = output_layout, pad_output = pad_output, dtype = args.dtype, nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024, prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus] valpipes = [HybridValPipe(batch_size = batch_size, num_threads = num_validation_threads, device_id = gpu_id, rec_path = args.data_val, idx_path = args.data_val_idx, shard_id = 0 if args.separ_val else gpus.index(gpu_id) + len(gpus)*rank, num_shards = 1 if args.separ_val else len(gpus)*nWrk, crop_shape = target_shape[1:], resize_shp = resize, output_layout = output_layout, pad_output = pad_output, dtype = args.dtype, nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024, prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus] if args.data_val else None trainpipes[0].build() if args.data_val: valpipes[0].build() if args.num_examples < trainpipes[0].epoch_size("Reader"): warnings.warn("{} training examples will be used, although full training set contains {} examples".format(args.num_examples, trainpipes[0].epoch_size("Reader"))) dali_train_iter = DALIClassificationIterator(trainpipes, args.num_examples // nWrk) dali_val_iter = DALIClassificationIterator(valpipes, valpipes[0].epoch_size("Reader") // (1 if args.separ_val else nWrk), fill_last_batch = False) if args.data_val else None return dali_train_iter, dali_val_iter
def get_rec_iter(args, trainpipes, valpipes, cvalpipes, kv=None): rank = kv.rank if kv else 0 nWrk = kv.num_workers if kv else 1 dali_train_iter = DALIClassificationIterator(trainpipes, args.num_examples // nWrk) if args.no_augument_epoch < args.num_epochs: dali_cval_iter = DALIClassificationIterator(cvalpipes, args.num_examples // nWrk) else: dali_cval_iter = None mx_resnet_print(key=mlperf_log.INPUT_SIZE, val=trainpipes[0].epoch_size("Reader")) mx_resnet_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, val=trainpipes[0].epoch_size("Reader")) if args.data_val: mx_resnet_print(key=mlperf_log.EVAL_SIZE, val=valpipes[0].epoch_size("Reader")) mx_resnet_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, val=valpipes[0].epoch_size("Reader")) if args.num_examples < trainpipes[0].epoch_size("Reader"): warnings.warn( "{} training examples will be used, although full training set contains {} examples" .format(args.num_examples, trainpipes[0].epoch_size("Reader"))) worker_val_examples = valpipes[0].epoch_size("Reader") if not args.separ_val: worker_val_examples = worker_val_examples // nWrk if rank < valpipes[0].epoch_size("Reader") % nWrk: worker_val_examples += 1 dali_val_iter = DALIClassificationIterator( valpipes, worker_val_examples, fill_last_batch=False) if args.data_val else None return dali_train_iter, dali_val_iter, dali_cval_iter
def get_dali_dataloder(batch_size, ctx, opt): from nvidia.dali.plugin.mxnet import DALIClassificationIterator from lib.data.loader import HybridTrainPipe, HybridValPipe rec_train = os.path.expanduser(opt.rec_train) rec_train_idx = os.path.expanduser(opt.rec_train_idx) rec_val = os.path.expanduser(opt.rec_val) rec_val_idx = os.path.expanduser(opt.rec_val_idx) input_size = opt.input_size num_devices = len(ctx) trainpipes = [ HybridTrainPipe(rec_path=rec_train, index_path=rec_train_idx, batch_size=batch_size, input_size=input_size, num_gpus=num_devices, num_threads=32, device_id=i) for i in range(num_devices) ] valpipes = [ HybridValPipe(rec_path=rec_val, index_path=rec_val_idx, batch_size=batch_size, input_size=input_size, num_gpus=num_devices, num_threads=32, device_id=i) for i in range(num_devices) ] trainpipes[0].build() valpipes[0].build() train_loader = DALIClassificationIterator( trainpipes, trainpipes[0].epoch_size("Reader")) val_loader = DALIClassificationIterator(valpipes, valpipes[0].epoch_size("Reader")) logging.info('dali dataloder was loaded.') return train_loader, val_loader
def get_rec_iter(args, kv=None, dali_cpu=False): gpus = args.gpus num_threads = args.dali_threads num_validation_threads = args.dali_validation_threads pad_output = (args.image_shape[0] == 4) # the input_layout w.r.t. the model is the output_layout of the image pipeline output_layout = types.NHWC if args.input_layout == 'NHWC' else types.NCHW if 'horovod' in args.kv_store: rank = hvd.rank() nWrk = hvd.size() else: rank = kv.rank if kv else 0 nWrk = kv.num_workers if kv else 1 batch_size = args.batch_size // nWrk // len(gpus) trainpipes = [HybridTrainPipe(args = args, batch_size = batch_size, num_threads = num_threads, device_id = gpu_id, rec_path = args.data_train, idx_path = args.data_train_idx, shard_id = gpus.index(gpu_id) + len(gpus)*rank, num_shards = len(gpus)*nWrk, crop_shape = args.image_shape[1:], output_layout = output_layout, dtype = args.dtype, pad_output = pad_output, dali_cpu = dali_cpu, nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024, prefetch_queue = args.dali_prefetch_queue, nvjpeg_width_hint = args.dali_nvjpeg_width_hint, nvjpeg_height_hint = args.dali_nvjpeg_height_hint) for gpu_id in gpus] if args.data_val: valpipes = [HybridValPipe(args = args, batch_size = batch_size, num_threads = num_validation_threads, device_id = gpu_id, rec_path = args.data_val, idx_path = args.data_val_idx, shard_id = 0 if args.dali_separ_val else gpus.index(gpu_id) + len(gpus)*rank, num_shards = 1 if args.dali_separ_val else len(gpus)*nWrk, crop_shape = args.image_shape[1:], resize_shp = args.data_val_resize, output_layout = output_layout, dtype = args.dtype, pad_output = pad_output, dali_cpu = dali_cpu, nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024, prefetch_queue = args.dali_prefetch_queue, nvjpeg_width_hint = args.dali_nvjpeg_width_hint, nvjpeg_height_hint = args.dali_nvjpeg_height_hint) for gpu_id in gpus] if args.data_val else None trainpipes[0].build() if args.data_val: valpipes[0].build() worker_val_examples = valpipes[0].epoch_size("Reader") if not args.dali_separ_val: worker_val_examples = worker_val_examples // nWrk if rank < valpipes[0].epoch_size("Reader") % nWrk: worker_val_examples += 1 if args.num_examples < trainpipes[0].epoch_size("Reader"): warnings.warn("{} training examples will be used, although full training set contains {} examples".format(args.num_examples, trainpipes[0].epoch_size("Reader"))) dali_train_iter = DALIClassificationIterator(trainpipes, args.num_examples // nWrk) if args.data_val: dali_val_iter = DALIClassificationIterator(valpipes, worker_val_examples, fill_last_batch = False) if args.data_val else None else: dali_val_iter = None return dali_train_iter, dali_val_iter
def get(data_shape, label_shape, labels_range, args, kv_store=None): """Creates data iterator. Args: data_shape (tuple): Shape of input data tensor (X) including batch size. The batch size is the 0th dimension (bsz = data_shape[0]). This batch size must be an effective batch for a whole node. label_shape (tuple): Shape of input label tensor (Y) including batch size. The batch size is the 0th dimension (bsz = labels_shape[0]). This batch size must be an effective batch for a whole node. labels_range (list): List of output labels. For ImageNet, that would be a list with integers from 0 to 999. args (argparse.Namespace): Command line arguments. kv_store (mxnet.kvstore.KVStore): An object returned by mx.kvstore.create('...'). The data_shape and label_shape have first dimension to be batch dimension. It is a local batch, i.e.: replica_batch * num_devices Returns: Data iterator (instance of mx.io.DataIter). """ logging.info("Creating data iterator: data_shape=%s, label_shape=%s.", data_shape, label_shape) # 1. Synthetic Iterator ---------------------------------------------------------------------------------------- if args.data_dir is None or args.data_dir == "": logging.info( "Creating synthetic data iterator with data shape = %s.", data_shape) return mx.io.ResizeIter( SyntheticDataIterator(data_shape, label_shape, labels_range, args.dtype), args.num_warmup_batches + args.num_batches) # 2. Numpy Array Iterator -------------------------------------------------------------------------------------- fnames = [ f for f in os.listdir(args.data_dir) if os.path.isfile(os.path.join(args.data_dir, f)) ] if len(fnames) == 1 and fnames[0].endswith('.npz'): dataset = np.load(os.path.join(args.data_dir, fnames[0])) data, labels = dataset.get('data', None), dataset.get('labels', None) if data is None: raise ValueError("The dataset ({}) does not contain 'data' " "field.".format( os.path.join(args.data_dir, fnames[0]))) logging.info("Creating NDArray iterator: data=%s, labels=%s", data.shape, labels.shape) nd_arr_iter = mx.io.NDArrayIter(data=data, label=labels, batch_size=data_shape[0], shuffle=False, last_batch_handle='discard') return mx.io.ResizeIter(nd_arr_iter, args.num_warmup_batches + args.num_batches) # 3. DALI Iterator --------------------------------------------------------------------------------------------- if 'horovod' in args.kv_store: if not hvd: raise ValueError("Horovod library not found") rank, nworker = hvd.rank(), hvd.size() else: rank, nworker = (kv_store.rank, kv_store.num_workers) if kv_store else (0, 1) dataset_files = [ os.path.join(args.data_dir, 'train.rec'), os.path.join(args.data_dir, 'train.idx') ] if os.path.exists(dataset_files[0]) and os.path.exists( dataset_files[1]): if args.use_dali is True: # https://docs.nvidia.com/deeplearning/sdk/dali-developer-guide/docs/examples/mxnet/mxnet-resnet50.html if dali is None: raise ValueError( "DALI library not found (use_dali is true).") if len(args.gpus) == 0: raise ValueError( "DALI can only be used with GPU devices (gpus={})". format(args.gpus)) logging.info("Creating DALI iterator") output_layout = dali.types.NHWC if args.input_layout == 'NHWC' else dali.types.NCHW cropshape = ( data_shape[1], data_shape[2]) if args.input_layout == 'NHWC' else ( data_shape[2], data_shape[3]) channel_idx = 3 if args.input_layout == 'NHWC' else 1 trainpipes = [ HybridTrainPipe( batch_size=data_shape[0] // len(args.gpus), # Replica batch. num_threads=3, # Per GPU device_id=gpu_id, rec_path=dataset_files[0], idx_path=dataset_files[1], shard_id=args.gpus.index(gpu_id) + len(args.gpus) * rank, num_shards=len(args.gpus) * nworker, crop_shape=cropshape, output_layout=output_layout, pad_output=data_shape[channel_idx] == 4, dtype=args.dtype, nvjpeg_padding=16 * 1024 * 1024, prefetch_queue=3) for gpu_id in args.gpus ] trainpipes[0].build() # epoch_size = trainpipes[0].epoch_size("Reader") // nworker epoch_size = data_shape[0] * (args.num_warmup_batches + args.num_batches) return DALIClassificationIterator( trainpipes, # List of pipelines to use epoch_size, # Epoch size. 'data', # Data name for provided symbols. 'softmax_label', # Label name for provided symbols. args. input_layout # Layout of the pipeline outputs (NCHW / NHWC). ) # 4. MXNET Image Record Iterator --------------------------------------------------------------------------- # https://mxnet.incubator.apache.org/api/python/io.html#mxnet.io.imagerecorditer # https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/common/data.py # this iterator supports channels first format only. if args.input_layout != 'NCHW': raise ValueError( "Standard mxnet image record iterator only supports channel first format (NCHW), " "requested format: {}.".format(args.input_layout)) logging.info( "Creating standard image record iterator (ImageRecordIter) with data layout = %s.", args.input_layout) num_preprocess_threads = args.preprocess_threads if num_preprocess_threads <= 4: logging.warning( "[Number of pre-process threads is %d. This may be too small for large number of GPUs. " "If you do not see speedup as you add more GPUs, increase this number.", num_preprocess_threads) img_rec_iter = mx.io.ImageRecordIter( path_imgrec=dataset_files[0], path_imgidx=dataset_files[1], data_name='data', label_name='softmax_label', data_shape=(data_shape[1], data_shape[2], data_shape[3]), batch_size=data_shape[0], rand_crop=True, rand_mirror=True, preprocess_threads=num_preprocess_threads, prefetch_buffer=args.prefetch_buffer, dtype='float32', num_parts=nworker, part_index=rank) return mx.io.ResizeIter(img_rec_iter, args.num_warmup_batches + args.num_batches) # 5. All Failed ------------------------------------------------------------------------------------------------ raise ValueError( "Cannot find data set files. MXNET benchmark backend supports the following data sets:\n" " 1. Synthetic data set. It is used when data_dir parameter is none or empty:\n" " -Pexp.data_dir='\"\"'\n" " 2. Real data set in a file with 'npz' extension. This data set is used if data_dir value\n" " is a valid directory and contains one file with npz extension. If found, this file\n" " must contain a dictionary with at least one key - `data`. It can also contain 'labels'\n" " key for labels.\n" " 3. Real image data set in standard RecordIO format. This data set is used if provided data directory\n" " contains 'train.rec' and 'train.idx' files.'")
def get_rec_iter(args, kv=None): # resize is default base length of shorter edge for dataset; # all images will be reshaped to this size resize = int(args.resize) # target shape is final shape of images pipelined to network; # all images will be cropped to this size target_shape = tuple([int(l) for l in args.image_shape.split(',')]) pad_output = target_shape[0] == 4 gpus = list(map(int, filter(None, args.gpus.split(',')))) # filter to not encount eventually empty strings batch_size = args.batch_size//len(gpus) mx_resnet_print( key=mlperf_log.INPUT_BATCH_SIZE, val=batch_size) # TODO MPI WORLD SIZE num_threads = args.dali_threads # the input_layout w.r.t. the model is the output_layout of the image pipeline output_layout = types.NHWC if args.input_layout == 'NHWC' else types.NCHW rank = kv.rank if kv else 0 nWrk = kv.num_workers if kv else 1 trainpipes = [HybridTrainPipe(batch_size = batch_size, num_threads = num_threads, device_id = gpu_id, rec_path = args.data_train, idx_path = args.data_train_idx, shard_id = gpus.index(gpu_id) + len(gpus)*rank, num_shards = len(gpus)*nWrk, crop_shape = target_shape[1:], min_random_area = args.min_random_area, max_random_area = args.max_random_area, min_random_aspect_ratio = args.min_random_aspect_ratio, max_random_aspect_ratio = args.max_random_aspect_ratio, nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024, prefetch_queue = args.dali_prefetch_queue, seed = args.seed, output_layout = output_layout, pad_output = pad_output, dtype = args.dtype, mlperf_print = gpu_id == gpus[0]) for gpu_id in gpus] valpipes = [HybridValPipe(batch_size = batch_size, num_threads = num_threads, device_id = gpu_id, rec_path = args.data_val, idx_path = args.data_val_idx, shard_id = 0 if args.separ_val else gpus.index(gpu_id) + len(gpus)*rank, num_shards = 1 if args.separ_val else len(gpus)*nWrk, crop_shape = target_shape[1:], nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024, prefetch_queue = args.dali_prefetch_queue, seed = args.seed, resize_shp = resize, output_layout = output_layout, pad_output = pad_output, dtype = args.dtype, mlperf_print = gpu_id == gpus[0]) for gpu_id in gpus] if args.data_val else None trainpipes[0].build() if args.data_val: valpipes[0].build() mx_resnet_print( key=mlperf_log.INPUT_SIZE, val=trainpipes[0].epoch_size("Reader")) mx_resnet_print( key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, val=trainpipes[0].epoch_size("Reader")) if args.data_val: mx_resnet_print( key=mlperf_log.EVAL_SIZE, val=valpipes[0].epoch_size("Reader")) mx_resnet_print( key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, val=valpipes[0].epoch_size("Reader")) if args.num_examples < trainpipes[0].epoch_size("Reader"): warnings.warn("{} training examples will be used, although full training set contains {} examples".format(args.num_examples, trainpipes[0].epoch_size("Reader"))) dali_train_iter = DALIClassificationIterator(trainpipes, args.num_examples // nWrk) worker_val_examples = valpipes[0].epoch_size("Reader") if not args.separ_val: worker_val_examples = worker_val_examples // nWrk if rank < valpipes[0].epoch_size("Reader") % nWrk: worker_val_examples += 1 dali_val_iter = DALIClassificationIterator(valpipes, worker_val_examples, fill_last_batch = False) if args.data_val else None return dali_train_iter, dali_val_iter
def inst_iterators(data_train, data_dev, batch_size=1, data_shape=(3, 224, 224), resize=(-1, -1), resize_scale=(1, 1), resize_area=(1, 1), use_svm_label=False, use_dali=False): ''' Instantiate specified training and developing data iterators :params: data_train training rec/lst data_dev developing rec/lst batch_size mini batch size, sum of all device data_shape input shape resize resize shorter edge of (train,dev) data, -1 means no resize resize_scale resize train-data into (width*s, height*s), with s randomly chosen from this range resize_area Change the area (namely width * height) to a random value in [min_random_area, max_random_area]. Ignored if random_resized_crop is False use_svm_label set as True if classifier needs svm label name use_dali set as True if nvidia dali is supposed to be used :return: train, dev tuple of 2 iterators ''' # initialization assert data_train and data_dev, logging.error( "Please input training or developing data") mean, std = cfg.TRAIN.MEAN_RGB, cfg.TRAIN.STD_RGB assert len(mean) == 3 and len(std) == 3, logging.error( "Mean or Std should be a list of 3 items") mean_r, mean_g, mean_b, std_r, std_g, std_b = mean[:] + std[:] min_random_scale, max_random_scale = resize_scale min_random_area, max_random_area = resize_area min_aspect_ratio = cfg.TRAIN.MIN_ASPECT_RATIO if cfg.TRAIN.MIN_ASPECT_RATIO else None logging.info('Input normalization : Mean-RGB {}, Std-RGB {}'.format( [mean_r, mean_g, mean_b], [std_r, std_g, std_b])) logging.info( 'Input scale augmentation : Max-random-sclae {}, Min-random-scale {}'. format(max_random_scale, min_random_scale)) logging.info( 'Input area augmentation : Max-random-area {}, Min-random-area {}'. format(max_random_area, min_random_area)) resize_train, resize_dev = resize label_name = 'softmax_label' if not use_svm_label else 'svm_label' # build iterators if not cfg.TRAIN.USE_DALI and cfg.TRAIN.USE_REC: logging.info("Creating recordio iterators") train = mx.io.ImageRecordIter( dtype=cfg.TRAIN.DATA_TYPE, path_imgrec=data_train, preprocess_threads=cfg.TRAIN.PROCESS_THREAD, data_name='data', label_name=label_name, label_width=cfg.TRAIN.LABEL_WIDTH, data_shape=data_shape, batch_size=batch_size, resize=resize_train, max_random_scale=max_random_scale, min_random_scale=min_random_scale, shuffle=cfg.TRAIN.SHUFFLE, rand_crop=cfg.TRAIN.RAND_CROP, rand_mirror=cfg.TRAIN.RAND_MIRROR, max_rotate_angle=cfg.TRAIN.MAX_ROTATE_ANGLE, max_aspect_ratio=cfg.TRAIN.MAX_ASPECT_RATIO, min_aspect_ratio=min_aspect_ratio, random_resized_crop=cfg.TRAIN.RANDOM_RESIZED_CROP, max_random_area=max_random_area, min_random_area=min_random_area, max_img_size=cfg.TRAIN.MAX_IMG_SIZE, min_img_size=cfg.TRAIN.MIN_IMG_SIZE, max_shear_ratio=cfg.TRAIN.MAX_SHEAR_RATIO, brightness=cfg.TRAIN.BRIGHTNESS_JITTER, contrast=cfg.TRAIN.CONTRAST_JITTER, saturation=cfg.TRAIN.SATURATION_JITTER, hue=cfg.TRAIN.HUE_JITTER, pca_noise=cfg.TRAIN.PCA_NOISE, random_h=cfg.TRAIN.RANDOM_H, random_s=cfg.TRAIN.RANDOM_S, random_l=cfg.TRAIN.RANDOM_L, mean_r=mean_r, mean_g=mean_g, mean_b=mean_b, std_r=std_r, std_g=std_g, std_b=std_b, inter_method=cfg.TRAIN.INTERPOLATION_METHOD) dev = mx.io.ImageRecordIter( dtype=cfg.TRAIN.DATA_TYPE, path_imgrec=data_dev, preprocess_threads=cfg.TRAIN.PROCESS_THREAD, data_name='data', label_name=label_name, label_width=cfg.TRAIN.LABEL_WIDTH, batch_size=batch_size, data_shape=data_shape, resize=resize_dev, shuffle=False, rand_crop=False, # center crop rand_mirror=False, mean_r=mean_r, mean_g=mean_g, mean_b=mean_b, std_r=std_r, std_g=std_g, std_b=std_b, inter_method=cfg.TRAIN.INTERPOLATION_METHOD) elif not cfg.TRAIN.USE_DALI and not cfg.TRAIN.USE_REC: logging.info("Creating image iterators") # set decoding thread number os.environ['MXNET_CPU_WORKER_NTHREADS'] = str(cfg.TRAIN.PROCESS_THREAD) # set rand_crop and rand_resize as default, and append separately aug_list_train = mx.image.CreateAugmenter( data_shape=data_shape, resize=resize_train, rand_mirror=cfg.TRAIN.RAND_MIRROR, mean=np.asarray(mean), std=np.asarray(std), brightness=cfg.TRAIN.BRIGHTNESS_JITTER, contrast=cfg.TRAIN.CONTRAST_JITTER, saturation=cfg.TRAIN.SATURATION_JITTER, hue=cfg.TRAIN.HUE_JITTER, pca_noise=cfg.TRAIN.PCA_NOISE, inter_method=cfg.TRAIN.INTERPOLATION_METHOD) if cfg.TRAIN.RAND_CROP and min_random_scale != 1: aug_list_train.append( mx.image.RandomSizedCropAug((data_shape[2], data_shape[1]), min_random_scale**2, (1 - cfg.TRAIN.MAX_ASPECT_RATIO, 1 + cfg.TRAIN.MAX_ASPECT_RATIO), cfg.TRAIN.INTERPOLATION_METHOD)) elif cfg.TRAIN.RAND_CROP: aug_list_train.append( mx.image.RandomCropAug((data_shape[2], data_shape[1]), cfg.TRAIN.INTERPOLATION_METHOD)) # set rand_crop and rand_resize as default to use center-crop aug_list_dev = mx.image.CreateAugmenter( data_shape=data_shape, resize=resize_dev, mean=np.asarray(mean), std=np.asarray(std), inter_method=cfg.TRAIN.INTERPOLATION_METHOD) try: train = mx.image.ImageIter( dtype=cfg.TRAIN.DATA_TYPE, path_imglist=data_train, data_name='data', label_name=label_name, label_width=cfg.TRAIN.LABEL_WIDTH, data_shape=data_shape, batch_size=batch_size, path_root=cfg.TRAIN.TRAIN_IMG_PREFIX, shuffle=cfg.TRAIN.SHUFFLE, last_batch_handle=cfg.TRAIN.LAST_BATCH_HANDLE, aug_list=aug_list_train) dev = mx.image.ImageIter( dtype=cfg.TRAIN.DATA_TYPE, path_imglist=data_dev, data_name='data', label_name=label_name, label_width=cfg.TRAIN.LABEL_WIDTH, data_shape=data_shape, batch_size=batch_size, path_root=cfg.TRAIN.DEV_IMG_PREFIX, shuffle=cfg.TRAIN.SHUFFLE, last_batch_handle=cfg.TRAIN.LAST_BATCH_HANDLE, aug_list=aug_list_dev) except: print("!!!!!!!!!!!!!!!!!!!!!!!!") elif cfg.TRAIN.USE_DALI and cfg.TRAIN.USE_REC: from dali_util import HybridTrainPipe, HybridValPipe from nvidia.dali.plugin.mxnet import DALIClassificationIterator num_gpus = len(cfg.TRAIN.GPU_IDX) batch_size /= num_gpus train_pipes = [ HybridTrainPipe(batch_size=batch_size, num_threads=cfg.TRAIN.PROCESS_THREAD, device_id=i, num_gpus=num_gpus) for i in range(num_gpus) ] dev_pipes = [ HybridValPipe(batch_size=batch_size, num_threads=cfg.TRAIN.PROCESS_THREAD, device_id=i, num_gpus=num_gpus) for i in range(num_gpus) ] train_pipes[0].build() dev_pipes[0].build() train = DALIClassificationIterator(train_pipes, train_pipes[0].epoch_size("Reader")) dev = DALIClassificationIterator(dev_pipes, dev_pipes[0].epoch_size("Reader")) else: logging.error('Invalid data loader type') pass logging.info("Data iters created successfully") return train, dev
train_params.update(loss.params) trainer = gluon.Trainer(train_params, 'sgd', { 'learning_rate': lr, 'momentum': momentum, 'wd': wd }) lr_counter = 0 logger.info([lamda, r_init, lr_steps, lr, momentum, wd, batch_size]) it, epoch = 0, 0 loss_mtc, acc_mtc = mx.metric.Loss(), mx.metric.Accuracy() tic = time.time() btic = time.time() dali_iter = DALIClassificationIterator(train_pipes, size) while it < iters + 1: if it == lr_steps[lr_counter]: trainer.set_learning_rate(trainer.learning_rate * 0.1) lr_counter += 1 for batches in tqdm(dali_iter): datas, labels = split_and_load(batches, num_gpu) with ag.record(): ots = [net(X) for X in datas] embedds = [ot[0] for ot in ots] outputs = [ot[1] for ot in ots] losses = [ loss(yhat, y, emb)
# 多卡测试,速度和单卡一样,也是18000samples/s,可能主要卡在 SSD 读取速度上了,2080Ti GPU占用20%左右 # 测试 HHD 8000 samples/s, SSD 18000 samples/s # trainpipes = [HybridTrainPipe(path_imgidx, path_imgrec, batch_size=batch_size, num_threads=6, device_id = i, num_gpus = N) for i in range(N)] # htp = trainpipes[0] # 单卡测试 htp = HybridTrainPipe(path_imgrec, batch_size, 6, device_id=0, num_gpus=N, initial_fill=batch_size) trainpipes = [htp] htp.build() print("Training pipeline epoch size: {}".format(htp.epoch_size("Reader"))) dali_train_iter = DALIClassificationIterator(trainpipes, htp.epoch_size("Reader")) print([dali_train_iter.provide_data[0][:2]], [dali_train_iter.provide_label[0][:2]]) import time time_start = time.time() batch_num = 0 while True: batch = dali_train_iter.next() batch_num += 1 # # print("batch num:", len(batch)) # # # print("batch:", batch[0].asnumpy()) # # print("elem num:", len(batch[0].data)) # # print("image num:", batch[0].data[0].shape) # # print("label num:", batch[0].label[0].shape) # 查看图像结果 # for image, label in zip(batch[0].data[0], batch[0].label[0]):
def get_rec_iter(args, kv=None, batch_fn=None, dali_cpu=False): devices = [0] if dali_cpu else args.gpus num_devices = len(devices) pad_output = (args.image_shape[0] == 4) # the input_layout w.r.t. the model is the output_layout of the image pipeline output_layout = types.NHWC if args.input_layout == 'NHWC' else types.NCHW if 'horovod' in args.kv_store: rank = hvd.rank() nWrk = hvd.size() else: rank = kv.rank if kv else 0 nWrk = kv.num_workers if kv else 1 batch_size = args.batch_size // nWrk * num_devices trainpipes = [ HybridTrainPipe( args=args, batch_size=batch_size, num_threads=args.dali_threads, device_id=dev_id, rec_path=args.rec_train, idx_path=args.rec_train_idx, shard_id=devices.index(dev_id) + num_devices * rank, num_shards=num_devices * nWrk, crop_shape=args.image_shape[1:], output_layout=output_layout, dtype=args.dtype, pad_output=pad_output, dali_cpu=dali_cpu, nvjpeg_padding=args.dali_nvjpeg_memory_padding * 1024 * 1024, prefetch_queue=args.dali_prefetch_queue) for dev_id in devices ] trainpipes[0].build() num_examples = trainpipes[0].epoch_size("Reader") if args.num_examples < num_examples: warnings.warn( "{} training examples will be used, although full training set contains {} examples" .format(args.num_examples, num_examples)) train_examples = args.num_examples // nWrk dali_train_iter = DALIClassificationIterator(trainpipes, train_examples) if not args.rec_val: return dali_train_iter, None, batch_fn valpipes = [ HybridValPipe( args=args, batch_size=batch_size, num_threads=args.dali_validation_threads, device_id=dev_id, rec_path=args.rec_val, idx_path=args.rec_val_idx, shard_id=0 if args.dali_separ_val else devices.index(dev_id) + num_devices * rank, num_shards=1 if args.dali_separ_val else num_devices * nWrk, crop_shape=args.image_shape[1:], resize_shp=args.data_val_resize, output_layout=output_layout, dtype=args.dtype, pad_output=pad_output, dali_cpu=dali_cpu, nvjpeg_padding=args.dali_nvjpeg_memory_padding * 1024 * 1024, prefetch_queue=args.dali_prefetch_queue) for dev_id in devices ] valpipes[0].build() worker_val_examples = valpipes[0].epoch_size("Reader") if not args.dali_separ_val: adj = 1 if rank < worker_val_examples % nWrk else 0 worker_val_examples = adj + worker_val_examples // nWrk dali_val_iter = DALIClassificationIterator(valpipes, worker_val_examples) return dali_train_iter, dali_val_iter, batch_fn
def train_net(args): #_seed = 727 #random.seed(_seed) #np.random.seed(_seed) #mx.random.seed(_seed) ctx = [] cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip() if len(cvd) > 0: for i in range(len(cvd.split(','))): ctx.append(mx.gpu(i)) if len(ctx) == 0: ctx = [mx.cpu()] print('use cpu') else: print('gpu num:', len(ctx)) if len(args.extra_model_name) == 0: prefix = os.path.join( args.models_root, '%s-%s-%s' % (args.network, args.loss, args.dataset), 'model') else: prefix = os.path.join( args.models_root, '%s-%s-%s-%s' % (args.network, args.loss, args.dataset, args.extra_model_name), 'model') prefix_dir = os.path.dirname(prefix) print('prefix', prefix) if not os.path.exists(prefix_dir): os.makedirs(prefix_dir) args.ctx_num = len(ctx) if args.per_batch_size == 0: args.per_batch_size = 128 args.batch_size = args.per_batch_size * args.ctx_num args.rescale_threshold = 0 args.image_channel = config.image_shape[2] config.batch_size = args.batch_size config.per_batch_size = args.per_batch_size data_dir = config.dataset_path path_imgrec = None path_imglist = None image_size = config.image_shape[0:2] assert len(image_size) == 2 assert image_size[0] == image_size[1] print('image_size', image_size) print('num_classes', config.num_classes) path_imgrec = os.path.join(data_dir, "train.rec") data_shape = (args.image_channel, image_size[0], image_size[1]) num_workers = config.num_workers global_num_ctx = num_workers * args.ctx_num if config.num_classes % global_num_ctx == 0: args.ctx_num_classes = config.num_classes // global_num_ctx else: args.ctx_num_classes = config.num_classes // global_num_ctx + 1 args.local_num_classes = args.ctx_num_classes * args.ctx_num args.local_class_start = args.local_num_classes * args.worker_id #if len(args.partial)==0: # local_classes_range = (0, args.num_classes) #else: # _vec = args.partial.split(',') # local_classes_range = (int(_vec[0]), int(_vec[1])) #args.partial_num_classes = local_classes_range[1] - local_classes_range[0] #args.partial_start = local_classes_range[0] print('Called with argument:', args, config) mean = None begin_epoch = 0 base_lr = args.lr base_wd = args.wd base_mom = args.mom arg_params = None aux_params = None if len(args.pretrained) == 0: esym = get_symbol_embedding() asym = get_symbol_arcface else: #assert False print('loading', args.pretrained, args.pretrained_epoch) pretrain_esym, arg_params, aux_params = mx.model.load_checkpoint( args.pretrained, args.pretrained_epoch) esym = get_symbol_embedding(pretrain_esym) asym = get_symbol_arcface if config.count_flops: all_layers = esym.get_internals() _sym = all_layers['fc1_output'] FLOPs = flops_counter.count_flops(_sym, data=(1, 3, image_size[0], image_size[1])) _str = flops_counter.flops_str(FLOPs) print('Network FLOPs: %s' % _str) # if config.num_workers == 1: # from dali_parall_module_local_v1 import ParallModule # else: # from parall_module_dist import ParallModule model = ParallModule( context=ctx, symbol=esym, data_names=['data'], label_names=['softmax_label'], asymbol=asym, args=args, ) val_dataiter = None # TODO: if config.use_dali: if True: from dali_image_iter import HybridTrainPipe from nvidia.dali.plugin.mxnet import DALIClassificationIterator # trainpipes = [HybridTrainPipe(path_imgrec, args.batch_size, num_threads=4, device_id = i, num_gpus = 4) for i in range(2)] # htp = trainpipes[0] htp = HybridTrainPipe(path_imgrec, args.batch_size, 4, 0, 4, args.batch_size * 1000) trainpipes = [htp] htp.build() print("Training pipeline epoch size: {}".format( htp.epoch_size("Reader"))) dali_train_iter = DALIClassificationIterator(trainpipes, htp.epoch_size("Reader")) train_dataiter = dali_train_iter else: train_dataiter = FaceImageIter( batch_size=args.batch_size, data_shape=data_shape, path_imgrec=path_imgrec, shuffle=True, rand_mirror=config.data_rand_mirror, mean=mean, cutoff=config.data_cutoff, color_jittering=config.data_color, images_filter=config.data_images_filter, ) if config.net_name == 'fresnet' or config.net_name == 'fmobilefacenet': initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style else: initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2) _rescale = 1.0 / args.batch_size opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale) _cb = mx.callback.Speedometer(args.batch_size, args.frequent) ver_list = [] ver_name_list = [] for name in config.val_targets: path = os.path.join(data_dir, name + ".bin") if os.path.exists(path): data_set = verification.load_bin(path, image_size) ver_list.append(data_set) ver_name_list.append(name) print('ver', name) def ver_test(nbatch): results = [] for i in range(len(ver_list)): acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test( ver_list[i], model, args.batch_size, 10, None, None) print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm)) #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1)) print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc2, std2)) results.append(acc2) return results highest_acc = [0.0, 0.0] #lfw and target #for i in range(len(ver_list)): # highest_acc.append(0.0) global_step = [0] save_step = [0] lr_steps = [int(x) for x in args.lr_steps.split(',')] print('lr_steps', lr_steps) def _batch_callback(param): #global global_step global_step[0] += 1 mbatch = global_step[0] for step in lr_steps: if mbatch == step: opt.lr *= 0.1 print('lr change to', opt.lr) break _cb(param) if mbatch % 1000 == 0: print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch) if mbatch >= 0 and mbatch % args.verbose == 0: acc_list = ver_test(mbatch) save_step[0] += 1 msave = save_step[0] do_save = False is_highest = False if len(acc_list) > 0: #lfw_score = acc_list[0] #if lfw_score>highest_acc[0]: # highest_acc[0] = lfw_score # if lfw_score>=0.998: # do_save = True score = sum(acc_list) if acc_list[-1] >= highest_acc[-1]: if acc_list[-1] > highest_acc[-1]: is_highest = True else: if score >= highest_acc[0]: is_highest = True highest_acc[0] = score highest_acc[-1] = acc_list[-1] #if lfw_score>=0.99: # do_save = True if is_highest: do_save = True if args.ckpt == 0: do_save = False elif args.ckpt == 2: do_save = True elif args.ckpt == 3: msave = 1 if do_save: # print('saving', msave) # arg, aux = model.get_export_params() # all_layers = model.symbol.get_internals() # _sym = all_layers['fc1_output'] # mx.model.save_checkpoint(prefix, msave, _sym, arg, aux) print('saving', msave) arg, aux = model.get_params() # TODO: 这里求加个保存全部参数的方法,方便继续训练,我这么改不知道有没有问题 if config.ckpt_embedding: all_layers = model.symbol.get_internals() _sym = all_layers['fc1_output'] _arg = {} for k in arg: if not k.startswith('fc7'): _arg[k] = arg[k] mx.model.save_checkpoint(prefix, msave, _sym, _arg, aux) else: mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux) print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1])) if config.max_steps > 0 and mbatch > config.max_steps: sys.exit(0) epoch_cb = None # train_dataiter = mx.io.PrefetchingIter(train_dataiter) model.fit( train_dataiter, begin_epoch=begin_epoch, num_epoch=999999, eval_data=val_dataiter, #eval_metric = eval_metrics, kvstore=args.kvstore, optimizer=opt, #optimizer_params = optimizer_params, initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=True, batch_end_callback=_batch_callback, epoch_end_callback=epoch_cb)
logger = Logger(root="./log", prefix="mobile_facenet", local_rank=local_rank) # train and val pipeline train_pipes = [ FacePipe(name="emore", batch_size=batch_size_per_gpu, num_threads=num_worker, device_id=local_rank, num_shards=num_gpu, shard_id=rank) ] train_size = train_pipes[0].size num_classes = train_pipes[0].num_classes train_iter = DALIClassificationIterator(train_pipes, train_size // num_gpu, auto_reset=True) validator = ParallelValidation(val_targets, batch_size_per_gpu, rank, local_rank, logger=logger) # loss, network net = get_mobile_facenet(num_classes, weight_norm=True) net.initialize(init=mx.init.MSRAPrelu(), ctx=ctx) net.hybridize(static_alloc=True) loss = RingLoss(lamda, r_init) loss.initialize(ctx=ctx)
epochs = args.epochs + 1 alpha = args.alpha max_accuracy = 0.0 ctx = mx.gpu(bps.local_rank()) # load_data batch_size = args.batch_size * num_gpu train_pipes = [ CifarPipe(args.batch_size, args.num_workers, local_rank, num_gpu, rank, use_float16) ] train_size = train_pipes[0].size train_data = DALIClassificationIterator(train_pipes, train_size // num_gpu, auto_reset=True) val_pipes = [ CifarPipe(args.batch_size, args.num_workers, local_rank, 1, 0, use_float16, train=False) ] val_size = val_pipes[0].size val_data = DALIClassificationIterator(val_pipes, val_size, auto_reset=True) # set the network and trainer net = get_attention_cifar(10, num_layers=args.num_layers)