def run_training(self): """ Run training benchmarks. Returns: Numpy array containing batch times (string, numpy array). """ # Create data iterator and resize it to total number of iterations (no matter what input data size is) train_data = DataIteratorFactory.get( (self.worker_batch, ) + self.model.input_shape, (self.worker_batch, ) + self.model.labels_shape, self.model.labels_range, self.args, kv_store=self.kv_store) # https://github.com/apache/incubator-mxnet/blob/master/example/distributed_training-horovod/resnet50_imagenet.py optimizer_params = { 'multi_precision': True } if self.args.dtype == 'float16' else {} if self.is_horovod: optimizer_params['rescale_grad'] = 1.0 / self.worker_batch opt = mx.optimizer.create('sgd', **optimizer_params) if self.is_horovod: opt = hvd.DistributedOptimizer(opt) mod = mx.mod.Module(symbol=self.model.output, context=self.devices[0]) mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True) mod.init_params( mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2)) if self.is_horovod: arg_params, aux_params = mod.get_params() if arg_params: hvd.broadcast_parameters(arg_params, root_rank=0) if aux_params: hvd.broadcast_parameters(aux_params, root_rank=0) mod.set_params(arg_params=arg_params, aux_params=aux_params) batch_end_callback = BatchEndCallback(self.args.num_warmup_batches, self.args.num_batches) # print ("Starting benchmarks.") # TODO: In current implementation, number of epochs must always equal to 1. It is iterator responsibility to # iterate the right number of batched - warm up plus benchmark batches. mod.fit(train_data, kvstore=self.kv_store, optimizer=opt, optimizer_params=optimizer_params, eval_metric=self.model.eval_metric, batch_end_callback=[batch_end_callback], begin_epoch=0, num_epoch=1) if self.is_horovod: start_time = timeit.default_timer() mx.ndarray.waitall() logging.info( "(horovod) wait time for all ndarrays is %.5f seconds", timeit.default_timer() - start_time) return batch_end_callback.batch_times
def test_allreduce(use_horovod, dtype): if use_horovod is False: kvstore_type = "dist_sync_device" if os.environ.get( "DMLC_ROLE") == "worker" else kvstore_type kv = mx.kvstore.create(kvstore_type) rank = kv.rank num_workers = kv.num_workers else: kvstore_type = "device" kv = mx.kvstore.create(kvstore_type) hvd.init() rank = hvd.rank() num_workers = hvd.size() print('use horovod: {}, rank {}/{}, kv type: {}, usetree: {}'.format( use_horovod, rank, num_workers, kvstore_type, os.environ.get("MXNET_KVSTORE_USETREE"))) rescale_grad = 1.0 / (8 * num_workers) if use_horovod: rescale_grad = rescale_grad * num_workers optimizer_params = dict( momentum=0, # pOpt.optimizer.momentum, wd=0, # pOpt.optimizer.wd, learning_rate=0.1, rescale_grad=rescale_grad, ) optimizer = mx.optimizer.create("sgd", **optimizer_params) if use_horovod: # Horovod: wrap optimizer with DistributedOptimizer optimizer = hvd.DistributedOptimizer(optimizer) print("opt rescale:{}".format(optimizer.rescale_grad)) kv.set_optimizer(optimizer) test_hvd_kv(rank, num_workers, kv, dtype)
def train_module(): # Create input symbol data = mx.sym.var('data') if args.dtype == 'float16': data = mx.sym.Cast(data=data, dtype=np.float16) net.cast(np.float16) # Create output symbol out = net(data) if args.dtype == 'float16': out = mx.sym.Cast(data=out, dtype=np.float32) softmax = mx.sym.SoftmaxOutput(out, name='softmax') # Create model mod = mx.mod.Module(softmax, context=context) # Initialize parameters if args.use_pretrained: arg_params = {} for x in net.collect_params().values(): x.reset_ctx(mx.cpu()) arg_params[x.name] = x.data() else: arg_params = None aux_params = None mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) mod.init_params(initializer, arg_params=arg_params, aux_params=aux_params) # Horovod: fetch and broadcast parameters (arg_params, aux_params) = mod.get_params() if arg_params is not None: hvd.broadcast_parameters(arg_params, root_rank=0) if aux_params is not None: hvd.broadcast_parameters(aux_params, root_rank=0) mod.set_params(arg_params=arg_params, aux_params=aux_params) # Create optimizer # Note that when using Module API, we need to specify rescale_grad since # we create optimizer first and wrap it with DistributedOptimizer. For # Gluon API, it is handled in Trainer.step() function so there is no need # to specify rescale_grad (see above train_gluon() function). optimizer_params = { 'wd': args.wd, 'momentum': args.momentum, 'rescale_grad': 1.0 / batch_size, 'lr_scheduler': lr_sched } if args.dtype == 'float16': optimizer_params['multi_precision'] = True opt = mx.optimizer.create('sgd', **optimizer_params) # Horovod: wrap optimizer with DistributedOptimizer dist_opt = hvd.DistributedOptimizer( opt, gradient_predivide_factor=args.gradient_predivide_factor) # Setup validation data and callback during training eval_data = None if args.eval_epoch: eval_data = val_data batch_callback = None if args.log_interval > 0 and rank == 0: batch_callback = mx.callback.Speedometer(batch_size * num_workers, args.log_interval) epoch_callback = None if args.save_frequency > 0: epoch_callback = mx.callback.do_checkpoint('%s-%d' % (args.model, rank), period=args.save_frequency) # Train model mod.fit(train_data, eval_data=eval_data, num_epoch=args.num_epochs, kvstore=None, batch_end_callback=batch_callback, epoch_end_callback=epoch_callback, optimizer=dist_opt) # Evaluate performance if not using synthetic data if args.use_rec: acc_top1 = mx.metric.Accuracy() acc_top5 = mx.metric.TopKAccuracy(5) res = mod.score(val_data, [acc_top1, acc_top5]) for name, val in res: logging.info('Epoch[%d] Rank[%d] Validation-%s=%f', args.num_epochs - 1, rank, name, val)
def train(): # Get model from GluonCV model zoo # https://gluon-cv.mxnet.io/model_zoo/index.html net = get_model(args.model, **kwargs) net.cast(args.dtype) # Create input symbol data = mx.sym.var('data') if args.dtype == 'float16': data = mx.sym.Cast(data=data, dtype=np.float16) net.cast(np.float16) # Create output symbol out = net(data) if args.dtype == 'float16': out = mx.sym.Cast(data=out, dtype=np.float32) softmax = mx.sym.SoftmaxOutput(out, name='softmax') if args.use_pretrained: arg_params = {} for x in net.collect_params().values(): x.reset_ctx(mx.cpu()) arg_params[x.name] = x.data() else: arg_params = None aux_params = None # Create model mod = mx.mod.Module(softmax, context=context) # Create optimizer optimizer_params = { 'wd': args.wd, 'momentum': args.momentum, 'rescale_grad': 1.0 / batch_size, 'lr_scheduler': lr_sched } if args.dtype == 'float16': optimizer_params['multi_precision'] = True opt = mx.optimizer.create('sgd', sym=out, **optimizer_params) # Horovod: wrap optimizer with DistributedOptimizer opt = hvd.DistributedOptimizer(opt) # Create initializer and initializer parameters initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) mod.init_params(initializer, arg_params=arg_params, aux_params=aux_params) # Horovod: fetch and broadcast parameters (arg_params, aux_params) = mod.get_params() if arg_params is not None: hvd.broadcast_parameters(arg_params, root_rank=0) if aux_params is not None: hvd.broadcast_parameters(aux_params, root_rank=0) mod.set_params(arg_params=arg_params, aux_params=aux_params) # Setup validation data and callback during training eval_data = None if args.eval_epoch: eval_data = val_data batch_callback = None if args.log_interval > 0: batch_callback = mx.callback.Speedometer(batch_size, max(1, args.log_interval)) epoch_callback = None if args.save_frequency > 0: epoch_callback = mx.callback.do_checkpoint('%s-%d' % (args.model, rank), period=args.save_frequency) # Train model mod.fit(train_data, eval_data=eval_data, num_epoch=args.num_epochs, kvstore=None, batch_end_callback=batch_callback, epoch_end_callback=epoch_callback, optimizer=opt, optimizer_params=optimizer_params) # Evaluate performance if not using synthetic data if args.use_rec: acc_top1 = mx.metric.Accuracy() acc_top5 = mx.metric.TopKAccuracy(5) res = mod.score(val_data, [acc_top1, acc_top5]) for name, val in res: logging.info('Epoch[%d] Rank[%d] Validation-%s=%f', args.num_epochs - 1, rank, name, val)
eval_metrics.add(eval_metric) eval_metrics.add(cls_metric) eval_metrics.add(bbox_metric) if not config.TRAIN.ONLY_PROPOSAL: eval_metrics.add(rceval_metric) eval_metrics.add(rccls_metric) eval_metrics.add(rcbbox_metric) if config.TRAIN.WITH_MASK: mask_metric = metric.MaskLogLossMetric(config) eval_metrics.add(mask_metric) optimizer_params = get_optim_params(config, len(train_iter), batch_size) print('Optimizer params: {}'.format(optimizer_params)) opt = mx.optimizer.create('sgd', **optimizer_params) opt = hvd.DistributedOptimizer(opt) # Checkpointing batch_end_callback = None if rank == 0: batch_end_callback = mx.callback.Speedometer(batch_size * num_workers, args.display) epoch_end_callback = None if rank == 0: prefix = os.path.join(output_path, args.save_prefix) epoch_end_callback = [ mx.callback.module_checkpoint(mod, prefix, period=1,
def train_module(): # Create input symbol data = mx.sym.var('data') if args.dtype == 'float16': data = mx.sym.Cast(data=data, dtype=np.float16) net.cast(np.float16) # Create output symbol out = net(data) if args.dtype == 'float16': out = mx.sym.Cast(data=out, dtype=np.float32) softmax = mx.sym.SoftmaxOutput(out, name='softmax') # Create model mod = mx.mod.Module(softmax, context=context) # Initialize parameters if args.use_pretrained: arg_params = {} for x in net.collect_params().values(): x.reset_ctx(mx.cpu()) arg_params[x.name] = x.data() else: arg_params = None aux_params = None mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) mod.init_params(initializer, arg_params=arg_params, aux_params=aux_params) # Horovod: fetch and broadcast parameters (arg_params, aux_params) = mod.get_params() if arg_params is not None: hvd.broadcast_parameters(arg_params, root_rank=0) if aux_params is not None: hvd.broadcast_parameters(aux_params, root_rank=0) mod.set_params(arg_params=arg_params, aux_params=aux_params) # Horovod: wrap optimizer with DistributedOptimizer dist_opt = hvd.DistributedOptimizer(opt) # Setup validation data and callback during training eval_data = None if args.eval_epoch: eval_data = val_data batch_callback = None if args.log_interval > 0 and rank == 0: batch_callback = mx.callback.Speedometer(batch_size * num_workers, args.log_interval) epoch_callback = None if args.save_frequency > 0: epoch_callback = mx.callback.do_checkpoint('%s-%d' % (args.model, rank), period=args.save_frequency) # Train model mod.fit(train_data, eval_data=eval_data, num_epoch=args.num_epochs, kvstore=None, batch_end_callback=batch_callback, epoch_end_callback=epoch_callback, optimizer=dist_opt) # Evaluate performance if not using synthetic data if args.use_rec: acc_top1 = mx.metric.Accuracy() acc_top5 = mx.metric.TopKAccuracy(5) res = mod.score(val_data, [acc_top1, acc_top5]) for name, val in res: logging.info('Epoch[%d] Rank[%d] Validation-%s=%f', args.num_epochs - 1, rank, name, val)
def fit(args, network, data_loader, **kwargs): """ train a model args : argparse returns network : the symbol definition of the nerual network data_loader : function that returns the train and val data iterators """ # kvstore # kv = mx.kvstore.create(args.kv_store) # if args.gc_type != 'none': # kv.set_gradient_compression({'type': args.gc_type, # 'threshold': args.gc_threshold}) if args.profile_worker_suffix: if hvd.size() > 1: filename = 'rank' + str( hvd.rank()) + '_' + args.profile_worker_suffix else: filename = args.profile_worker_suffix # mx.profiler.set_config(filename=filename, profile_all=True, profile_process='worker') mx.profiler.set_config(filename=filename, profile_symbolic=True, profile_imperative=True, profile_api=False, profile_process='worker') mx.profiler.set_state(state='run', profile_process='worker') # logging head = '%(asctime)-15s Node[' + str(hvd.rank()) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) logging.info('start with arguments %s', args) # data iterators (train, val) = data_loader(args, (hvd.rank(), hvd.size())) if args.test_io: tic = time.time() for i, batch in enumerate(train): for j in batch.data: j.wait_to_read() if (i + 1) % args.disp_batches == 0: logging.info( 'Batch [%d]\tSpeed: %.2f samples/sec', i, args.disp_batches * args.batch_size / (time.time() - tic)) tic = time.time() return # load model if 'arg_params' in kwargs and 'aux_params' in kwargs: arg_params = kwargs['arg_params'] aux_params = kwargs['aux_params'] else: sym, arg_params, aux_params = _load_model(args, hvd.rank()) if sym is not None: assert sym.tojson() == network.tojson() # save model checkpoint = _save_model(args, hvd.rank()) # devices for training devs = [mx.gpu(hvd.local_rank())] # learning rate lr, lr_scheduler = _get_lr_scheduler(args) # create model model = mx.mod.Module(context=devs, symbol=network) lr_scheduler = lr_scheduler optimizer_params = { 'learning_rate': lr, 'wd': args.wd, 'lr_scheduler': lr_scheduler, 'multi_precision': True } # Only a limited number of optimizers have 'momentum' property has_momentum = {'sgd', 'dcasgd', 'nag'} if args.optimizer in has_momentum: optimizer_params['momentum'] = args.mom monitor = mx.mon.Monitor(args.monitor, pattern=".*") if args.monitor > 0 else None # A limited number of optimizers have a warmup period has_warmup = {'lbsgd', 'lbnag'} if args.optimizer in has_warmup: if hvd.size() > 1: nworkers = hvd.size() else: nworkers = 1 epoch_size = args.num_examples / args.batch_size / nworkers if epoch_size < 1: epoch_size = 1 macrobatch_size = args.macrobatch_size if macrobatch_size < args.batch_size * nworkers: macrobatch_size = args.batch_size * nworkers #batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999) batch_scale = math.ceil( float(macrobatch_size) / args.batch_size / nworkers) optimizer_params['updates_per_epoch'] = epoch_size optimizer_params[ 'begin_epoch'] = args.load_epoch if args.load_epoch else 0 optimizer_params['batch_scale'] = batch_scale optimizer_params['warmup_strategy'] = args.warmup_strategy optimizer_params['warmup_epochs'] = args.warmup_epochs optimizer_params['num_epochs'] = args.num_epochs if args.initializer == 'default': if args.network == 'alexnet': # AlexNet will not converge using Xavier initializer = mx.init.Normal() # VGG will not trend to converge using Xavier-Gaussian elif 'vgg' in args.network: initializer = mx.init.Xavier() else: initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), elif args.initializer == 'xavier': initializer = mx.init.Xavier() elif args.initializer == 'msra': initializer = mx.init.MSRAPrelu() elif args.initializer == 'orthogonal': initializer = mx.init.Orthogonal() elif args.initializer == 'normal': initializer = mx.init.Normal() elif args.initializer == 'uniform': initializer = mx.init.Uniform() elif args.initializer == 'one': initializer = mx.init.One() elif args.initializer == 'zero': initializer = mx.init.Zero() # evaluation metrices eval_metrics = ['accuracy'] if args.top_k > 0: eval_metrics.append( mx.metric.create('top_k_accuracy', top_k=args.top_k)) supported_loss = ['ce', 'nll_loss'] if len(args.loss) > 0: # ce or nll loss is only applicable to softmax output loss_type_list = args.loss.split(',') if 'softmax_output' in network.list_outputs(): for loss_type in loss_type_list: loss_type = loss_type.strip() if loss_type == 'nll': loss_type = 'nll_loss' if loss_type not in supported_loss: logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \ 'negative likelihood loss is supported!') else: eval_metrics.append(mx.metric.create(loss_type)) else: logging.warning( "The output is not softmax_output, loss argument will be skipped!" ) # callbacks that run after each batch batch_end_callbacks = [ mx.callback.Speedometer(args.batch_size, args.disp_batches) ] if 'batch_end_callback' in kwargs: cbs = kwargs['batch_end_callback'] batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs] # load bytescheduler if os.environ.get('USE_BYTESCHEDULER') is not None and os.environ.get( 'USE_BYTESCHEDULER') == "1": if args.partition: os.environ["BYTESCHEDULER_PARTITION"] = str(1000 * args.partition) if args.credit: os.environ["BYTESCHEDULER_CREDIT"] = str(args.credit) import bytescheduler.mxnet.horovod as bsc bsc.init() optimizer_params['rescale_grad'] = 1 / (args.batch_size * hvd.size()) # horovod wrapper, must create optimzier explicitly opt = mx.optimizer.create(args.optimizer, sym=network, **optimizer_params) opt = hvd.DistributedOptimizer(opt) # horovod: better to explicitly init model.bind(data_shapes=train.provide_data, label_shapes=train.provide_label) if arg_params is None and aux_params is None: model.init_params(initializer) (arg_params, aux_params) = model.get_params() if arg_params is not None: hvd.broadcast_parameters(arg_params, root_rank=0) if aux_params is not None: hvd.broadcast_parameters(aux_params, root_rank=0) model.set_params(arg_params=arg_params, aux_params=aux_params) # run model.fit(train, begin_epoch=args.load_epoch if args.load_epoch else 0, num_epoch=args.num_epochs, eval_data=val, eval_metric=eval_metrics, kvstore=None, optimizer=opt, optimizer_params=optimizer_params, batch_end_callback=batch_end_callbacks, epoch_end_callback=checkpoint, allow_missing=True, monitor=monitor) if args.profile_worker_suffix: mx.profiler.set_state(state='run', profile_process='worker')
def run(opt, model, train_data, val_data, lr_scheduler, context, arg_params, aux_params, logger, **kwargs): if opt.horovod: rank = hvd.rank() num_workers = hvd.size() else: rank = 0 num_workers = 1 optimizer_params = { 'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum, 'lr_scheduler': lr_scheduler, 'multi_precision': False } if opt.horovod: optimizer_params['rescale_grad'] = 1. / opt.batch_size if opt.optimizer in {'sgdwfastlars'}: optimizer_params['lars'] = True optimizer_params['lars_eta'] = opt.lars_eta optimizer_params['lars_eps'] = opt.lars_eps mll.opt_name('sgdwfastlars') mll.lars_epsilon(opt.lars_eps) mll.lars_opt_base_learning_rate(opt.lr) mll.lars_opt_weight_decay(opt.wd) mll.lars_opt_learning_rate_warmup_epochs(opt.warmup_epochs) mll.lars_opt_momentum(opt.momentum) mll.lars_opt_end_lr(0.0001) mll.lars_opt_lr_decay_poly_power(2) mll.lars_opt_lr_decay_steps('pow2') if opt.horovod: # Setting idx2name dictionary, required to mask out entries for weight decay. idx2name = {} for i, n in enumerate(model._exec_group.param_names): idx2name[i] = n optimizer = mx.optimizer.create(opt.optimizer, sym=None, param_idx2name=idx2name, **optimizer_params) # Horovod: wrap optimizer with DistributedOptimizer optimizer = hvd.DistributedOptimizer(optimizer) else: optimizer = mx.optimizer.create(opt.optimizer, **optimizer_params) # evaluation metrices eval_metrics = mx.metric.Accuracy() epoch_end_callbacks = [] # callbacks that run after each batch batch_end_callbacks = [] if opt.horovod: # if using horovod, only report on rank 0 with global batch size if rank == 0: batch_end_callbacks.append( mx.callback.Speedometer(num_workers * opt.batch_size, opt.log_interval)) else: batch_end_callbacks.append( mx.callback.Speedometer(opt.batch_size, opt.log_interval)) # start to train model fit(model, train_data, eval_data=val_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callbacks, batch_end_callback=batch_end_callbacks, kvstore='horovod' if opt.horovod else "", optimizer=optimizer, optimizer_params=optimizer_params, begin_epoch=0, num_epoch=opt.num_epochs, initializer=None, arg_params=arg_params, aux_params=aux_params, accuracy_target=opt.accuracy_target, allow_missing=True, eval_frequency=opt.eval_frequency, eval_offset=opt.eval_offset, logger=logger)