示例#1
0
    def run_training(self):
        """ Run training benchmarks.
        Returns:
            Numpy array containing batch times (string, numpy array).
        """
        # Create data iterator and resize it to total number of iterations (no matter what input data size is)
        train_data = DataIteratorFactory.get(
            (self.worker_batch, ) + self.model.input_shape,
            (self.worker_batch, ) + self.model.labels_shape,
            self.model.labels_range,
            self.args,
            kv_store=self.kv_store)
        # https://github.com/apache/incubator-mxnet/blob/master/example/distributed_training-horovod/resnet50_imagenet.py
        optimizer_params = {
            'multi_precision': True
        } if self.args.dtype == 'float16' else {}
        if self.is_horovod:
            optimizer_params['rescale_grad'] = 1.0 / self.worker_batch
        opt = mx.optimizer.create('sgd', **optimizer_params)
        if self.is_horovod:
            opt = hvd.DistributedOptimizer(opt)

        mod = mx.mod.Module(symbol=self.model.output, context=self.devices[0])
        mod.bind(data_shapes=train_data.provide_data,
                 label_shapes=train_data.provide_label,
                 for_training=True)
        mod.init_params(
            mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2))
        if self.is_horovod:
            arg_params, aux_params = mod.get_params()
            if arg_params:
                hvd.broadcast_parameters(arg_params, root_rank=0)
            if aux_params:
                hvd.broadcast_parameters(aux_params, root_rank=0)
            mod.set_params(arg_params=arg_params, aux_params=aux_params)

        batch_end_callback = BatchEndCallback(self.args.num_warmup_batches,
                                              self.args.num_batches)
        # print ("Starting benchmarks.")
        # TODO: In current implementation, number of epochs must always equal to 1. It is iterator responsibility to
        #       iterate the right number of batched - warm up plus benchmark batches.
        mod.fit(train_data,
                kvstore=self.kv_store,
                optimizer=opt,
                optimizer_params=optimizer_params,
                eval_metric=self.model.eval_metric,
                batch_end_callback=[batch_end_callback],
                begin_epoch=0,
                num_epoch=1)

        if self.is_horovod:
            start_time = timeit.default_timer()
            mx.ndarray.waitall()
            logging.info(
                "(horovod) wait time for all ndarrays is %.5f seconds",
                timeit.default_timer() - start_time)
        return batch_end_callback.batch_times
示例#2
0
def test_allreduce(use_horovod, dtype):
    if use_horovod is False:
        kvstore_type = "dist_sync_device" if os.environ.get(
            "DMLC_ROLE") == "worker" else kvstore_type
        kv = mx.kvstore.create(kvstore_type)
        rank = kv.rank
        num_workers = kv.num_workers
    else:
        kvstore_type = "device"
        kv = mx.kvstore.create(kvstore_type)
        hvd.init()
        rank = hvd.rank()
        num_workers = hvd.size()
    print('use horovod: {}, rank {}/{}, kv type: {}, usetree: {}'.format(
        use_horovod, rank, num_workers, kvstore_type,
        os.environ.get("MXNET_KVSTORE_USETREE")))

    rescale_grad = 1.0 / (8 * num_workers)
    if use_horovod:
        rescale_grad = rescale_grad * num_workers

    optimizer_params = dict(
        momentum=0,  # pOpt.optimizer.momentum,
        wd=0,  # pOpt.optimizer.wd,
        learning_rate=0.1,
        rescale_grad=rescale_grad,
    )
    optimizer = mx.optimizer.create("sgd", **optimizer_params)
    if use_horovod:
        # Horovod: wrap optimizer with DistributedOptimizer
        optimizer = hvd.DistributedOptimizer(optimizer)

    print("opt rescale:{}".format(optimizer.rescale_grad))
    kv.set_optimizer(optimizer)

    test_hvd_kv(rank, num_workers, kv, dtype)
示例#3
0
def train_module():
    # Create input symbol
    data = mx.sym.var('data')
    if args.dtype == 'float16':
        data = mx.sym.Cast(data=data, dtype=np.float16)
        net.cast(np.float16)

    # Create output symbol
    out = net(data)
    if args.dtype == 'float16':
        out = mx.sym.Cast(data=out, dtype=np.float32)
    softmax = mx.sym.SoftmaxOutput(out, name='softmax')

    # Create model
    mod = mx.mod.Module(softmax, context=context)

    # Initialize parameters
    if args.use_pretrained:
        arg_params = {}
        for x in net.collect_params().values():
            x.reset_ctx(mx.cpu())
            arg_params[x.name] = x.data()
    else:
        arg_params = None
    aux_params = None
    mod.bind(data_shapes=train_data.provide_data,
             label_shapes=train_data.provide_label)
    mod.init_params(initializer, arg_params=arg_params, aux_params=aux_params)

    # Horovod: fetch and broadcast parameters
    (arg_params, aux_params) = mod.get_params()
    if arg_params is not None:
        hvd.broadcast_parameters(arg_params, root_rank=0)
    if aux_params is not None:
        hvd.broadcast_parameters(aux_params, root_rank=0)
    mod.set_params(arg_params=arg_params, aux_params=aux_params)

    # Create optimizer
    # Note that when using Module API, we need to specify rescale_grad since
    # we create optimizer first and wrap it with DistributedOptimizer. For
    # Gluon API, it is handled in Trainer.step() function so there is no need
    # to specify rescale_grad (see above train_gluon() function).
    optimizer_params = {
        'wd': args.wd,
        'momentum': args.momentum,
        'rescale_grad': 1.0 / batch_size,
        'lr_scheduler': lr_sched
    }
    if args.dtype == 'float16':
        optimizer_params['multi_precision'] = True
    opt = mx.optimizer.create('sgd', **optimizer_params)

    # Horovod: wrap optimizer with DistributedOptimizer
    dist_opt = hvd.DistributedOptimizer(
        opt, gradient_predivide_factor=args.gradient_predivide_factor)

    # Setup validation data and callback during training
    eval_data = None
    if args.eval_epoch:
        eval_data = val_data
    batch_callback = None
    if args.log_interval > 0 and rank == 0:
        batch_callback = mx.callback.Speedometer(batch_size * num_workers,
                                                 args.log_interval)

    epoch_callback = None
    if args.save_frequency > 0:
        epoch_callback = mx.callback.do_checkpoint('%s-%d' %
                                                   (args.model, rank),
                                                   period=args.save_frequency)

    # Train model
    mod.fit(train_data,
            eval_data=eval_data,
            num_epoch=args.num_epochs,
            kvstore=None,
            batch_end_callback=batch_callback,
            epoch_end_callback=epoch_callback,
            optimizer=dist_opt)

    # Evaluate performance if not using synthetic data
    if args.use_rec:
        acc_top1 = mx.metric.Accuracy()
        acc_top5 = mx.metric.TopKAccuracy(5)
        res = mod.score(val_data, [acc_top1, acc_top5])
        for name, val in res:
            logging.info('Epoch[%d] Rank[%d] Validation-%s=%f',
                         args.num_epochs - 1, rank, name, val)
示例#4
0
def train():
    # Get model from GluonCV model zoo
    # https://gluon-cv.mxnet.io/model_zoo/index.html
    net = get_model(args.model, **kwargs)
    net.cast(args.dtype)

    # Create input symbol
    data = mx.sym.var('data')
    if args.dtype == 'float16':
        data = mx.sym.Cast(data=data, dtype=np.float16)
        net.cast(np.float16)

    # Create output symbol
    out = net(data)
    if args.dtype == 'float16':
        out = mx.sym.Cast(data=out, dtype=np.float32)
    softmax = mx.sym.SoftmaxOutput(out, name='softmax')

    if args.use_pretrained:
        arg_params = {}
        for x in net.collect_params().values():
            x.reset_ctx(mx.cpu())
            arg_params[x.name] = x.data()
    else:
        arg_params = None
    aux_params = None

    # Create model
    mod = mx.mod.Module(softmax, context=context)

    # Create optimizer
    optimizer_params = {
        'wd': args.wd,
        'momentum': args.momentum,
        'rescale_grad': 1.0 / batch_size,
        'lr_scheduler': lr_sched
    }
    if args.dtype == 'float16':
        optimizer_params['multi_precision'] = True
    opt = mx.optimizer.create('sgd', sym=out, **optimizer_params)

    # Horovod: wrap optimizer with DistributedOptimizer
    opt = hvd.DistributedOptimizer(opt)

    # Create initializer and initializer parameters
    initializer = mx.init.Xavier(rnd_type='gaussian',
                                 factor_type="in",
                                 magnitude=2)
    mod.bind(data_shapes=train_data.provide_data,
             label_shapes=train_data.provide_label)
    mod.init_params(initializer, arg_params=arg_params, aux_params=aux_params)

    # Horovod: fetch and broadcast parameters
    (arg_params, aux_params) = mod.get_params()
    if arg_params is not None:
        hvd.broadcast_parameters(arg_params, root_rank=0)
    if aux_params is not None:
        hvd.broadcast_parameters(aux_params, root_rank=0)
    mod.set_params(arg_params=arg_params, aux_params=aux_params)

    # Setup validation data and callback during training
    eval_data = None
    if args.eval_epoch:
        eval_data = val_data
    batch_callback = None
    if args.log_interval > 0:
        batch_callback = mx.callback.Speedometer(batch_size,
                                                 max(1, args.log_interval))
    epoch_callback = None
    if args.save_frequency > 0:
        epoch_callback = mx.callback.do_checkpoint('%s-%d' %
                                                   (args.model, rank),
                                                   period=args.save_frequency)

    # Train model
    mod.fit(train_data,
            eval_data=eval_data,
            num_epoch=args.num_epochs,
            kvstore=None,
            batch_end_callback=batch_callback,
            epoch_end_callback=epoch_callback,
            optimizer=opt,
            optimizer_params=optimizer_params)

    # Evaluate performance if not using synthetic data
    if args.use_rec:
        acc_top1 = mx.metric.Accuracy()
        acc_top5 = mx.metric.TopKAccuracy(5)
        res = mod.score(val_data, [acc_top1, acc_top5])
        for name, val in res:
            logging.info('Epoch[%d] Rank[%d] Validation-%s=%f',
                         args.num_epochs - 1, rank, name, val)
示例#5
0
    eval_metrics.add(eval_metric)
    eval_metrics.add(cls_metric)
    eval_metrics.add(bbox_metric)
    if not config.TRAIN.ONLY_PROPOSAL:
        eval_metrics.add(rceval_metric)
        eval_metrics.add(rccls_metric)
        eval_metrics.add(rcbbox_metric)

    if config.TRAIN.WITH_MASK:
        mask_metric = metric.MaskLogLossMetric(config)
        eval_metrics.add(mask_metric)

    optimizer_params = get_optim_params(config, len(train_iter), batch_size)
    print('Optimizer params: {}'.format(optimizer_params))
    opt = mx.optimizer.create('sgd', **optimizer_params)
    opt = hvd.DistributedOptimizer(opt)

    # Checkpointing

    batch_end_callback = None
    if rank == 0:
        batch_end_callback = mx.callback.Speedometer(batch_size * num_workers,
                                                     args.display)
    epoch_end_callback = None
    if rank == 0:
        prefix = os.path.join(output_path, args.save_prefix)

        epoch_end_callback = [
            mx.callback.module_checkpoint(mod,
                                          prefix,
                                          period=1,
def train_module():
    # Create input symbol
    data = mx.sym.var('data')
    if args.dtype == 'float16':
        data = mx.sym.Cast(data=data, dtype=np.float16)
        net.cast(np.float16)

    # Create output symbol
    out = net(data)
    if args.dtype == 'float16':
        out = mx.sym.Cast(data=out, dtype=np.float32)
    softmax = mx.sym.SoftmaxOutput(out, name='softmax')

    # Create model
    mod = mx.mod.Module(softmax, context=context)

    # Initialize parameters
    if args.use_pretrained:
        arg_params = {}
        for x in net.collect_params().values():
            x.reset_ctx(mx.cpu())
            arg_params[x.name] = x.data()
    else:
        arg_params = None
    aux_params = None
    mod.bind(data_shapes=train_data.provide_data,
             label_shapes=train_data.provide_label)
    mod.init_params(initializer, arg_params=arg_params, aux_params=aux_params)

    # Horovod: fetch and broadcast parameters
    (arg_params, aux_params) = mod.get_params()
    if arg_params is not None:
        hvd.broadcast_parameters(arg_params, root_rank=0)
    if aux_params is not None:
        hvd.broadcast_parameters(aux_params, root_rank=0)
    mod.set_params(arg_params=arg_params, aux_params=aux_params)

    # Horovod: wrap optimizer with DistributedOptimizer
    dist_opt = hvd.DistributedOptimizer(opt)

    # Setup validation data and callback during training
    eval_data = None
    if args.eval_epoch:
        eval_data = val_data
    batch_callback = None
    if args.log_interval > 0 and rank == 0:
        batch_callback = mx.callback.Speedometer(batch_size * num_workers,
                                                 args.log_interval)

    epoch_callback = None
    if args.save_frequency > 0:
        epoch_callback = mx.callback.do_checkpoint('%s-%d' %
                                                   (args.model, rank),
                                                   period=args.save_frequency)

    # Train model
    mod.fit(train_data,
            eval_data=eval_data,
            num_epoch=args.num_epochs,
            kvstore=None,
            batch_end_callback=batch_callback,
            epoch_end_callback=epoch_callback,
            optimizer=dist_opt)

    # Evaluate performance if not using synthetic data
    if args.use_rec:
        acc_top1 = mx.metric.Accuracy()
        acc_top5 = mx.metric.TopKAccuracy(5)
        res = mod.score(val_data, [acc_top1, acc_top5])
        for name, val in res:
            logging.info('Epoch[%d] Rank[%d] Validation-%s=%f',
                         args.num_epochs - 1, rank, name, val)
def fit(args, network, data_loader, **kwargs):
    """
    train a model
    args : argparse returns
    network : the symbol definition of the nerual network
    data_loader : function that returns the train and val data iterators
    """
    # kvstore
    # kv = mx.kvstore.create(args.kv_store)
    # if args.gc_type != 'none':
    #     kv.set_gradient_compression({'type': args.gc_type,
    #                                  'threshold': args.gc_threshold})

    if args.profile_worker_suffix:
        if hvd.size() > 1:
            filename = 'rank' + str(
                hvd.rank()) + '_' + args.profile_worker_suffix
        else:
            filename = args.profile_worker_suffix
        # mx.profiler.set_config(filename=filename, profile_all=True, profile_process='worker')
        mx.profiler.set_config(filename=filename,
                               profile_symbolic=True,
                               profile_imperative=True,
                               profile_api=False,
                               profile_process='worker')
        mx.profiler.set_state(state='run', profile_process='worker')

    # logging
    head = '%(asctime)-15s Node[' + str(hvd.rank()) + '] %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=head)
    logging.info('start with arguments %s', args)

    # data iterators
    (train, val) = data_loader(args, (hvd.rank(), hvd.size()))
    if args.test_io:
        tic = time.time()
        for i, batch in enumerate(train):
            for j in batch.data:
                j.wait_to_read()
            if (i + 1) % args.disp_batches == 0:
                logging.info(
                    'Batch [%d]\tSpeed: %.2f samples/sec', i,
                    args.disp_batches * args.batch_size / (time.time() - tic))
                tic = time.time()

        return

    # load model
    if 'arg_params' in kwargs and 'aux_params' in kwargs:
        arg_params = kwargs['arg_params']
        aux_params = kwargs['aux_params']
    else:
        sym, arg_params, aux_params = _load_model(args, hvd.rank())
        if sym is not None:
            assert sym.tojson() == network.tojson()

    # save model
    checkpoint = _save_model(args, hvd.rank())

    # devices for training
    devs = [mx.gpu(hvd.local_rank())]

    # learning rate
    lr, lr_scheduler = _get_lr_scheduler(args)

    # create model
    model = mx.mod.Module(context=devs, symbol=network)

    lr_scheduler = lr_scheduler
    optimizer_params = {
        'learning_rate': lr,
        'wd': args.wd,
        'lr_scheduler': lr_scheduler,
        'multi_precision': True
    }

    # Only a limited number of optimizers have 'momentum' property
    has_momentum = {'sgd', 'dcasgd', 'nag'}
    if args.optimizer in has_momentum:
        optimizer_params['momentum'] = args.mom

    monitor = mx.mon.Monitor(args.monitor,
                             pattern=".*") if args.monitor > 0 else None

    # A limited number of optimizers have a warmup period
    has_warmup = {'lbsgd', 'lbnag'}
    if args.optimizer in has_warmup:
        if hvd.size() > 1:
            nworkers = hvd.size()
        else:
            nworkers = 1
        epoch_size = args.num_examples / args.batch_size / nworkers
        if epoch_size < 1:
            epoch_size = 1
        macrobatch_size = args.macrobatch_size
        if macrobatch_size < args.batch_size * nworkers:
            macrobatch_size = args.batch_size * nworkers
        #batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999)
        batch_scale = math.ceil(
            float(macrobatch_size) / args.batch_size / nworkers)
        optimizer_params['updates_per_epoch'] = epoch_size
        optimizer_params[
            'begin_epoch'] = args.load_epoch if args.load_epoch else 0
        optimizer_params['batch_scale'] = batch_scale
        optimizer_params['warmup_strategy'] = args.warmup_strategy
        optimizer_params['warmup_epochs'] = args.warmup_epochs
        optimizer_params['num_epochs'] = args.num_epochs

    if args.initializer == 'default':
        if args.network == 'alexnet':
            # AlexNet will not converge using Xavier
            initializer = mx.init.Normal()
            # VGG will not trend to converge using Xavier-Gaussian
        elif 'vgg' in args.network:
            initializer = mx.init.Xavier()
        else:
            initializer = mx.init.Xavier(rnd_type='gaussian',
                                         factor_type="in",
                                         magnitude=2)
    # initializer   = mx.init.Xavier(factor_type="in", magnitude=2.34),
    elif args.initializer == 'xavier':
        initializer = mx.init.Xavier()
    elif args.initializer == 'msra':
        initializer = mx.init.MSRAPrelu()
    elif args.initializer == 'orthogonal':
        initializer = mx.init.Orthogonal()
    elif args.initializer == 'normal':
        initializer = mx.init.Normal()
    elif args.initializer == 'uniform':
        initializer = mx.init.Uniform()
    elif args.initializer == 'one':
        initializer = mx.init.One()
    elif args.initializer == 'zero':
        initializer = mx.init.Zero()

    # evaluation metrices
    eval_metrics = ['accuracy']
    if args.top_k > 0:
        eval_metrics.append(
            mx.metric.create('top_k_accuracy', top_k=args.top_k))

    supported_loss = ['ce', 'nll_loss']
    if len(args.loss) > 0:
        # ce or nll loss is only applicable to softmax output
        loss_type_list = args.loss.split(',')
        if 'softmax_output' in network.list_outputs():
            for loss_type in loss_type_list:
                loss_type = loss_type.strip()
                if loss_type == 'nll':
                    loss_type = 'nll_loss'
                if loss_type not in supported_loss:
                    logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \
                                    'negative likelihood loss is supported!')
                else:
                    eval_metrics.append(mx.metric.create(loss_type))
        else:
            logging.warning(
                "The output is not softmax_output, loss argument will be skipped!"
            )

    # callbacks that run after each batch
    batch_end_callbacks = [
        mx.callback.Speedometer(args.batch_size, args.disp_batches)
    ]
    if 'batch_end_callback' in kwargs:
        cbs = kwargs['batch_end_callback']
        batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs]

    # load bytescheduler
    if os.environ.get('USE_BYTESCHEDULER') is not None and os.environ.get(
            'USE_BYTESCHEDULER') == "1":
        if args.partition:
            os.environ["BYTESCHEDULER_PARTITION"] = str(1000 * args.partition)
        if args.credit:
            os.environ["BYTESCHEDULER_CREDIT"] = str(args.credit)
        import bytescheduler.mxnet.horovod as bsc
        bsc.init()

    optimizer_params['rescale_grad'] = 1 / (args.batch_size * hvd.size())
    # horovod wrapper, must create optimzier explicitly
    opt = mx.optimizer.create(args.optimizer, sym=network, **optimizer_params)
    opt = hvd.DistributedOptimizer(opt)

    # horovod: better to explicitly init
    model.bind(data_shapes=train.provide_data,
               label_shapes=train.provide_label)
    if arg_params is None and aux_params is None:
        model.init_params(initializer)
        (arg_params, aux_params) = model.get_params()
    if arg_params is not None:
        hvd.broadcast_parameters(arg_params, root_rank=0)
    if aux_params is not None:
        hvd.broadcast_parameters(aux_params, root_rank=0)
    model.set_params(arg_params=arg_params, aux_params=aux_params)

    # run
    model.fit(train,
              begin_epoch=args.load_epoch if args.load_epoch else 0,
              num_epoch=args.num_epochs,
              eval_data=val,
              eval_metric=eval_metrics,
              kvstore=None,
              optimizer=opt,
              optimizer_params=optimizer_params,
              batch_end_callback=batch_end_callbacks,
              epoch_end_callback=checkpoint,
              allow_missing=True,
              monitor=monitor)

    if args.profile_worker_suffix:
        mx.profiler.set_state(state='run', profile_process='worker')
def run(opt, model, train_data, val_data, lr_scheduler, context, arg_params,
        aux_params, logger, **kwargs):
    if opt.horovod:
        rank = hvd.rank()
        num_workers = hvd.size()
    else:
        rank = 0
        num_workers = 1

    optimizer_params = {
        'learning_rate': opt.lr,
        'wd': opt.wd,
        'momentum': opt.momentum,
        'lr_scheduler': lr_scheduler,
        'multi_precision': False
    }

    if opt.horovod:
        optimizer_params['rescale_grad'] = 1. / opt.batch_size

    if opt.optimizer in {'sgdwfastlars'}:
        optimizer_params['lars'] = True
        optimizer_params['lars_eta'] = opt.lars_eta
        optimizer_params['lars_eps'] = opt.lars_eps
        mll.opt_name('sgdwfastlars')
        mll.lars_epsilon(opt.lars_eps)
        mll.lars_opt_base_learning_rate(opt.lr)
        mll.lars_opt_weight_decay(opt.wd)
        mll.lars_opt_learning_rate_warmup_epochs(opt.warmup_epochs)
        mll.lars_opt_momentum(opt.momentum)
        mll.lars_opt_end_lr(0.0001)
        mll.lars_opt_lr_decay_poly_power(2)
        mll.lars_opt_lr_decay_steps('pow2')

    if opt.horovod:
        # Setting idx2name dictionary, required to mask out entries for weight decay.
        idx2name = {}
        for i, n in enumerate(model._exec_group.param_names):
            idx2name[i] = n

        optimizer = mx.optimizer.create(opt.optimizer,
                                        sym=None,
                                        param_idx2name=idx2name,
                                        **optimizer_params)
        # Horovod: wrap optimizer with DistributedOptimizer
        optimizer = hvd.DistributedOptimizer(optimizer)
    else:
        optimizer = mx.optimizer.create(opt.optimizer, **optimizer_params)

    # evaluation metrices
    eval_metrics = mx.metric.Accuracy()

    epoch_end_callbacks = []

    # callbacks that run after each batch
    batch_end_callbacks = []
    if opt.horovod:
        # if using horovod, only report on rank 0 with global batch size
        if rank == 0:
            batch_end_callbacks.append(
                mx.callback.Speedometer(num_workers * opt.batch_size,
                                        opt.log_interval))
    else:
        batch_end_callbacks.append(
            mx.callback.Speedometer(opt.batch_size, opt.log_interval))

    # start to train model
    fit(model,
        train_data,
        eval_data=val_data,
        eval_metric=eval_metrics,
        epoch_end_callback=epoch_end_callbacks,
        batch_end_callback=batch_end_callbacks,
        kvstore='horovod' if opt.horovod else "",
        optimizer=optimizer,
        optimizer_params=optimizer_params,
        begin_epoch=0,
        num_epoch=opt.num_epochs,
        initializer=None,
        arg_params=arg_params,
        aux_params=aux_params,
        accuracy_target=opt.accuracy_target,
        allow_missing=True,
        eval_frequency=opt.eval_frequency,
        eval_offset=opt.eval_offset,
        logger=logger)