def __init__(self, flag, batch_size, use_global_stats=True, checkpoint_interval=5, epochs=50, learning_rate=1.e-4, momentum=0.9, weight_decay=4.e-5, train_OS=16, train_split='train_aug', val_split='val', resume=None, test_batch_size=None, data_root=os.path.expanduser('~/.mxnet/datasets/voc'), ctx=[mx.gpu()], norm_layer=gluon.nn.BatchNorm, num_workers=4): if test_batch_size is None: test_batch_size = batch_size self.running_flag = flag self.checkpoint_interval = checkpoint_interval self.batch_size = batch_size # dataset and dataloader train_dataset = VOCAugSegmentation(root=data_root, split=train_split) val_datset = VOCAugSegmentation(root=data_root, split=val_split) self.train_data = gluon.data.DataLoader(train_dataset, batch_size, shuffle=True, last_batch='rollover', num_workers=num_workers) self.eval_data = gluon.data.DataLoader(val_datset, test_batch_size, last_batch='keep', num_workers=num_workers) # create network model = DeepLabv3p(OS=train_OS, classes=21, use_global_stats=use_global_stats, norm_layer=norm_layer) print(model) # resume checkpoint if needed if resume is not None: if os.path.isfile(resume): model.load_parameters(resume, ctx=ctx) else: raise RuntimeError("=> no checkpoint found at '{}'".format(resume)) else: model.initialize(ctx=ctx) self.net = DataParallelModel(model, ctx, sync=True) self.evaluator = DataParallelModel(SegEvalModel(model), ctx) # create criterion self.criterion = DataParallelCriterion(SoftmaxCrossEntropyLoss(), ctx, sync=True) # optimizer and lr scheduling self.lr_scheduler = LRScheduler(mode='poly', baselr=learning_rate, niters=len(self.train_data), nepochs=epochs) self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd', {'lr_scheduler': self.lr_scheduler, 'wd': weight_decay, 'momentum': momentum, 'multi_precision': True})
def init_trainer(self, trainer): if trainer.config['loss_function'] == 'default': trainer.loss_function = MixSoftmaxCrossEntropyLoss(aux=True) else: trainer.loss_function = getattr( gluoncv.loss, trainer.config['loss_function'])( **trainer.config['loss_function_parameters']) trainer.lr_scheduler = gluoncv.utils.LRScheduler( mode='poly', baselr=trainer.config['learn_rate'], niters=len(trainer.dataloader), nepochs=50) trainer.model.model = DataParallelModel(trainer.model.model, self.ctx_list) trainer.loss_function = DataParallelCriterion(trainer.loss_function, self.ctx_list) kv = mxnet.kv.create('local') optimizer = trainer.config['optimizer'] if not optimizer in ['sgd']: optimizer = 'sgd' trainer.optimizer = gluon.Trainer( trainer.model.model.module.collect_params(), optimizer, { 'lr_scheduler': trainer.lr_scheduler, 'wd': 0.0001, 'momentum': 0.9, 'multi_precision': True }, kvstore=kv)
def __init__(self, args): self.args = args # image transform input_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([.485, .456, .406], [.229, .224, .225]), ]) # dataset and dataloader trainset = get_segmentation_dataset( args.dataset, split='train', transform=input_transform) valset = get_segmentation_dataset( args.dataset, split='val', transform=input_transform) self.train_data = gluon.data.DataLoader( trainset, args.batch_size, shuffle=True, last_batch='rollover', num_workers=args.workers) self.eval_data = gluon.data.DataLoader(valset, args.test_batch_size, last_batch='keep', num_workers=args.workers) # create network model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone, norm_layer=args.norm_layer, aux=args.aux, norm_kwargs=args.norm_kwargs) # model.hybridize(static_alloc=True, static_shape=True) print(model) self.net = DataParallelModel(model, args.ctx, args.syncbn) self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx) # resume checkpoint if needed if args.resume is not None: if os.path.isfile(args.resume): model.load_params(args.resume, ctx=args.ctx) else: raise RuntimeError("=> no checkpoint found at '{}'" \ .format(args.resume)) # create criterion criterion = SoftmaxCrossEntropyLossWithAux(args.aux) self.criterion = DataParallelCriterion(criterion, args.ctx, args.syncbn) # optimizer and lr scheduling self.lr_scheduler = LRScheduler(mode='poly', baselr=args.lr, niters=len(self.train_data), nepochs=args.epochs) kv = mx.kv.create(args.kvstore) self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd', {'lr_scheduler': self.lr_scheduler, 'wd':args.weight_decay, 'momentum': args.momentum, 'multi_precision': True}, kvstore = kv)
def test_net_sync(net, criterion, sync, nDevices): ctx_list = [mx.cpu(0) for i in range(nDevices)] net = DataParallelModel(net, ctx_list, sync=sync) criterion = DataParallelCriterion(criterion, ctx_list, sync=sync) iters = 100 # train mode for i in range(iters): x = mx.random.uniform(shape=(8, 1, 28, 28)) t = nd.ones(shape=(8)) with autograd.record(): y = net(x) loss = criterion(y, t) autograd.backward(loss) # evaluation mode for i in range(iters): x = mx.random.uniform(shape=(8, 1, 28, 28)) y = net(x)
def test(args): # output folder outdir = 'train_logs/outdir' if not os.path.exists(outdir): os.makedirs(outdir) # dataset and dataloader testset = get_custom_segm_dataset("test", args) test_data = gluon.data.DataLoader( testset, args.test_batch_size, shuffle=False, last_batch='keep', batchify_fn=ms_batchify_fn if args.tta else None, num_workers=args.workers) # create network if args.model_zoo is not None: model = get_pretrained_segmentation_model(args) if args.resume is not None: resume_checkpoint(model, args) print("loading checkpoint from %s for testing" % args.resume) else: model = get_segmentation_model(model=args.model, dataset=args.dataset, ctx=args.ctx, backbone=args.backbone, norm_layer=args.norm_layer, norm_kwargs=args.norm_kwargs, aux=args.aux, base_size=args.base_size, crop_size=args.crop_size) # load pretrained weight assert args.resume is not None, '=> Please provide the checkpoint using --resume' resume_checkpoint(model, args) # print(model) if args.tta: evaluator = MultiEvalModel(model, testset.num_class, ctx_list=args.ctx, scales=[0.75, 1.0, 1.25, 1.5, 1.75]) else: evaluator = DataParallelModel(SegEvalModel(model), args.ctx) metric = gluoncv.utils.metrics.SegmentationMetric(testset.num_class) tbar = tqdm(test_data) for i, (data, dsts) in enumerate(tbar): if args.eval: if args.tta: predicts = [ pred[0].expand_dims(0) for pred in evaluator.parallel_forward(data) ] targets = [target.as_in_context(predicts[0].context).expand_dims(0) \ for target in dsts] else: data = data.astype(args.dtype, copy=False) predicts = evaluator(data) predicts = [x[0] for x in predicts] if args.test_flip: assert (data.ndim == 4) fdata = data.flip(3) fpredicts = evaluator(fdata) predicts = [(x + y[0].flip(3)) / 2 for x, y in zip(predicts, fpredicts)] targets = mx.gluon.utils.split_and_load(dsts, args.ctx, even_split=False) metric.update(targets, predicts) pixAcc, mIoU = metric.get() tbar.set_description('pixAcc: %.4f, mIoU: %.4f' % (pixAcc, mIoU)) mx.nd.waitall() else: im_paths = dsts predicts = evaluator.parallel_forward(data) for predict, impath in zip(predicts, im_paths): predict = mx.nd.squeeze(mx.nd.argmax(predict[0], 1)).asnumpy() + \ testset.pred_offset mask = get_color_pallete(predict, args.dataset) outname = os.path.splitext(impath)[0] + '.png' mask.save(os.path.join(outdir, outname))
def __init__(self, args): self.args = args # image transform input_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([.485, .456, .406], [.229, .224, .225]), ]) # dataset and dataloader data_kwargs = { 'transform': input_transform, 'base_size': args.base_size, 'crop_size': args.crop_size, 'root': args.dataset_dir } trainset = get_segmentation_dataset(args.dataset, split=args.train_split, mode='train', **data_kwargs) valset = get_segmentation_dataset(args.dataset, split='val', mode='val', **data_kwargs) self.train_data = gluon.data.DataLoader(trainset, args.batch_size, shuffle=True, last_batch='rollover', num_workers=args.workers) self.eval_data = gluon.data.DataLoader(valset, args.test_batch_size, last_batch='rollover', num_workers=args.workers) # create network if args.model_zoo is not None: model = get_model(args.model_zoo, pretrained=True) else: model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone, norm_layer=args.norm_layer, norm_kwargs=args.norm_kwargs, aux=args.aux, crop_size=args.crop_size) model.cast(args.dtype) print(model) self.net = DataParallelModel(model, args.ctx, args.syncbn) self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx) # resume checkpoint if needed if args.resume is not None: if os.path.isfile(args.resume): model.load_parameters(args.resume, ctx=args.ctx) else: raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) # create criterion criterion = MixSoftmaxCrossEntropyLoss(args.aux, aux_weight=args.aux_weight) self.criterion = DataParallelCriterion(criterion, args.ctx, args.syncbn) # optimizer and lr scheduling self.lr_scheduler = LRScheduler(mode='poly', base_lr=args.lr, nepochs=args.epochs, iters_per_epoch=len(self.train_data), power=0.9) kv = mx.kv.create(args.kvstore) optimizer_params = { 'lr_scheduler': self.lr_scheduler, 'wd': args.weight_decay, 'momentum': args.momentum } if args.dtype == 'float16': optimizer_params['multi_precision'] = True if args.no_wd: for k, v in self.net.module.collect_params( '.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd', optimizer_params, kvstore=kv) # evaluation metrics self.metric = gluoncv.utils.metrics.SegmentationMetric( trainset.num_class)