def batch_fn(batch, ctx): if opt.num_segments > 1: data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False, multiplier=opt.num_segments) else: data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) return data, label
def test(ctx,val_data): acc_top1.reset() acc_top5.reset() L = gluon.loss.SoftmaxCrossEntropyLoss() num_test_iter = len(val_data) val_loss_epoch = 0 for i, batch in enumerate(val_data): data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0) label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0) val_outputs = [] for _, X in enumerate(data): # X = X.reshape((-1,) + X.shape[2:]) # X = X.reshape((-1,15)+X.shape[-2:]) X = X.reshape((-3,-3,-2)) pred = net(X) val_outputs.append(pred) loss = [L(yhat, y) for yhat, y in zip(val_outputs, label)] acc_top1.update(label, val_outputs) acc_top5.update(label, val_outputs) val_loss_epoch += sum([l.mean().asscalar() for l in loss]) / len(loss) _, top1 = acc_top1.get() _, top5 = acc_top5.get() val_loss = val_loss_epoch / num_test_iter return (top1, top5, val_loss)
def test(ctx, val_data): acc_top1.reset() acc_top5.reset() L = gluon.loss.SoftmaxCrossEntropyLoss() num_test_iter = len(val_data) val_loss_epoch = 0 for i, batch in enumerate(val_data): data_bgs = split_and_load(batch[0], ctx_list=ctx, batch_axis=0) data_fgs = split_and_load(batch[1], ctx_list=ctx, batch_axis=0) label = split_and_load(batch[2], ctx_list=ctx, batch_axis=0) val_outputs = [] for _, (X_bgs, X_fgs) in enumerate(zip(data_bgs, data_fgs)): #print('X_bgs',X_bgs.shape) # (10, 8, 3, 224, 224) #print('X_fgs',X_fgs.shape) # (10, 8, 3, 224, 224) X_bgs = X_bgs.reshape((-1, ) + X_bgs.shape[2:]) X_fgs = X_fgs.reshape((-1, ) + X_fgs.shape[2:]) #print('X_bgs',X_bgs.shape) #(80, 3, 224, 224) #print('X_fgs',X_fgs.shape) #(80, 3, 224, 224) pred = net(X_bgs, X_fgs) val_outputs.append(pred) loss = [L(yhat, y) for yhat, y in zip(val_outputs, label)] acc_top1.update(label, val_outputs) acc_top5.update(label, val_outputs) val_loss_epoch += sum([l.mean().asscalar() for l in loss]) / len(loss) _, top1 = acc_top1.get() _, top5 = acc_top5.get() val_loss = val_loss_epoch / num_test_iter return (top1, top5, val_loss)
def _get_data_and_label(self, batch, ctx, batch_axis=0): data = batch[0] gt_bboxes = batch[-1] data = split_and_load(data, ctx_list=ctx, batch_axis=batch_axis) targets = list(zip(*[split_and_load(batch[i], ctx_list=ctx, batch_axis=batch_axis) for i in range(1, len(batch) - 1)])) gt_bboxes = split_and_load(gt_bboxes, ctx_list=ctx, batch_axis=batch_axis) return data, targets, gt_bboxes
def train_batch_fn(data, ctx): """split and load data in GPU""" template = split_and_load(data[0], ctx_list=ctx, batch_axis=0) search = split_and_load(data[1], ctx_list=ctx, batch_axis=0) label_cls = split_and_load(data[2], ctx_list=ctx, batch_axis=0) label_loc = split_and_load(data[3], ctx_list=ctx, batch_axis=0) label_loc_weight = split_and_load(data[4], ctx_list=ctx, batch_axis=0) return template, search, label_cls, label_loc, label_loc_weight
def batch_fn(batch, ctx): data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) return data, label
def test(ctx, val_data): acc_top1.reset() acc_top5.reset() L = gluon.loss.SoftmaxCrossEntropyLoss() num_test_iter = len(val_data) val_loss_epoch = 0 for i, batch in enumerate(val_data): data_bgs = split_and_load(batch[0], ctx_list=ctx, batch_axis=0) data_fgs = split_and_load(batch[1], ctx_list=ctx, batch_axis=0) label = split_and_load(batch[2], ctx_list=ctx, batch_axis=0) val_outputs = [] for _, (X_bgs, X_fgs) in enumerate(zip(data_bgs, data_fgs)): #print('X_bgs',X_bgs.shape) # (10, 8, 3, 224, 224) #print('X_fgs',X_fgs.shape) # (10, 8, 3, 224, 224) X_bgs = X_bgs.reshape((-1, ) + X_bgs.shape[2:]) X_fgs = X_fgs.reshape((-1, ) + X_fgs.shape[2:]) #print('X_bgs',X_bgs.shape) #(80, 3, 224, 224) #print('X_fgs',X_fgs.shape) #(80, 3, 224, 224) x_bgs = net_bgs(X_bgs) x_fgs = net_fgs(X_fgs) if opt.fusion_method == 'avg': x = nd.stack(x_bgs, x_fgs) x = nd.mean(x, axis=0) elif opt.fusion_method == 'max': x = nd.stack(x_bgs, x_fgs) x = nd.max(x, axis=0) elif opt.fusion_method == 'bgs': x = x_bgs elif opt.fusion_method == 'fgs': x = x_fgs else: raise ValueError('fusion_method not supported') pred = x #pred = net(X_bgs,X_fgs) val_outputs.append(pred) loss = [L(yhat, y) for yhat, y in zip(val_outputs, label)] acc_top1.update(label, val_outputs) acc_top5.update(label, val_outputs) val_loss_epoch += sum([l.mean().asscalar() for l in loss]) / len(loss) _, top1 = acc_top1.get() _, top5 = acc_top5.get() val_loss = val_loss_epoch / num_test_iter return (top1, top5, val_loss)
def _validation(self): """validation""" cudnn_auto_tune(False) tbar = tqdm(self.val_iter) for i, (data, label) in enumerate(tbar): gpu_datas = split_and_load(data=data, ctx_list=self.ctx, even_split=False) gpu_labels = split_and_load(data=label, ctx_list=self.ctx, even_split=False) for gpu_data, gpu_label in zip(gpu_datas, gpu_labels): self.metric.update(gpu_label, self.net.evaluate(gpu_data)) tbar.set_description('pixAcc: %.4f, mIoU: %.4f' % (self.metric.get())) pixel_acc, mean_iou = self.metric.get() self.metric.reset() cudnn_auto_tune(True) return pixel_acc, mean_iou
def evaluate_batch(self, estimator, val_batch, batch_axis=0): """Evaluate the estimator model on a batch of validation data. Parameters ---------- estimator : Estimator Reference to the estimator val_batch : tuple Data and label of a batch from the validation data loader. batch_axis : int, default 0 Batch axis to split the validation data into devices. """ data = split_and_load(val_batch[0], ctx_list=estimator.context, batch_axis=0, even_split=False) label = split_and_load(val_batch[1], ctx_list=estimator.context, batch_axis=0, even_split=False) mx.nd.waitall() det_bboxes = [] det_ids = [] det_scores = [] gt_bboxes = [] gt_ids = [] gt_difficults = [] for x, y in zip(data, label): # get prediction results with autograd.predict_mode(): ids, scores, bboxes = estimator.val_net(x) det_ids.append(ids.copy().asnumpy()) det_scores.append(scores.copy().asnumpy()) # clip to image size det_bboxes.append(bboxes.clip(0, val_batch[0].shape[2]).copy().asnumpy()) # split ground truths gt_ids.append(y.slice_axis(axis=-1, begin=4, end=5).copy().asnumpy()) gt_bboxes.append(y.slice_axis(axis=-1, begin=0, end=4).copy().asnumpy()) gt_difficults.append(y.slice_axis(axis=-1, begin=5, end=6).copy().asnumpy() if y.shape[-1] > 5 else None) # pred = [estimator.val_net(x) for x in data] # loss = [estimator.val_loss(y_hat, y) for y_hat, y in zip(pred, label)] pred = (det_bboxes, det_ids, det_scores) label = (gt_bboxes, gt_ids, gt_difficults) return data, label, pred, 0.
def fit(run, ctx, log_interval=5, no_val=False, logger=None): net = FitFactory.get_model(wandb.config, ctx) train_iter, num_train = FitFactory.data_iter( wandb.config.data_name, wandb.config.bs_train, root=get_dataset_info(wandb.config.data_name)[0], split='train', # sometimes would be 'trainval' mode='train', base_size=wandb.config.base_size, crop_size=wandb.config.crop_size) val_iter, num_valid = FitFactory.data_iter( wandb.config.data_name, wandb.config.bs_val, shuffle=False, last_batch='keep', root=get_dataset_info(wandb.config.data_name)[0], split='val', base_size=wandb.config.base_size, crop_size=wandb.config.crop_size) criterion = FitFactory.get_criterion( wandb.config.aux, wandb.config.aux_weight, # focal_kwargs={'alpha': 1.0, 'gamma': 0.5}, # sensitive_kwargs={ # 'nclass': get_dataset_info(wandb.config.data_name)[1], # 'alpha': 1.0, # 'gamma': 1.0} ) trainer = FitFactory.create_trainer(net, wandb.config, iters_per_epoch=len(train_iter)) metric = SegmentationMetric( nclass=get_dataset_info(wandb.config.data_name)[1]) wandb.config.num_train = num_train wandb.config.num_valid = num_valid t_start = get_strftime() logger.info(f'Training start: {t_start}') for k, v in wandb.config.items(): logger.info(f'{k}: {v}') logger.info('-----> end hyper-parameters <-----') wandb.config.start_time = get_strftime() best_score = .0 for epoch in range(wandb.config.epochs): train_loss = .0 tbar = tqdm(train_iter) for i, (data, target) in enumerate(tbar): gpu_datas = split_and_load(data, ctx_list=ctx) gpu_targets = split_and_load(target, ctx_list=ctx) with autograd.record(): loss_gpus = [ criterion(*net(gpu_data), gpu_target) for gpu_data, gpu_target in zip(gpu_datas, gpu_targets) ] for loss in loss_gpus: autograd.backward(loss) trainer.step(wandb.config.bs_train) nd.waitall() loss_temp = .0 # sum up all sample loss for loss in loss_gpus: loss_temp += loss.sum().asscalar() train_loss += (loss_temp / wandb.config.bs_train) tbar.set_description('Epoch %d, training loss %.5f' % (epoch, train_loss / (i + 1))) if (i % log_interval == 0) or (i + 1 == len(train_iter)): wandb.log({ f'train_loss_batch, interval={log_interval}': train_loss / (i + 1) }) wandb.log({ 'train_loss_epoch': train_loss / (len(train_iter) + 1), 'custom_step': epoch }) if not no_val: cudnn_auto_tune(False) val_loss = .0 vbar = tqdm(val_iter) for i, (data, target) in enumerate(vbar): gpu_datas = split_and_load(data=data, ctx_list=ctx, even_split=False) gpu_targets = split_and_load(data=target, ctx_list=ctx, even_split=False) loss_temp = .0 for gpu_data, gpu_target in zip(gpu_datas, gpu_targets): loss_gpu = criterion(*net(gpu_data), gpu_target) loss_temp += loss_gpu.sum().asscalar() metric.update(gpu_target, net.evaluate(gpu_data)) vbar.set_description('Epoch %d, val PA %.4f, mIoU %.4f' % (epoch, metric.get()[0], metric.get()[1])) val_loss += (loss_temp / wandb.config.bs_val) nd.waitall() pix_acc, mean_iou = metric.get() wandb.log({ 'val_PA': pix_acc, 'val_mIoU': mean_iou, 'val_loss': val_loss / len(val_iter) + 1 }) metric.reset() if mean_iou > best_score: save_checkpoint(model=net, model_name=wandb.config.model_name.lower(), backbone=wandb.config.backbone.lower(), data_name=wandb.config.data_name.lower(), time_stamp=wandb.config.start_time, is_best=True) best_score = mean_iou cudnn_auto_tune(True) save_checkpoint(model=net, model_name=wandb.config.model_name.lower(), backbone=wandb.config.backbone.lower(), data_name=wandb.config.data_name.lower(), time_stamp=wandb.config.start_time, is_best=False) run.finish()
lr_decay_count = 0 for epoch in range(epochs): tic = time.time() train_metric.reset() train_loss = 0 # Learning rate decay if epoch == lr_decay_epoch[lr_decay_count]: trainer.set_learning_rate(trainer.learning_rate*lr_decay) lr_decay_count += 1 # Loop through each batch of training data for i, batch in enumerate(train_data): # Extract data and label data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0) label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0) # AutoGrad with ag.record(): output = [] for _, X in enumerate(data): X = X.reshape((-1,) + X.shape[2:]) pred = net(X) output.append(pred) loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] # Backpropagation for l in loss: l.backward()
def train(cfg, ctx_lst, project_name, log_interval=5, no_val=False, lr=None, wd=None): wandb.init(job_type='train', dir=my_tools.root_dir(), config=cfg, project=project_name) if lr and wd: wandb.config.lr = lr wandb.config.wd = wd ctx = my_tools.get_contexts(ctx_lst) wandb.config.ctx = ctx data_factory = DataFactory(wandb.config.data_name) model_factory = ModelFactory(wandb.config.model_name) norm_layer, norm_kwargs = my_tools.get_norm_layer(wandb.config.norm, len(ctx)) model_kwargs = { 'nclass': data_factory.num_class, 'backbone': wandb.config.backbone, 'pretrained_base': wandb.config.backbone_init.get('manner') == 'cls', 'aux': wandb.config.aux, 'crop_size': wandb.config.crop_size, 'base_size': wandb.config.base_size, 'dilate': wandb.config.dilate, 'norm_layer': norm_layer, 'norm_kwargs': norm_kwargs, } net = model_factory.get_model( model_kwargs, resume=wandb.config.resume, lr_mult=wandb.config.lr_mult, backbone_init_manner=wandb.config.backbone_init.get('manner'), backbone_ckpt=wandb.config.backbone_init.get('backbone_ckpt'), prior_classes=wandb.config.backbone_init.get('prior_classes'), ctx=ctx) if net.symbolize: net.hybridize() num_worker = 0 if platform.system() == 'Windows' else 16 train_set = data_factory.seg_dataset( split='train', # sometimes would be 'trainval' mode='train', transform=my_tools.image_transform(), base_size=wandb.config.base_size, crop_size=wandb.config.crop_size) train_iter = DataLoader(train_set, wandb.config.bs_train, shuffle=True, last_batch='discard', num_workers=num_worker) val_set = data_factory.seg_dataset(split='val', mode='val', transform=my_tools.image_transform(), base_size=wandb.config.base_size, crop_size=wandb.config.crop_size) val_iter = DataLoader(val_set, wandb.config.bs_val, shuffle=False, last_batch='keep', num_workers=num_worker) wandb.config.num_train = len(train_set) wandb.config.num_valid = len(val_set) criterion = _get_criterion(wandb.config.aux, wandb.config.aux_weight) criterion.initialize(ctx=ctx) wandb.config.criterion = type(criterion) if wandb.config.optimizer == 'adam': trainer = Trainer(net.collect_params(), 'adam', optimizer_params={ 'learning_rate': wandb.config.lr, 'wd': wandb.config.wd, 'beta1': wandb.config.adam.get('adam_beta1'), 'beta2': wandb.config.adam.get('adam_beta2') }) elif wandb.config.optimizer in ('sgd', 'nag'): scheduler = _lr_scheduler( mode=wandb.config.lr_scheduler, base_lr=wandb.config.lr, target_lr=wandb.config.target_lr, nepochs=wandb.config.epochs, iters_per_epoch=len(train_iter), step_epoch=wandb.config.step.get('step_epoch'), step_factor=wandb.config.step.get('step_factor'), power=wandb.config.poly.get('power')) trainer = Trainer(net.collect_params(), wandb.config.optimizer, optimizer_params={ 'lr_scheduler': scheduler, 'wd': wandb.config.wd, 'momentum': wandb.config.momentum, 'multi_precision': True }) else: raise RuntimeError(f"Unknown optimizer: {wandb.config.optimizer}") metric = SegmentationMetric(data_factory.num_class) logger = get_logger(name='train', level=10) t_start = my_tools.get_strftime() logger.info(f'Training start: {t_start}') for k, v in wandb.config.items(): logger.info(f'{k}: {v}') logger.info('-----> end hyper-parameters <-----') wandb.config.start_time = t_start best_score = .0 best_epoch = 0 for epoch in range(wandb.config.epochs): train_loss = .0 tbar = tqdm(train_iter) for i, (data, target) in enumerate(tbar): gpu_datas = split_and_load(data, ctx_list=ctx) gpu_targets = split_and_load(target, ctx_list=ctx) with autograd.record(): loss_gpus = [ criterion(*net(gpu_data), gpu_target) for gpu_data, gpu_target in zip(gpu_datas, gpu_targets) ] for loss in loss_gpus: autograd.backward(loss) trainer.step(wandb.config.bs_train) nd.waitall() train_loss += sum([loss.mean().asscalar() for loss in loss_gpus]) / len(loss_gpus) tbar.set_description( 'Epoch-%d [training], loss %.5f, %s' % (epoch, train_loss / (i + 1), my_tools.get_strftime('%Y-%m-%d %H:%M:%S'))) if (i % log_interval == 0) or (i + 1 == len(train_iter)): wandb.log({ f'train_loss_batch, interval={log_interval}': train_loss / (i + 1) }) wandb.log({ 'train_loss_epoch': train_loss / (len(train_iter)), 'custom_step': epoch }) if not no_val: val_loss = .0 vbar = tqdm(val_iter) for i, (data, target) in enumerate(vbar): gpu_datas = split_and_load(data=data, ctx_list=ctx, even_split=False) gpu_targets = split_and_load(data=target, ctx_list=ctx, even_split=False) loss_gpus = [] for gpu_data, gpu_target in zip(gpu_datas, gpu_targets): gpu_output = net(gpu_data) loss_gpus.append(criterion(*gpu_output, gpu_target)) metric.update(gpu_target, gpu_output[0]) val_loss += sum([loss.mean().asscalar() for loss in loss_gpus]) / len(loss_gpus) vbar.set_description( 'Epoch-%d [validation], PA %.4f, mIoU %.4f' % (epoch, metric.get()[0], metric.get()[1])) nd.waitall() pix_acc, mean_iou = metric.get() wandb.log({ 'val_PA': pix_acc, 'val_mIoU': mean_iou, 'val_loss': val_loss / len(val_iter), 'custom_step': epoch }) metric.reset() if mean_iou > best_score: my_tools.save_checkpoint( model=net, model_name=wandb.config.model_name.lower(), backbone=wandb.config.backbone.lower(), data_name=wandb.config.data_name.lower(), time_stamp=wandb.config.start_time, is_best=True) best_score = mean_iou best_epoch = epoch logger.info( f'Best val mIoU={round(best_score * 100, 2)} at epoch: {best_epoch}') wandb.config.best_epoch = best_epoch my_tools.save_checkpoint(model=net, model_name=wandb.config.model_name.lower(), backbone=wandb.config.backbone.lower(), data_name=wandb.config.data_name.lower(), time_stamp=wandb.config.start_time, is_best=False)
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment ctx = [mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()] # retrieve the hyperparameters we set in notebook (with some defaults) #number of training examples utilized in one iteration. batch_size = args.batch_size #number of times an entire dataset is passed forward and backward through the neural network epochs = args.epochs #tuning parameter in an optimization algorithm that determines the step size at each iteration while moving toward a minimum of a loss function. learning_rate = args.learning_rate #Momentum remembers the update Δ w at each iteration, and determines the next update as a linear combination of the gradient and the previous update momentum = args.momentum #Optimizers are algorithms or methods used to change the attributes of your neural network such as weights and learning rate in order to reduce the losses. optimizer = args.optimizer #after each update, the weights are multiplied by a factor slightly less than 1. wd = args.wd optimizer_params = { 'learning_rate': learning_rate, 'wd': wd, 'momentum': momentum } log_interval = args.log_interval #In this example, we use Inflated 3D model (I3D) with ResNet50 backbone trained on Kinetics400 dataset. We want to replace the last classification (dense) layer to the number of classes in the dataset. model_name = args.network #number of classes in the dataset nclass = 2 #number of workers for the data loader num_workers = 1 current_host = args.current_host hosts = args.hosts model_dir = args.model_dir CHECKPOINTS_DIR = '/opt/ml/checkpoints' checkpoints_enabled = os.path.exists(CHECKPOINTS_DIR) data_dir = args.train segments = 'rawframes' train = 'annotations/{}_train_list_rawframes.txt'.format(args.task) #load the data with data loader train_data = load_data(data_dir, batch_size, num_workers, segments, train) # define the network net = define_network(ctx, model_name, nclass) #define the gluon trainer trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) #define loss function loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() #define training metric train_metric = mx.metric.Accuracy() train_history = TrainingHistory(['training-acc']) net.hybridize() #learning rate decay hyperparameters lr_decay_count = 0 lr_decay = 0.1 lr_decay_epoch = [40, 80, 100] for epoch in range(epochs): tic = time.time() train_metric.reset() train_loss = 0 # Learning rate decay if epoch == lr_decay_epoch[lr_decay_count]: trainer.set_learning_rate(trainer.learning_rate * lr_decay) lr_decay_count += 1 # Loop through each batch of training data for i, batch in enumerate(train_data): # Extract data and label data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) # AutoGrad with ag.record(): output = [] for _, X in enumerate(data): X = X.reshape((-1, ) + X.shape[2:]) pred = net(X) output.append(pred) loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] # Backpropagation for l in loss: l.backward() # Optimize trainer.step(batch_size) # Update metrics train_loss += sum([l.mean().asscalar() for l in loss]) train_metric.update(label, output) if i == 100: break name, acc = train_metric.get() # Update history and print metrics train_history.update([acc]) print('[Epoch %d] train=%f loss=%f time: %f' % (epoch, acc, train_loss / (i + 1), time.time() - tic)) print('saving the model') save(net, model_dir)
for epoch in range(opt.resume_epoch, opt.epochs): tic = time.time() train_metric.reset() train_loss = 0 btic = time.time() # Learning rate decay # if epoch == lr_decay_epoch[lr_decay_count]: # trainer.set_learning_rate(trainer.learning_rate*lr_decay) # lr_decay_count += 1 # Loop through each batch of training data for i, batch in enumerate(train_data): # Extract data and label data_bgs = split_and_load(batch[0], ctx_list=ctx, batch_axis=0) data_fgs = split_and_load(batch[1], ctx_list=ctx, batch_axis=0) label = split_and_load(batch[2], ctx_list=ctx, batch_axis=0) # AutoGrad with ag.record(): output = [] for _, (X_bgs, X_fgs) in enumerate(zip(data_bgs, data_fgs)): if opt.reshape_type == 'tsn': X_bgs = X_bgs.reshape((-1, ) + X_bgs.shape[2:]) X_fgs = X_fgs.reshape((-1, ) + X_fgs.shape[2:]) elif opt.reshape_type == 'c3d' or '3d' in opt.model: X_bgs = nd.transpose(data=X_bgs, axes=(0, 2, 1, 3, 4)) X_fgs = nd.transpose(data=X_fgs, axes=(0, 2, 1, 3, 4)) #X = nd.transpose(data=X,axes=(0,2,1,3,4)) elif opt.new_length != 1 and opt.reshape_type == 'tsn_newlength': #X = X.reshape((-3,-3,-2))
for epoch in range(epochs): tic = time.time() train_metric.reset() train_loss = 0 # Learning rate decay if epoch == lr_decay_epoch[lr_decay_count]: trainer.set_learning_rate(trainer.learning_rate * lr_decay) lr_decay_count += 1 # Loop through each batch of training data for i, batch in enumerate(train_data): # Extract data and label data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0, multiplier=3) label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0) # AutoGrad with ag.record(): output = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] # Backpropagation for l in loss: l.backward() # Optimize trainer.step(batch_size)
def _get_data_and_label(self, batch, ctx, batch_axis=0): data = batch[0] label = batch[1] data = split_and_load(data, ctx_list=ctx, batch_axis=batch_axis) label = split_and_load(label, ctx_list=ctx, batch_axis=batch_axis) return data, label
def fit(self): last_miou = .0 # record the best validation mIoU loss_step = 0 # step count for epoch in range(self.conf.epochs): train_loss = .0 start = time.time() for i, (data, target) in enumerate(self.train_iter): gpu_datas = split_and_load(data, ctx_list=self.ctx) gpu_targets = split_and_load(target, ctx_list=self.ctx) with autograd.record(): loss_gpu = [ self.criterion(*self.net(gpu_data), gpu_target) for gpu_data, gpu_target in zip( gpu_datas, gpu_targets) ] for loss in loss_gpu: autograd.backward(loss) self.trainer.step(self.conf.bs_train) nd.waitall() loss_temp = .0 for losses in loss_gpu: loss_temp += losses.sum().asscalar() train_loss += (loss_temp / self.conf.bs_train) # log every n batch # add loss to draw curve, train_loss <class numpy.float64> interval = 5 if loss_step < 5000 else 50 if (i % interval == 0) or (i + 1 == len(self.train_iter)): fitlog.add_loss(name='loss', value=round(train_loss / (i + 1), 5), step=loss_step) loss_step += 1 self.logger.info( "Epoch %d, batch %d, training loss %.5f." % (epoch, i, train_loss / (i + 1))) # log each epoch self.logger.info( ">>>>>> Epoch %d complete, time cost: %.1f sec. <<<<<<" % (epoch, time.time() - start)) # validation each epoch if self.val: pixel_acc, mean_iou = self._validation() self.logger.info( "Epoch %d validation, PixelAccuracy: %.4f, mIoU: %.4f." % (epoch, pixel_acc, mean_iou)) fitlog.add_metric(value=mean_iou, step=epoch, name='mIoU') fitlog.add_metric(value=pixel_acc, step=epoch, name='PA') if mean_iou > last_miou: f_name = self._save_model(tag='best') self.logger.info( "Epoch %d mIoU: %.4f > %.4f(previous), save model: %s" % (epoch, mean_iou, last_miou, f_name)) last_miou = mean_iou # save the final-epoch params f_name = self._save_model(tag='last') self.logger.info(">>>>>> Training complete, save model: %s. <<<<<<" % f_name) # record fitlog.add_best_metric(value=round(last_miou, 4), name='mIoU') fitlog.add_other(value=self.id, name='record_id') fitlog.add_other(value=self.num_train, name='train') fitlog.add_other(value=self.num_val, name='val')