def train(net, train_data, val_data, eval_metric, batch_size, ctx, logger, args): """Training pipeline""" args.kv_store = 'device' if (args.amp and 'nccl' in args.kv_store) else args.kv_store kv = mx.kvstore.create(args.kv_store) net.collect_params().setattr('grad_req', 'null') net.collect_train_params().setattr('grad_req', 'write') for k, v in net.collect_params('.*bias').items(): v.wd_mult = 0.0 optimizer_params = {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum, } if args.clip_gradient > 0.0: optimizer_params['clip_gradient'] = args.clip_gradient if args.amp: optimizer_params['multi_precision'] = True if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer( net.collect_train_params(), # fix batchnorm, fix first stage, etc... 'sgd', optimizer_params ) else: trainer = gluon.Trainer( net.collect_train_params(), # fix batchnorm, fix first stage, etc... 'sgd', optimizer_params, update_on_kvstore=(False if args.amp else None), kvstore=kv) if args.amp: amp.init_trainer(trainer) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_warmup = float(args.lr_warmup) # avoid int division rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss(rho=args.rpn_smoothl1_rho) # == smoothl1 rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss(rho=args.rcnn_smoothl1_rho) # == smoothl1 rcnn_mask_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) metrics = [mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1'), mx.metric.Loss('RCNN_Mask')] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() rcnn_mask_metric = MaskAccMetric() rcnn_fgmask_metric = MaskFGAccMetric() metrics2 = [rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric, rcnn_mask_metric, rcnn_fgmask_metric] async_eval_processes = [] logger.info(args) if args.verbose: logger.info('Trainable parameters:') logger.info(net.collect_train_params().keys()) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] base_lr = trainer.learning_rate for epoch in range(args.start_epoch, args.epochs): rcnn_task = ForwardBackwardTask(net, trainer, rpn_cls_loss, rpn_box_loss, rcnn_cls_loss, rcnn_box_loss, rcnn_mask_loss, args.amp) executor = Parallel(args.executor_threads, rcnn_task) if not args.horovod else None if not args.disable_hybridization: net.hybridize(static_alloc=args.static_alloc) while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr)) for metric in metrics: metric.reset() tic = time.time() btic = time.time() train_data_iter = iter(train_data) next_data_batch = next(train_data_iter) next_data_batch = split_and_load(next_data_batch, ctx_list=ctx) for i in range(len(train_data)): batch = next_data_batch if i + epoch * len(train_data) <= lr_warmup: # adjust based on real percentage new_lr = base_lr * get_lr_at_iter((i + epoch * len(train_data)) / lr_warmup, args.lr_warmup_factor) if new_lr != trainer.learning_rate: if i % args.log_interval == 0: logger.info('[Epoch {} Iteration {}] Set learning rate to {}' .format(epoch, i, new_lr)) trainer.set_learning_rate(new_lr) metric_losses = [[] for _ in metrics] add_losses = [[] for _ in metrics2] if executor is not None: for data in zip(*batch): executor.put(data) for j in range(len(ctx)): if executor is not None: result = executor.get() else: result = rcnn_task.forward_backward(list(zip(*batch))[0]) if (not args.horovod) or hvd.rank() == 0: for k in range(len(metric_losses)): metric_losses[k].append(result[k]) for k in range(len(add_losses)): add_losses[k].append(result[len(metric_losses) + k]) try: # prefetch next batch next_data_batch = next(train_data_iter) next_data_batch = split_and_load(next_data_batch, ctx_list=ctx) except StopIteration: pass for metric, record in zip(metrics, metric_losses): metric.update(0, record) for metric, records in zip(metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(batch_size) if (not args.horovod or hvd.rank() == 0) and args.log_interval \ and not (i + 1) % args.log_interval: msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics + metrics2]) batch_speed = args.log_interval * args.batch_size / (time.time() - btic) speed.append(batch_speed) logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.format( epoch, i, batch_speed, msg)) btic = time.time() if speed: avg_batch_speed = sum(speed) / len(speed) # validate and save params if (not args.horovod) or hvd.rank() == 0: msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics]) logger.info('[Epoch {}] Training cost: {:.3f}, Speed: {:.3f} samples/sec, {}'.format( epoch, (time.time() - tic), avg_batch_speed, msg)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time validate(net, val_data, async_eval_processes, ctx, eval_metric, logger, epoch, best_map, args) elif (not args.horovod) or hvd.rank() == 0: current_map = 0. save_params(net, logger, best_map, current_map, epoch, args.save_interval, args.save_prefix) for thread in async_eval_processes: thread.join()
def train(net, train_data, val_data, eval_metric, batch_size, ctx, args): """Training pipeline""" kv = mx.kvstore.create(args.kv_store) net.collect_params().setattr('grad_req', 'null') net.collect_train_params().setattr('grad_req', 'write') optimizer_params = { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum } if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer( net.collect_train_params( ), # fix batchnorm, fix first stage, etc... 'sgd', optimizer_params) else: trainer = gluon.Trainer( net.collect_train_params( ), # fix batchnorm, fix first stage, etc... 'sgd', optimizer_params, update_on_kvstore=(False if args.amp else None), kvstore=kv) if args.amp: amp.init_trainer(trainer) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted( [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_warmup = float(args.lr_warmup) # avoid int division # TODO(zhreshold) losses? rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss( from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.) # == smoothl1 rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss() # == smoothl1 metrics = [ mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1'), ] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() metrics2 = [ rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric ] # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) if args.verbose: logger.info('Trainable parameters:') logger.info(net.collect_train_params().keys()) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): mix_ratio = 1.0 if not args.disable_hybridization: net.hybridize(static_alloc=args.static_alloc) rcnn_task = ForwardBackwardTask(net, trainer, rpn_cls_loss, rpn_box_loss, rcnn_cls_loss, rcnn_box_loss, mix_ratio=1.0) executor = Parallel(args.executor_threads, rcnn_task) if not args.horovod else None if args.mixup: # TODO(zhreshold) only support evenly mixup now, target generator needs to be modified otherwise train_data._dataset._data.set_mixup(np.random.uniform, 0.5, 0.5) mix_ratio = 0.5 if epoch >= args.epochs - args.no_mixup_epochs: train_data._dataset._data.set_mixup(None) mix_ratio = 1.0 while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) for metric in metrics: metric.reset() tic = time.time() btic = time.time() base_lr = trainer.learning_rate rcnn_task.mix_ratio = mix_ratio print(len(train_data)) for i, batch in enumerate(train_data): if epoch == 0 and i <= lr_warmup: # adjust based on real percentage new_lr = base_lr * get_lr_at_iter(i / lr_warmup, args.lr_warmup_factor) if new_lr != trainer.learning_rate: if i % args.log_interval == 0: logger.info( '[Epoch 0 Iteration {}] Set learning rate to {}'. format(i, new_lr)) trainer.set_learning_rate(new_lr) batch = split_and_load(batch, ctx_list=ctx) metric_losses = [[] for _ in metrics] add_losses = [[] for _ in metrics2] if executor is not None: for data in zip(*batch): executor.put(data) for j in range(len(ctx)): if executor is not None: result = executor.get() else: result = rcnn_task.forward_backward(list(zip(*batch))[0]) if (not args.horovod) or hvd.rank() == 0: for k in range(len(metric_losses)): metric_losses[k].append(result[k]) for k in range(len(add_losses)): add_losses[k].append(result[len(metric_losses) + k]) for metric, record in zip(metrics, metric_losses): metric.update(0, record) for metric, records in zip(metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(batch_size) # update metrics if (not args.horovod or hvd.rank() == 0) and args.log_interval \ and not (i + 1) % args.log_interval: msg = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in metrics + metrics2 ]) logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'. format( epoch, i, args.log_interval * args.batch_size / (time.time() - btic), msg)) btic = time.time() if (not args.horovod) or hvd.rank() == 0: msg = ','.join( ['{}={:.3f}'.format(*metric.get()) for metric in metrics]) logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format( epoch, (time.time() - tic), msg)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric, args) map_name_train, mean_ap_train = validate( net, train_data, ctx, eval_metric, args) if isinstance(map_name, list): val_msg = '\n'.join([ '{}={}'.format(k, v) for k, v in zip(map_name, mean_ap) ]) train_msg = '\n'.join([ '{}={}'.format(k, v) for k, v in zip(map_name_train, mean_ap_train) ]) current_map = float(mean_ap[-1]) else: val_msg = '{}={}'.format(map_name, mean_ap) train_msg = '{}={}'.format(map_name_train, mean_ap_train) current_map = mean_ap logger.info('[Epoch {}] Validation: {}'.format(epoch, val_msg)) logger.info('[Epoch {}] Train: {}'.format(epoch, train_msg)) else: current_map = 0. save_params(net, logger, best_map, current_map, epoch, args.save_interval, os.path.join(args.model_dir, 'fastrcnn')) executor.__del__()
lr_decay = float(args.lr_decay) lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_warmup = float(args.lr_warmup) # avoid int division # TODO(zhreshold) losses? rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.) # == smoothl1 rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss() # == smoothl1 metrics = [mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1'), ] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() metrics2 = [rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric] # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args)
def get_faster_rcnn_metrics(): return [mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1')], \ [RPNAccMetric(), RPNL1LossMetric(), RCNNAccMetric(), RCNNL1LossMetric()]
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().setattr('grad_req', 'null') net.collect_train_params().setattr('grad_req', 'write') for k, v in net.collect_params('.*beta|.*bias').items(): v.wd_mult = 0.0 if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer( net.collect_train_params(), # fix batchnorm, fix first stage, etc... 'sgd', {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum}) else: trainer = gluon.Trainer( net.collect_train_params(), # fix batchnorm, fix first stage, etc... 'sgd', {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum}, update_on_kvstore=(False if args.amp else None)) if args.amp: amp.init_trainer(trainer) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_warmup = float(args.lr_warmup) # avoid int division rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.) # == smoothl1 rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss() # == smoothl1 rcnn_mask_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) metrics = [mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1'), mx.metric.Loss('RCNN_Mask')] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() rcnn_mask_metric = MaskAccMetric() rcnn_fgmask_metric = MaskFGAccMetric() metrics2 = [rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric, rcnn_mask_metric, rcnn_fgmask_metric] # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) if args.verbose: logger.info('Trainable parameters:') logger.info(net.collect_train_params().keys()) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr)) for metric in metrics: metric.reset() tic = time.time() btic = time.time() if not args.disable_hybridization: net.hybridize(static_alloc=args.static_alloc) base_lr = trainer.learning_rate for i, batch in enumerate(train_data): if epoch == 0 and i <= lr_warmup: # adjust based on real percentage new_lr = base_lr * get_lr_at_iter(i / lr_warmup) if new_lr != trainer.learning_rate: if i % args.log_interval == 0: logger.info( '[Epoch 0 Iteration {}] Set learning rate to {}'.format(i, new_lr)) trainer.set_learning_rate(new_lr) batch = split_and_load(batch, ctx_list=ctx) batch_size = len(batch[0]) losses = [] metric_losses = [[] for _ in metrics] add_losses = [[] for _ in metrics2] with autograd.record(): for data, label, gt_mask, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip( *batch): gt_label = label[:, :, 4:5] gt_box = label[:, :, :4] cls_pred, box_pred, mask_pred, roi, samples, matches, rpn_score, rpn_box, anchors = net( data, gt_box) # losses of rpn rpn_score = rpn_score.squeeze(axis=-1) num_rpn_pos = (rpn_cls_targets >= 0).sum() rpn_loss1 = rpn_cls_loss(rpn_score, rpn_cls_targets, rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos rpn_loss2 = rpn_box_loss(rpn_box, rpn_box_targets, rpn_box_masks) * rpn_box.size / num_rpn_pos # rpn overall loss, use sum rather than average rpn_loss = rpn_loss1 + rpn_loss2 # generate targets for rcnn cls_targets, box_targets, box_masks = net.target_generator(roi, samples, matches, gt_label, gt_box) # losses of rcnn num_rcnn_pos = (cls_targets >= 0).sum() rcnn_loss1 = rcnn_cls_loss(cls_pred, cls_targets, cls_targets >= 0) * cls_targets.size / \ cls_targets.shape[0] / num_rcnn_pos rcnn_loss2 = rcnn_box_loss(box_pred, box_targets, box_masks) * box_pred.size / \ box_pred.shape[0] / num_rcnn_pos rcnn_loss = rcnn_loss1 + rcnn_loss2 # generate targets for mask mask_targets, mask_masks = net.mask_target(roi, gt_mask, matches, cls_targets) # loss of mask mask_loss = rcnn_mask_loss(mask_pred, mask_targets, mask_masks) * \ mask_targets.size / mask_targets.shape[0] / mask_masks.sum() # overall losses losses.append(rpn_loss.sum() + rcnn_loss.sum() + mask_loss.sum()) if (not args.horovod or hvd.rank() == 0): metric_losses[0].append(rpn_loss1.sum()) metric_losses[1].append(rpn_loss2.sum()) metric_losses[2].append(rcnn_loss1.sum()) metric_losses[3].append(rcnn_loss2.sum()) metric_losses[4].append(mask_loss.sum()) add_losses[0].append([[rpn_cls_targets, rpn_cls_targets >= 0], [rpn_score]]) add_losses[1].append([[rpn_box_targets, rpn_box_masks], [rpn_box]]) add_losses[2].append([[cls_targets], [cls_pred]]) add_losses[3].append([[box_targets, box_masks], [box_pred]]) add_losses[4].append([[mask_targets, mask_masks], [mask_pred]]) add_losses[5].append([[mask_targets, mask_masks], [mask_pred]]) if args.amp: with amp.scale_loss(losses, trainer) as scaled_losses: autograd.backward(scaled_losses) else: autograd.backward(losses) if (not args.horovod or hvd.rank() == 0): for metric, record in zip(metrics, metric_losses): metric.update(0, record) for metric, records in zip(metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(batch_size) # update metrics if (not args.horovod or hvd.rank() == 0) and args.log_interval and not (i + 1) % args.log_interval: msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics + metrics2]) logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.format( epoch, i, args.log_interval * args.batch_size / (time.time() - btic), msg)) btic = time.time() # validate and save params if (not args.horovod or hvd.rank() == 0): msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics]) logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format( epoch, (time.time() - tic), msg)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric, args) val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, logger, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def main(): # Function to get mnist iterator given a rank def get_voc_iterator(rank, num_workers, net, num_shards): data_dir = "data-%d" % rank try: s3_client = boto3.client('s3') for file in [ 'VOCtrainval_06-Nov-2007.tar', 'VOCtest_06-Nov-2007.tar', 'VOCtrainval_11-May-2012.tar' ]: s3_client.download_file(args.s3bucket, f'voc_tars/{file}', f'/opt/ml/code/{file}') with tarfile.open(filename) as tar: tar.extractall(path=path) except: print('downloading from source') download_voc(data_dir) input_shape = (1, 256, 256, 3) batch_size = args.batch_size # might want to replace with mx.io.ImageDetRecordIter, this means you need data in RecordIO format # train_iter = mx.io.MNISTIter( # image="%s/train-images-idx3-ubyte" % data_dir, # label="%s/train-labels-idx1-ubyte" % data_dir, # input_shape=input_shape, # batch_size=batch_size, # shuffle=True, # flat=False, # num_parts=hvd.size(), # part_index=hvd.rank() # ) train_dataset = gdata.VOCDetection( root=f'/opt/ml/code/data-{rank}/VOCdevkit/', splits=[(2007, 'trainval'), (2012, 'trainval')]) val_dataset = gdata.VOCDetection( root=f'/opt/ml/code/data-{rank}/VOCdevkit/', splits=[(2007, 'test')]) val_metric = VOC07MApMetric(iou_thresh=0.5, class_names=val_dataset.classes) im_aspect_ratio = [1.] * len(train_dataset) train_bfn = FasterRCNNTrainBatchify(net) train_sampler = gluoncv.nn.sampler.SplitSortedBucketSampler( im_aspect_ratio, batch_size, num_parts=hvd.size() if args.horovod else 1, part_index=hvd.rank() if args.horovod else 0, shuffle=True) # had issue with multi_stage=True train_iter = mx.gluon.data.DataLoader(train_dataset.transform( FasterRCNNDefaultTrainTransform(net.short, net.max_size, net, ashape=net.ashape, multi_stage=False)), batch_sampler=train_sampler, batchify_fn=train_bfn, num_workers=num_workers) val_bfn = Tuple(*[Append() for _ in range(3)]) short = net.short[-1] if isinstance(net.short, (tuple, list)) else net.short # validation use 1 sample per device val_iter = mx.gluon.data.DataLoader(val_dataset.transform( FasterRCNNDefaultValTransform(short, net.max_size)), num_shards, False, batchify_fn=val_bfn, last_batch='keep', num_workers=num_workers) return train_iter, val_iter # Function to define neural network def conv_nets(model_name): net = model_zoo.get_model(model_name, pretrained_base=False) return net def evaluate(net, val_data, ctx, eval_metric, args): """Test on validation dataset.""" clipper = gcv.nn.bbox.BBoxClipToImage() eval_metric.reset() if not args.disable_hybridization: # input format is differnet than training, thus rehybridization is needed. net.hybridize(static_alloc=args.static_alloc) for batch in val_data: batch = split_and_load(batch, ctx_list=ctx) det_bboxes = [] det_ids = [] det_scores = [] gt_bboxes = [] gt_ids = [] gt_difficults = [] for x, y, im_scale in zip(*batch): # get prediction results ids, scores, bboxes = net(x) det_ids.append(ids) det_scores.append(scores) # clip to image size det_bboxes.append(clipper(bboxes, x)) # rescale to original resolution im_scale = im_scale.reshape((-1)).asscalar() det_bboxes[-1] *= im_scale # split ground truths gt_ids.append(y.slice_axis(axis=-1, begin=4, end=5)) gt_bboxes.append(y.slice_axis(axis=-1, begin=0, end=4)) gt_bboxes[-1] *= im_scale gt_difficults.append( y.slice_axis(axis=-1, begin=5, end=6 ) if y.shape[-1] > 5 else None) # update metric for det_bbox, det_id, det_score, gt_bbox, gt_id, gt_diff in zip( det_bboxes, det_ids, det_scores, gt_bboxes, gt_ids, gt_difficults): eval_metric.update(det_bbox, det_id, det_score, gt_bbox, gt_id, gt_diff) return eval_metric.get() # Initialize Horovod hvd.init() # Horovod: pin context to local rank if args.horovod: ctx = [mx.gpu(hvd.local_rank())] else: ctx = [mx.gpu(int(i)) for i in args.gpus.split(',') if i.strip()] ctx = ctx if ctx else [mx.cpu()] context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu( hvd.local_rank()) num_workers = hvd.size() # Build model model = conv_nets(args.model_name) model.cast(args.dtype) model.hybridize() # Initialize parameters initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) model.initialize(initializer, ctx=context) # Create optimizer optimizer_params = { 'momentum': args.momentum, 'learning_rate': args.lr * hvd.size() } opt = mx.optimizer.create('sgd', **optimizer_params) # Load training and validation data train_data, val_data = get_voc_iterator(hvd.rank(), num_workers, model, len(ctx)) # Horovod: fetch and broadcast parameters params = model.collect_params() if params is not None: hvd.broadcast_parameters(params, root_rank=0) # Horovod: create DistributedTrainer, a subclass of gluon.Trainer trainer = hvd.DistributedTrainer(params, opt) # Create loss function and train metric loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() # adding in new loss functions rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss( from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss( rho=args.rpn_smoothl1_rho) # == smoothl1 rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss( rho=args.rcnn_smoothl1_rho) # == smoothl1 metrics = [ mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1'), ] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() metrics2 = [ rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric ] metric = mx.metric.Accuracy() # Global training timing if hvd.rank() == 0: global_tic = time.time() # Train model # for epoch in range(args.epochs): # tic = time.time() # train_data.reset() # metric.reset() # for nbatch, batch in enumerate(train_data, start=1): # data = batch.data[0].as_in_context(context) # label = batch.label[0].as_in_context(context) # with autograd.record(): # output = model(data.astype(args.dtype, copy=False)) # loss = loss_fn(output, label) # loss.backward() # trainer.step(args.batch_size) # metric.update([label], [output]) # if nbatch % 100 == 0: # name, acc = metric.get() # logging.info('[Epoch %d Batch %d] Training: %s=%f' % # (epoch, nbatch, name, acc)) # if hvd.rank() == 0: # elapsed = time.time() - tic # speed = nbatch * args.batch_size * hvd.size() / elapsed # logging.info('Epoch[%d]\tSpeed=%.2f samples/s\tTime cost=%f', # epoch, speed, elapsed) # # Evaluate model accuracy # _, train_acc = metric.get() # name, val_acc = evaluate(model, val_data, context) # if hvd.rank() == 0: # logging.info('Epoch[%d]\tTrain: %s=%f\tValidation: %s=%f', epoch, name, # train_acc, name, val_acc) # if hvd.rank()==0: # global_training_time =time.time() - global_tic # print("Global elpased time on training:{}".format(global_training_time)) # device = context.device_type + str(num_workers) # train from train_faster_rcnn.py for epoch in range(args.epochs): lr_decay = float(args.lr_decay) lr_steps = sorted( [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_warmup = float(args.lr_warmup) # avoid int division # this simplifies dealing with all of the loss functions rcnn_task = ForwardBackwardTask(model, trainer, rpn_cls_loss, rpn_box_loss, rcnn_cls_loss, rcnn_box_loss, mix_ratio=1.0, amp_enabled=args.amp) executor = Parallel(args.executor_threads, rcnn_task) if not args.horovod else None mix_ratio = 1.0 if not args.disable_hybridization: model.hybridize(static_alloc=args.static_alloc) if args.mixup: # TODO(zhreshold) only support evenly mixup now, target generator needs to be modified otherwise train_data._dataset._data.set_mixup(np.random.uniform, 0.5, 0.5) mix_ratio = 0.5 if epoch >= args.epochs - args.no_mixup_epochs: train_data._dataset._data.set_mixup(None) mix_ratio = 1.0 while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) for metric in metrics: metric.reset() tic = time.time() btic = time.time() base_lr = trainer.learning_rate rcnn_task.mix_ratio = mix_ratio for i, batch in enumerate(train_data): if epoch == 0 and i <= lr_warmup: # does a learning rate reset if warming up # adjust based on real percentage if (lr_warmup != 0): new_lr = base_lr * get_lr_at_iter(i / lr_warmup, args.lr_warmup_factor) if new_lr != trainer.learning_rate: if i % args.log_interval == 0: logger.info( '[Epoch 0 Iteration {}] Set learning rate to {}'. format(i, new_lr)) trainer.set_learning_rate(new_lr) batch = split_and_load( batch, ctx_list=ctx ) # does split and load function, creates a batch per device metric_losses = [[] for _ in metrics] add_losses = [[] for _ in metrics2] if executor is not None: for data in zip(*batch): executor.put(data) for j in range(len(ctx)): if executor is not None: result = executor.get() else: result = rcnn_task.forward_backward(list(zip(*batch))[0]) if (not args.horovod) or hvd.rank() == 0: for k in range(len(metric_losses)): metric_losses[k].append(result[k]) for k in range(len(add_losses)): add_losses[k].append(result[len(metric_losses) + k]) for metric, record in zip(metrics, metric_losses): metric.update(0, record) for metric, records in zip(metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(batch_size) # update metrics if (not args.horovod or hvd.rank() == 0) and args.log_interval \ and not (i + 1) % args.log_interval: msg = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in metrics + metrics2 ]) logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'. format( epoch, i, args.log_interval * args.batch_size / (time.time() - btic), msg)) btic = time.time() if (not args.horovod) or hvd.rank() == 0: msg = ','.join( ['{}={:.3f}'.format(*metric.get()) for metric in metrics]) logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format( epoch, (time.time() - tic), msg)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(model, val_data, ctx, eval_metric, args) val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format( epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(model, logger, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def train(net, train_data, val_data, eval_metric, batch_size, ctx, logger, args): """Training pipeline""" args.kv_store = "device" if (args.amp and "nccl" in args.kv_store) else args.kv_store kv = mx.kvstore.create(args.kv_store) net.collect_params().setattr("grad_req", "null") net.collect_train_params().setattr("grad_req", "write") optimizer_params = {"learning_rate": args.lr, "wd": args.wd, "momentum": args.momentum} if args.amp: optimizer_params["multi_precision"] = True if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer( net.collect_train_params(), # fix batchnorm, fix first stage, etc... "sgd", optimizer_params, ) else: trainer = gluon.Trainer( net.collect_train_params(), # fix batchnorm, fix first stage, etc... "sgd", optimizer_params, update_on_kvstore=(False if args.amp else None), kvstore=kv, ) if args.amp: amp.init_trainer(trainer) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(",") if ls.strip()]) lr_warmup = float(args.lr_warmup) # avoid int division # TODO(zhreshold) losses? rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.0) # == smoothl1 rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss(rho=1.0) # == smoothl1 metrics = [ mx.metric.Loss("RPN_Conf"), mx.metric.Loss("RPN_SmoothL1"), mx.metric.Loss("RCNN_CrossEntropy"), mx.metric.Loss("RCNN_SmoothL1"), ] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() metrics2 = [rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric] logger.info(args) if args.verbose: logger.info("Trainable parameters:") logger.info(net.collect_train_params().keys()) logger.info("Start training from [Epoch {}]".format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): rcnn_task = ForwardBackwardTask( net, trainer, rpn_cls_loss, rpn_box_loss, rcnn_cls_loss, rcnn_box_loss, mix_ratio=1.0, amp_enabled=args.amp, ) executor = Parallel(args.executor_threads, rcnn_task) if not args.horovod else None mix_ratio = 1.0 net.hybridize() while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr)) for metric in metrics: metric.reset() tic = time.time() btic = time.time() base_lr = trainer.learning_rate rcnn_task.mix_ratio = mix_ratio for i, batch in enumerate(train_data): if epoch == 0 and i <= lr_warmup: # adjust based on real percentage new_lr = base_lr * get_lr_at_iter( i / lr_warmup, args.lr_warmup_factor / args.num_gpus ) if new_lr != trainer.learning_rate: if i % args.log_interval == 0: logger.info( "[Epoch 0 Iteration {}] Set learning rate to {}".format(i, new_lr) ) trainer.set_learning_rate(new_lr) batch = split_and_load(batch, ctx_list=ctx) metric_losses = [[] for _ in metrics] add_losses = [[] for _ in metrics2] if executor is not None: for data in zip(*batch): executor.put(data) for j in range(len(ctx)): if executor is not None: result = executor.get() else: result = rcnn_task.forward_backward(list(zip(*batch))[0]) if (not args.horovod) or hvd.rank() == 0: for k in range(len(metric_losses)): metric_losses[k].append(result[k]) for k in range(len(add_losses)): add_losses[k].append(result[len(metric_losses) + k]) for metric, record in zip(metrics, metric_losses): metric.update(0, record) for metric, records in zip(metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(batch_size) # update metrics if ( (not args.horovod or hvd.rank() == 0) and args.log_interval and not (i + 1) % args.log_interval ): msg = ",".join(["{}={:.3f}".format(*metric.get()) for metric in metrics + metrics2]) logger.info( "[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}".format( epoch, i, args.log_interval * args.batch_size / (time.time() - btic), msg ) ) btic = time.time() if (not args.horovod) or hvd.rank() == 0: msg = ",".join(["{}={:.3f}".format(*metric.get()) for metric in metrics]) logger.info( "[Epoch {}] Training cost: {:.3f}, {}".format(epoch, (time.time() - tic), msg) ) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric, args) val_msg = "\n".join(["{}={}".format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info("[Epoch {}] Validation: \n{}".format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0.0 save_params( net, logger, best_map, current_map, epoch, args.save_interval, os.path.join(args.sm_save, args.save_prefix), args, )
def train(net, train_data, val_data, eval_metric, ctx, args): net.collect_params.reset_ctx(ctx) kv = mx.kvstore.create(args.kv_store) net.collect_params().setattr('grad_req', 'null') net.collect_train_params.setattr('grad_req', 'write') if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer(net.collect_train_params(), 'sgd', {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum}) else: trainer = gluon.Trainer(net.collect_train_params(), 'sgd', {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum}, update_on_kvstore=(False if args.amp else None), kvstore=kv) if args.amp: amp.init_trainer(trainer) lr_decay = float(args.lr_decay) lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_warmup = float(args.lr_warmup) # losses, 以下4个loss是rcnn_task 里面要用到 rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.) rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss() metrics = [mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1')] # metrics: [rpn_cls_loss, rpn_box_loss, rcnn_cls_loss, rcnn_box_loss] # metric_losses: [[rpn_cls_loss], [rpn_box_loss], [rcnn_cls_loss], [rcnn_box_loss]] # metric.update(0, record) rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() metrics2 = [rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric] # logger set_up logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) if args.verbose: logger.info('Trainable parameters:') logger.info(net.collect_train_params().keys()) logger.info("Start training from [Epoch {}]".format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): mix_ratio = 1.0 if not args.disable_hybridization: net.hybridize(static_alloc=args.static_alloc) rcnn_task = ForwardBackwardTask(net, trainer, rpn_cls_loss, rpn_box_loss, rcnn_cls_loss, rcnn_box_loss, mix_ratio=1.0) executor = Parallel(1 if args.horovod else args.executor_threads, rcnn_task) # executor 这一句什么意思 if args.mixup: train_data._dataset._data.set_mixup(np.random.uniform, 0.5, 0.5) mix_ratio =0.5 if epoch >= args.epochs - args.no_mixup_epochs: train_data._dataset._data.set_mixup(None) mix_ratio = 1.0 # 调整学习率 while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info('[Epoch {}] Set learning rate to {}'.format(epoch, new_lr)) for metric in metrics: metric.reset() tic = time.time() # 记录一次循环的时间 btic = time.time() # 记录每一个batch的时间 base_lr = trainer.learning_rate rcnn_task.mix_ratio = mix_ratio for i, batch in enumerate(train_data): if epoch == 0 and i <= lr_warmup: new_lr = base_lr * get_lr_at_iter(i / lr_warmup) if new_lr != trainer.learning_rate: if i % args.log_interval == 0: logger.info('[Epoch 0 Iteration {}] Set learning rate to {}'.format(i, new_lr)) trainer.set_learning_rate(new_lr) batch = split_and_load(batch, ctx_list=ctx) batch_size = len(batch[0]) metric_losses = [[] for _ in metrics] # metrics: [rpn_cls_loss, rpn_box_loss, rcnn_cls_loss, rcnn_box_loss] add_losses = [[] for _ in metrics2] # metrics2 : [rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric] for data in zip(*batch): executor.put(data) for j in range(len(ctx)): result = executor.get() if (not args.horovod) or hvd.rank() == 0: for k in range(len(metric_losses)): metric_losses[k].append(result[k]) for k in range(len(add_losses)): add_losses[k].append(result[len(metric_losses) + k]) for metric, record in zip(metrics, metric_losses): # metrics: [rpn_cls_loss, rpn_box_loss, rcnn_cls_loss, rcnn_box_loss] # metric_losses: [[rpn_cls_loss], [rpn_box_loss], [rcnn_cls_loss], [rcnn_box_loss]] metric.update(0, record) for metric, records in zip(metrics2, add_losses): # metrics2 = [rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric] # add_losses: [[rpn_acc_metric], [rpn_bbox_metric], [rcnn_acc_metric], [rcnn_bbox_metric]] # rpn_acc_metric: [[rpn_label, rpn_weight], [rpn_cls_logits]] for pred in records: # update(label, preds) # label: [rpn_label, rpn_weight] # preds: [rpn_cls_logits] metric.update(pred[0], pred[1]) trainer.step(batch_size) if (not args.horovod or hvd.rank() == 0) and args.log_interval and not (i + 1) % args.log_interval: msg = ','.join(['{} ={:.3f}'.format(*metric.get()) for metric in metrics + metrics2]) logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.format( epoch, i, args.batch_size * args.log_interval / (time.time() - btic), msg)) btic = time.time() if (not args.horovod) or hvd.rank() == 0: msg = ','.join(['{} ={:.3f}'.format(*metric.get()) for metric in metrics]) logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format( epoch, (time.time() - tic), msg)) if (epoch % args.val_interval == 0) or (args.save_interval and epoch % args.save_interval == 0): # 每循环args.val_interval或者args.save_interval次 # 就需要使用验证集来测试一次,得到current_map map_name, mean_ap = validate(net, val_data, ctx, eval_metric, args) val_msg = "\n".join('{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) # mean_ap的最后一个数据就是mAP else: current_map = 0 save_params(net, logger, best_map, current_map, epoch, args.save_interval, args.save_prefix) executor.__del__()
def train(net, train_loader, val_loader, eval_metric, ctx, cfg): kv = mx.kvstore.create(cfg["train"]["kv_store"]) net.collect_params().setattr('grad_req', 'null') # 需要训练的参数的train_pattern在构造时输入到网络中 net.collect_train_params().setattr('grad_req', 'write') optimizer_params = { "learning_rate": cfg["train"]["lr"], "wd": cfg["train"]["wd"], "momentum": cfg["train"]["momentum"], } trainer = gluon.Trainer(net.collect_train_params(), "sgd", optimizer_params, kvstore=kv) lr_decay = float(cfg["train"]["lr_decay"]) lr_steps = sorted([float(ls) for ls in cfg["train"]["lr_decay_epoch"]]) lr_warmup = float(cfg["train"]["lr_warmup_iteration"]) rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss( from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss( rho=cfg["train"]["rpn_smoothl1_rho"]) rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss( rho=cfg["train"]["rcnn_smoothl1_rho"]) metrics = [ mx.metric.Loss("RPN_Conf"), mx.metric.Loss("RPN_SmoothL1"), mx.metric.Loss("RCNN_CrossEntropy"), mx.metric.Loss("RCNN_SmoothL1"), mx.metric.Loss('RPN_GT_Recall'), ] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() metrics2 = [ rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric ] data_prepare_time = Record("data_prepare_time") data_distributed_time = Record("data_distributed_time") net_forward_backward_time = Record("net_forward_backward_time") logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = cfg["train"]["save_prefix"] + "_train.log" log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(cfg) if cfg["verbose"]: logger.info("Trainable parameters:") logger.info(net.collect_train_params().keys()) logger.info("Start training from [Epoch {}]".format( cfg["train"]['start_epoch'])) best_map = [0] for epoch in range(cfg["train"]["start_epoch"], cfg["train"]["epochs"]): rcnn_task = ForwardBackwardTask(net, trainer, rpn_cls_loss, rpn_box_loss, rcnn_cls_loss, rcnn_box_loss, mix_ratio=1.0, amp_enabled=None) # 多线程执行运算操作 # 每个线程处理一部分数据,这部分数据可以在任意gpus上 # 线程数量和gpus数量无关 # 在多gpus训练中,每个设备都要调用一次forward-backward操作 # 每次调用都需要等待执行结果 # 使用Parallel,每次调用都不需要等待执行结果, # 实现并行运行 # 也可以理解为后台线程进行运算(如果Parallel设置成主线程不运算) # cfg["executor_threads"]设置成1, # Parallel的代码: # 前cfg["executor_threads"]次put()的数据在主线程运算, # 后面再调用put()在后台线程运算 # 目的是在第一次迭代中模型在主线程初始化 # 调用cfg["executor_threads"]次是为了保证模型在不同设备上正常进行初始化? executor = Parallel(cfg["train"]["executor_threads"], rcnn_task) mix_ratio = 1.0 if not cfg["train"]["disable_hybridization"]: net.hybridize(static_alloc=cfg["train"]["static_alloc"]) while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) for metric in metrics: metric.reset() for metric in metrics2: metric.reset() data_prepare_time.reset() data_distributed_time.reset() net_forward_backward_time.reset() tic = time.time() btic = time.time() base_lr = trainer.learning_rate rcnn_task.mix_ratio = mix_ratio before_data_prepare_point = time.time() for i, batch in enumerate(train_loader): data_prepare_time.update(None, time.time() - before_data_prepare_point) if epoch == 0 and i <= lr_warmup: new_lr = base_lr * get_lr_at_iter( i / lr_warmup, cfg["train"]["lr_warmup_factor"]) if new_lr != trainer.learning_rate: if i % cfg["train"]["log_interval"] == 0: logger.info( "[Epoch 0 Iteration {}] Set learning rate to {}". format(i, new_lr)) trainer.set_learning_rate(new_lr) before_data_distributed_point = time.time() # img, label, cls_targets, box_targets, box_masks = batch batch = split_and_load(batch, ctx_list=ctx) # 分发数据 data_distributed_time.update( None, time.time() - before_data_distributed_point) metric_losses = [[] for _ in metrics] add_losses = [[] for _ in metrics2] before_net_forward_backward_point = time.time() for data in zip(*batch): executor.put(data) # for j in range(len(ctx)): result = executor.get() for k in range(len(metric_losses)): metric_losses[k].append(result[k]) for k in range(len(add_losses)): add_losses[k].append(result[len(metric_losses) + k]) for metric, record in zip(metrics, metric_losses): metric.update(0, record) # 把所有loss放到一起 for metric, records in zip(metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(cfg["dataset"]["batch_size_per_device"] * len(ctx)) net_forward_backward_time.update( None, time.time() - before_net_forward_backward_point) if cfg["train"]["log_interval"] and not ( i + 1) % cfg["train"]["log_interval"]: msg = ",".join([ "{}={:.3f}".format(*metric.get()) for metric in metrics + metrics2 ]) logger.info( "[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}". format( epoch, i, cfg["train"]["log_interval"] * cfg["dataset"]["batch_size_per_device"] * len(ctx) / (time.time() - btic), msg)) time_msg = ",".join([ "{}={}".format(*metric.get()) for metric in [ data_prepare_time, data_distributed_time, net_forward_backward_time ] ]) logger.info("[Epoch {}][Batch {}], {}".format( epoch, i, time_msg)) btic = time.time() before_data_prepare_point = time.time() msg = ",".join( ["{}={:.3f}".format(*metric.get()) for metric in metrics]) logger.info("[Epoch {}] Training cost: {:.3f}, {}".format( epoch, (time.time() - tic), msg)) if not (epoch + 1) % cfg["train"]["val_interval"]: map_name, mean_ap = validate(net, val_loader, ctx, eval_metric, cfg) val_msg = "\n".join( ["{}={}".format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info("[Epoch {}] Validation: \n{}".format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, logger, best_map, current_map, epoch, cfg["train"]["save_interval"], cfg["train"]["save_prefix"])