def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if opt.partial_bn: train_patterns = None if 'inceptionv3' in opt.model: train_patterns = '.*weight|.*bias|inception30_batchnorm0_gamma|inception30_batchnorm0_beta|inception30_batchnorm0_running_mean|inception30_batchnorm0_running_var' else: logger.info( 'Current model does not support partial batch normalization.' ) if opt.kvstore is not None: trainer = gluon.Trainer(net.collect_params(train_patterns), optimizer, optimizer_params, kvstore=kv, update_on_kvstore=False) else: trainer = gluon.Trainer(net.collect_params(train_patterns), optimizer, optimizer_params, update_on_kvstore=False) else: if opt.kvstore is not None: trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params, kvstore=kv, update_on_kvstore=False) else: trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params, update_on_kvstore=False) if opt.accumulate > 1: params = [ p for p in net.collect_params().values() if p.grad_req != 'null' ] for p in params: p.grad_req = 'add' if opt.resume_states is not '': trainer.load_states(opt.resume_states) if opt.use_amp: amp.init_trainer(trainer) L = gluon.loss.SoftmaxCrossEntropyLoss() best_val_score = 0 lr_decay_count = 0 for epoch in range(opt.resume_epoch, opt.num_epochs): tic = time.time() train_metric.reset() btic = time.time() num_train_iter = len(train_data) train_loss_epoch = 0 train_loss_iter = 0 for i, batch in enumerate(train_data): data, label = batch_fn(batch, ctx) with ag.record(): outputs = [] for _, X in enumerate(data): X = X.reshape((-1, ) + X.shape[2:]) pred = net(X.astype(opt.dtype, copy=False)) outputs.append(pred) loss = [ L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label) ] if opt.use_amp: with amp.scale_loss(loss, trainer) as scaled_loss: ag.backward(scaled_loss) else: ag.backward(loss) if opt.accumulate > 1 and (i + 1) % opt.accumulate == 0: if opt.kvstore is not None: trainer.step(batch_size * kv.num_workers * opt.accumulate) else: trainer.step(batch_size * opt.accumulate) net.collect_params().zero_grad() else: if opt.kvstore is not None: trainer.step(batch_size * kv.num_workers) else: trainer.step(batch_size) train_metric.update(label, outputs) train_loss_iter = sum([l.mean().asscalar() for l in loss]) / len(loss) train_loss_epoch += train_loss_iter train_metric_name, train_metric_score = train_metric.get() sw.add_scalar(tag='train_acc_top1_iter', value=train_metric_score * 100, global_step=epoch * num_train_iter + i) sw.add_scalar(tag='train_loss_iter', value=train_loss_iter, global_step=epoch * num_train_iter + i) sw.add_scalar(tag='learning_rate_iter', value=trainer.learning_rate, global_step=epoch * num_train_iter + i) if opt.log_interval and not (i + 1) % opt.log_interval: logger.info( 'Epoch[%03d] Batch [%04d]/[%04d]\tSpeed: %f samples/sec\t %s=%f\t loss=%f\t lr=%f' % (epoch, i, num_train_iter, batch_size * opt.log_interval / (time.time() - btic), train_metric_name, train_metric_score * 100, train_loss_epoch / (i + 1), trainer.learning_rate)) btic = time.time() train_metric_name, train_metric_score = train_metric.get() throughput = int(batch_size * i / (time.time() - tic)) mx.ndarray.waitall() if opt.kvstore is not None and epoch == opt.resume_epoch: kv.init(111111, nd.zeros(1)) kv.init(555555, nd.zeros(1)) kv.init(999999, nd.zeros(1)) if opt.kvstore is not None: acc_top1_val, acc_top5_val, loss_val = test(ctx, val_data, kv) else: acc_top1_val, acc_top5_val, loss_val = test(ctx, val_data) logger.info('[Epoch %03d] training: %s=%f\t loss=%f' % (epoch, train_metric_name, train_metric_score * 100, train_loss_epoch / num_train_iter)) logger.info('[Epoch %03d] speed: %d samples/sec\ttime cost: %f' % (epoch, throughput, time.time() - tic)) logger.info( '[Epoch %03d] validation: acc-top1=%f acc-top5=%f loss=%f' % (epoch, acc_top1_val * 100, acc_top5_val * 100, loss_val)) sw.add_scalar(tag='train_loss_epoch', value=train_loss_epoch / num_train_iter, global_step=epoch) sw.add_scalar(tag='val_loss_epoch', value=loss_val, global_step=epoch) sw.add_scalar(tag='val_acc_top1_epoch', value=acc_top1_val * 100, global_step=epoch) if acc_top1_val > best_val_score: best_val_score = acc_top1_val net.save_parameters('%s/%.4f-%s-%s-%03d-best.params' % (opt.save_dir, best_val_score, opt.dataset, model_name, epoch)) trainer.save_states('%s/%.4f-%s-%s-%03d-best.states' % (opt.save_dir, best_val_score, opt.dataset, model_name, epoch)) else: if opt.save_frequency and opt.save_dir and ( epoch + 1) % opt.save_frequency == 0: net.save_parameters( '%s/%s-%s-%03d.params' % (opt.save_dir, opt.dataset, model_name, epoch)) trainer.save_states( '%s/%s-%s-%03d.states' % (opt.save_dir, opt.dataset, model_name, epoch)) # save the last model net.save_parameters( '%s/%s-%s-%03d.params' % (opt.save_dir, opt.dataset, model_name, opt.num_epochs - 1)) trainer.save_states( '%s/%s-%s-%03d.states' % (opt.save_dir, opt.dataset, model_name, opt.num_epochs - 1))
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer(net.collect_params(), 'sgd', { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum }) else: trainer = gluon.Trainer( net.collect_params(), 'sgd', { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum }, update_on_kvstore=(False if args.amp else None)) if args.amp: amp.init_trainer(trainer) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted( [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) mbox_loss = gcv.loss.SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(train_data): if args.dali: # dali iterator returns a mxnet.io.DataBatch data = [d.data[0] for d in batch] box_targets = [d.label[0] for d in batch] cls_targets = [ nd.cast(d.label[1], dtype='float32') for d in batch ] else: data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) if args.amp: with amp.scale_loss(sum_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) if (not args.horovod or hvd.rank() == 0): local_batch_size = int(args.batch_size // (hvd.size() if args.horovod else 1)) ce_metric.update(0, [l * local_batch_size for l in cls_loss]) smoothl1_metric.update( 0, [l * local_batch_size for l in box_loss]) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}' .format(epoch, i, args.batch_size / (time.time() - btic), name1, loss1, name2, loss2)) btic = time.time() if (not args.horovod or hvd.rank() == 0): name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}'. format(epoch, (time.time() - tic), name1, loss1, name2, loss2)) if (epoch % args.val_interval == 0) or (args.save_interval and epoch % args.save_interval == 0): # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format( epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def train(metric): """Training function.""" if not only_inference: logging.info('Now we are doing BERT classification training on %s!', ctx) all_model_params = model.collect_params() optimizer_params = {'learning_rate': lr, 'epsilon': epsilon, 'wd': 0.01} try: trainer = gluon.Trainer(all_model_params, args.optimizer, optimizer_params, update_on_kvstore=False) except ValueError as e: print(e) warnings.warn( 'AdamW optimizer is not found. Please consider upgrading to ' 'mxnet>=1.5.0. Now the original Adam optimizer is used instead.') trainer = gluon.Trainer(all_model_params, 'adam', optimizer_params, update_on_kvstore=False) if args.dtype == 'float16': amp.init_trainer(trainer) step_size = batch_size * accumulate if accumulate else batch_size num_train_steps = int(num_train_examples / step_size * args.epochs) warmup_ratio = args.warmup_ratio num_warmup_steps = int(num_train_steps * warmup_ratio) step_num = 0 # Do not apply weight decay on LayerNorm and bias terms for _, v in model.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 # Collect differentiable parameters params = [p for p in all_model_params.values() if p.grad_req != 'null'] # Set grad_req if gradient accumulation is required if accumulate and accumulate > 1: for p in params: p.grad_req = 'add' # track best eval score metric_history = [] tic = time.time() for epoch_id in range(args.epochs): if not only_inference: metric.reset() step_loss = 0 tic = time.time() all_model_params.zero_grad() for batch_id, seqs in enumerate(train_data): # learning rate schedule if step_num < num_warmup_steps: new_lr = lr * step_num / num_warmup_steps else: non_warmup_steps = step_num - num_warmup_steps offset = non_warmup_steps / (num_train_steps - num_warmup_steps) new_lr = lr - offset * lr trainer.set_learning_rate(new_lr) # forward and backward with mx.autograd.record(): input_ids, valid_length, type_ids, label = seqs out = model( input_ids.as_in_context(ctx), type_ids.as_in_context(ctx), valid_length.astype('float32').as_in_context(ctx)) ls = loss_function(out, label.as_in_context(ctx)).mean() if args.dtype == 'float16': with amp.scale_loss(ls, trainer) as scaled_loss: mx.autograd.backward(scaled_loss) else: ls.backward() # update if not accumulate or (batch_id + 1) % accumulate == 0: trainer.allreduce_grads() nlp.utils.clip_grad_global_norm(params, 1) trainer.update(accumulate if accumulate else 1) step_num += 1 if accumulate and accumulate > 1: # set grad to zero for gradient accumulation all_model_params.zero_grad() step_loss += ls.asscalar() metric.update([label], [out]) if (batch_id + 1) % (args.log_interval) == 0: log_train(batch_id, len(train_data), metric, step_loss, args.log_interval, epoch_id, trainer.learning_rate) step_loss = 0 mx.nd.waitall() # inference on dev data for segment, dev_data in dev_data_list: metric_nm, metric_val = evaluate(dev_data, metric, segment) metric_history.append((epoch_id, metric_nm, metric_val)) if not only_inference: # save params ckpt_name = 'model_bert_{0}_{1}.params'.format(task_name, epoch_id) params_saved = os.path.join(output_dir, ckpt_name) nlp.utils.save_parameters(model, params_saved) logging.info('params saved in: %s', params_saved) toc = time.time() logging.info('Time cost=%.2fs', toc - tic) tic = toc if not only_inference: # we choose the best model based on metric[0], # assuming higher score stands for better model quality metric_history.sort(key=lambda x: x[2][0], reverse=True) epoch_id, metric_nm, metric_val = metric_history[0] ckpt_name = 'model_bert_{0}_{1}.params'.format(task_name, epoch_id) params_saved = os.path.join(output_dir, ckpt_name) nlp.utils.load_parameters(model, params_saved) metric_str = 'Best model at epoch {}. Validation metrics:'.format( epoch_id) metric_str += ','.join([i + ':%.4f' for i in metric_nm]) logging.info(metric_str, *metric_val) # inference on test data for segment, test_data in test_data_list: test(test_data, segment)
def train(): """Training function.""" segment = 'train' #if not args.debug else 'dev' log.info('Loading %s data...', segment) if version_2: train_data = SQuAD(segment, version='2.0') else: train_data = SQuAD(segment, version='1.1') if args.debug: sampled_data = [train_data[i] for i in range(0, 10000)] train_data = mx.gluon.data.SimpleDataset(sampled_data) log.info('Number of records in Train data:{}'.format(len(train_data))) train_data_transform = preprocess_dataset( tokenizer, train_data, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, input_features=True) log.info('The number of examples after preprocessing:{}'.format( len(train_data_transform))) sampler = nlp.data.SplitSampler(len(train_data_transform), num_parts=size, part_index=rank, even_size=True) num_train_examples = len(sampler) train_dataloader = mx.gluon.data.DataLoader(train_data_transform, batchify_fn=batchify_fn, batch_size=batch_size, num_workers=4, sampler=sampler) log.info('Start Training') optimizer_params = {'learning_rate': lr} param_dict = net.collect_params() if args.comm_backend == 'horovod': trainer = hvd.DistributedTrainer(param_dict, optimizer, optimizer_params) else: trainer = mx.gluon.Trainer(param_dict, optimizer, optimizer_params, update_on_kvstore=False) if args.dtype == 'float16': amp.init_trainer(trainer) step_size = batch_size * accumulate if accumulate else batch_size num_train_steps = int(num_train_examples / step_size * args.epochs) if args.training_steps: num_train_steps = args.training_steps num_warmup_steps = int(num_train_steps * warmup_ratio) def set_new_lr(step_num, batch_id): """set new learning rate""" # set grad to zero for gradient accumulation if accumulate: if batch_id % accumulate == 0: step_num += 1 else: step_num += 1 # learning rate schedule # Notice that this learning rate scheduler is adapted from traditional linear learning # rate scheduler where step_num >= num_warmup_steps, new_lr = 1 - step_num/num_train_steps if step_num < num_warmup_steps: new_lr = lr * step_num / num_warmup_steps else: offset = (step_num - num_warmup_steps) * lr / \ (num_train_steps - num_warmup_steps) new_lr = lr - offset trainer.set_learning_rate(new_lr) return step_num # Do not apply weight decay on LayerNorm and bias terms for _, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 # Collect differentiable parameters params = [p for p in param_dict.values() if p.grad_req != 'null'] # Set grad_req if gradient accumulation is required if accumulate: for p in params: p.grad_req = 'add' net.collect_params().zero_grad() epoch_tic = time.time() total_num = 0 log_num = 0 batch_id = 0 step_loss = 0.0 tic = time.time() step_num = 0 tic = time.time() while step_num < num_train_steps: for _, data in enumerate(train_dataloader): # set new lr step_num = set_new_lr(step_num, batch_id) # forward and backward _, inputs, token_types, valid_length, start_label, end_label = data num_labels = len(inputs) log_num += num_labels total_num += num_labels with mx.autograd.record(): out = net(inputs.as_in_context(ctx), token_types.as_in_context(ctx), valid_length.as_in_context(ctx).astype('float32')) loss = loss_function(out, [ start_label.as_in_context(ctx).astype('float32'), end_label.as_in_context(ctx).astype('float32') ]).sum() / num_labels if accumulate: loss = loss / accumulate if args.dtype == 'float16': with amp.scale_loss(loss, trainer) as l: mx.autograd.backward(l) norm_clip = 1.0 * size * trainer._amp_loss_scaler.loss_scale else: mx.autograd.backward(loss) norm_clip = 1.0 * size # update if not accumulate or (batch_id + 1) % accumulate == 0: trainer.allreduce_grads() nlp.utils.clip_grad_global_norm(params, norm_clip) trainer.update(1) if accumulate: param_dict.zero_grad() if args.comm_backend == 'horovod': step_loss += hvd.allreduce(loss, average=True).asscalar() else: step_loss += loss.asscalar() if (batch_id + 1) % log_interval == 0: toc = time.time() log.info('Batch: {}/{}, Loss={:.4f}, lr={:.7f} ' 'Thoughput={:.2f} samples/s'.format( batch_id % len(train_dataloader), len(train_dataloader), step_loss / log_interval, trainer.learning_rate, log_num / (toc - tic))) tic = time.time() step_loss = 0.0 log_num = 0 if step_num >= num_train_steps: break batch_id += 1 log.info('Finish training step: %d', step_num) epoch_toc = time.time() log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format( epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic))) if rank == 0: net.save_parameters(os.path.join(output_dir, 'net.params'))
def train(net, train_data, val_data, eval_metric, batch_size, ctx, args): """Training pipeline""" args.kv_store = 'device' if (args.amp and 'nccl' in args.kv_store) else args.kv_store kv = mx.kvstore.create(args.kv_store) net.collect_params().setattr('grad_req', 'null') net.collect_train_params().setattr('grad_req', 'write') optimizer_params = {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum} if args.amp: optimizer_params['multi_precision'] = True if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer( net.collect_train_params(), # fix batchnorm, fix first stage, etc... 'sgd', optimizer_params) else: trainer = gluon.Trainer( net.collect_train_params(), # fix batchnorm, fix first stage, etc... 'sgd', optimizer_params, update_on_kvstore=(False if args.amp else None), kvstore=kv) if args.amp: amp.init_trainer(trainer) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_warmup = float(args.lr_warmup) # avoid int division # TODO(zhreshold) losses? rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss(rho=args.rpn_smoothl1_rho) # == smoothl1 rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss(rho=args.rcnn_smoothl1_rho) # == smoothl1 metrics = [mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1'), ] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() metrics2 = [rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric] # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) if args.custom_model: logger.info('Custom model enabled. Expert Only!! Currently non-FPN model is not supported!!' ' Default setting is for MS-COCO.') logger.info(args) if args.verbose: logger.info('Trainable parameters:') logger.info(net.collect_train_params().keys()) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): mix_ratio = 1.0 if not args.disable_hybridization: net.hybridize(static_alloc=args.static_alloc) rcnn_task = ForwardBackwardTask(net, trainer, rpn_cls_loss, rpn_box_loss, rcnn_cls_loss, rcnn_box_loss, mix_ratio=1.0) executor = Parallel(args.executor_threads, rcnn_task) if not args.horovod else None if args.mixup: # TODO(zhreshold) only support evenly mixup now, target generator needs to be modified otherwise train_data._dataset._data.set_mixup(np.random.uniform, 0.5, 0.5) mix_ratio = 0.5 if epoch >= args.epochs - args.no_mixup_epochs: train_data._dataset._data.set_mixup(None) mix_ratio = 1.0 while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr)) for metric in metrics: metric.reset() tic = time.time() btic = time.time() base_lr = trainer.learning_rate rcnn_task.mix_ratio = mix_ratio for i, batch in enumerate(train_data): if epoch == 0 and i <= lr_warmup: # adjust based on real percentage new_lr = base_lr * get_lr_at_iter(i / lr_warmup, args.lr_warmup_factor) if new_lr != trainer.learning_rate: if i % args.log_interval == 0: logger.info( '[Epoch 0 Iteration {}] Set learning rate to {}'.format(i, new_lr)) trainer.set_learning_rate(new_lr) batch = split_and_load(batch, ctx_list=ctx) metric_losses = [[] for _ in metrics] add_losses = [[] for _ in metrics2] if executor is not None: for data in zip(*batch): executor.put(data) for j in range(len(ctx)): if executor is not None: result = executor.get() else: result = rcnn_task.forward_backward(list(zip(*batch))[0]) if (not args.horovod) or hvd.rank() == 0: for k in range(len(metric_losses)): metric_losses[k].append(result[k]) for k in range(len(add_losses)): add_losses[k].append(result[len(metric_losses) + k]) for metric, record in zip(metrics, metric_losses): metric.update(0, record) for metric, records in zip(metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(batch_size) # update metrics if (not args.horovod or hvd.rank() == 0) and args.log_interval \ and not (i + 1) % args.log_interval: msg = ','.join( ['{}={:.3f}'.format(*metric.get()) for metric in metrics + metrics2]) logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.format( epoch, i, args.log_interval * args.batch_size / (time.time() - btic), msg)) btic = time.time() if (not args.horovod) or hvd.rank() == 0: msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics]) logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format( epoch, (time.time() - tic), msg)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric, args) val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, logger, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def train(net, train_data, val_data, eval_metric, batch_size, ctx, logger, args): """Training pipeline""" args.kv_store = 'device' if (args.amp and 'nccl' in args.kv_store) else args.kv_store kv = mx.kvstore.create(args.kv_store) net.collect_params().setattr('grad_req', 'null') net.collect_train_params().setattr('grad_req', 'write') for k, v in net.collect_params('.*bias').items(): v.wd_mult = 0.0 optimizer_params = {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum, } if args.clip_gradient > 0.0: optimizer_params['clip_gradient'] = args.clip_gradient if args.amp: optimizer_params['multi_precision'] = True if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer( net.collect_train_params(), # fix batchnorm, fix first stage, etc... 'sgd', optimizer_params ) else: trainer = gluon.Trainer( net.collect_train_params(), # fix batchnorm, fix first stage, etc... 'sgd', optimizer_params, update_on_kvstore=(False if args.amp else None), kvstore=kv) if args.amp: amp.init_trainer(trainer) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_warmup = float(args.lr_warmup) # avoid int division rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.) # == smoothl1 rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss() # == smoothl1 rcnn_mask_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) metrics = [mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1'), mx.metric.Loss('RCNN_Mask')] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() rcnn_mask_metric = MaskAccMetric() rcnn_fgmask_metric = MaskFGAccMetric() metrics2 = [rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric, rcnn_mask_metric, rcnn_fgmask_metric] async_eval_processes = [] logger.info(args) if args.verbose: logger.info('Trainable parameters:') logger.info(net.collect_train_params().keys()) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] base_lr = trainer.learning_rate for epoch in range(args.start_epoch, args.epochs): if not args.disable_hybridization: net.hybridize(static_alloc=args.static_alloc) rcnn_task = ForwardBackwardTask(net, trainer, rpn_cls_loss, rpn_box_loss, rcnn_cls_loss, rcnn_box_loss, rcnn_mask_loss) executor = Parallel(args.executor_threads, rcnn_task) if not args.horovod else None while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr)) for metric in metrics: metric.reset() tic = time.time() btic = time.time() train_data_iter = iter(train_data) next_data_batch = next(train_data_iter) next_data_batch = split_and_load(next_data_batch, ctx_list=ctx) for i in range(len(train_data)): batch = next_data_batch if i + epoch * len(train_data) <= lr_warmup: # adjust based on real percentage new_lr = base_lr * get_lr_at_iter((i + epoch * len(train_data)) / lr_warmup, args.lr_warmup_factor) if new_lr != trainer.learning_rate: if i % args.log_interval == 0: logger.info('[Epoch {} Iteration {}] Set learning rate to {}' .format(epoch, i, new_lr)) trainer.set_learning_rate(new_lr) metric_losses = [[] for _ in metrics] add_losses = [[] for _ in metrics2] if executor is not None: for data in zip(*batch): executor.put(data) for j in range(len(ctx)): if executor is not None: result = executor.get() else: result = rcnn_task.forward_backward(list(zip(*batch))[0]) if (not args.horovod) or hvd.rank() == 0: for k in range(len(metric_losses)): metric_losses[k].append(result[k]) for k in range(len(add_losses)): add_losses[k].append(result[len(metric_losses) + k]) try: # prefetch next batch next_data_batch = next(train_data_iter) next_data_batch = split_and_load(next_data_batch, ctx_list=ctx) except StopIteration: pass for metric, record in zip(metrics, metric_losses): metric.update(0, record) for metric, records in zip(metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(batch_size) if (not args.horovod or hvd.rank() == 0) and args.log_interval \ and not (i + 1) % args.log_interval: msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics + metrics2]) logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.format( epoch, i, args.log_interval * args.batch_size / (time.time() - btic), msg)) btic = time.time() # validate and save params if (not args.horovod) or hvd.rank() == 0: msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics]) logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format( epoch, (time.time() - tic), msg)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time validate(net, val_data, async_eval_processes, ctx, eval_metric, logger, epoch, best_map, args) elif (not args.horovod) or hvd.rank() == 0: current_map = 0. save_params(net, logger, best_map, current_map, epoch, args.save_interval, args.save_prefix) for thread in async_eval_processes: thread.join()
def _fit(self, train_data, val_data, time_limit=math.inf): """ Fit Mask R-CNN models. """ # TODO(zhreshold): remove 'dataset' in config, use train_data/val_data instead self._cfg.kv_store = 'device' \ if (self._cfg.mask_rcnn.amp and 'nccl' in self._cfg.kv_store) else self._cfg.kv_store kv = mx.kvstore.create(self._cfg.kv_store) self.net.collect_params().setattr('grad_req', 'null') self.net.collect_train_params().setattr('grad_req', 'write') for k, v in self.net.collect_params('.*bias').items(): v.wd_mult = 0.0 optimizer_params = { 'learning_rate': self._cfg.train.lr, 'wd': self._cfg.train.wd, 'momentum': self._cfg.train.momentum, } if self._cfg.train.clip_gradient > 0.0: optimizer_params['clip_gradient'] = self._cfg.train.clip_gradient if self._cfg.mask_rcnn.amp: optimizer_params['multi_precision'] = True if self._cfg.horovod: hvd.broadcast_parameters(self.net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer( self.net.collect_train_params( ), # fix batchnorm, fix first stage, etc... 'sgd', optimizer_params) else: trainer = gluon.Trainer( self.net.collect_train_params( ), # fix batchnorm, fix first stage, etc... 'sgd', optimizer_params, update_on_kvstore=(False if self._cfg.mask_rcnn.amp else None), kvstore=kv) if self._cfg.mask_rcnn.amp: amp.init_trainer(trainer) # lr decay policy lr_decay = float(self._cfg.train.lr_decay) lr_steps = sorted([float(ls) for ls in self._cfg.train.lr_decay_epoch]) lr_warmup = float(self._cfg.train.lr_warmup) # avoid int division if self._cfg.train.verbose: self._logger.info('Trainable parameters:') self._logger.info(self.net.collect_train_params().keys()) self._logger.info('Start training from [Epoch %d]', self._cfg.train.start_epoch) base_lr = trainer.learning_rate for epoch in range(self._cfg.train.start_epoch, self._cfg.train.epochs): self.epoch = epoch rcnn_task = ForwardBackwardTask( self.net, trainer, self.rpn_cls_loss, self.rpn_box_loss, self.rcnn_cls_loss, self.rcnn_box_loss, self.rcnn_mask_loss, amp_enabled=self._cfg.mask_rcnn.amp) executor = Parallel(self._cfg.train.executor_threads, rcnn_task) if not self._cfg.horovod else None if not self._cfg.disable_hybridization: self.net.hybridize( static_alloc=self._cfg.mask_rcnn.static_alloc) while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) self._logger.info("[Epoch %d] Set learning rate to %f", epoch, new_lr) for metric in self.metrics: metric.reset() tic = time.time() btic = time.time() train_data_iter = iter(self._train_data) next_data_batch = next(train_data_iter) next_data_batch = _split_and_load(next_data_batch, ctx_list=self.ctx) for i in range(len(self._train_data)): batch = next_data_batch if i + epoch * len(self._train_data) <= lr_warmup: # adjust based on real percentage new_lr = base_lr * _get_lr_at_iter( (i + epoch * len(self._train_data)) / lr_warmup, self._cfg.train.lr_warmup_factor) if new_lr != trainer.learning_rate: if i % self._cfg.train.log_interval == 0: self._logger.info( '[Epoch %d Iteration %d] Set learning rate to %f', epoch, i, new_lr) trainer.set_learning_rate(new_lr) metric_losses = [[] for _ in self.metrics] add_losses = [[] for _ in self.metrics2] if executor is not None: for data in zip(*batch): executor.put(data) for _ in range(len(self.ctx)): if executor is not None: result = executor.get() else: result = rcnn_task.forward_backward( list(zip(*batch))[0]) if (not self._cfg.horovod) or hvd.rank() == 0: for k in range(len(metric_losses)): metric_losses[k].append(result[k]) for k in range(len(add_losses)): add_losses[k].append(result[len(metric_losses) + k]) try: # prefetch next batch next_data_batch = next(train_data_iter) next_data_batch = _split_and_load(next_data_batch, ctx_list=self.ctx) except StopIteration: pass for metric, record in zip(self.metrics, metric_losses): metric.update(0, record) for metric, records in zip(self.metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(self.batch_size) if (not self._cfg.horovod or hvd.rank() == 0) and self._cfg.train.log_interval \ and not (i + 1) % self._cfg.train.log_interval: msg = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in self.metrics + self.metrics2 ]) self._logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'. format( epoch, i, self._cfg.train.log_interval * self._cfg.train.batch_size / (time.time() - btic), msg)) btic = time.time() # validate and save params if (not self._cfg.horovod) or hvd.rank() == 0: msg = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in self.metrics ]) self._logger.info( '[Epoch {}] Training cost: {:.3f}, {}'.format( epoch, (time.time() - tic), msg)) if not (epoch + 1) % self._cfg.valid.val_interval: # consider reduce the frequency of validation to save time self._validate(self._val_data, self.async_eval_processes, self.ctx, self.eval_metric, self._logger, epoch, self.best_map) elif (not self._cfg.horovod) or hvd.rank() == 0: current_map = 0. _save_params(self.net, self._logger, self.best_map, current_map, epoch, self._cfg.save_interval, os.path.join(self._logdir, self._cfg.save_prefix)) if self._reporter: self._reporter(epoch=epoch, map_reward=current_map) for thread in self.async_eval_processes: thread.join()
def train_text_classification(args, reporter=None): # Step 1: add scripts every function and python objects in the original training script except for the training function # at the beginning of the decorated function nlp = try_import_gluonnlp() logger = logging.getLogger(__name__) if args.verbose: logger.setLevel(logging.INFO) logger.info(args) batch_size = args.batch_size dev_batch_size = args.dev_batch_size lr = args.lr epsilon = args.epsilon accumulate = args.accumulate log_interval = args.log_interval * accumulate if accumulate else args.log_interval if accumulate: logger.info('Using gradient accumulation. Effective batch size = ' \ 'batch_size * accumulate = %d', accumulate * batch_size) # random seed np.random.seed(args.seed) random.seed(args.seed) mx.random.seed(args.seed) # TODO support for multi-GPU ctx = [mx.gpu(i) for i in range(args.num_gpus) ][0] if args.num_gpus > 0 else [mx.cpu()][0] task = args.dataset # data type with mixed precision training if args.dtype == 'float16': try: from mxnet.contrib import amp # pylint: disable=ungrouped-imports # monkey patch amp list since topk does not support fp16 amp.lists.symbol.FP32_FUNCS.append('topk') amp.lists.symbol.FP16_FP32_FUNCS.remove('topk') amp.init() except ValueError: # topk is already in the FP32_FUNCS list amp.init() except ImportError: # amp is not available logger.info( 'Mixed precision training with float16 requires MXNet >= ' '1.5.0b20190627. Please consider upgrading your MXNet version.' ) exit() # model and loss model_name = args.net dataset = args.pretrained_dataset use_roberta = 'roberta' in model_name get_model_params = { 'name': model_name, 'dataset_name': dataset, 'pretrained': True, 'ctx': ctx, 'use_decoder': False, 'use_classifier': False, } # RoBERTa does not contain parameters for sentence pair classification if not use_roberta: get_model_params['use_pooler'] = True bert, vocabulary = nlp.model.get_model(**get_model_params) model = get_network(bert, task.class_labels, use_roberta) #do_regression = not task.class_labels #if do_regression: # num_classes = 1 # loss_function = gluon.loss.L2Loss() #else: # num_classes = len(task.class_labels) # loss_function = gluon.loss.SoftmaxCELoss() ## reuse the BERTClassifier class with num_classes=1 for regression #if use_roberta: # model = RoBERTaClassifier(bert, dropout=0.0, num_classes=num_classes) #else: # model = BERTClassifier(bert, dropout=0.1, num_classes=num_classes) # initialize classifier loss_function = gluon.loss.SoftmaxCELoss( ) if task.class_labels else gluon.loss.L2Loss() initializer = mx.init.Normal(0.02) model.classifier.initialize(init=initializer, ctx=ctx) model.hybridize(static_alloc=True) loss_function.hybridize(static_alloc=True) # data processing do_lower_case = 'uncased' in dataset if use_roberta: bert_tokenizer = nlp.data.GPT2BPETokenizer() else: bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=do_lower_case) # Get the loader. train_data, dev_data_list, num_train_examples, trans, test_trans = preprocess_data( bert_tokenizer, task, batch_size, dev_batch_size, args.max_len, vocabulary, True, args.num_workers) def log_train(batch_id, batch_num, metric, step_loss, log_interval, epoch_id, learning_rate, tbar): """Generate and print out the log message for training. """ metric_nm, metric_val = metric.get() if not isinstance(metric_nm, list): metric_nm, metric_val = [metric_nm], [metric_val] train_str = '[Epoch %d] loss=%.4f, lr=%.7f, metrics:' + \ ','.join([i + ':%.4f' for i in metric_nm]) tbar.set_description( train_str % (epoch_id, step_loss / log_interval, learning_rate, *metric_val)) def log_eval(batch_id, batch_num, metric, step_loss, log_interval, tbar): """Generate and print out the log message for inference. """ metric_nm, metric_val = metric.get() if not isinstance(metric_nm, list): metric_nm, metric_val = [metric_nm], [metric_val] eval_str = 'loss=%.4f, metrics:' + \ ','.join([i + ':%.4f' for i in metric_nm]) tbar.set_description(eval_str % (step_loss / log_interval, *metric_val)) def evaluate(loader_dev, metric, segment): """Evaluate the model on validation dataset.""" metric.reset() step_loss = 0 tbar = tqdm(loader_dev) for batch_id, seqs in enumerate(tbar): input_ids, valid_length, segment_ids, label = seqs input_ids = input_ids.as_in_context(ctx) valid_length = valid_length.as_in_context(ctx).astype('float32') label = label.as_in_context(ctx) if use_roberta: out = model(input_ids, valid_length) else: out = model(input_ids, segment_ids.as_in_context(ctx), valid_length) ls = loss_function(out, label).mean() step_loss += ls.asscalar() metric.update([label], [out]) if (batch_id + 1) % (args.log_interval) == 0: log_eval(batch_id, len(loader_dev), metric, step_loss, args.log_interval, tbar) step_loss = 0 metric_nm, metric_val = metric.get() if not isinstance(metric_nm, list): metric_nm, metric_val = [metric_nm], [metric_val] metric_str = 'validation metrics:' + ','.join( [i + ':%.4f' for i in metric_nm]) logger.info(metric_str, *metric_val) mx.nd.waitall() return metric_nm, metric_val # Step 2: the training function in the original training script is added in the decorated function in autogluon for training. """Training function.""" all_model_params = model.collect_params() optimizer_params = {'learning_rate': lr, 'epsilon': epsilon, 'wd': 0.01} trainer = gluon.Trainer(all_model_params, 'bertadam', optimizer_params, update_on_kvstore=False) if args.dtype == 'float16': amp.init_trainer(trainer) step_size = batch_size * accumulate if accumulate else batch_size num_train_steps = int(num_train_examples / step_size * args.epochs) warmup_ratio = args.warmup_ratio num_warmup_steps = int(num_train_steps * warmup_ratio) step_num = 0 # Do not apply weight decay on LayerNorm and bias terms for _, v in model.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 # Collect differentiable parameters params = [p for p in all_model_params.values() if p.grad_req != 'null'] # Set grad_req if gradient accumulation is required if accumulate and accumulate > 1: for p in params: p.grad_req = 'add' # track best eval score metric_history = [] best_metric = None patience = args.early_stop tic = time.time() for epoch_id in range(args.epochs): if args.early_stop and patience == 0: logger.info('Early stopping at epoch %d', epoch_id) break task.metric.reset() step_loss = 0 tic = time.time() all_model_params.zero_grad() tbar = tqdm(train_data) for batch_id, seqs in enumerate(tbar): # learning rate schedule if step_num < num_warmup_steps: new_lr = lr * step_num / num_warmup_steps else: non_warmup_steps = step_num - num_warmup_steps offset = non_warmup_steps / (num_train_steps - num_warmup_steps) new_lr = lr - offset * lr trainer.set_learning_rate(new_lr) # forward and backward with mx.autograd.record(): input_ids, valid_length, segment_ids, label = seqs input_ids = input_ids.as_in_context(ctx) valid_length = valid_length.as_in_context(ctx).astype( 'float32') label = label.as_in_context(ctx) if use_roberta: out = model(input_ids, valid_length) else: out = model(input_ids, segment_ids.as_in_context(ctx), valid_length) ls = loss_function(out, label).mean() if args.dtype == 'float16': with amp.scale_loss(ls, trainer) as scaled_loss: mx.autograd.backward(scaled_loss) else: ls.backward() # update if not accumulate or (batch_id + 1) % accumulate == 0: trainer.allreduce_grads() nlp.utils.clip_grad_global_norm(params, 1) trainer.update(accumulate if accumulate else 1) step_num += 1 if accumulate and accumulate > 1: # set grad to zero for gradient accumulation all_model_params.zero_grad() step_loss += ls.asscalar() task.metric.update([label], [out]) if (batch_id + 1) % (args.log_interval) == 0: log_train(batch_id, len(train_data), task.metric, step_loss, args.log_interval, epoch_id, trainer.learning_rate, tbar) step_loss = 0 mx.nd.waitall() # inference on dev data for segment, dev_data in dev_data_list: metric_nm, metric_val = evaluate(dev_data, task.metric, segment) if best_metric is None or metric_val >= best_metric: best_metric = metric_val patience = args.early_stop else: if args.early_stop is not None: patience -= 1 metric_history.append((epoch_id, metric_nm, metric_val)) if reporter is not None: # Note: epoch reported back must start with 1, not with 0 reporter(epoch=epoch_id + 1, accuracy=metric_val[0]) if args.final_fit: get_model_params.pop('ctx') return { 'model_params': collect_params(model), 'get_model_args': get_model_params, 'class_labels': task.class_labels, 'transform': trans, 'test_transform': test_trans }
def train(net, train_data, val_data, eval_metric, batch_size, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list( range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_decay_epoch = [e - args.warmup_epochs for e in lr_decay_epoch] num_batches = args.num_samples // args.batch_size lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=0, target_lr=args.lr, nepochs=args.warmup_epochs, iters_per_epoch=num_batches), LRScheduler(args.lr_mode, base_lr=args.lr, nepochs=args.epochs - args.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=args.lr_decay, power=2), ]) if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer(net.collect_params(), 'sgd', { 'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler }) else: trainer = gluon.Trainer( net.collect_params(), 'sgd', { 'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler }, kvstore='local', update_on_kvstore=(False if args.amp else None)) if args.amp: amp.init_trainer(trainer) # targets sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) l1_loss = gluon.loss.L1Loss() # metrics obj_metrics = mx.metric.Loss('ObjLoss') center_metrics = mx.metric.Loss('BoxCenterLoss') scale_metrics = mx.metric.Loss('BoxScaleLoss') cls_metrics = mx.metric.Loss('ClassLoss') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): if args.mixup: # TODO(zhreshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= args.epochs - args.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() for i, batch in enumerate(train_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [ gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 6) ] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, cls_loss = net( x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) if args.amp: with amp.scale_loss(sum_losses, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_losses) trainer.step(batch_size) if (not args.horovod or hvd.rank() == 0): obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info( '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, i, trainer.learning_rate, args.batch_size / (time.time() - btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) btic = time.time() if (not args.horovod or hvd.rank() == 0): name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format( epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def train(net, train_data, val_data, eval_metric, ctx, config): batch_size = config['batch_size'] net.collect_params().reset_ctx(ctx) trainer = gluon.Trainer( params=net.collect_params(), optimizer='sgd', optimizer_params={'learning_rate': config['lr'], 'wd': config['wd'], 'momentum': config['momentum']}, update_on_kvstore=(False if config['amp'] else None) ) if config['amp']: amp.init_trainer(trainer) hm_creteria = FocalLoss(sparse_label=False) offset_creteria = gluon.loss.L1Loss() wh_creteria = gluon.loss.L1Loss() hm_metric = mx.metric.Loss('FocalLoss') offset_metric = mx.metric.Loss('Offset_L1') wh_metric = mx.metric.Loss('WH_L1') logging.info('Start training from scratch...') for epoch in range(config['epoch']): hm_metric.reset() offset_metric.reset() wh_metric.reset() tic = time.time() btic = time.time() net.hybridize(static_alloc=True, static_shape=True) for i, (batch, img_ids, _) in enumerate(train_data): # if i >= 100: # print ('continue') # break batch_data = [d.data[0] for d in batch] batch_hm = [d.label[0] for d in batch] batch_offset = [d.label[1] for d in batch] batch_wh = [d.label[2] for d in batch] with autograd.record(): hm_losses, offset_losses, wh_losses, sum_losses = [], [], [], [] for data, hm, offset, wh in zip(batch_data, batch_hm, batch_offset, batch_wh): outputs = net(data) hm_pred, offset_pred, wh_pred = outputs hm_loss = hm_creteria(hm_pred, hm) offset_loss = offset_creteria(offset_pred, offset) wh_loss = wh_creteria(wh_pred, wh) sum_loss = hm_loss + offset_loss + 0.1*wh_loss hm_losses.append(hm_loss) offset_losses.append(offset_loss) wh_losses.append(wh_loss) sum_losses.append(sum_loss) for sum_loss in sum_losses: autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(batch_size) hm_metric.update(0, [l * batch_size for l in hm_losses]) offset_metric.update(0, [l * batch_size for l in offset_losses]) wh_metric.update(0, [l * batch_size for l in wh_losses]) if i > 0 and i % 50 == 0: name1, loss1 = hm_metric.get() name2, loss2 = offset_metric.get() name3, loss3 = wh_metric.get() print('Epoch {} Batch {} Speed: {:.3f} samples/s, {}={:.3f}, {}={:.3f}, {}={:.3f}'.\ format(epoch, i, batch_size/(time.time()-btic), name1, loss1, name2, loss2, name3, loss3)) btic = time.time() print ('starting validation') map_name, mean_ap = validate(net, val_data, ctx, eval_metric, config)
def train(self): self.net.collect_params().reset_ctx(self.ctx) trainer = gluon.Trainer( params=self.net.collect_params(), optimizer='sgd', optimizer_params={ 'learning_rate': self.lr, 'wd': self.wd, 'momentum': self.momentum }, update_on_kvstore=(False if self.use_amp else None)) if self.use_amp: amp.init_trainer(trainer) lr_decay = self.lr_decay lr_steps = sorted( [float(ls) for ls in self.lr_decay_epoch.split(',') if ls.strip()]) cls_criterion = FocalLoss(num_class=80) box_criterion = HuberLoss(rho=0.11) cls_metric = mx.metric.Loss('FocalLoss') box_metric = mx.metric.Loss('SmoothL1') logging.info('Start training from scratch...') for epoch in range(self.epoch): while lr_steps and epoch > lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logging.info("Epoch {} Set learning rate to {}".format( epoch, new_lr)) cls_metric.reset() box_metric.reset() tic = time.time() btic = time.time() # reset cause save params may change self.net.collect_params().reset_ctx(self.ctx) self.net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(self.train_data): data, box_targets, cls_targets = batch with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = self.net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) cls_loss = [ cls_criterion(cls_pred, cls_target) for cls_pred, cls_target in zip( cls_preds, cls_targets) ] box_loss = [ box_criterion(box_pred, box_target) for box_pred, box_target in zip( box_preds, box_targets) ] sum_loss = [(cl + bl) for cl, bl in zip(cls_loss, box_loss)] if self.use_amp: with amp.scale_loss(sum_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) cls_metric.update(0, [l * self.batch_size for l in cls_loss]) box_metric.update(0, [l * self.batch_size for l in box_loss]) if i > 0 and i % 50 == 0: name1, loss1 = cls_metric.get() name2, loss2 = box_metric.get() logging.info('Epoch {} Batch {} Speed: {:.3f} samples/s, {}={:.5f}, {}={:.5f}'.\ format(epoch, i, self.batch_size/(time.time()-btic), name1, loss1, name2, loss2)) btic = time.time() logging.info('[Epoch {}] Starting Validation.'.format(epoch)) map_name, mean_ap = self.validation() val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logging.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) self.save_params(epoch)
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], graphviz=True, epoch=100, input_size=[512, 512], batch_size=16, batch_log=100, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], data_augmentation=True, num_workers=4, optimizer="ADAM", weight_decay=0.000001, lambda_off=1, lambda_size=0.1, save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base=18, pretrained_base=True, pretrained_path="modelparam", AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", valid_html_auto_open=True, using_mlflow=True, topk=100, iou_thresh=0.5, nms=False, except_class_thresh=0.01, nms_thresh=0.5, plot_class_thresh=0.5): ''' AMP 가 모든 연산을 지원하지는 않는다. modulated convolution을 지원하지 않음 ''' if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training Center Detector") input_shape = (1, 3) + tuple(input_size) scale_factor = 4 # 고정 logging.info(f"scale factor {scale_factor}") train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() if pretrained_base: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_P" + "CENTER_RES" + str(base) else: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_CENTER_RES" + str(base) weight_path = os.path.join("weights", f"{model}") sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') optimizer_path = os.path.join(weight_path, f'{model}-{load_period:04d}.opt') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)}\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 net = CenterNet(base=base, heads=OrderedDict([('heatmap', { 'num_output': num_classes, 'bias': -2.19 }), ('offset', { 'num_output': 2 }), ('wh', { 'num_output': 2 })]), head_conv_channel=64, pretrained=pretrained_base, root=pretrained_path, use_dcnv2=False, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) if AMP: amp.init_trainer(trainer) # optimizer weight 불러오기 if os.path.exists(optimizer_path): try: trainer.load_states(optimizer_path) except Exception as E: logging.info(E) else: logging.info(f"loading {os.path.basename(optimizer_path)}\n") heatmapfocalloss = HeatmapFocalLoss(from_sigmoid=True, alpha=2, beta=4) normedl1loss = NormedL1Loss() prediction = Prediction(batch_size=valid_size, topk=topk, scale=scale_factor, nms=nms, except_class_thresh=except_class_thresh, nms_thresh=nms_thresh) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) ctx_list = ctx if isinstance(ctx, (list, tuple)) else [ctx] start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 time_stamp = time.time() ''' target generator를 train_dataloader에서 만들어 버리는게 학습 속도가 훨씬 빠르다. ''' for batch_count, (image, _, heatmap, offset_target, wh_target, mask_target, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image_split = mx.nd.split(data=image, num_outputs=subdivision, axis=0) heatmap_split = mx.nd.split(data=heatmap, num_outputs=subdivision, axis=0) offset_target_split = mx.nd.split(data=offset_target, num_outputs=subdivision, axis=0) wh_target_split = mx.nd.split(data=wh_target, num_outputs=subdivision, axis=0) mask_target_split = mx.nd.split(data=mask_target, num_outputs=subdivision, axis=0) if subdivision == 1: image_split = [image_split] heatmap_split = [heatmap_split] offset_target_split = [offset_target_split] wh_target_split = [wh_target_split] mask_target_split = [mask_target_split] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): heatmap_all_losses = [] offset_all_losses = [] wh_all_losses = [] for image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part in zip( image_split, heatmap_split, offset_target_split, wh_target_split, mask_target_split): image_part = gluon.utils.split_and_load(image_part, ctx_list, even_split=False) heatmap_part = gluon.utils.split_and_load(heatmap_part, ctx_list, even_split=False) offset_target_part = gluon.utils.split_and_load( offset_target_part, ctx_list, even_split=False) wh_target_part = gluon.utils.split_and_load( wh_target_part, ctx_list, even_split=False) mask_target_part = gluon.utils.split_and_load( mask_target_part, ctx_list, even_split=False) # prediction, target space for Data Parallelism heatmap_losses = [] offset_losses = [] wh_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, heatmap_target, offset_target, wh_target, mask_target in zip( image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part): heatmap_pred, offset_pred, wh_pred = net(img) heatmap_loss = heatmapfocalloss( heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_losses.append(heatmap_loss.asscalar()) offset_losses.append(offset_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) total_loss.append(heatmap_loss + offset_loss + wh_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) heatmap_all_losses.append(sum(heatmap_losses)) offset_all_losses.append(sum(offset_losses)) wh_all_losses.append(sum(wh_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() heatmap_loss_sum += sum(heatmap_all_losses) / td_batch_size offset_loss_sum += sum(offset_all_losses) / td_batch_size wh_loss_sum += sum(wh_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[heatmap loss = {sum(heatmap_all_losses) / td_batch_size:.3f}]' f'[offset loss = {sum(offset_all_losses) / td_batch_size:.3f}]' f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]') time_stamp = time.time() train_heatmap_loss_mean = np.divide(heatmap_loss_sum, train_update_number_per_epoch) train_offset_loss_mean = np.divide(offset_loss_sum, train_update_number_per_epoch) train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_heatmap_loss_mean + train_offset_loss_mean + train_wh_loss_mean logging.info( f"train heatmap loss : {train_heatmap_loss_mean} / train offset loss : {train_offset_loss_mean} / train wh loss : {train_wh_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) # optimizer weight 저장하기 try: trainer.save_states( os.path.join(weight_path, f'{model}-{i:04d}.opt')) except Exception as E: logging.error(f"optimizer weight export 예외 발생 : {E}") else: logging.info("optimizer weight export 성공") ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) ''' mxnet1.6.0 버전 에서 AMP 사용시 위에 미리 선언한 prediction을 사용하면 문제가 될 수 있다. -yolo v3, gaussian yolo v3 에서는 문제가 발생한다. mxnet 1.5.x 버전에서는 아래와 같이 새로 선언하지 않아도 정상 동작한다. block들은 함수 인자로 보낼 경우 자기 자신이 보내진다.(복사되는 것이 아님) export_block_for_cplusplus 에서 prediction 이 hybridize 되면서 미리 선언한 prediction도 hybridize화 되면서 symbol 형태가 된다. 이런 현상을 보면 아래와같이 다시 선언해 주는게 맞는 것 같다. ''' auxnet = Prediction(topk=topk, scale=scale_factor, nms=nms, except_class_thresh=except_class_thresh, nms_thresh=nms_thresh) postnet = PostNet(net=net, auxnet=auxnet) # 새로운 객체가 생성 try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 export_block_for_cplusplus( path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) if i % eval_period == 0 and valid_list: heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 # loss 구하기 for image, label, heatmap_all, offset_target_all, wh_target_all, mask_target_all, _ in valid_dataloader: vd_batch_size = image.shape[0] image = gluon.utils.split_and_load(image, ctx_list, even_split=False) label = gluon.utils.split_and_load(label, ctx_list, even_split=False) heatmap_split = gluon.utils.split_and_load(heatmap_all, ctx_list, even_split=False) offset_target_split = gluon.utils.split_and_load( offset_target_all, ctx_list, even_split=False) wh_target_split = gluon.utils.split_and_load(wh_target_all, ctx_list, even_split=False) mask_target_split = gluon.utils.split_and_load( mask_target_all, ctx_list, even_split=False) # prediction, target space for Data Parallelism heatmap_losses = [] offset_losses = [] wh_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, heatmap_target, offset_target, wh_target, mask_target in zip( image, label, heatmap_split, offset_target_split, wh_target_split, mask_target_split): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(img) id, score, bbox = prediction(heatmap_pred, offset_pred, wh_pred) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box * scale_factor, gt_labels=gt_id) heatmap_loss = heatmapfocalloss(heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_losses.append(heatmap_loss.asscalar()) offset_losses.append(offset_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) heatmap_loss_sum += sum(heatmap_losses) / vd_batch_size offset_loss_sum += sum(offset_losses) / vd_batch_size wh_loss_sum += sum(wh_losses) / vd_batch_size valid_heatmap_loss_mean = np.divide(heatmap_loss_sum, valid_update_number_per_epoch) valid_offset_loss_mean = np.divide(offset_loss_sum, valid_update_number_per_epoch) valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_heatmap_loss_mean + valid_offset_loss_mean + valid_wh_loss_mean logging.info( f"valid heatmap loss : {valid_heatmap_loss_mean} / valid offset loss : {valid_offset_loss_mean} / valid wh loss : {valid_wh_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) AP_appender = np.nan_to_num(AP_appender) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i, auto_open=valid_html_auto_open) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _, _, _ = next(dataloader_iter) image = gluon.utils.split_and_load(image, ctx_list, even_split=False) label = gluon.utils.split_and_load(label, ctx_list, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 1, 0) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(img) ids, scores, bboxes = prediction(heatmap_pred, offset_pred, wh_pred) for ig, gt_id, gt_box, heatmap, id, score, bbox in zip( img, gt_ids, gt_boxes, heatmap_pred, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) ig = ig.astype(np.uint8) # heatmap 그리기 heatmap = mx.nd.multiply(heatmap, 255.0) # 0 ~ 255 범위로 바꾸기 heatmap = mx.nd.max( heatmap, axis=0, keepdims=True) # channel 축으로 가장 큰것 뽑기 heatmap = mx.nd.transpose( heatmap, axes=(1, 2, 0)) # (height, width, channel=1) heatmap = mx.nd.repeat( heatmap, repeats=3, axis=-1) # (height, width, channel=3) heatmap = heatmap.asnumpy( ) # mxnet.ndarray -> numpy.ndarray heatmap = heatmap.astype("uint8") # float32 -> uint8 heatmap = cv2.resize(heatmap, dsize=(input_size[1], input_size[0])) # 사이즈 원복 heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box * scale_factor, scores=None, labels=gt_id, thresh=None, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True, heatmap=heatmap) # Tensorboard에 그리기 위해 (height, width, channel) -> (channel, height, width) 를한다. prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="heatmap_loss", value={ "train_heatmap_loss_mean": train_heatmap_loss_mean, "valid_heatmap_loss_mean": valid_heatmap_loss_mean }, global_step=i) summary.add_scalar(tag="offset_loss", value={ "train_offset_loss_mean": train_offset_loss_mean, "valid_offset_loss_mean": valid_offset_loss_mean }, global_step=i) summary.add_scalar(tag="wh_loss", value={ "train_wh_loss_mean": train_wh_loss_mean, "valid_wh_loss_mean": valid_wh_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) for p in net.collect_params().values(): summary.add_histogram(tag=p.name, values=p.data(ctx=ctx_list[0]), global_step=i, bins='default') end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
if 'inceptionv3' in opt.model: train_patterns = '.*weight|.*bias|inception30_batchnorm0_gamma|inception30_batchnorm0_beta|inception30_batchnorm0_running_mean|inception30_batchnorm0_running_var' else: logger.info( 'Current model does not support partial batch normalization.') trainer = gluon.Trainer(net.collect_params(train_patterns), optimizer, optimizer_params) else: trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) if opt.use_amp: amp.init() if opt.use_amp: amp.init_trainer(trainer) loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() train_metric = mx.metric.Accuracy() train_history = TrainingHistory( ['training-acc', 'val-top1-acc', 'val-top5-acc']) epochs = opt.num_epochs lr_decay_count = 0 acc_top1 = mx.metric.Accuracy() acc_top5 = mx.metric.TopKAccuracy(5)
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], anchor_alloc_size=[256, 256], box_sizes=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72], box_ratios=[[1, 2, 0.5]] + [[1, 2, 0.5, 3, 1.0 / 3]] * 4 + [[1, 2, 0.5]] * 2, anchor_box_clip=True, graphviz=True, epoch=100, input_size=[400, 600], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], foreground_iou_thresh=0.5, data_augmentation=True, num_workers=4, optimizer="ADAM", weight_decay=0.000001, save_period=10, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base="VGG16_512", pretrained_base=True, pretrained_path="modelparam", classHardNegativeMining=True, boxHardNegativeMining=True, AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", valid_html_auto_open=True, using_mlflow=True, decode_number=-1, multiperclass=True, nms_thresh=0.45, nms_topk=500, iou_thresh=0.5, except_class_thresh=0.01, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info(f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB') else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info(f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB') else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training SSD Detector") input_shape = (1, 3) + tuple(input_size) if base.upper() == "VGG16_300": # 입력 사이즈 300 x 300 추천 net = SSD_VGG16(version=300, input_size=input_size, box_sizes=box_sizes, box_ratios=box_ratios, anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=mx.cpu()) elif base.upper() == "VGG16_512": # 입력 사이즈 512 x 512 추천 net = SSD_VGG16(version=512, input_size=input_size, box_sizes=box_sizes, box_ratios=box_ratios, anchor_box_clip=anchor_box_clip, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader(multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, make_target=True) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_dataloader, valid_dataset = validdataloader(path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, make_target=True) valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes # 이름 다시 붙이기 optimizer = optimizer.upper() base = base.upper() if pretrained_base: model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_P" + base else: model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_" + base weight_path = os.path.join("weights", f"{model}") sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') optimizer_path = os.path.join(weight_path, f'{model}-{load_period:04d}.opt') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)}\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 if base.upper() == "VGG16_300": # 입력 사이즈 300 x 300 추천 net = SSD_VGG16(version=300, input_size=input_size, # box_sizes=[21, 45, 101.25, 157.5, 213.75, 270, 326.25], # box_ratios=[[1, 2, 0.5]] + # conv4_3 # [[1, 2, 0.5, 3, 1.0 / 3]] * 3 + # conv7, conv8_2, conv9_2, conv10_2 # [[1, 2, 0.5]] * 2, # conv11_2, conv12_2 box_sizes=box_sizes, box_ratios=box_ratios, num_classes=num_classes, pretrained=pretrained_base, pretrained_path=pretrained_path, anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=ctx) elif base.upper() == "VGG16_512": # 입력 사이즈 512 x 512 추천 net = SSD_VGG16(version=512, input_size=input_size, # box_sizes=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72], # box_ratios=[[1, 2, 0.5]] + # conv4_3 # [[1, 2, 0.5, 3, 1.0 / 3]] * 4 + # conv7, conv8_2, conv9_2, conv10_2 # [[1, 2, 0.5]] * 2, # conv11_2, conv12_2 box_sizes=box_sizes, box_ratios=box_ratios, num_classes=num_classes, pretrained=pretrained_base, pretrained_path=pretrained_path, anchor_box_clip=anchor_box_clip, ctx=ctx) else: logging.warning("backbone 없음") exit(0) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False}, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False}, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "momentum": 0.9, 'multi_precision': False}, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) if AMP: amp.init_trainer(trainer) # optimizer weight 불러오기 if os.path.exists(optimizer_path): try: trainer.load_states(optimizer_path) except Exception as E: logging.info(E) else: logging.info(f"loading {os.path.basename(optimizer_path)}\n") ''' localization loss -> Smooth L1 loss confidence loss -> Softmax ''' if not classHardNegativeMining: confidence_loss = SoftmaxCrossEntropyLoss(axis=-1, sparse_label=True, from_log_softmax=False, batch_axis=None, reduction="sum", exclude=False) if not boxHardNegativeMining: localization_loss = HuberLoss(rho=1, batch_axis=None, reduction="sum", exclude=False) prediction = Prediction( batch_size=batch_size, from_softmax=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) ctx_list = ctx if isinstance(ctx, (list, tuple)) else [ctx] start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): conf_loss_sum = 0 loc_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, cls_all, box_all, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) cls_all = mx.nd.split(data=cls_all, num_outputs=subdivision, axis=0) box_all = mx.nd.split(data=box_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] cls_t_all = [cls_t_all] box_t_all = [box_t_all] with autograd.record(train_mode=True): cls_all_losses = [] box_all_losses = [] for image_split, cls_split, box_split in zip(image, cls_all, box_all): image_split = gluon.utils.split_and_load(image_split, ctx_list, even_split=False) cls_split = gluon.utils.split_and_load(cls_split, ctx_list, even_split=False) box_split = gluon.utils.split_and_load(box_split, ctx_list, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, cls_target, box_target in zip(image_split, cls_split, box_split): # 1. SSD network Inference cls_pred, box_pred, anchor = net(img) ''' 4. Hard negative mining (class에만 loss 계산) Hard negative mining After the matching step, most of the default boxes are negatives, especially when the number of possible default boxes is large. This introduces a significant imbalance between the positive and negative training examples. Instead of using all the negative examples, we sort them using the highest confidence loss for each default box and pick the top ones so that the ratio between the negatives and positives is at most 3:1. We found that this leads to faster optimization and a more stable training ''' weight_term_alpha = 1 negative_mining_ratio = 3 positive_samples = cls_target > 0 # True or False positive_numbers = positive_samples.sum() if classHardNegativeMining: pred = mx.nd.log_softmax(cls_pred, axis=-1) negative_samples = 1 - positive_samples conf_loss = -mx.nd.pick(pred, cls_target, axis=-1) # (batch, all feature number) ''' we sort them using the highest confidence loss for each default box and pick the top ones so that the ratio between the negatives and positives is at most 3:1. ''' negative_samples_conf_loss = (conf_loss * negative_samples) # 아래 3줄의 코드 출처 : from gluoncv.loss import SSDMultiBoxLoss negative_samples_index = mx.nd.argsort(negative_samples_conf_loss, axis=-1, is_ascend=False) selection = mx.nd.argsort(negative_samples_index, axis=-1, is_ascend=True) hard_negative_samples = selection <= mx.nd.multiply(positive_numbers, negative_mining_ratio).expand_dims(-1) pos_hardnega = positive_samples + hard_negative_samples conf_loss = mx.nd.where(pos_hardnega > 0, conf_loss, mx.nd.zeros_like(conf_loss)) conf_loss = mx.nd.sum(conf_loss) if positive_numbers: conf_loss = mx.nd.divide(conf_loss, positive_numbers) else: conf_loss = mx.nd.multiply(conf_loss, 0) cls_losses.append(conf_loss.asscalar()) else: conf_loss = confidence_loss(cls_pred, cls_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: conf_loss = mx.nd.divide(conf_loss, positive_numbers) else: conf_loss = mx.nd.multiply(conf_loss, 0) cls_losses.append(conf_loss.asscalar()) if boxHardNegativeMining: # loc loss에도 hard HardNegativeMining 적용해보자. pred = mx.nd.log_softmax(cls_pred, axis=-1) negative_samples = 1 - positive_samples conf_loss_for_box = -mx.nd.pick(pred, cls_target, axis=-1) # (batch, all feature number) negative_samples_conf_loss = (conf_loss_for_box * negative_samples) negative_samples_index = mx.nd.argsort(negative_samples_conf_loss, axis=-1, is_ascend=False) selection = mx.nd.argsort(negative_samples_index, axis=-1, is_ascend=True) hard_negative_samples = selection <= mx.nd.multiply(positive_numbers, negative_mining_ratio).expand_dims(-1) pos_hardnega = positive_samples + hard_negative_samples pos_hardnega = mx.nd.repeat(pos_hardnega.reshape(shape=(0, 0, 1)), repeats=4, axis=-1) loc_loss = mx.nd.abs(box_pred - box_target) loc_loss = mx.nd.where(loc_loss > 1, loc_loss - 0.5, (0.5 / 1) * mx.nd.square(loc_loss)) loc_loss = mx.nd.where(pos_hardnega > 0, loc_loss, mx.nd.zeros_like(loc_loss)) loc_loss = mx.nd.sum(loc_loss) if positive_numbers: loc_loss = mx.nd.divide(loc_loss, positive_numbers) else: loc_loss = mx.nd.multiply(loc_loss, 0) box_losses.append(loc_loss.asscalar()) else: loc_loss = localization_loss(box_pred, box_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: loc_loss = mx.nd.divide(loc_loss, positive_numbers) else: loc_loss = mx.nd.multiply(loc_loss, 0) box_losses.append(loc_loss.asscalar()) total_loss.append(conf_loss + weight_term_alpha * loc_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) cls_all_losses.append(sum(cls_losses)) box_all_losses.append(sum(box_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() conf_loss_sum += sum(cls_all_losses) / td_batch_size loc_loss_sum += sum(box_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info(f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]' f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]') time_stamp = time.time() train_conf_loss_mean = np.divide(conf_loss_sum, train_update_number_per_epoch) train_loc_loss_mean = np.divide(loc_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean logging.info( f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}") if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) # optimizer weight 저장하기 try: trainer.save_states(os.path.join(weight_path, f'{model}-{i:04d}.opt')) except Exception as E: logging.error(f"optimizer weight export 예외 발생 : {E}") else: logging.info("optimizer weight export 성공") ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) ''' mxnet1.6.0 버전 에서 AMP 사용시 위에 미리 선언한 prediction을 사용하면 문제가 될 수 있다. -yolo v3, gaussian yolo v3 에서는 문제가 발생한다. mxnet 1.5.x 버전에서는 아래와 같이 새로 선언하지 않아도 정상 동작한다. block들은 함수 인자로 보낼 경우 자기 자신이 보내진다.(복사되는 것이 아님) export_block_for_cplusplus 에서 prediction 이 hybridize 되면서 미리 선언한 prediction도 hybridize화 되면서 symbol 형태가 된다. 이런 현상을 보면 아래와같이 다시 선언해 주는게 맞는 것 같다. ''' auxnet = Prediction( from_softmax=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) postnet = PostNet(net=net, auxnet=auxnet) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함. export_block_for_cplusplus(path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3,)), epoch=i, preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) if i % eval_period == 0 and valid_list: if classHardNegativeMining: confidence_loss = SoftmaxCrossEntropyLoss(axis=-1, sparse_label=True, from_log_softmax=False, batch_axis=None, reduction="sum", exclude=False) if boxHardNegativeMining: localization_loss = HuberLoss(rho=1, batch_axis=None, reduction="sum", exclude=False) conf_loss_sum = 0 loc_loss_sum = 0 for image, label, cls_all, box_all, _ in valid_dataloader: vd_batch_size = image.shape[0] image = gluon.utils.split_and_load(image, ctx_list, even_split=False) label = gluon.utils.split_and_load(label, ctx_list, even_split=False) cls_all = gluon.utils.split_and_load(cls_all, ctx_list, even_split=False) box_all = gluon.utils.split_and_load(box_all, ctx_list, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, cls_target, box_target in zip(image, label, cls_all, box_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) id, score, bbox = prediction(cls_pred, box_pred, anchor) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) positive_samples = cls_target > 0 positive_numbers = positive_samples.sum() conf_loss = confidence_loss(cls_pred, cls_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: conf_loss = mx.nd.divide(conf_loss, positive_numbers) else: conf_loss = mx.nd.multiply(conf_loss, 0) cls_losses.append(conf_loss.asscalar()) loc_loss = localization_loss(box_pred, box_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: loc_loss = mx.nd.divide(loc_loss, positive_numbers) else: loc_loss = mx.nd.multiply(loc_loss, 0) box_losses.append(loc_loss.asscalar()) conf_loss_sum += sum(cls_losses) / vd_batch_size loc_loss_sum += sum(box_losses) / vd_batch_size valid_conf_loss_mean = np.divide(conf_loss_sum, valid_update_number_per_epoch) valid_loc_loss_mean = np.divide(loc_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean logging.info( f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}") AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list() for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info(f"class {j}'s {name} AP : {round(AP * 100, round_position)}%") AP_appender.append(AP) AP_appender = np.nan_to_num(AP_appender) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i, auto_open=valid_html_auto_open) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _ = next(dataloader_iter) image = gluon.utils.split_and_load(image, ctx_list, even_split=False) label = gluon.utils.split_and_load(label, ctx_list, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 1, 0) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) ids, scores, bboxes = prediction(cls_pred, box_pred, anchor) for ig, gt_id, gt_box, id, score, bbox in zip(img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose( (1, 2, 0)) * mx.nd.array(std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) ig = ig.astype(np.uint8) # ground truth box 그리기 ground_truth = plot_bbox(ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox(ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 (height, width, channel) -> (channel, height, width) 를한다. prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append(prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="conf_loss", value={"train_conf_loss": train_conf_loss_mean, "valid_conf_loss": valid_conf_loss_mean}, global_step=i) summary.add_scalar(tag="loc_loss", value={"train_loc_loss": train_loc_loss_mean, "valid_loc_loss": valid_loc_loss_mean}, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean}, global_step=i) for p in net.collect_params().values(): summary.add_histogram(tag=p.name, values=p.data(ctx=ctx_list[0]), global_step=i, bins='default') end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], offset_alloc_size=(64, 64), anchors={"shallow": [(10, 13), (16, 30), (33, 23)], "middle": [(30, 61), (62, 45), (59, 119)], "deep": [(116, 90), (156, 198), (373, 326)]}, graphviz=False, epoch=100, input_size=[416, 416], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=False, factor_scale=[13, 5], ignore_threshold=0.5, dynamic=False, data_augmentation=True, num_workers=4, optimizer="ADAM", save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, Darknetlayer=53, pretrained_base=True, pretrained_path="modelparam", AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, multiperclass=True, nms_thresh=0.5, nms_topk=500, except_class_thresh=0.05, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info(f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB') else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info(f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB') else: logging.info(f'Running on {ctx}') # 입력 사이즈를 32의 배수로 지정해 버리기 - stride가 일그러지는 것을 막기 위함 if input_size[0] % 32 != 0 and input_size[1] % 32 != 0: logging.info("The input size must be a multiple of 32") exit(0) if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training Gaussian YoloV3 Detector") input_shape = (1, 3) + tuple(input_size) try: net = Yolov3(Darknetlayer=Darknetlayer, anchors=anchors, pretrained=False, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader(multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, ignore_threshold=ignore_threshold, dynamic=dynamic, from_sigmoid=False, make_target=True) valid_dataloader, valid_dataset = validdataloader(path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, ignore_threshold=ignore_threshold, dynamic=dynamic, from_sigmoid=False, make_target=True) except Exception as E: logging.info(E) exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() if pretrained_base: model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_P" + "Dark_" + str(Darknetlayer) else: model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_Dark_" + str(Darknetlayer) weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 ''' mxnet c++에서 arbitrary input image 를 받기 위한 전략 alloc_size : tuple of int, default is (128, 128) For advanced users. Define `alloc_size` to generate large enough offset maps, which will later saved in parameters. During inference, we support arbitrary input image by cropping corresponding area of the anchor map. This allow us to export to symbol so we can run it in c++, Scalar, etc. ''' net = Yolov3(Darknetlayer=Darknetlayer, input_size=input_size, anchors=anchors, num_classes=num_classes, # foreground만 pretrained=pretrained_base, pretrained_path=pretrained_path, alloc_size=offset_alloc_size, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False}) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False}) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False}) else: logging.error("optimizer not selected") exit(0) loss = GaussianYolov3Loss(sparse_label=True, from_sigmoid=False, batch_axis=None, num_classes=num_classes, reduction="sum", exclude=False, epsilon=1e-9) prediction = Prediction( from_sigmoid=False, num_classes=num_classes, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): xcyc_loss_sum = 0 wh_loss_sum = 0 object_loss_sum = 0 class_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, xcyc_all, wh_all, objectness_all, class_all, weights_all, _) in enumerate( train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) xcyc_all = mx.nd.split(data=xcyc_all, num_outputs=subdivision, axis=0) wh_all = mx.nd.split(data=wh_all, num_outputs=subdivision, axis=0) objectness_all = mx.nd.split(data=objectness_all, num_outputs=subdivision, axis=0) class_all = mx.nd.split(data=class_all, num_outputs=subdivision, axis=0) weights_all = mx.nd.split(data=weights_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] xcyc_all = [xcyc_all] wh_all = [wh_all] objectness_all = [objectness_all] class_all = [class_all] weights_all = [weights_all] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): xcyc_all_losses = [] wh_all_losses = [] object_all_losses = [] class_all_losses = [] for image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split in zip(image, xcyc_all, wh_all, objectness_all, class_all, weights_all): if GPU_COUNT <= 1: image_split = gluon.utils.split_and_load(image_split, [ctx], even_split=False) xcyc_split = gluon.utils.split_and_load(xcyc_split, [ctx], even_split=False) wh_split = gluon.utils.split_and_load(wh_split, [ctx], even_split=False) objectness_split = gluon.utils.split_and_load(objectness_split, [ctx], even_split=False) class_split = gluon.utils.split_and_load(class_split, [ctx], even_split=False) weights_split = gluon.utils.split_and_load(weights_split, [ctx], even_split=False) else: image_split = gluon.utils.split_and_load(image_split, ctx, even_split=False) xcyc_split = gluon.utils.split_and_load(xcyc_split, ctx, even_split=False) wh_split = gluon.utils.split_and_load(wh_split, ctx, even_split=False) objectness_split = gluon.utils.split_and_load(objectness_split, ctx, even_split=False) class_split = gluon.utils.split_and_load(class_split, ctx, even_split=False) weights_split = gluon.utils.split_and_load(weights_split, ctx, even_split=False) xcyc_losses = [] wh_losses = [] object_losses = [] class_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, xcyc_target, wh_target, objectness, class_target, weights in zip(image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split): output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target, wh_target, objectness, class_target, weights) xcyc_losses.append(xcyc_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) object_losses.append(object_loss.asscalar()) class_losses.append(class_loss.asscalar()) total_loss.append(xcyc_loss + wh_loss + object_loss + class_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) xcyc_all_losses.append(sum(xcyc_losses)) wh_all_losses.append(sum(wh_losses)) object_all_losses.append(sum(object_losses)) class_all_losses.append(sum(class_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() xcyc_loss_sum += sum(xcyc_all_losses) / td_batch_size wh_loss_sum += sum(wh_all_losses) / td_batch_size object_loss_sum += sum(object_all_losses) / td_batch_size class_loss_sum += sum(class_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info(f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[xcyc loss = {sum(xcyc_all_losses) / td_batch_size:.3f}]' f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]' f'[obj loss = {sum(object_all_losses) / td_batch_size:.3f}]' f'[class loss = {sum(class_all_losses) / td_batch_size:.3f}]') time_stamp = time.time() train_xcyc_loss_mean = np.divide(xcyc_loss_sum, train_update_number_per_epoch) train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch) train_object_loss_mean = np.divide(object_loss_sum, train_update_number_per_epoch) train_class_loss_mean = np.divide(class_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_xcyc_loss_mean + train_wh_loss_mean + train_object_loss_mean + train_class_loss_mean logging.info( f"train xcyc loss : {train_xcyc_loss_mean} / " f"train wh loss : {train_wh_loss_mean} / " f"train object loss : {train_object_loss_mean} / " f"train class loss : {train_class_loss_mean} / " f"train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: xcyc_loss_sum = 0 wh_loss_sum = 0 object_loss_sum = 0 class_loss_sum = 0 # loss 구하기 for image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all, _ in valid_dataloader: vd_batch_size, _, height, width = image.shape if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) xcyc_all = gluon.utils.split_and_load(xcyc_all, [ctx], even_split=False) wh_all = gluon.utils.split_and_load(wh_all, [ctx], even_split=False) objectness_all = gluon.utils.split_and_load(objectness_all, [ctx], even_split=False) class_all = gluon.utils.split_and_load(class_all, [ctx], even_split=False) weights_all = gluon.utils.split_and_load(weights_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) xcyc_all = gluon.utils.split_and_load(xcyc_all, ctx, even_split=False) wh_all = gluon.utils.split_and_load(wh_all, ctx, even_split=False) objectness_all = gluon.utils.split_and_load(objectness_all, ctx, even_split=False) class_all = gluon.utils.split_and_load(class_all, ctx, even_split=False) weights_all = gluon.utils.split_and_load(weights_all, ctx, even_split=False) xcyc_losses = [] wh_losses = [] object_losses = [] class_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, xcyc_target, wh_target, objectness, class_target, weights in zip(image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) id, score, bbox = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target, wh_target, objectness, class_target, weights) xcyc_losses.append(xcyc_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) object_losses.append(object_loss.asscalar()) class_losses.append(class_loss.asscalar()) total_loss.append(xcyc_losses + wh_losses + object_losses + class_losses) xcyc_loss_sum += sum(xcyc_losses) / vd_batch_size wh_loss_sum += sum(wh_losses) / vd_batch_size object_loss_sum += sum(object_losses) / vd_batch_size class_loss_sum += sum(class_losses) / vd_batch_size valid_xcyc_loss_mean = np.divide(xcyc_loss_sum, valid_update_number_per_epoch) valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch) valid_object_loss_mean = np.divide(object_loss_sum, valid_update_number_per_epoch) valid_class_loss_mean = np.divide(class_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_xcyc_loss_mean + valid_wh_loss_mean + valid_object_loss_mean + valid_class_loss_mean logging.info( f"valid xcyc loss : {valid_xcyc_loss_mean} / " f"valid wh loss : {valid_wh_loss_mean} / " f"valid object loss : {valid_object_loss_mean} / " f"valid class loss : {valid_class_loss_mean} / " f"valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list() for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info(f"class {j}'s {name} AP : {round(AP * 100, round_position)}%") AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _, _, _, _= next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) ids, scores, bboxes = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3) for ig, gt_id, gt_box, id, score, bbox in zip(img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose( (1, 2, 0)) * mx.nd.array(std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # ground truth box 그리기 ground_truth = plot_bbox(ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox(ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append(prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="xy_loss", value={"train_xcyc_loss": train_xcyc_loss_mean, "valid_xcyc_loss": valid_xcyc_loss_mean}, global_step=i) summary.add_scalar(tag="wh_loss", value={"train_wh_loss": train_wh_loss_mean, "valid_wh_loss": valid_wh_loss_mean}, global_step=i) summary.add_scalar(tag="object_loss", value={"train_object_loss": train_object_loss_mean, "valid_object_loss": valid_object_loss_mean}, global_step=i) summary.add_scalar(tag="class_loss", value={"train_class_loss": train_class_loss_mean, "valid_class_loss": valid_class_loss_mean}, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean}, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: if not os.path.exists(weight_path): os.makedirs(weight_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) # for onnx net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함. export_block_for_cplusplus(path=os.path.join(weight_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3,)), epoch=i, preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
def train_net(config): mx.random.seed(3) np.random.seed(3) if config.TRAIN.USE_FP16: from mxnet.contrib import amp amp.init() if config.use_hvd: import horovod.mxnet as hvd ctx_list = [mx.gpu(x) for x in config.gpus] from utils.blocks import FrozenBatchNorm2d neck = PyramidNeckFCOS(feature_dim=config.network.fpn_neck_feature_dim) backbone = build_backbone(config, neck=neck, norm_layer=FrozenBatchNorm2d, **config.network.BACKBONE.kwargs) net = FCOSFPNNet(backbone, config.dataset.NUM_CLASSES) # Resume parameters. resume = None if resume is not None: params_coco = mx.nd.load(resume) for k in params_coco: params_coco[k.replace("arg:", "").replace("aux:", "")] = params_coco.pop(k) params = net.collect_params() for k in params.keys(): try: params[k]._load_init(params_coco[k.replace('resnet0_', '')], ctx=mx.cpu()) print("success load {}".format(k)) except Exception as e: logging.exception(e) if config.TRAIN.resume is not None: net.collect_params().load(config.TRAIN.resume) logging.info("loaded resume from {}".format(config.TRAIN.resume)) # Initialize parameters params = net.collect_params() from utils.initializer import KaMingUniform for key in params.keys(): if params[key]._data is None: default_init = mx.init.Zero( ) if "bias" in key or "offset" in key else KaMingUniform() default_init.set_verbosity(True) if params[key].init is not None and hasattr( params[key].init, "set_verbosity"): params[key].init.set_verbosity(True) params[key].initialize(init=params[key].init, default_init=params[key].init) else: params[key].initialize(default_init=default_init) params = net.collect_params() # for p_name, p in params.items(): # if p_name.endswith(('_bias')): # p.wd_mult = 0 # p.lr_mult = 2 # logging.info("set wd_mult of {} to {}.".format(p_name, p.wd_mult)) # logging.info("set lr_mult of {} to {}.".format(p_name, p.lr_mult)) net.collect_params().reset_ctx(list(set(ctx_list))) if config.dataset.dataset_type == "coco": from data.bbox.mscoco import COCODetection base_train_dataset = COCODetection(root=config.dataset.dataset_path, splits=("instances_train2017", ), h_flip=config.TRAIN.FLIP, transform=None, use_crowd=False) elif config.dataset.dataset_type == "voc": from data.bbox.voc import VOCDetection base_train_dataset = VOCDetection(root=config.dataset.dataset_path, splits=((2007, 'trainval'), (2012, 'trainval')), preload_label=False) else: assert False train_dataset = AspectGroupingDataset( base_train_dataset, config, target_generator=FCOSTargetGenerator(config)) if config.use_hvd: class SplitDataset(object): def __init__(self, da, local_size, local_rank): self.da = da self.local_size = local_size self.locak_rank = local_rank def __len__(self): return len(self.da) // self.local_size def __getitem__(self, idx): return self.da[idx * self.local_size + self.locak_rank] train_dataset = SplitDataset(train_dataset, local_size=hvd.local_size(), local_rank=hvd.local_rank()) train_loader = mx.gluon.data.DataLoader(dataset=train_dataset, batch_size=1, num_workers=8, last_batch="discard", shuffle=True, thread_pool=False, batchify_fn=batch_fn) params_all = net.collect_params() params_to_train = {} params_fixed_prefix = config.network.FIXED_PARAMS for p in params_all.keys(): ignore = False if params_all[p].grad_req == "null" and "running" not in p: ignore = True logging.info( "ignore {} because its grad req is set to null.".format(p)) if params_fixed_prefix is not None: import re for f in params_fixed_prefix: if re.match(f, str(p)) is not None: ignore = True params_all[p].grad_req = 'null' logging.info( "{} is ignored when training because it matches {}.". format(p, f)) if not ignore and params_all[p].grad_req != "null": params_to_train[p] = params_all[p] lr_steps = [len(train_loader) * int(x) for x in config.TRAIN.lr_step] logging.info(lr_steps) lr_scheduler = mx.lr_scheduler.MultiFactorScheduler( step=lr_steps, warmup_mode="constant", factor=.1, base_lr=config.TRAIN.lr, warmup_steps=config.TRAIN.warmup_step, warmup_begin_lr=config.TRAIN.warmup_lr) if config.use_hvd: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer( params_to_train, 'sgd', { 'wd': config.TRAIN.wd, 'momentum': config.TRAIN.momentum, 'clip_gradient': None, 'lr_scheduler': lr_scheduler, 'multi_precision': True, }) else: trainer = mx.gluon.Trainer( params_to_train, # fix batchnorm, fix first stage, etc... 'sgd', { 'wd': config.TRAIN.wd, 'momentum': config.TRAIN.momentum, 'clip_gradient': None, 'lr_scheduler': lr_scheduler, 'multi_precision': True, }, update_on_kvstore=(False if config.TRAIN.USE_FP16 else None), kvstore=mx.kvstore.create('local')) if config.TRAIN.USE_FP16: amp.init_trainer(trainer) # trainer = mx.gluon.Trainer( # params_to_train, # fix batchnorm, fix first stage, etc... # 'adam', {"learning_rate": 4e-4}) # Please note that the GPU devices of the trainer states when saving must be same with that when loading. if config.TRAIN.trainer_resume is not None: trainer.load_states(config.TRAIN.trainer_resume) logging.info("loaded trainer states from {}.".format( config.TRAIN.trainer_resume)) metric_loss_loc = mx.metric.Loss(name="loss_loc") metric_loss_cls = mx.metric.Loss(name="loss_cls") metric_loss_center = mx.metric.Loss(name="loss_center") eval_metrics = mx.metric.CompositeEvalMetric() for child_metric in [metric_loss_loc, metric_loss_cls, metric_loss_center]: eval_metrics.add(child_metric) net.hybridize(static_alloc=True, static_shape=False) for ctx in ctx_list: with ag.record(): pad = lambda x: int(np.ceil(x / 32) * 32) _ = net( mx.nd.random.randn( config.TRAIN.batch_size // len(ctx_list), int(pad(config.TRAIN.image_max_long_size + 32)), int(pad(config.TRAIN.image_short_size + 32)), 3, ctx=ctx)) ag.backward(_) del _ net.collect_params().zero_grad() mx.nd.waitall() while trainer.optimizer.num_update <= config.TRAIN.end_epoch * len( train_loader): epoch = trainer.optimizer.num_update // len(train_loader) for data_batch in tqdm.tqdm( train_loader ) if not config.use_hvd or hvd.local_rank() == 0 else train_loader: if config.use_hvd: data_list = [data_batch[0].as_in_context(ctx_list[0])] targets_list = [data_batch[1].as_in_context(ctx_list[0])] else: if isinstance(data_batch[0], mx.nd.NDArray): data_list = mx.gluon.utils.split_and_load( mx.nd.array(data_batch[0]), ctx_list=ctx_list, batch_axis=0) targets_list = mx.gluon.utils.split_and_load( mx.nd.array(data_batch[1]), ctx_list=ctx_list, batch_axis=0) else: data_list = mx.gluon.utils.split_and_load( mx.nd.array(data_batch[0][0]), ctx_list=ctx_list, batch_axis=0) targets_list = mx.gluon.utils.split_and_load( mx.nd.array(data_batch[0][1]), ctx_list=ctx_list, batch_axis=0) losses_loc = [] losses_center_ness = [] losses_cls = [] n_workers = hvd.local_size() if config.use_hvd else len(ctx_list) num_pos = data_batch[0][1][:, 0].sum() / n_workers num_pos_denominator = mx.nd.maximum(num_pos, mx.nd.ones_like(num_pos)) centerness_sum = data_batch[0][1][:, 5].sum() / n_workers centerness_sum_denominator = mx.nd.maximum( centerness_sum, mx.nd.ones_like(centerness_sum)) with ag.record(): for data, targets in zip(data_list, targets_list): num_pos_denominator_ctx = num_pos_denominator.as_in_context( data.context) centerness_sum_denominator_ctx = centerness_sum_denominator.as_in_context( data.context) loc_preds, cls_preds = net(data) iou_loss = mobula.op.IoULoss(loc_preds[:, :4], targets[:, 1:5], axis=1) iou_loss = iou_loss * targets[:, 5: 6] / centerness_sum_denominator_ctx # iou_loss = IoULoss()(loc_preds[:, :4].exp(), targets[:, 1:5]) * targets[:, 5] / centerness_sum_denominator_ctx loss_center = mobula.op.BCELoss( loc_preds[:, 4], targets[:, 5]) * targets[:, 0] / num_pos_denominator_ctx loss_cls = mobula.op.FocalLoss( alpha=.25, gamma=2, logits=cls_preds, targets=targets[:, 6:]) / num_pos_denominator_ctx loss_total = loss_center.sum() + iou_loss.sum( ) + loss_cls.sum() if config.TRAIN.USE_FP16: with amp.scale_loss(loss_total, trainer) as scaled_losses: ag.backward(scaled_losses) else: loss_total.backward() losses_loc.append(iou_loss) losses_center_ness.append(loss_center) losses_cls.append(loss_cls) trainer.step(n_workers) if not config.use_hvd or hvd.local_rank() == 0: for l in losses_loc: metric_loss_loc.update(None, l.sum()) for l in losses_center_ness: metric_loss_center.update(None, l.sum()) for l in losses_cls: metric_loss_cls.update(None, l.sum()) if trainer.optimizer.num_update % config.TRAIN.log_interval == 0: # msg = "Epoch={},Step={},lr={}, ".format( epoch, trainer.optimizer.num_update, trainer.learning_rate) msg += ','.join([ '{}={:.3f}'.format(w, v) for w, v in zip(*eval_metrics.get()) ]) logging.info(msg) eval_metrics.reset() if trainer.optimizer.num_update % 5000 == 0: save_path = os.path.join( config.TRAIN.log_path, "{}-{}.params".format(epoch, trainer.optimizer.num_update)) net.collect_params().save(save_path) logging.info("Saved checkpoint to {}".format(save_path)) trainer_path = save_path + "-trainer.states" trainer.save_states(trainer_path) if not config.use_hvd or hvd.local_rank() == 0: save_path = os.path.join(config.TRAIN.log_path, "{}.params".format(epoch)) net.collect_params().save(save_path) logging.info("Saved checkpoint to {}".format(save_path)) trainer_path = save_path + "-trainer.states" trainer.save_states(trainer_path)