os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) transform = EfficientTrainTransform(input_size[0], input_size[1], make_target=False) dataset = DetectionDataset(path=os.path.join(root, 'Dataset', 'train'), transform=transform) num_classes = dataset.num_class image, label, _, _, _ = dataset[0] label = mx.nd.array(label) net = Efficient( version=0, input_size=input_size, anchor_sizes=[32, 64, 128, 256, 512], anchor_size_ratios=[1, pow(2, 1 / 3), pow(2, 2 / 3)], anchor_aspect_ratios=[0.5, 1, 2], num_classes=num_classes, # foreground만 anchor_box_offset=(0.5, 0.5), anchor_box_clip=True, alloc_size=[256, 256], ctx=mx.cpu()) net.hybridize(active=True, static_alloc=True, static_shape=True) pred = Prediction(batch_size=8, from_sigmoid=False, means=(0., 0., 0., 0.), stds=(0.1, 0.1, 0.2, 0.2), num_classes=num_classes, decode_number=1000, nms_thresh=0.5,
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], anchor_alloc_size=[256, 256], anchor_sizes=[32, 64, 128, 256, 512], anchor_size_ratios=[1, pow(2, 1 / 3), pow(2, 2 / 3)], anchor_aspect_ratios=[0.5, 1, 2], anchor_box_clip=True, graphviz=True, epoch=100, input_size=[512, 512], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], foreground_iou_thresh=0.5, background_iou_thresh=0.4, data_augmentation=True, num_workers=4, optimizer="ADAM", weight_decay=0.000001, save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base=0, AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", valid_html_auto_open=True, using_mlflow=True, decode_number=5000, multiperclass=True, nms_thresh=0.5, nms_topk=500, iou_thresh=0.5, except_class_thresh=0.05, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training Efficient Detector") input_shape = (1, 3) + tuple(input_size) net = Efficient(version=base, anchor_sizes=anchor_sizes, anchor_size_ratios=anchor_size_ratios, anchor_aspect_ratios=anchor_aspect_ratios, anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, background_iou_thresh=background_iou_thresh, make_target=True) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, background_iou_thresh=background_iou_thresh, make_target=True) valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_EFF_" + str(base) weight_path = os.path.join("weights", f"{model}") sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') optimizer_path = os.path.join(weight_path, f'{model}-{load_period:04d}.opt') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)}\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 net = Efficient( version=base, input_size=input_size, anchor_sizes=anchor_sizes, anchor_size_ratios=anchor_size_ratios, anchor_aspect_ratios=anchor_aspect_ratios, num_classes=num_classes, # foreground만 anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) if AMP: amp.init_trainer(trainer) # optimizer weight 불러오기 if os.path.exists(optimizer_path): try: trainer.load_states(optimizer_path) except Exception as E: logging.info(E) else: logging.info(f"loading {os.path.basename(optimizer_path)}\n") ''' localization loss -> Smooth L1 loss confidence loss -> Focal ''' confidence_loss = FocalLoss(alpha=0.25, gamma=2, sparse_label=True, from_sigmoid=False, batch_axis=None, num_class=num_classes, reduction="sum", exclude=False) localization_loss = HuberLoss(rho=1, batch_axis=None, reduction="sum", exclude=False) prediction = Prediction(batch_size=batch_size, from_sigmoid=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) ctx_list = ctx if isinstance(ctx, (list, tuple)) else [ctx] start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): conf_loss_sum = 0 loc_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, cls_all, box_all, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) cls_all = mx.nd.split(data=cls_all, num_outputs=subdivision, axis=0) box_all = mx.nd.split(data=box_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] cls_all = [cls_all] box_all = [box_all] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): cls_all_losses = [] box_all_losses = [] for image_split, cls_split, box_split in zip( image, cls_all, box_all): image_split = gluon.utils.split_and_load(image_split, ctx_list, even_split=False) cls_split = gluon.utils.split_and_load(cls_split, ctx_list, even_split=False) box_split = gluon.utils.split_and_load(box_split, ctx_list, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, cls_target, box_target in zip( image_split, cls_split, box_split): cls_pred, box_pred, anchor = net(img) except_ignore_samples = cls_target > -1 positive_samples = cls_target > 0 positive_numbers = positive_samples.sum() conf_loss = confidence_loss( cls_pred, cls_target, except_ignore_samples.expand_dims(axis=-1)) conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1) cls_losses.append(conf_loss.asscalar()) loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) box_losses.append(loc_loss.asscalar()) total_loss.append(conf_loss + loc_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) cls_all_losses.append(sum(cls_losses)) box_all_losses.append(sum(box_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() conf_loss_sum += sum(cls_all_losses) / td_batch_size loc_loss_sum += sum(box_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]' f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]' ) time_stamp = time.time() train_conf_loss_mean = np.divide(conf_loss_sum, train_update_number_per_epoch) train_loc_loss_mean = np.divide(loc_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean logging.info( f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) # optimizer weight 저장하기 try: trainer.save_states( os.path.join(weight_path, f'{model}-{i:04d}.opt')) except Exception as E: logging.error(f"optimizer weight export 예외 발생 : {E}") else: logging.info("optimizer weight export 성공") ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) ''' mxnet1.6.0 버전 에서 AMP 사용시 위에 미리 선언한 prediction을 사용하면 문제가 될 수 있다. -yolo v3, gaussian yolo v3 에서는 문제가 발생한다. mxnet 1.5.x 버전에서는 아래와 같이 새로 선언하지 않아도 정상 동작한다. block들은 함수 인자로 보낼 경우 자기 자신이 보내진다.(복사되는 것이 아님) export_block_for_cplusplus 에서 prediction 이 hybridize 되면서 미리 선언한 prediction도 hybridize화 되면서 symbol 형태가 된다. 이런 현상을 보면 아래와같이 다시 선언해 주는게 맞는 것 같다. ''' auxnet = Prediction(from_sigmoid=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) postnet = PostNet(net=net, auxnet=auxnet) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 export_block_for_cplusplus( path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) if i % eval_period == 0 and valid_list: conf_loss_sum = 0 loc_loss_sum = 0 # loss 구하기 for image, label, cls_all, box_all, _ in valid_dataloader: vd_batch_size = image.shape[0] image = gluon.utils.split_and_load(image, ctx_list, even_split=False) label = gluon.utils.split_and_load(label, ctx_list, even_split=False) cls_all = gluon.utils.split_and_load(cls_all, ctx_list, even_split=False) box_all = gluon.utils.split_and_load(box_all, ctx_list, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, cls_target, box_target in zip( image, label, cls_all, box_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) id, score, bbox = prediction(cls_pred, box_pred, anchor) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) except_ignore_samples = cls_target > -1 positive_samples = cls_target > 0 positive_numbers = positive_samples.sum() conf_loss = confidence_loss( cls_pred, cls_target, except_ignore_samples.expand_dims(axis=-1)) conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1) cls_losses.append(conf_loss.asscalar()) loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) box_losses.append(loc_loss.asscalar()) conf_loss_sum += sum(cls_losses) / vd_batch_size loc_loss_sum += sum(box_losses) / vd_batch_size valid_conf_loss_mean = np.divide(conf_loss_sum, valid_update_number_per_epoch) valid_loc_loss_mean = np.divide(loc_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean logging.info( f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) AP_appender = np.nan_to_num(AP_appender) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i, auto_open=valid_html_auto_open) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _ = next(dataloader_iter) image = gluon.utils.split_and_load(image, ctx_list, even_split=False) label = gluon.utils.split_and_load(label, ctx_list, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 1, 0) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) ids, scores, bboxes = prediction(cls_pred, box_pred, anchor) for ig, gt_id, gt_box, id, score, bbox in zip( img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) ig = ig.astype(np.uint8) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 (height, width, channel) -> (channel, height, width) 를한다. prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="conf_loss", value={ "train_conf_loss": train_conf_loss_mean, "valid_conf_loss": valid_conf_loss_mean }, global_step=i) summary.add_scalar(tag="loc_loss", value={ "train_loc_loss": train_loc_loss_mean, "valid_loc_loss": valid_loc_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) for p in net.collect_params().values(): summary.add_histogram(tag=p.name, values=p.data(ctx=ctx_list[0]), global_step=i, bins='default') end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))