示例#1
0
 def check_add_histogram(data):
     sw = SummaryWriter(logdir=_LOGDIR)
     sw.add_histogram(tag='test_add_histogram',
                      values=data,
                      global_step=0,
                      bins=100)
     sw.close()
     check_event_file_and_remove_logdir()
示例#2
0
 def __call__(self, mxb_writer: mxboard.SummaryWriter,
              samples_processed: int, *args, **kwargs):
     if samples_processed - self._last_call > self._freq:
         self._last_call = samples_processed
         for k, p in self._params.items():
             if p.grad_req != 'null':
                 g = p.grad().asnumpy()
                 mxb_writer.add_histogram(k, g, samples_processed, bins=10)
示例#3
0
 def train(self, train_data, valid_data, batch_size, n_epoch, acc):
     # perform validation before training the network
     best_loss = 0.
     global_step = 0
     for data, label in valid_data:
         data = transpose(data, (0, 3, 1, 2))
         best_loss += self.__loss(self.__net(data.copyto(self.ctx)), label.copyto(self.ctx)).mean().asscalar()
     best_loss /= len(valid_data)
     best_train = best_loss
     print("Before training: test loss %.3f" % (best_loss))
     self.__net.save_parameters(os.path.join(self.__save_path, "best.model"))
     sw = SummaryWriter(logdir='./logs')
     # start training the network
     for epoch in range(n_epoch):
         train_loss, valid_loss, valid_acc = 0., 0., 0.
         tic = time.time()
         for data, label in train_data:
             # forward + backward
             data = data.copyto(self.ctx)
             data = transpose(round(data),(0, 3, 1, 2))
             with autograd.record():
                 output = self.__net(data)
                 loss = self.__loss(output, label.copyto(self.ctx))
             loss.backward()
             if global_step % 1000 == 0:
                 sw.add_histogram(tag="loss", values=loss, bins=200, global_step=global_step)
                 sw.add_histogram(tag="output", values=output, bins=200, global_step=global_step)
             # update parameters
             self.__trainer.step(batch_size)
             # calculate training metrics
             train_loss += loss.mean().asscalar()
             global_step += 1
         # calculate validation accuracy
         for data, label in valid_data:
             data = data.copyto(self.ctx)
             data = transpose(data, (0, 3, 1, 2))
             valid_out = self.__net(data)
             label = label.copyto(self.ctx)
             valid_loss += self.__loss(valid_out, label).mean().asscalar()
             valid_acc += acc(valid_out, label)
         train_loss /= len(train_data)
         valid_loss /= len(valid_data)
         valid_acc /= len(valid_data)
         print("Epoch %d: loss %.5f, test loss %.5f, test acc %.5f, in %.1f sec" % (
                 epoch, train_loss, valid_loss, valid_acc, time.time()-tic))
         if train_loss < best_train:
             best_train = train_loss
         if valid_loss < best_loss:
             best_loss = valid_loss
             self.__net.save_parameters(os.path.join(self.__save_path, "best.model"))
             print("\tCurrent best epoch!")
     self.__net.save_parameters(os.path.join(self.__save_path, "last.model"))
     return best_train, best_loss
示例#4
0
def test_add_histogram():
    shape = rand_shape_nd(4)
    sw = SummaryWriter(logdir=_LOGDIR)
    sw.add_histogram(tag='test_add_histogram', values=mx.nd.random.normal(shape=shape), global_step=0, bins=100)
    sw.close()
    check_event_file_and_remove_logdir()
示例#5
0
def mytrain(net,num_classes,train_data,valid_data,ctx,start_epoch, end_epoch, \
            arm_cls_loss=arm_cls_loss,cls_loss=cls_loss,box_loss=box_loss,trainer=None):
    if trainer is None:
        # trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01,'momentum':0.9, 'wd':50.0})
        trainer = gluon.Trainer(net.collect_params(), 'adam', {
            'learning_rate': 0.001,
            'clip_gradient': 2.0
        })
        # trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 0.003})
    box_metric = metric.MAE()

    ## add visible
    # collect parameter names for logging the gradients of parameters in each epoch
    params = net.collect_params()
    # param_names = params.keys()
    # define a summary writer that logs data and flushes to the file every 5 seconds
    sw = SummaryWriter(logdir='./logs', flush_secs=5)
    global_step = 0

    for e in range(start_epoch, end_epoch):
        # print(e)
        train_data.reset()
        valid_data.reset()
        box_metric.reset()
        tic = time.time()
        _loss = [0, 0]
        arm_loss = [0, 0]
        # if e == 6 or e == 100:
        #     trainer.set_learning_rate(trainer.learning_rate * 0.2)

        outs, labels = None, None
        for i, batch in enumerate(train_data):
            # print('----- batch {} start ----'.format(i))
            data = batch.data[0].as_in_context(ctx)
            label = batch.label[0].as_in_context(ctx)
            # print('label shape: ',label.shape)
            with autograd.record():
                # 1. generate results according to extract network
                ssd_layers = net(data)
                arm_loc_preds, arm_cls_preds, arm_anchor_boxes, odm_loc_preds, odm_cls_preds = multibox_layer(ssd_layers,\
                                                                            num_classes,sizes,ratios,normalizations)
                # arm_loc_preds, arm_cls_preds, arm_anchor_boxes, odm_loc_preds, odm_cls_preds = net(data)
                # print('---------1111-----------')
                # 2. ARM predict
                ## 2.1  modify label as [-1,0,..]
                label_arm = nd.Custom(label, op_type='modify_label')
                arm_tmp = MultiBoxTarget(arm_anchor_boxes,label_arm,arm_cls_preds,overlap_threshold=.5,\
                                         negative_mining_ratio=3,negative_mining_thresh=.5)
                arm_loc_target = arm_tmp[0]  # box offset
                arm_loc_target_mask = arm_tmp[1]  # box mask (only 0,1)
                arm_cls_target = arm_tmp[2]  #  every anchor' idx
                # print(sum(arm_cls_target[0]))
                # print('---------2222-----------')

                # 3. ODM predict
                ## 3.1 refine anchor generator originate in ARM
                odm_anchor_boxes = refine_anchor_generator(
                    arm_anchor_boxes,
                    arm_loc_preds)  #(batch,h*w*num_anchors[:layers],4)
                # ### debug backward err
                # odm_anchor_boxes = arm_anchor_boxes
                odm_anchor_boxes_bs = nd.split(
                    data=odm_anchor_boxes, axis=0,
                    num_outputs=label.shape[0])  # list
                # print('---3 : odm_anchor_boxes_bs shape : {}'.format(odm_anchor_boxes_bs[0].shape))
                # print('---------3333-----------')
                ## 3.2 对当前所有batch的data计算 Target (多个gpu使用)

                odm_loc_target = []
                odm_loc_target_mask = []
                odm_cls_target = []
                label_bs = nd.split(data=label,
                                    axis=0,
                                    num_outputs=label.shape[0])
                odm_cls_preds_bs = nd.split(data=odm_cls_preds,
                                            axis=0,
                                            num_outputs=label.shape[0])
                # print('---4 : odm_cls_preds_bs shape: {}'.format(odm_cls_preds_bs[0].shape))
                # print('---4 : label_bs shape: {}'.format(label_bs[0].shape))

                for j in range(label.shape[0]):
                    if label.shape[0] == 1:
                        odm_tmp = MultiBoxTarget(odm_anchor_boxes_bs[j].expand_dims(axis=0),label_bs[j].expand_dims(axis=0),\
                                            odm_cls_preds_bs[j].expand_dims(axis=0),overlap_threshold=.5,negative_mining_ratio=2,negative_mining_thresh=.5)
                    ## 多个batch
                    else:
                        odm_tmp = MultiBoxTarget(odm_anchor_boxes_bs[j],label_bs[j],\
                                            odm_cls_preds_bs[j],overlap_threshold=.5,negative_mining_ratio=3,negative_mining_thresh=.5)
                    odm_loc_target.append(odm_tmp[0])
                    odm_loc_target_mask.append(odm_tmp[1])
                    odm_cls_target.append(odm_tmp[2])
                ### concat ,上面为什么会单独计算每张图,odm包含了batch,so需要拆
                odm_loc_target = nd.concat(*odm_loc_target, dim=0)
                odm_loc_target_mask = nd.concat(*odm_loc_target_mask, dim=0)
                odm_cls_target = nd.concat(*odm_cls_target, dim=0)

                # 4. negitave filter
                group = nd.Custom(arm_cls_preds,
                                  odm_cls_target,
                                  odm_loc_target_mask,
                                  op_type='negative_filtering')
                odm_cls_target = group[0]  #用ARM中的cls过滤后的odm_cls
                odm_loc_target_mask = group[1]  #过滤掉的mask为0
                # print('---------4444-----------')
                # 5. calc loss
                # TODO:add 1/N_arm, 1/N_odm (num of positive anchors)
                # arm_cls_loss = gluon.loss.SoftmaxCrossEntropyLoss()
                arm_loss_cls = arm_cls_loss(arm_cls_preds.transpose((0, 2, 1)),
                                            arm_cls_target)
                arm_loss_loc = box_loss(arm_loc_preds, arm_loc_target,
                                        arm_loc_target_mask)
                # print('55555 loss->  arm_loss_cls : {} arm_loss_loc {}'.format(arm_loss_cls.shape,arm_loss_loc.shape))
                # print('arm_loss_cls loss : {}'.format(arm_loss_cls))
                # odm_cls_prob = nd.softmax(odm_cls_preds,axis=2)
                tmp = odm_cls_preds.transpose((0, 2, 1))
                odm_loss_cls = cls_loss(odm_cls_preds.transpose((0, 2, 1)),
                                        odm_cls_target)
                odm_loss_loc = box_loss(odm_loc_preds, odm_loc_target,
                                        odm_loc_target_mask)
                # print('66666 loss->  odm_loss_cls : {} odm_loss_loc {}'.format(odm_loss_cls.shape,odm_loss_loc.shape))
                # print('odm_loss_cls loss :{} '.format(odm_loss_cls))
                # print('odm_loss_loc loss :{} '.format(odm_loss_loc))
                # print('N_arm: {} ; N_odm: {} '.format(nd.sum(arm_loc_target_mask,axis=1)/4.0,nd.sum(odm_loc_target_mask,axis=1)/4.0))
                # loss = arm_loss_cls+arm_loss_loc+odm_loss_cls+odm_loss_loc
                loss = 1/(nd.sum(arm_loc_target_mask,axis=1)/4.0) *(arm_loss_cls+arm_loss_loc) + \
                        1/(nd.sum(odm_loc_target_mask,axis=1)/4.0)*(odm_loss_cls+odm_loss_loc)

            sw.add_scalar(tag='loss',
                          value=loss.mean().asscalar(),
                          global_step=global_step)
            global_step += 1
            loss.backward(retain_graph=False)
            # autograd.backward(loss)
            # print(net.collect_params().get('conv4_3_weight').data())
            # print(net.collect_params().get('vgg0_conv9_weight').grad())
            ### 单独测试梯度
            # arm_loss_cls.backward(retain_graph=False)
            # arm_loss_loc.backward(retain_graph=False)
            # odm_loss_cls.backward(retain_graph=False)
            # odm_loss_loc.backward(retain_graph=False)

            trainer.step(data.shape[0])
            _loss[0] += nd.mean(odm_loss_cls).asscalar()
            _loss[1] += nd.mean(odm_loss_loc).asscalar()
            arm_loss[0] += nd.mean(arm_loss_cls).asscalar()
            arm_loss[1] += nd.mean(arm_loss_loc).asscalar()
            # print(arm_loss)
            arm_cls_prob = nd.SoftmaxActivation(arm_cls_preds, mode='channel')
            odm_cls_prob = nd.SoftmaxActivation(odm_cls_preds, mode='channel')
            out = MultiBoxDetection(odm_cls_prob,odm_loc_preds,odm_anchor_boxes,\
                                        force_suppress=True,clip=False,nms_threshold=.5,nms_topk=400)
            # print('out shape: {}'.format(out.shape))
            if outs is None:
                outs = out
                labels = label
            else:
                outs = nd.concat(outs, out, dim=0)
                labels = nd.concat(labels, label, dim=0)
            box_metric.update([odm_loc_target],
                              [odm_loc_preds * odm_loc_target_mask])
        print('-------{} epoch end ------'.format(e))
        train_AP = evaluate_MAP(outs, labels)
        valid_AP, val_box_metric = evaluate_acc(net, valid_data, ctx)
        info["train_ap"].append(train_AP)
        info["valid_ap"].append(valid_AP)
        info["loss"].append(_loss)
        print('odm loss: ', _loss)
        print('arm loss: ', arm_loss)
        if e == 0:
            sw.add_graph(net)
        # grads = [i.grad() for i in net.collect_params().values()]
        # grads_4_3 = net.collect_params().get('vgg0_conv9_weight').grad()
        # sw.add_histogram(tag ='vgg0_conv9_weight',values=grads_4_3,global_step=e, bins=1000 )
        grads_4_2 = net.collect_params().get('vgg0_conv5_weight').grad()
        sw.add_histogram(tag='vgg0_conv5_weight',
                         values=grads_4_2,
                         global_step=e,
                         bins=1000)
        # assert len(grads) == len(param_names)
        # logging the gradients of parameters for checking convergence
        # for i, name in enumerate(param_names):
        #     sw.add_histogram(tag=name, values=grads[i], global_step=e, bins=1000)

        # net.export('./Model/RefineDet_MeterDetect') # net
        if (e + 1) % 5 == 0:
            print(
                "epoch: %d time: %.2f cls loss: %.4f,reg loss: %.4f lr: %.5f" %
                (e, time.time() - tic, _loss[0], _loss[1],
                 trainer.learning_rate))
            print("train mae: %.4f AP: %.4f" % (box_metric.get()[1], train_AP))
            print("valid mae: %.4f AP: %.4f" %
                  (val_box_metric.get()[1], valid_AP))
        sw.add_scalar(tag='train_AP', value=train_AP, global_step=e)
        sw.add_scalar(tag='valid_AP', value=valid_AP, global_step=e)
    sw.close()
    if True:
        info["loss"] = np.array(info["loss"])
        info["cls_loss"] = info["loss"][:, 0]
        info["box_loss"] = info["loss"][:, 1]

        plt.figure(figsize=(12, 4))
        plt.subplot(121)
        plot("train_ap")
        plot("valid_ap")
        plt.legend(loc="upper right")
        plt.subplot(122)
        plot("cls_loss")
        plot("box_loss")
        plt.legend(loc="upper right")
        plt.savefig('loss_curve.png')
示例#6
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        offset_alloc_size=(64, 64),
        anchors={"shallow": [(10, 13), (16, 30), (33, 23)],
                 "middle": [(30, 61), (62, 45), (59, 119)],
                 "deep": [(116, 90), (156, 198), (373, 326)]},
        graphviz=False,
        epoch=100,
        input_size=[416, 416],
        batch_log=100,
        batch_size=16,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=False,
        factor_scale=[13, 5],
        ignore_threshold=0.5,
        dynamic=False,
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        save_period=5,
        load_period=10,
        learning_rate=0.001, decay_lr=0.999, decay_step=10,
        GPU_COUNT=0,
        Darknetlayer=53,
        pretrained_base=True,
        pretrained_path="modelparam",
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        using_mlflow=True,
        multiperclass=True,
        nms_thresh=0.5,
        nms_topk=500,
        iou_thresh=0.5,
        except_class_thresh=0.05,
        plot_class_thresh=0.5):
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB')
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB')
        else:
            logging.info(f'Running on {ctx}')

    # 입력 사이즈를 32의 배수로 지정해 버리기 - stride가 일그러지는 것을 막기 위함
    if input_size[0] % 32 != 0 and input_size[1] % 32 != 0:
        logging.info("The input size must be a multiple of 32")
        exit(0)

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training YoloV3 Detector")
    input_shape = (1, 3) + tuple(input_size)

    try:
        net = Yolov3(Darknetlayer=Darknetlayer,
                     anchors=anchors,
                     pretrained=False,
                     ctx=mx.cpu())
        train_dataloader, train_dataset = traindataloader(multiscale=multiscale,
                                                          factor_scale=factor_scale,
                                                          augmentation=data_augmentation,
                                                          path=train_dataset_path,
                                                          input_size=input_size,
                                                          batch_size=batch_size,
                                                          batch_interval=batch_interval,
                                                          num_workers=num_workers,
                                                          shuffle=True, mean=mean, std=std,
                                                          net=net, ignore_threshold=ignore_threshold, dynamic=dynamic,
                                                          from_sigmoid=False, make_target=True)
        valid_dataloader, valid_dataset = validdataloader(path=valid_dataset_path,
                                                          input_size=input_size,
                                                          batch_size=valid_size,
                                                          num_workers=num_workers,
                                                          shuffle=True, mean=mean, std=std,
                                                          net=net, ignore_threshold=ignore_threshold, dynamic=dynamic,
                                                          from_sigmoid=False, make_target=True)

    except Exception:
        logging.info("dataset 없음")
        exit(0)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    if pretrained_base:
        model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_P" + "Dark_" + str(Darknetlayer)
    else:
        model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_Dark_" + str(Darknetlayer)

    weight_path = f"weights/{model}"
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)} weights\n")
        net = gluon.SymbolBlock.imports(sym_path,
                                        ['data'],
                                        param_path, ctx=ctx)
    else:
        start_epoch = 0
        '''
        mxnet c++에서 arbitrary input image 를 받기 위한 전략
        alloc_size : tuple of int, default is (128, 128)
        For advanced users. Define `alloc_size` to generate large enough offset
        maps, which will later saved in parameters. During inference, we support arbitrary
        input image by cropping corresponding area of the anchor map. This allow us
        to export to symbol so we can run it in c++, Scalar, etc.
        '''
        net = Yolov3(Darknetlayer=Darknetlayer,
                     input_size=input_size,
                     anchors=anchors,
                     num_classes=num_classes,  # foreground만
                     pretrained=pretrained_base,
                     pretrained_path=pretrained_path,
                     alloc_size=offset_alloc_size,
                     ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))

        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'

    if AMP:
        '''
        update_on_kvstore : bool, default None
        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
        '''
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "beta1": 0.9,
                                                                                       "beta2": 0.999,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "gamma1": 0.9,
                                                                                       "gamma2": 0.999,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "wd": 0.0005,
                                                                                       "momentum": 0.9,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        else:
            logging.error("optimizer not selected")
            exit(0)

        amp.init_trainer(trainer)

    else:
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "beta1": 0.9,
                                                                                       "beta2": 0.999,
                                                                                       'multi_precision': False})
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "gamma1": 0.9,
                                                                                       "gamma2": 0.999,
                                                                                       'multi_precision': False})
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "wd": 0.0005,
                                                                                       "momentum": 0.9,
                                                                                       'multi_precision': False})

        else:
            logging.error("optimizer not selected")
            exit(0)

    loss = Yolov3Loss(sparse_label=True,
                      from_sigmoid=False,
                      batch_axis=None,
                      num_classes=num_classes,
                      reduction="sum",
                      exclude=False)

    prediction = Prediction(
        from_sigmoid=False,
        num_classes=num_classes,
        nms_thresh=nms_thresh,
        nms_topk=nms_topk,
        except_class_thresh=except_class_thresh,
        multiperclass=multiperclass)

    precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes)

    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch):

        xcyc_loss_sum = 0
        wh_loss_sum = 0
        object_loss_sum = 0
        class_loss_sum = 0
        time_stamp = time.time()

        for batch_count, (image, _, xcyc_all, wh_all, objectness_all, class_all, weights_all, _) in enumerate(
                train_dataloader, start=1):
            td_batch_size = image.shape[0]

            image = mx.nd.split(data=image, num_outputs=subdivision, axis=0)
            xcyc_all = mx.nd.split(data=xcyc_all, num_outputs=subdivision, axis=0)
            wh_all = mx.nd.split(data=wh_all, num_outputs=subdivision, axis=0)
            objectness_all = mx.nd.split(data=objectness_all, num_outputs=subdivision, axis=0)
            class_all = mx.nd.split(data=class_all, num_outputs=subdivision, axis=0)
            weights_all = mx.nd.split(data=weights_all, num_outputs=subdivision, axis=0)

            if subdivision == 1:
                image = [image]
                xcyc_all = [xcyc_all]
                wh_all = [wh_all]
                objectness_all = [objectness_all]
                class_all = [class_all]
                weights_all = [weights_all]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                xcyc_all_losses = []
                wh_all_losses = []
                object_all_losses = []
                class_all_losses = []

                for image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split in zip(image,
                                                                                                           xcyc_all,
                                                                                                           wh_all,
                                                                                                           objectness_all,
                                                                                                           class_all,
                                                                                                           weights_all):

                    if GPU_COUNT <= 1:
                        image_split = gluon.utils.split_and_load(image_split, [ctx], even_split=False)
                        xcyc_split = gluon.utils.split_and_load(xcyc_split, [ctx], even_split=False)
                        wh_split = gluon.utils.split_and_load(wh_split, [ctx], even_split=False)
                        objectness_split = gluon.utils.split_and_load(objectness_split, [ctx], even_split=False)
                        class_split = gluon.utils.split_and_load(class_split, [ctx], even_split=False)
                        weights_split = gluon.utils.split_and_load(weights_split, [ctx], even_split=False)
                    else:
                        image_split = gluon.utils.split_and_load(image_split, ctx, even_split=False)
                        xcyc_split = gluon.utils.split_and_load(xcyc_split, ctx, even_split=False)
                        wh_split = gluon.utils.split_and_load(wh_split, ctx, even_split=False)
                        objectness_split = gluon.utils.split_and_load(objectness_split, ctx, even_split=False)
                        class_split = gluon.utils.split_and_load(class_split, ctx, even_split=False)
                        weights_split = gluon.utils.split_and_load(weights_split, ctx, even_split=False)

                    xcyc_losses = []
                    wh_losses = []
                    object_losses = []
                    class_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, xcyc_target, wh_target, objectness, class_target, weights in zip(image_split, xcyc_split,
                                                                                              wh_split,
                                                                                              objectness_split,
                                                                                              class_split,
                                                                                              weights_split):
                        output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                            img)
                        xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target,
                                                                           wh_target, objectness,
                                                                           class_target, weights)
                        xcyc_losses.append(xcyc_loss.asscalar())
                        wh_losses.append(wh_loss.asscalar())
                        object_losses.append(object_loss.asscalar())
                        class_losses.append(class_loss.asscalar())
                        total_loss.append(xcyc_loss + wh_loss + object_loss + class_loss)
                    if AMP:
                        with amp.scale_loss(total_loss, trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    xcyc_all_losses.append(sum(xcyc_losses))
                    wh_all_losses.append(sum(wh_losses))
                    object_all_losses.append(sum(object_losses))
                    class_all_losses.append(sum(class_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기
            for p in net.collect_params().values():
                p.zero_grad()

            xcyc_loss_sum += sum(xcyc_all_losses) / td_batch_size
            wh_loss_sum += sum(wh_all_losses) / td_batch_size
            object_loss_sum += sum(object_all_losses) / td_batch_size
            class_loss_sum += sum(class_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                             f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                             f'[Lr = {trainer.learning_rate}]'
                             f'[xcyc loss = {sum(xcyc_all_losses) / td_batch_size:.3f}]'
                             f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]'
                             f'[obj loss = {sum(object_all_losses) / td_batch_size:.3f}]'
                             f'[class loss = {sum(class_all_losses) / td_batch_size:.3f}]')
            time_stamp = time.time()

        train_xcyc_loss_mean = np.divide(xcyc_loss_sum, train_update_number_per_epoch)
        train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch)
        train_object_loss_mean = np.divide(object_loss_sum, train_update_number_per_epoch)
        train_class_loss_mean = np.divide(class_loss_sum, train_update_number_per_epoch)
        train_total_loss_mean = train_xcyc_loss_mean + train_wh_loss_mean + train_object_loss_mean + train_class_loss_mean
        logging.info(
            f"train xcyc loss : {train_xcyc_loss_mean} / "
            f"train wh loss : {train_wh_loss_mean} / "
            f"train object loss : {train_object_loss_mean} / "
            f"train class loss : {train_class_loss_mean} / "
            f"train total loss : {train_total_loss_mean}"
        )

        if i % eval_period == 0 and valid_list:

            xcyc_loss_sum = 0
            wh_loss_sum = 0
            object_loss_sum = 0
            class_loss_sum = 0

            # loss 구하기
            for image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all, _ in valid_dataloader:
                vd_batch_size, _, height, width = image.shape

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx], even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx], even_split=False)
                    xcyc_all = gluon.utils.split_and_load(xcyc_all, [ctx], even_split=False)
                    wh_all = gluon.utils.split_and_load(wh_all, [ctx], even_split=False)
                    objectness_all = gluon.utils.split_and_load(objectness_all, [ctx], even_split=False)
                    class_all = gluon.utils.split_and_load(class_all, [ctx], even_split=False)
                    weights_all = gluon.utils.split_and_load(weights_all, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image, ctx, even_split=False)
                    label = gluon.utils.split_and_load(label, ctx, even_split=False)
                    xcyc_all = gluon.utils.split_and_load(xcyc_all, ctx, even_split=False)
                    wh_all = gluon.utils.split_and_load(wh_all, ctx, even_split=False)
                    objectness_all = gluon.utils.split_and_load(objectness_all, ctx, even_split=False)
                    class_all = gluon.utils.split_and_load(class_all, ctx, even_split=False)
                    weights_all = gluon.utils.split_and_load(weights_all, ctx, even_split=False)

                xcyc_losses = []
                wh_losses = []
                object_losses = []
                class_losses = []
                total_loss = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, xcyc_target, wh_target, objectness, class_target, weights in zip(image, label, xcyc_all,
                                                                                              wh_all, objectness_all,
                                                                                              class_all, weights_all):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]

                    output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                        img)
                    id, score, bbox = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2,
                                                 offset3, stride1, stride2, stride3)

                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box,
                                            gt_labels=gt_id)

                    xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target,
                                                                       wh_target, objectness,
                                                                       class_target, weights)
                    xcyc_losses.append(xcyc_loss.asscalar())
                    wh_losses.append(wh_loss.asscalar())
                    object_losses.append(object_loss.asscalar())
                    class_losses.append(class_loss.asscalar())
                    total_loss.append(xcyc_losses + wh_losses + object_losses + class_losses)

                xcyc_loss_sum += sum(xcyc_losses) / vd_batch_size
                wh_loss_sum += sum(wh_losses) / vd_batch_size
                object_loss_sum += sum(object_losses) / vd_batch_size
                class_loss_sum += sum(class_losses) / vd_batch_size

            valid_xcyc_loss_mean = np.divide(xcyc_loss_sum, valid_update_number_per_epoch)
            valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch)
            valid_object_loss_mean = np.divide(object_loss_sum, valid_update_number_per_epoch)
            valid_class_loss_mean = np.divide(class_loss_sum, valid_update_number_per_epoch)
            valid_total_loss_mean = valid_xcyc_loss_mean + valid_wh_loss_mean + valid_object_loss_mean + valid_class_loss_mean

            logging.info(
                f"valid xcyc loss : {valid_xcyc_loss_mean} / "
                f"valid wh loss : {valid_wh_loss_mean} / "
                f"valid object loss : {valid_object_loss_mean} / "
                f"valid class loss : {valid_class_loss_mean} / "
                f"valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list()
            for j, c, p, r in zip(range(len(recall)), class_name, precision, recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(f"class {j}'s {name} AP : {round(AP * 100, round_position)}%")
                AP_appender.append(AP)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _, _, _, _ = next(dataloader_iter)
                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx], even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image, ctx, even_split=False)
                    label = gluon.utils.split_and_load(label, ctx, even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 0, 1)

                batch_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                        img)
                    ids, scores, bboxes = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1,
                                                     offset2, offset3, stride1, stride2, stride3)

                    for ig, gt_id, gt_box, id, score, bbox in zip(img, gt_ids, gt_boxes, ids, scores, bboxes):
                        ig = ig.transpose(
                            (1, 2, 0)) * mx.nd.array(std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(ig, gt_box, scores=None, labels=gt_id, thresh=None,
                                                 reverse_rgb=True,
                                                 class_names=valid_dataset.classes, absolute_coordinates=True,
                                                 colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(ground_truth, bbox, scores=score, labels=id,
                                                   thresh=plot_class_thresh,
                                                   reverse_rgb=False,
                                                   class_names=valid_dataset.classes, absolute_coordinates=True)

                        # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB)
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(prediction_box)  # (batch, channel, height, width)

                summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i)

                summary.add_scalar(tag="xy_loss", value={"train_xcyc_loss": train_xcyc_loss_mean,
                                                         "valid_xcyc_loss": valid_xcyc_loss_mean}, global_step=i)
                summary.add_scalar(tag="wh_loss", value={"train_wh_loss": train_wh_loss_mean,
                                                         "valid_wh_loss": valid_wh_loss_mean}, global_step=i)
                summary.add_scalar(tag="object_loss", value={"train_object_loss": train_object_loss_mean,
                                                             "valid_object_loss": valid_object_loss_mean},
                                   global_step=i)
                summary.add_scalar(tag="class_loss", value={"train_class_loss": train_class_loss_mean,
                                                            "valid_class_loss": valid_class_loss_mean}, global_step=i)

                summary.add_scalar(tag="total_loss", value={
                    "train_total_loss": train_total_loss_mean,
                    "valid_total_loss": valid_total_loss_mean},
                                   global_step=i)

                params = net.collect_params().values()
                if GPU_COUNT > 1:
                    for c in ctx:
                        for p in params:
                            summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default')
                else:
                    for p in params:
                        summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default')

        if i % save_period == 0:

            weight_epoch_path = os.path.join(weight_path, str(i))
            if not os.path.exists(weight_epoch_path):
                os.makedirs(weight_epoch_path)

            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''

            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)

            postnet = PostNet(net=net, auxnet=prediction)

            try:
                net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True)  # for onnx
                net.save_parameters(os.path.join(weight_path, f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함.
                export_block_for_cplusplus(path=os.path.join(weight_epoch_path, f"{model}_prepost"),
                                           block=postnet,
                                           data_shape=tuple(input_size) + tuple((3,)),
                                           epoch=i,
                                           preprocess=True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                                           layout='HWC',
                                           ctx=context,
                                           remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
示例#7
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        graphviz=True,
        epoch=100,
        input_size=[512, 512],
        batch_size=16,
        batch_log=100,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=True,
        factor_scale=[8, 5],
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        lambda_off=1,
        lambda_size=0.1,
        save_period=5,
        load_period=10,
        learning_rate=0.001,
        decay_lr=0.999,
        decay_step=10,
        GPU_COUNT=0,
        base=18,
        pretrained_base=True,
        pretrained_path="modelparam",
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        using_mlflow=True,
        topk=100,
        plot_class_thresh=0.5):
    '''
    AMP 가 모든 연산을 지원하지는 않는다.
    modulated convolution을 지원하지 않음
    '''
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
        else:
            logging.info(f'Running on {ctx}')

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training Center Detector")
    input_shape = (1, 3) + tuple(input_size)

    scale_factor = 4  # 고정
    logging.info(f"scale factor {scale_factor}")

    try:
        train_dataloader, train_dataset = traindataloader(
            multiscale=multiscale,
            factor_scale=factor_scale,
            augmentation=data_augmentation,
            path=train_dataset_path,
            input_size=input_size,
            batch_size=batch_size,
            batch_interval=batch_interval,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            scale_factor=scale_factor,
            make_target=True)
        valid_dataloader, valid_dataset = validdataloader(
            path=valid_dataset_path,
            input_size=input_size,
            batch_size=valid_size,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            scale_factor=scale_factor,
            make_target=True)

    except Exception as E:
        logging.info(E)
        exit(0)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    if pretrained_base:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_P" + "CENTER_RES" + str(base)
    else:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_CENTER_RES" + str(base)

    weight_path = f"weights/{model}"
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)} weights\n")
        net = gluon.SymbolBlock.imports(sym_path, ['data'],
                                        param_path,
                                        ctx=ctx)
    else:
        start_epoch = 0
        net = CenterNet(base=base,
                        heads=OrderedDict([('heatmap', {
                            'num_output': num_classes,
                            'bias': -2.19
                        }), ('offset', {
                            'num_output': 2
                        }), ('wh', {
                            'num_output': 2
                        })]),
                        head_conv_channel=64,
                        pretrained=pretrained_base,
                        root=pretrained_path,
                        use_dcnv2=False,
                        ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))
        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model),
                                max_queue=10,
                                flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net,
                                       shape=input_shape,
                                       save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) //
                 batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step,
                                             factor=decay_lr,
                                             stop_factor_lr=1e-12,
                                             base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'

    if AMP:
        '''
        update_on_kvstore : bool, default None
        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
        '''
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "beta1": 0.9,
                    "beta2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "gamma1": 0.9,
                    "gamma2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "wd": 0.0001,
                    "momentum": 0.9,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        else:
            logging.error("optimizer not selected")
            exit(0)

        amp.init_trainer(trainer)

    else:
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "beta1": 0.9,
                                        "beta2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "gamma1": 0.9,
                                        "gamma2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "wd": 0.0001,
                                        "momentum": 0.9,
                                        'multi_precision': False
                                    })

        else:
            logging.error("optimizer not selected")
            exit(0)

    heatmapfocalloss = HeatmapFocalLoss(from_sigmoid=True, alpha=2, beta=4)
    normedl1loss = NormedL1Loss()
    prediction = Prediction(batch_size=valid_size,
                            topk=topk,
                            scale=scale_factor)
    precision_recall = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes)

    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1),
                  initial=start_epoch + 1,
                  total=epoch):

        heatmap_loss_sum = 0
        offset_loss_sum = 0
        wh_loss_sum = 0
        time_stamp = time.time()
        '''
        target generator를 train_dataloader에서 만들어 버리는게 학습 속도가 훨씬 빠르다. 
        '''

        for batch_count, (image, _, heatmap, offset_target, wh_target,
                          mask_target, _) in enumerate(train_dataloader,
                                                       start=1):
            td_batch_size = image.shape[0]

            image_split = mx.nd.split(data=image,
                                      num_outputs=subdivision,
                                      axis=0)
            heatmap_split = mx.nd.split(data=heatmap,
                                        num_outputs=subdivision,
                                        axis=0)
            offset_target_split = mx.nd.split(data=offset_target,
                                              num_outputs=subdivision,
                                              axis=0)
            wh_target_split = mx.nd.split(data=wh_target,
                                          num_outputs=subdivision,
                                          axis=0)
            mask_target_split = mx.nd.split(data=mask_target,
                                            num_outputs=subdivision,
                                            axis=0)

            if subdivision == 1:
                image_split = [image_split]
                heatmap_split = [heatmap_split]
                offset_target_split = [offset_target_split]
                wh_target_split = [wh_target_split]
                mask_target_split = [mask_target_split]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                heatmap_all_losses = []
                offset_all_losses = []
                wh_all_losses = []

                for image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part in zip(
                        image_split, heatmap_split, offset_target_split,
                        wh_target_split, mask_target_split):

                    if GPU_COUNT <= 1:
                        image_part = gluon.utils.split_and_load(
                            image_part, [ctx], even_split=False)
                        heatmap_part = gluon.utils.split_and_load(
                            heatmap_part, [ctx], even_split=False)
                        offset_target_part = gluon.utils.split_and_load(
                            offset_target_part, [ctx], even_split=False)
                        wh_target_part = gluon.utils.split_and_load(
                            wh_target_part, [ctx], even_split=False)
                        mask_target_part = gluon.utils.split_and_load(
                            mask_target_part, [ctx], even_split=False)
                    else:
                        image_part = gluon.utils.split_and_load(
                            image_part, ctx, even_split=False)
                        heatmap_part = gluon.utils.split_and_load(
                            heatmap_part, ctx, even_split=False)
                        offset_target_part = gluon.utils.split_and_load(
                            offset_target_part, ctx, even_split=False)
                        wh_target_part = gluon.utils.split_and_load(
                            wh_target_part, ctx, even_split=False)
                        mask_target_part = gluon.utils.split_and_load(
                            mask_target_part, ctx, even_split=False)

                    # prediction, target space for Data Parallelism
                    heatmap_losses = []
                    offset_losses = []
                    wh_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, heatmap_target, offset_target, wh_target, mask_target in zip(
                            image_part, heatmap_part, offset_target_part,
                            wh_target_part, mask_target_part):
                        heatmap_pred, offset_pred, wh_pred = net(img)
                        heatmap_loss = heatmapfocalloss(
                            heatmap_pred, heatmap_target)
                        offset_loss = normedl1loss(offset_pred, offset_target,
                                                   mask_target) * lambda_off
                        wh_loss = normedl1loss(wh_pred, wh_target,
                                               mask_target) * lambda_size

                        heatmap_losses.append(heatmap_loss.asscalar())
                        offset_losses.append(offset_loss.asscalar())
                        wh_losses.append(wh_loss.asscalar())

                        total_loss.append(heatmap_loss + offset_loss + wh_loss)

                    if AMP:
                        with amp.scale_loss(total_loss,
                                            trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    heatmap_all_losses.append(sum(heatmap_losses))
                    offset_all_losses.append(sum(offset_losses))
                    wh_all_losses.append(sum(wh_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기

            for p in net.collect_params().values():
                p.zero_grad()

            heatmap_loss_sum += sum(heatmap_all_losses) / td_batch_size
            offset_loss_sum += sum(offset_all_losses) / td_batch_size
            wh_loss_sum += sum(wh_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(
                    f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                    f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                    f'[Lr = {trainer.learning_rate}]'
                    f'[heatmap loss = {sum(heatmap_all_losses) / td_batch_size:.3f}]'
                    f'[offset loss = {sum(offset_all_losses) / td_batch_size:.3f}]'
                    f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]')
            time_stamp = time.time()

        train_heatmap_loss_mean = np.divide(heatmap_loss_sum,
                                            train_update_number_per_epoch)
        train_offset_loss_mean = np.divide(offset_loss_sum,
                                           train_update_number_per_epoch)
        train_wh_loss_mean = np.divide(wh_loss_sum,
                                       train_update_number_per_epoch)
        train_total_loss_mean = train_heatmap_loss_mean + train_offset_loss_mean + train_wh_loss_mean

        logging.info(
            f"train heatmap loss : {train_heatmap_loss_mean} / train offset loss : {train_offset_loss_mean} / train wh loss : {train_wh_loss_mean} / train total loss : {train_total_loss_mean}"
        )

        if i % eval_period == 0 and valid_list:

            heatmap_loss_sum = 0
            offset_loss_sum = 0
            wh_loss_sum = 0

            # loss 구하기
            for image, label, heatmap_all, offset_target_all, wh_target_all, mask_target_all, _ in valid_dataloader:
                vd_batch_size = image.shape[0]

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                    heatmap_split = gluon.utils.split_and_load(
                        heatmap_all, [ctx], even_split=False)
                    offset_target_split = gluon.utils.split_and_load(
                        offset_target_all, [ctx], even_split=False)
                    wh_target_split = gluon.utils.split_and_load(
                        wh_target_all, [ctx], even_split=False)
                    mask_target_split = gluon.utils.split_and_load(
                        mask_target_all, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)
                    heatmap_split = gluon.utils.split_and_load(
                        heatmap_all, ctx, even_split=False)
                    offset_target_split = gluon.utils.split_and_load(
                        offset_target_all, ctx, even_split=False)
                    wh_target_split = gluon.utils.split_and_load(
                        wh_target_all, ctx, even_split=False)
                    mask_target_split = gluon.utils.split_and_load(
                        mask_target_all, ctx, even_split=False)

                # prediction, target space for Data Parallelism
                heatmap_losses = []
                offset_losses = []
                wh_losses = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, heatmap_target, offset_target, wh_target, mask_target in zip(
                        image, label, heatmap_split, offset_target_split,
                        wh_target_split, mask_target_split):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]
                    heatmap_pred, offset_pred, wh_pred = net(img)

                    id, score, bbox = prediction(heatmap_pred, offset_pred,
                                                 wh_pred)
                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box * scale_factor,
                                            gt_labels=gt_id)

                    heatmap_loss = heatmapfocalloss(heatmap_pred,
                                                    heatmap_target)
                    offset_loss = normedl1loss(offset_pred, offset_target,
                                               mask_target) * lambda_off
                    wh_loss = normedl1loss(wh_pred, wh_target,
                                           mask_target) * lambda_size

                    heatmap_losses.append(heatmap_loss.asscalar())
                    offset_losses.append(offset_loss.asscalar())
                    wh_losses.append(wh_loss.asscalar())

                heatmap_loss_sum += sum(heatmap_losses) / vd_batch_size
                offset_loss_sum += sum(offset_losses) / vd_batch_size
                wh_loss_sum += sum(wh_losses) / vd_batch_size

            valid_heatmap_loss_mean = np.divide(heatmap_loss_sum,
                                                valid_update_number_per_epoch)
            valid_offset_loss_mean = np.divide(offset_loss_sum,
                                               valid_update_number_per_epoch)
            valid_wh_loss_mean = np.divide(wh_loss_sum,
                                           valid_update_number_per_epoch)
            valid_total_loss_mean = valid_heatmap_loss_mean + valid_offset_loss_mean + valid_wh_loss_mean

            logging.info(
                f"valid heatmap loss : {valid_heatmap_loss_mean} / valid offset loss : {valid_offset_loss_mean} / valid wh loss : {valid_wh_loss_mean} / valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list(
            )
            for j, c, p, r in zip(range(len(recall)), class_name, precision,
                                  recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(
                    f"class {j}'s {name} AP : {round(AP * 100, round_position)}%"
                )
                AP_appender.append(AP)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender,
                                          mAP=mAP_result,
                                          folder_name=valid_graph_path,
                                          epoch=i)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _, _, _ = next(dataloader_iter)

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 0, 1)

                batch_image = []
                heatmap_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    heatmap_pred, offset_pred, wh_pred = net(img)
                    ids, scores, bboxes = prediction(heatmap_pred, offset_pred,
                                                     wh_pred)

                    for ig, gt_id, gt_box, heatmap, id, score, bbox in zip(
                            img, gt_ids, gt_boxes, heatmap_pred, ids, scores,
                            bboxes):
                        ig = ig.transpose((1, 2, 0)) * mx.nd.array(
                            std, ctx=ig.context) + mx.nd.array(mean,
                                                               ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)

                        # heatmap 그리기
                        heatmap = mx.nd.multiply(heatmap,
                                                 255.0)  # 0 ~ 255 범위로 바꾸기
                        heatmap = mx.nd.max(
                            heatmap, axis=0,
                            keepdims=True)  # channel 축으로 가장 큰것 뽑기
                        heatmap = mx.nd.transpose(
                            heatmap,
                            axes=(1, 2, 0))  # (height, width, channel=1)
                        heatmap = mx.nd.repeat(
                            heatmap, repeats=3,
                            axis=-1)  # (height, width, channel=3)
                        heatmap = heatmap.asnumpy(
                        )  # mxnet.ndarray -> numpy.ndarray
                        heatmap = cv2.resize(heatmap,
                                             dsize=(input_size[1],
                                                    input_size[0]))  # 사이즈 원복
                        heatmap = heatmap.astype("uint8")  # float32 -> uint8
                        heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
                        heatmap[:, :,
                                (0, 1, 2)] = heatmap[:, :,
                                                     (2, 1, 0)]  # BGR -> RGB
                        heatmap = np.transpose(
                            heatmap,
                            axes=(2, 0, 1))  # (channel=3, height, width)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(
                            ig,
                            gt_box * scale_factor,
                            scores=None,
                            labels=gt_id,
                            thresh=None,
                            reverse_rgb=True,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True,
                            colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(
                            ground_truth,
                            bbox,
                            scores=score,
                            labels=id,
                            thresh=plot_class_thresh,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True)

                        # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = cv2.cvtColor(prediction_box,
                                                      cv2.COLOR_BGR2RGB)
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(
                            prediction_box)  # (batch, channel, height, width)
                        heatmap_image.append(heatmap)

                all_image = np.concatenate(
                    [np.array(batch_image),
                     np.array(heatmap_image)], axis=-1)
                summary.add_image(tag="valid_result",
                                  image=all_image,
                                  global_step=i)
                summary.add_scalar(tag="heatmap_loss",
                                   value={
                                       "train_heatmap_loss_mean":
                                       train_heatmap_loss_mean,
                                       "valid_heatmap_loss_mean":
                                       valid_heatmap_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="offset_loss",
                                   value={
                                       "train_offset_loss_mean":
                                       train_offset_loss_mean,
                                       "valid_offset_loss_mean":
                                       valid_offset_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="wh_loss",
                                   value={
                                       "train_wh_loss_mean":
                                       train_wh_loss_mean,
                                       "valid_wh_loss_mean": valid_wh_loss_mean
                                   },
                                   global_step=i)

                summary.add_scalar(tag="total_loss",
                                   value={
                                       "train_total_loss":
                                       train_total_loss_mean,
                                       "valid_total_loss":
                                       valid_total_loss_mean
                                   },
                                   global_step=i)

                params = net.collect_params().values()
                if GPU_COUNT > 1:
                    for c in ctx:
                        for p in params:
                            summary.add_histogram(tag=p.name,
                                                  values=p.data(ctx=c),
                                                  global_step=i,
                                                  bins='default')
                else:
                    for p in params:
                        summary.add_histogram(tag=p.name,
                                              values=p.data(),
                                              global_step=i,
                                              bins='default')

        if i % save_period == 0:

            if not os.path.exists(weight_path):
                os.makedirs(weight_path)
            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''
            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)

            postnet = PostNet(net=net, auxnet=prediction)  # 새로운 객체가 생성
            try:
                net.export(os.path.join(weight_path, f"{model}"),
                           epoch=i,
                           remove_amp_cast=True)
                net.save_parameters(os.path.join(weight_path,
                                                 f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함
                export_block_for_cplusplus(
                    path=os.path.join(weight_path, f"{model}_prepost"),
                    block=postnet,
                    data_shape=tuple(input_size) + tuple((3, )),
                    epoch=i,
                    preprocess=
                    True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                    layout='HWC',
                    ctx=context,
                    remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
示例#8
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        anchor_alloc_size=[256, 256],
        anchor_sizes=[32, 64, 128, 256, 512],
        anchor_size_ratios=[1, pow(2, 1 / 3), pow(2, 2 / 3)],
        anchor_aspect_ratios=[0.5, 1, 2],
        anchor_box_clip=True,
        graphviz=True,
        epoch=100,
        input_size=[512, 512],
        batch_log=100,
        batch_size=16,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=True,
        factor_scale=[8, 5],
        foreground_iou_thresh=0.5,
        background_iou_thresh=0.4,
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        weight_decay=0.000001,
        save_period=5,
        load_period=10,
        learning_rate=0.001,
        decay_lr=0.999,
        decay_step=10,
        GPU_COUNT=0,
        base=0,
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        valid_html_auto_open=True,
        using_mlflow=True,
        decode_number=5000,
        multiperclass=True,
        nms_thresh=0.5,
        nms_topk=500,
        iou_thresh=0.5,
        except_class_thresh=0.05,
        plot_class_thresh=0.5):
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
        else:
            logging.info(f'Running on {ctx}')

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training Efficient Detector")
    input_shape = (1, 3) + tuple(input_size)

    net = Efficient(version=base,
                    anchor_sizes=anchor_sizes,
                    anchor_size_ratios=anchor_size_ratios,
                    anchor_aspect_ratios=anchor_aspect_ratios,
                    anchor_box_clip=anchor_box_clip,
                    alloc_size=anchor_alloc_size,
                    ctx=mx.cpu())
    train_dataloader, train_dataset = traindataloader(
        multiscale=multiscale,
        factor_scale=factor_scale,
        augmentation=data_augmentation,
        path=train_dataset_path,
        input_size=input_size,
        batch_size=batch_size,
        batch_interval=batch_interval,
        num_workers=num_workers,
        shuffle=True,
        mean=mean,
        std=std,
        net=net,
        foreground_iou_thresh=foreground_iou_thresh,
        background_iou_thresh=background_iou_thresh,
        make_target=True)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_dataloader, valid_dataset = validdataloader(
            path=valid_dataset_path,
            input_size=input_size,
            batch_size=valid_size,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            net=net,
            foreground_iou_thresh=foreground_iou_thresh,
            background_iou_thresh=background_iou_thresh,
            make_target=True)
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    model = str(input_size[0]) + "_" + str(
        input_size[1]) + "_" + optimizer + "_EFF_" + str(base)

    weight_path = os.path.join("weights", f"{model}")
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')
    optimizer_path = os.path.join(weight_path,
                                  f'{model}-{load_period:04d}.opt')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)}\n")
        net = gluon.SymbolBlock.imports(sym_path, ['data'],
                                        param_path,
                                        ctx=ctx)
    else:
        start_epoch = 0
        net = Efficient(
            version=base,
            input_size=input_size,
            anchor_sizes=anchor_sizes,
            anchor_size_ratios=anchor_size_ratios,
            anchor_aspect_ratios=anchor_aspect_ratios,
            num_classes=num_classes,  # foreground만
            anchor_box_clip=anchor_box_clip,
            alloc_size=anchor_alloc_size,
            ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))
        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model),
                                max_queue=10,
                                flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net,
                                       shape=input_shape,
                                       save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) //
                 batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step,
                                             factor=decay_lr,
                                             stop_factor_lr=1e-12,
                                             base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'
    '''
    update_on_kvstore : bool, default None
    Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
    suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
    provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
    '''
    if optimizer.upper() == "ADAM":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "beta1": 0.9,
                                    "beta2": 0.999,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    elif optimizer.upper() == "RMSPROP":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "gamma1": 0.9,
                                    "gamma2": 0.999,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    elif optimizer.upper() == "SGD":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "momentum": 0.9,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    else:
        logging.error("optimizer not selected")
        exit(0)

    if AMP:
        amp.init_trainer(trainer)

    # optimizer weight 불러오기
    if os.path.exists(optimizer_path):
        try:
            trainer.load_states(optimizer_path)
        except Exception as E:
            logging.info(E)
        else:
            logging.info(f"loading {os.path.basename(optimizer_path)}\n")
    '''
    localization loss -> Smooth L1 loss 
    confidence loss -> Focal 
    '''
    confidence_loss = FocalLoss(alpha=0.25,
                                gamma=2,
                                sparse_label=True,
                                from_sigmoid=False,
                                batch_axis=None,
                                num_class=num_classes,
                                reduction="sum",
                                exclude=False)

    localization_loss = HuberLoss(rho=1,
                                  batch_axis=None,
                                  reduction="sum",
                                  exclude=False)

    prediction = Prediction(batch_size=batch_size,
                            from_sigmoid=False,
                            num_classes=num_classes,
                            decode_number=decode_number,
                            nms_thresh=nms_thresh,
                            nms_topk=nms_topk,
                            except_class_thresh=except_class_thresh,
                            multiperclass=multiperclass)

    precision_recall = Voc_2007_AP(iou_thresh=iou_thresh,
                                   class_names=name_classes)

    ctx_list = ctx if isinstance(ctx, (list, tuple)) else [ctx]
    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1),
                  initial=start_epoch + 1,
                  total=epoch):

        conf_loss_sum = 0
        loc_loss_sum = 0
        time_stamp = time.time()

        for batch_count, (image, _, cls_all, box_all,
                          _) in enumerate(train_dataloader, start=1):
            td_batch_size = image.shape[0]

            image = mx.nd.split(data=image, num_outputs=subdivision, axis=0)
            cls_all = mx.nd.split(data=cls_all,
                                  num_outputs=subdivision,
                                  axis=0)
            box_all = mx.nd.split(data=box_all,
                                  num_outputs=subdivision,
                                  axis=0)

            if subdivision == 1:
                image = [image]
                cls_all = [cls_all]
                box_all = [box_all]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                cls_all_losses = []
                box_all_losses = []

                for image_split, cls_split, box_split in zip(
                        image, cls_all, box_all):

                    image_split = gluon.utils.split_and_load(image_split,
                                                             ctx_list,
                                                             even_split=False)
                    cls_split = gluon.utils.split_and_load(cls_split,
                                                           ctx_list,
                                                           even_split=False)
                    box_split = gluon.utils.split_and_load(box_split,
                                                           ctx_list,
                                                           even_split=False)

                    # prediction, target space for Data Parallelism
                    cls_losses = []
                    box_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, cls_target, box_target in zip(
                            image_split, cls_split, box_split):
                        cls_pred, box_pred, anchor = net(img)
                        except_ignore_samples = cls_target > -1
                        positive_samples = cls_target > 0
                        positive_numbers = positive_samples.sum()

                        conf_loss = confidence_loss(
                            cls_pred, cls_target,
                            except_ignore_samples.expand_dims(axis=-1))
                        conf_loss = mx.nd.divide(conf_loss,
                                                 positive_numbers + 1)
                        cls_losses.append(conf_loss.asscalar())

                        loc_loss = localization_loss(
                            box_pred, box_target,
                            positive_samples.expand_dims(axis=-1))
                        box_losses.append(loc_loss.asscalar())

                        total_loss.append(conf_loss + loc_loss)
                    if AMP:
                        with amp.scale_loss(total_loss,
                                            trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    cls_all_losses.append(sum(cls_losses))
                    box_all_losses.append(sum(box_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기
            for p in net.collect_params().values():
                p.zero_grad()

            conf_loss_sum += sum(cls_all_losses) / td_batch_size
            loc_loss_sum += sum(box_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(
                    f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                    f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                    f'[Lr = {trainer.learning_rate}]'
                    f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]'
                    f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]'
                )
            time_stamp = time.time()

        train_conf_loss_mean = np.divide(conf_loss_sum,
                                         train_update_number_per_epoch)
        train_loc_loss_mean = np.divide(loc_loss_sum,
                                        train_update_number_per_epoch)
        train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean

        logging.info(
            f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}"
        )

        if i % save_period == 0:

            weight_epoch_path = os.path.join(weight_path, str(i))
            if not os.path.exists(weight_epoch_path):
                os.makedirs(weight_epoch_path)

            # optimizer weight 저장하기
            try:
                trainer.save_states(
                    os.path.join(weight_path, f'{model}-{i:04d}.opt'))
            except Exception as E:
                logging.error(f"optimizer weight export 예외 발생 : {E}")
            else:
                logging.info("optimizer weight export 성공")
            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''
            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)
            '''
                mxnet1.6.0 버전 에서 AMP 사용시 위에 미리 선언한 prediction을 사용하면 문제가 될 수 있다. 
                -yolo v3, gaussian yolo v3 에서는 문제가 발생한다.
                mxnet 1.5.x 버전에서는 아래와 같이 새로 선언하지 않아도 정상 동작한다.  

                block들은 함수 인자로 보낼 경우 자기 자신이 보내진다.(복사되는 것이 아님)
                export_block_for_cplusplus 에서 prediction 이 hybridize 되면서 
                미리 선언한 prediction도 hybridize화 되면서 symbol 형태가 된다. 
                이런 현상을 보면 아래와같이 다시 선언해 주는게 맞는 것 같다.
            '''
            auxnet = Prediction(from_sigmoid=False,
                                num_classes=num_classes,
                                decode_number=decode_number,
                                nms_thresh=nms_thresh,
                                nms_topk=nms_topk,
                                except_class_thresh=except_class_thresh,
                                multiperclass=multiperclass)
            postnet = PostNet(net=net, auxnet=auxnet)
            try:
                net.export(os.path.join(weight_path, f"{model}"),
                           epoch=i,
                           remove_amp_cast=True)
                net.save_parameters(os.path.join(weight_path,
                                                 f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함
                export_block_for_cplusplus(
                    path=os.path.join(weight_epoch_path, f"{model}_prepost"),
                    block=postnet,
                    data_shape=tuple(input_size) + tuple((3, )),
                    epoch=i,
                    preprocess=
                    True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                    layout='HWC',
                    ctx=context,
                    remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

        if i % eval_period == 0 and valid_list:

            conf_loss_sum = 0
            loc_loss_sum = 0

            # loss 구하기
            for image, label, cls_all, box_all, _ in valid_dataloader:

                vd_batch_size = image.shape[0]

                image = gluon.utils.split_and_load(image,
                                                   ctx_list,
                                                   even_split=False)
                label = gluon.utils.split_and_load(label,
                                                   ctx_list,
                                                   even_split=False)
                cls_all = gluon.utils.split_and_load(cls_all,
                                                     ctx_list,
                                                     even_split=False)
                box_all = gluon.utils.split_and_load(box_all,
                                                     ctx_list,
                                                     even_split=False)

                # prediction, target space for Data Parallelism
                cls_losses = []
                box_losses = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, cls_target, box_target in zip(
                        image, label, cls_all, box_all):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    id, score, bbox = prediction(cls_pred, box_pred, anchor)

                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box,
                                            gt_labels=gt_id)

                    except_ignore_samples = cls_target > -1
                    positive_samples = cls_target > 0
                    positive_numbers = positive_samples.sum()

                    conf_loss = confidence_loss(
                        cls_pred, cls_target,
                        except_ignore_samples.expand_dims(axis=-1))
                    conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1)
                    cls_losses.append(conf_loss.asscalar())

                    loc_loss = localization_loss(
                        box_pred, box_target,
                        positive_samples.expand_dims(axis=-1))
                    box_losses.append(loc_loss.asscalar())

                conf_loss_sum += sum(cls_losses) / vd_batch_size
                loc_loss_sum += sum(box_losses) / vd_batch_size

            valid_conf_loss_mean = np.divide(conf_loss_sum,
                                             valid_update_number_per_epoch)
            valid_loc_loss_mean = np.divide(loc_loss_sum,
                                            valid_update_number_per_epoch)
            valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean

            logging.info(
                f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list(
            )
            for j, c, p, r in zip(range(len(recall)), class_name, precision,
                                  recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(
                    f"class {j}'s {name} AP : {round(AP * 100, round_position)}%"
                )
                AP_appender.append(AP)

            AP_appender = np.nan_to_num(AP_appender)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender,
                                          mAP=mAP_result,
                                          folder_name=valid_graph_path,
                                          epoch=i,
                                          auto_open=valid_html_auto_open)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _ = next(dataloader_iter)

                image = gluon.utils.split_and_load(image,
                                                   ctx_list,
                                                   even_split=False)
                label = gluon.utils.split_and_load(label,
                                                   ctx_list,
                                                   even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 1, 0)

                batch_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    ids, scores, bboxes = prediction(cls_pred, box_pred,
                                                     anchor)

                    for ig, gt_id, gt_box, id, score, bbox in zip(
                            img, gt_ids, gt_boxes, ids, scores, bboxes):
                        ig = ig.transpose((1, 2, 0)) * mx.nd.array(
                            std, ctx=ig.context) + mx.nd.array(mean,
                                                               ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)
                        ig = ig.astype(np.uint8)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(
                            ig,
                            gt_box,
                            scores=None,
                            labels=gt_id,
                            thresh=None,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True,
                            colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(
                            ground_truth,
                            bbox,
                            scores=score,
                            labels=id,
                            thresh=plot_class_thresh,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True)

                        # Tensorboard에 그리기 (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(
                            prediction_box)  # (batch, channel, height, width)

                summary.add_image(tag="valid_result",
                                  image=np.array(batch_image),
                                  global_step=i)
                summary.add_scalar(tag="conf_loss",
                                   value={
                                       "train_conf_loss": train_conf_loss_mean,
                                       "valid_conf_loss": valid_conf_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="loc_loss",
                                   value={
                                       "train_loc_loss": train_loc_loss_mean,
                                       "valid_loc_loss": valid_loc_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="total_loss",
                                   value={
                                       "train_total_loss":
                                       train_total_loss_mean,
                                       "valid_total_loss":
                                       valid_total_loss_mean
                                   },
                                   global_step=i)

                for p in net.collect_params().values():
                    summary.add_histogram(tag=p.name,
                                          values=p.data(ctx=ctx_list[0]),
                                          global_step=i,
                                          bins='default')

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
def train(epochs, ctx):
    # Collect all parameters from net and its children, then initialize them.
    net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
    net.hybridize()

    # Trainer is for updating parameters with gradient.
    trainer = gluon.Trainer(net.collect_params(), 'sgd',
                            {'learning_rate': opt.lr, 'momentum': opt.momentum})
    metric = mx.metric.Accuracy()
    loss = gluon.loss.SoftmaxCrossEntropyLoss()

    # collect parameter names for logging the gradients of parameters in each epoch
    params = net.collect_params()
    param_names = params.keys()

    # define a summary writer that logs data and flushes to the file every 5 seconds
    sw = SummaryWriter(logdir='./logs', flush_secs=5)

    global_step = 0
    for epoch in range(epochs):
        # reset data iterator and metric at begining of epoch.
        metric.reset()
        for i, (data, label) in enumerate(train_data):
            # Copy data to ctx if necessary
            data = data.as_in_context(ctx)
            label = label.as_in_context(ctx)
            # Start recording computation graph with record() section.
            # Recorded graphs can then be differentiated with backward.
            with autograd.record():
                output = net(data)
                L = loss(output, label)
            sw.add_scalar(tag='cross_entropy', value=L.mean().asscalar(), global_step=global_step)
            global_step += 1
            L.backward()

            # take a gradient step with batch_size equal to data.shape[0]
            trainer.step(data.shape[0])
            # update metric at last.
            metric.update([label], [output])

            if i % opt.log_interval == 0 and i > 0:
                name, train_acc = metric.get()
                print('[Epoch %d Batch %d] Training: %s=%f' % (epoch, i, name, train_acc))

            # Log the first batch of images of each epoch
            if i == 0:
                sw.add_image('minist_first_minibatch', data.reshape((opt.batch_size, 1, 28, 28)), epoch)

        if epoch == 0:
            sw.add_graph(net)

        grads = [i.grad() for i in net.collect_params().values()]
        assert len(grads) == len(param_names)
        # logging the gradients of parameters for checking convergence
        for i, name in enumerate(param_names):
            sw.add_histogram(tag=name, values=grads[i], global_step=epoch, bins=1000)

        name, train_acc = metric.get()
        print('[Epoch %d] Training: %s=%f' % (epoch, name, train_acc))
        # logging training accuracy
        sw.add_scalar(tag='accuracy_curves', value=('train_acc', train_acc), global_step=epoch)

        name, val_acc = test(ctx)
        print('[Epoch %d] Validation: %s=%f' % (epoch, name, val_acc))
        # logging the validation accuracy
        sw.add_scalar(tag='accuracy_curves', value=('valid_acc', val_acc), global_step=epoch)

    sw.export_scalars('scalar_dict.json')
    sw.close()
#            out0,*_= net(data,*states)
            out0,*_= net(data,*states)
            loss = loss_function(out0, label)
        loss.backward()
        trainer.step(BATCH_SIZE)
        _loss=loss.asnumpy().mean()
        pbar.set_postfix(loss=str(_loss))
        pbar.update()
        metric.update(label[:,:,0].reshape(-3,1), out0.reshape((-3,-1)))
        sw.add_scalar(tag='cross_entropy', value=_loss, global_step=epoch)
    grads = [i.grad().asnumpy() for i in net.collect_params().values()]
    oldval = newval
    newval = [i.data().asnumpy() for i in net.collect_params().values()]
    diffval = [a-b for a,b in zip(newval,oldval)]
    for i, name in enumerate(net.collect_params().keys()):
      sw.add_histogram(tag='grad'+name, values=grads[i], global_step=epoch, bins=1000)
      sw.add_histogram(tag='val'+name, values=newval[i], global_step=epoch, bins=1000)
      sw.add_histogram(tag='diff'+name, values=diffval[i], global_step=epoch, bins=1000)
    sw.add_scalar(tag=name, value=acc, global_step=epoch)
    name, acc = metric.get()
    print('After epoch {}: {} = {}({}s),\n    final batch loss is {}'.format(epoch + 1, name, acc,time()-t,loss.asnumpy().mean()))
    metric.reset()
    t=time()

'''
net.collect_params().save('test2.param',net.prefix)
net.collect_params().load('test2.param',restore_prefix=b._block._prefix)#可以在这里指定ctx=ctx
with mx.gluon.Block().name_scope() as b:
  net_p=build_model(
  vocab_size = vocab_size,
  embedding_dim=embedding_dim,
示例#11
0
def train(opt, ctx):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    kv = mx.kv.create(opt.kvstore)
    train_data, val_data = get_data_iters(opt, kv.num_workers, kv.rank)
    net.collect_params().reset_ctx(ctx)
    trainer = gluon.Trainer(net.collect_params(),
                            *get_optimizer(opt),
                            kvstore=kv)
    if opt.resume_states is not '':
        trainer.load_states(opt.resume_states)
    loss = gluon.loss.SoftmaxCrossEntropyLoss()

    # dummy forward pass to initialize binary layers
    with autograd.record():
        data, label = get_dummy_data(opt, ctx[0])
        output = net(data)

    # set batch norm wd to zero
    params = net.collect_params('.*batchnorm.*')
    for key in params:
        params[key].wd_mult = 0.0

    if opt.plot_network is not None:
        x = mx.sym.var('data')
        sym = net(x)
        with open('{}.txt'.format(opt.plot_network), 'w') as f:
            with redirect_stdout(f):
                mx.viz.print_summary(sym,
                                     shape={"data": get_shape(opt)},
                                     quantized_bitwidth=opt.bits)
        a = mx.viz.plot_network(sym, shape={"data": get_shape(opt)})
        try:
            a.render('{}.gv'.format(opt.plot_network))
        except OSError as e:
            logger.error(e)
        except ExecutableNotFound as e:
            logger.error(e)

    if opt.dry_run:
        return

    summary_writer = None
    global_step = 0
    if opt.write_summary:
        from mxboard import SummaryWriter
        summary_writer = SummaryWriter(logdir=opt.write_summary, flush_secs=30)
        params = net.collect_params(".*weight|.*bias")
        for name, param in params.items():
            summary_writer.add_histogram(tag=name,
                                         values=param.data(ctx[0]),
                                         global_step=global_step,
                                         bins=1000)
            summary_writer.add_histogram(tag="%s-grad" % name,
                                         values=param.grad(ctx[0]),
                                         global_step=global_step,
                                         bins=1000)

    total_time = 0
    num_epochs = 0
    best_acc = [0]
    epoch_time = -1
    q_activations = get_blocks(net, nn.QActivation)
    update_clip_threshold(opt.clip_threshold,
                          opt.start_epoch,
                          opt.clip_threshold_steps,
                          q_activations,
                          check_previous_epochs=True)
    for epoch in range(opt.start_epoch, opt.epochs):
        trainer = update_learning_rate(opt.lr, trainer, epoch, opt.lr_factor,
                                       lr_steps)
        if epoch != opt.start_epoch:
            update_clip_threshold(opt.clip_threshold, epoch,
                                  opt.clip_threshold_steps, q_activations)
        tic = time.time()
        train_data.reset()
        metric.reset()
        btic = time.time()
        for i, batch in enumerate(train_data):
            data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype),
                                              ctx_list=ctx,
                                              batch_axis=0)
            label = gluon.utils.split_and_load(batch.label[0].astype(
                opt.dtype),
                                               ctx_list=ctx,
                                               batch_axis=0)
            outputs = []
            Ls = []
            with autograd.record():
                for x, y in zip(data, label):
                    z = net(x)
                    L = loss(z, y)
                    # store the loss and do backward after we have done forward
                    # on all GPUs for better speed on multiple GPUs.
                    Ls.append(L)
                    outputs.append(z)
                autograd.backward(Ls)
            trainer.step(batch.data[0].shape[0])
            metric.update(label, outputs)
            if opt.log_interval and not (i + 1) % opt.log_interval:
                name, acc = metric.get()
                logger.info(
                    'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f, %s=%f'
                    % (epoch, i, batch_size /
                       (time.time() - btic), name[0], acc[0], name[1], acc[1]))
                log_progress(get_num_examples(opt.dataset), opt, epoch, i,
                             time.time() - tic, epoch_time)
                if summary_writer:
                    summary_writer.add_scalar("batch-%s" % name[0],
                                              acc[0],
                                              global_step=global_step)
                    summary_writer.add_scalar("batch-%s" % name[1],
                                              acc[1],
                                              global_step=global_step)
            btic = time.time()
            global_step += batch_size

        epoch_time = time.time() - tic

        if summary_writer:
            params = net.collect_params(".*weight|.*bias")
            for name, param in params.items():
                summary_writer.add_histogram(tag=name,
                                             values=param.data(ctx[0]),
                                             global_step=global_step,
                                             bins=1000)
                summary_writer.add_histogram(tag="%s-grad" % name,
                                             values=param.grad(ctx[0]),
                                             global_step=global_step,
                                             bins=1000)

        # First epoch will usually be much slower than the subsequent epics,
        # so don't factor into the average
        if num_epochs > 0:
            total_time = total_time + epoch_time
        num_epochs = num_epochs + 1

        # train
        name, acc = metric.get()
        logger.info('[Epoch %d] training: %s=%f, %s=%f' %
                    (epoch, name[0], acc[0], name[1], acc[1]))
        logger.info('[Epoch %d] time cost: %f' % (epoch, epoch_time))
        if summary_writer:
            summary_writer.add_scalar("epoch", epoch, global_step=global_step)
            summary_writer.add_scalar("epoch-time",
                                      epoch_time,
                                      global_step=global_step)
            summary_writer.add_scalar("training-%s" % name[0],
                                      acc[0],
                                      global_step=global_step)
            summary_writer.add_scalar("training-%s" % name[1],
                                      acc[1],
                                      global_step=global_step)

        # test
        name, val_acc = test(ctx, val_data)
        logger.info('[Epoch %d] validation: %s=%f, %s=%f' %
                    (epoch, name[0], val_acc[0], name[1], val_acc[1]))
        if summary_writer:
            summary_writer.add_scalar("validation-%s" % name[0],
                                      val_acc[0],
                                      global_step=global_step)
            summary_writer.add_scalar("validation-%s" % name[1],
                                      val_acc[1],
                                      global_step=global_step)

        # save model if meet requirements
        save_checkpoint(trainer, epoch, val_acc[0], best_acc)
    if num_epochs > 1:
        print('Average epoch time: {}'.format(
            float(total_time) / (num_epochs - 1)))

    if opt.mode != 'hybrid':
        net.hybridize()
        # dummy forward pass to save model
        with autograd.record():
            data, label = get_dummy_data(opt, ctx[0])
            output = net(data)
    net.export(os.path.join(opt.prefix,
                            "image-classifier-{}bit".format(opt.bits)),
               epoch=0)
示例#12
0
def train(epochs, ctx):
    # Collect all parameters from net and its children, then initialize them.
    net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
    # Trainer is for updating parameters with gradient.
    trainer = gluon.Trainer(net.collect_params(), 'adam')
    metric = mx.metric.Accuracy()
    loss = gluon.loss.SoftmaxCrossEntropyLoss()

    # do forward pass with dummy data without backwards pass to initialize binary layers
    with autograd.record():
        data, label = dummy_data(ctx)
        output = net(data)
        L = loss(output, label)

    if opt.hybridize:
        net.hybridize()

    # collect parameter names for logging the gradients of parameters in each epoch
    log_param_filter = ".*weight|.*bias"
    params = net.collect_params(log_param_filter)
    param_names = params.keys()

    sw = SummaryWriter(logdir='./logs/{}-{}bits/'.format(
        "symbolic" if opt.hybridize else "gluon", opt.bits),
                       flush_secs=5)

    global_step = 0
    for epoch in range(epochs):
        # reset data iterator and metric at begining of epoch.
        metric.reset()
        for i, (data, label) in enumerate(train_data):
            # Copy data to ctx if necessary
            data = data.as_in_context(ctx)
            label = label.as_in_context(ctx)
            # Start recording computation graph with record() section.
            # Recorded graphs can then be differentiated with backward.
            with autograd.record():
                output = net(data)
                L = loss(output, label)
                L.backward()
            sw.add_scalar(tag='cross_entropy',
                          value=L.mean().asscalar(),
                          global_step=global_step)
            global_step += 1
            # take a gradient step with batch_size equal to data.shape[0]
            trainer.step(data.shape[0])
            # update metric at last.
            metric.update([label], [output])

            if i % opt.log_interval == 0 and i > 0:
                name, acc = metric.get()
                print('[Epoch %d Batch %d] Training: %s=%f' %
                      (epoch, i, name, acc))

            if i == 0:
                sw.add_image('mnist_first_minibatch',
                             data.reshape((opt.batch_size, 1, 28, 28)), epoch)

        grads = [
            i.grad() for i in net.collect_params(log_param_filter).values()
        ]
        assert len(grads) == len(param_names)
        # logging the gradients of parameters for checking convergence
        for i, name in enumerate(param_names):
            sw.add_histogram(tag=name,
                             values=grads[i],
                             global_step=global_step,
                             bins=1000)

        name, acc = metric.get()
        print('[Epoch %d] Training: %s=%f' % (epoch, name, acc))
        sw.add_scalar(tag='train_acc', value=acc, global_step=global_step)

        name, val_acc = test(ctx)
        print('[Epoch %d] Validation: %s=%f' % (epoch, name, val_acc))
        sw.add_scalar(tag='valid_acc', value=val_acc, global_step=global_step)

    if not opt.hybridize:
        net.hybridize()
        with autograd.record():
            data, label = dummy_data(ctx)
            output = net(data)
            L = loss(output, label)
    net.export("mnist-lenet-{}-{}-bit".format(
        "symbolic" if opt.hybridize else "gluon", opt.bits),
               epoch=1)
    sw.add_graph(net)
    sw.close()
class TrainerAgentGluon:  # Probably needs refactoring
    """Main training loop"""
    def __init__(
        self,
        net,
        val_data,
        train_config: TrainConfig,
        train_objects: TrainObjects,
        use_rtpt: bool,
    ):
        """
        Class for training the neural network.
        :param net: The NN with loaded parameters that shall be trained.
        :param val_data: The validation data loaded with gluon DataLoader.
        :param train_config: An instance of the TrainConfig data class.
        :param train_objects: Am omstamce pf the TrainObject data class.
        :param use_rtpt: If True, an RTPT object will be created and modified within this class.
        """
        # Too many instance attributes (29/7) - Too many arguments (24/5) - Too many local variables (25/15)
        # Too few public methods (1/2)
        self.tc = train_config
        self.to = train_objects
        if self.to.metrics is None:
            self.to.metrics = {}
        self._ctx = get_context(train_config.context, train_config.device_id)
        self._net = net
        self._graph_exported = False
        self._val_data = val_data
        # define a summary writer that logs data and flushes to the file every 5 seconds
        if self.tc.log_metrics_to_tensorboard:
            self.sum_writer = SummaryWriter(logdir=self.tc.export_dir + "logs",
                                            flush_secs=5,
                                            verbose=False)
        # Define the two loss functions
        self._softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss(
            sparse_label=self.tc.sparse_policy_label)
        self._l2_loss = gluon.loss.L2Loss()
        if self.tc.optimizer_name != "nag":
            raise NotImplementedError(
                "The requested optimizer %s Isn't supported yet." %
                self.tc.optimizer_name)
        self._trainer = gluon.Trainer(
            self._net.collect_params(),
            "nag",
            {
                "learning_rate": self.to.lr_schedule(0),
                "momentum": self.to.momentum_schedule(0),
                "wd": self.tc.wd,
            },
        )

        # collect parameter names for logging the gradients of parameters in each epoch
        self._params = self._net.collect_params()
        self._param_names = self._params.keys()
        self.ordering = list(
            range(self.tc.nb_parts)
        )  # define a list which describes the order of the processed batches

        self.use_rtpt = use_rtpt
        self.rtpt = None  # Set this later in training function

    def _log_metrics(self, metric_values, global_step, prefix="train_"):
        """
        Logs a dictionary object of metric value to the console and to tensorboard
        if _log_metrics_to_tensorboard is set to true
        :param metric_values: Dictionary object storing the current metrics
        :param global_step: X-Position point of all metric entries
        :param prefix: Used for labelling the metrics
        :return:
        """
        for name in metric_values.keys():  # show the metric stats
            print(" - %s%s: %.4f" % (prefix, name, metric_values[name]),
                  end="")
            # add the metrics to the tensorboard event file
            if self.tc.log_metrics_to_tensorboard:
                self.sum_writer.add_scalar(
                    name, [prefix.replace("_", ""), metric_values[name]],
                    global_step)

    def _process_on_data_plane_file(self, train_data, batch_proc_tmp):

        for _, (data, value_label, policy_label) in enumerate(train_data):
            data = data.as_in_context(self._ctx)
            value_label = value_label.as_in_context(self._ctx)
            policy_label = policy_label.as_in_context(self._ctx)

            # update a dummy metric to see a proper progress bar
            #  (the metrics will get evaluated at the end of 100k steps)
            # if self.batch_proc_tmp > 0:
            #    self._metrics['value_loss'].update(old_label, value_out)
            # old_label = value_label
            with autograd.record():
                [value_out, policy_out] = self._net(data)
                if self.tc.select_policy_from_plane and not self.tc.is_policy_from_plane_data:
                    policy_out = policy_out[:, FLAT_PLANE_IDX]
                value_loss = self._l2_loss(value_out, value_label)
                policy_loss = self._softmax_cross_entropy(
                    policy_out, policy_label)
                # weight the components of the combined loss
                combined_loss = self.tc.val_loss_factor * value_loss.sum(
                ) + self.tc.policy_loss_factor * policy_loss.sum()
                # update a dummy metric to see a proper progress bar
                self.to.metrics["value_loss"].update(preds=value_out,
                                                     labels=value_label)

            combined_loss.backward()
            self._trainer.step(data.shape[0])
            batch_proc_tmp += 1
        return batch_proc_tmp, self.to.metrics["value_loss"].get()[1]

    def train(self, cur_it=None):  # Probably needs refactoring
        """
        Training model
        :param cur_it: Current iteration which is used for the learning rate and momentum schedule.
         If set to None it will be initialized
        """
        # Too many local variables (44/15) - Too many branches (18/12) - Too many statements (108/50)
        # set a custom seed for reproducibility
        random.seed(self.tc.seed)
        # define and initialize the variables which will be used
        t_s = time()
        # predefine the local variables that will be used in the training loop
        val_loss_best = val_p_acc_best = k_steps_best = val_metric_values_best = old_label = value_out = None
        patience_cnt = epoch = batch_proc_tmp = 0  # track on how many batches have been processed in this epoch
        k_steps = self.tc.k_steps_initial  # counter for thousands steps
        # calculate how many log states will be processed
        k_steps_end = round(self.tc.total_it / self.tc.batch_steps)
        # we use k-steps instead of epochs here
        if k_steps_end == 0:
            k_steps_end = 1

        if self.use_rtpt:
            self.rtpt = RTPT(name_initials=self.tc.name_initials,
                             experiment_name='crazyara',
                             max_iterations=k_steps_end -
                             self.tc.k_steps_initial)
        if cur_it is None:
            cur_it = self.tc.k_steps_initial * 1000
        nb_spikes = 0  # count the number of spikes that have been detected
        # initialize the loss to compare with, with a very high value
        old_val_loss = np.inf
        graph_exported = False  # create a state variable to check if the net architecture has been reported yet

        if not self.ordering:  # safety check to prevent eternal loop
            raise Exception(
                "You must have at least one part file in your planes-dataset directory!"
            )

        if self.use_rtpt:
            # Start the RTPT tracking
            self.rtpt.start()

        while True:  # Too many nested blocks (7/5)
            # reshuffle the ordering of the training game batches (shuffle works in place)
            random.shuffle(self.ordering)

            epoch += 1
            logging.info("EPOCH %d", epoch)
            logging.info("=========================")
            t_s_steps = time()

            for part_id in tqdm_notebook(self.ordering):
                # load one chunk of the dataset from memory
                _, x_train, yv_train, yp_train, _, _ = load_pgn_dataset(
                    dataset_type="train",
                    part_id=part_id,
                    normalize=self.tc.normalize,
                    verbose=False,
                    q_value_ratio=self.tc.q_value_ratio)

                yp_train = prepare_policy(
                    y_policy=yp_train,
                    select_policy_from_plane=self.tc.select_policy_from_plane,
                    sparse_policy_label=self.tc.sparse_policy_label,
                    is_policy_from_plane_data=self.tc.is_policy_from_plane_data
                )

                # update the train_data object
                train_dataset = gluon.data.ArrayDataset(
                    nd.array(x_train), nd.array(yv_train), nd.array(yp_train))
                train_data = gluon.data.DataLoader(
                    train_dataset,
                    batch_size=self.tc.batch_size,
                    shuffle=True,
                    num_workers=self.tc.cpu_count)

                for _, (data, value_label,
                        policy_label) in enumerate(train_data):
                    data = data.as_in_context(self._ctx)
                    value_label = value_label.as_in_context(self._ctx)
                    policy_label = policy_label.as_in_context(self._ctx)

                    # update a dummy metric to see a proper progress bar
                    #  (the metrics will get evaluated at the end of 100k steps)
                    if batch_proc_tmp > 0:
                        self.to.metrics["value_loss"].update(
                            old_label, value_out)

                    old_label = value_label
                    with autograd.record():
                        [value_out, policy_out] = self._net(data)
                        value_loss = self._l2_loss(value_out, value_label)
                        policy_loss = self._softmax_cross_entropy(
                            policy_out, policy_label)
                        # weight the components of the combined loss
                        combined_loss = (
                            self.tc.val_loss_factor * value_loss +
                            self.tc.policy_loss_factor * policy_loss)
                        # update a dummy metric to see a proper progress bar
                        # self._metrics['value_loss'].update(preds=value_out, labels=value_label)

                    combined_loss.backward()
                    learning_rate = self.to.lr_schedule(
                        cur_it)  # update the learning rate
                    self._trainer.set_learning_rate(learning_rate)
                    momentum = self.to.momentum_schedule(
                        cur_it)  # update the momentum
                    self._trainer._optimizer.momentum = momentum
                    self._trainer.step(data.shape[0])
                    cur_it += 1
                    batch_proc_tmp += 1
                    # add the graph representation of the network to the tensorboard log file
                    if not graph_exported and self.tc.log_metrics_to_tensorboard:
                        self.sum_writer.add_graph(self._net)
                        graph_exported = True

                    if batch_proc_tmp >= self.tc.batch_steps:  # show metrics every thousands steps
                        # log the current learning rate
                        # update batch_proc_tmp counter by subtracting the batch_steps
                        batch_proc_tmp = batch_proc_tmp - self.tc.batch_steps
                        ms_step = (
                            (time() - t_s_steps) /
                            self.tc.batch_steps) * 1000  # measure elapsed time
                        # update the counters
                        k_steps += 1
                        patience_cnt += 1
                        logging.info("Step %dK/%dK - %dms/step", k_steps,
                                     k_steps_end, ms_step)
                        logging.info("-------------------------")
                        logging.debug("Iteration %d/%d", cur_it,
                                      self.tc.total_it)
                        logging.debug("lr: %.7f - momentum: %.7f",
                                      learning_rate, momentum)
                        train_metric_values = evaluate_metrics(
                            self.to.metrics,
                            train_data,
                            self._net,
                            nb_batches=10,  #25,
                            ctx=self._ctx,
                            sparse_policy_label=self.tc.sparse_policy_label,
                            apply_select_policy_from_plane=self.tc.
                            select_policy_from_plane
                            and not self.tc.is_policy_from_plane_data)
                        val_metric_values = evaluate_metrics(
                            self.to.metrics,
                            self._val_data,
                            self._net,
                            nb_batches=None,
                            ctx=self._ctx,
                            sparse_policy_label=self.tc.sparse_policy_label,
                            apply_select_policy_from_plane=self.tc.
                            select_policy_from_plane
                            and not self.tc.is_policy_from_plane_data)
                        if self.use_rtpt:
                            # update process title according to loss
                            self.rtpt.step(
                                subtitle=
                                f"loss={val_metric_values['loss']:2.2f}")
                        if self.tc.use_spike_recovery and (
                                old_val_loss * self.tc.spike_thresh <
                                val_metric_values["loss"]
                                or np.isnan(val_metric_values["loss"])
                        ):  # check for spikes
                            nb_spikes += 1
                            logging.warning(
                                "Spike %d/%d occurred - val_loss: %.3f",
                                nb_spikes,
                                self.tc.max_spikes,
                                val_metric_values["loss"],
                            )
                            if nb_spikes >= self.tc.max_spikes:
                                val_loss = val_metric_values["loss"]
                                val_p_acc = val_metric_values["policy_acc"]
                                logging.debug(
                                    "The maximum number of spikes has been reached. Stop training."
                                )
                                # finally stop training because the number of lr drops has been achieved
                                print()
                                print("Elapsed time for training(hh:mm:ss): " +
                                      str(
                                          datetime.timedelta(
                                              seconds=round(time() - t_s))))

                                if self.tc.log_metrics_to_tensorboard:
                                    self.sum_writer.close()
                                return return_metrics_and_stop_training(
                                    k_steps, val_metric_values, k_steps_best,
                                    val_metric_values_best)

                            logging.debug("Recover to latest checkpoint")
                            model_path = self.tc.export_dir + "weights/model-%.5f-%.3f-%04d.params" % (
                                val_loss_best,
                                val_p_acc_best,
                                k_steps_best,
                            )  # Load the best model once again
                            logging.debug("load current best model:%s",
                                          model_path)
                            self._net.load_parameters(model_path,
                                                      ctx=self._ctx)
                            k_steps = k_steps_best
                            logging.debug("k_step is back at %d", k_steps_best)
                            # print the elapsed time
                            t_delta = time() - t_s_steps
                            print(" - %.ds" % t_delta)
                            t_s_steps = time()
                        else:
                            # update the val_loss_value to compare with using spike recovery
                            old_val_loss = val_metric_values["loss"]
                            # log the metric values to tensorboard
                            self._log_metrics(train_metric_values,
                                              global_step=k_steps,
                                              prefix="train_")
                            self._log_metrics(val_metric_values,
                                              global_step=k_steps,
                                              prefix="val_")

                            if self.tc.export_grad_histograms:
                                grads = []
                                # logging the gradients of parameters for checking convergence
                                for _, name in enumerate(self._param_names):
                                    if "bn" not in name and "batch" not in name and name != "policy_flat_plane_idx":
                                        grads.append(self._params[name].grad())
                                        self.sum_writer.add_histogram(
                                            tag=name,
                                            values=grads[-1],
                                            global_step=k_steps,
                                            bins=20)

                            # check if a new checkpoint shall be created
                            if val_loss_best is None or val_metric_values[
                                    "loss"] < val_loss_best:
                                # update val_loss_best
                                val_loss_best = val_metric_values["loss"]
                                val_p_acc_best = val_metric_values[
                                    "policy_acc"]
                                val_metric_values_best = val_metric_values
                                k_steps_best = k_steps

                                if self.tc.export_weights:
                                    prefix = self.tc.export_dir + "weights/model-%.5f-%.3f" \
                                             % (val_loss_best, val_p_acc_best)
                                    # the export function saves both the architecture and the weights
                                    self._net.export(prefix,
                                                     epoch=k_steps_best)
                                    print()
                                    logging.info(
                                        "Saved checkpoint to %s-%04d.params",
                                        prefix, k_steps_best)

                                patience_cnt = 0  # reset the patience counter
                            # print the elapsed time
                            t_delta = time() - t_s_steps
                            print(" - %.ds" % t_delta)
                            t_s_steps = time()

                            # log the samples per second metric to tensorboard
                            self.sum_writer.add_scalar(
                                tag="samples_per_second",
                                value={
                                    "hybrid_sync":
                                    data.shape[0] * self.tc.batch_steps /
                                    t_delta
                                },
                                global_step=k_steps,
                            )

                            # log the current learning rate
                            self.sum_writer.add_scalar(
                                tag="lr",
                                value=self.to.lr_schedule(cur_it),
                                global_step=k_steps)
                            # log the current momentum value
                            self.sum_writer.add_scalar(
                                tag="momentum",
                                value=self.to.momentum_schedule(cur_it),
                                global_step=k_steps)

                            if cur_it >= self.tc.total_it:

                                val_loss = val_metric_values["loss"]
                                val_p_acc = val_metric_values["policy_acc"]
                                logging.debug(
                                    "The number of given iterations has been reached"
                                )
                                # finally stop training because the number of lr drops has been achieved
                                print()
                                print("Elapsed time for training(hh:mm:ss): " +
                                      str(
                                          datetime.timedelta(
                                              seconds=round(time() - t_s))))

                                if self.tc.log_metrics_to_tensorboard:
                                    self.sum_writer.close()

                                return return_metrics_and_stop_training(
                                    k_steps, val_metric_values, k_steps_best,
                                    val_metric_values_best)
示例#14
0
class GluonLearner():
    def __init__(self, model, run_id, gpu_idxs=None, hybridize=False, tensorboard_logging=False):
        """

        Parameters
        ----------
        model: HybridBlock
        gpu_idxs: None or list of ints
            If None will set context to CPU.
            If list of ints, will set context to given GPUs.
        """
        logging.info("Using Gluon Learner.")
        self.model = model
        self.run_id = run_id
        if hybridize:
            self.model.hybridize()
            logging.info("Hybridized model.")
        self.context = get_context(gpu_idxs)
        self.tensorboard_logging = tensorboard_logging
        if self.tensorboard_logging:
            from mxboard import SummaryWriter
            current_folder = os.path.dirname(os.path.realpath(__file__))
            tensorboard_folder = os.path.join(current_folder, "..", "logs", "tensorboard")
            summary_filepath = os.path.join(tensorboard_folder, self.run_id)
            self.writer = SummaryWriter(logdir=summary_filepath)


    def fit(self, train_data, valid_data,
            epochs=300,
            lr=None, lr_schedule=None,
            initializer=mx.init.Xavier(),
            optimizer=None,
            kvstore='device',
            log_frequency=10000,
            early_stopping_criteria=None
        ):
        """
        Uses accuracy as training and validation metric.

        Parameters
        ----------
        train_iter : DataIter
            Contains training data
        validation_iter : DataIter
            Contains validation data
        epochs: int
            Number of epochs to run, unless stopped early by early_stopping_criteria.
        lr: float or int
            Learning rate
        lr_schedule : dict
            Contains change points of learning rate.
            Key is the epoch and value is the learning rate.
            Must contain epoch 0.
        initializer : mxnet.initializer.Initializer
        optimizer: mxnet.optimizer.Optimizer
            Defaults to be `mx.optimizer.SGD(learning_rate=lr_schedule[0], rescale_grad=1.0/batch_size, momentum=0.9)`
        kvstore : str, optional
        log_frequency : int, optional
            Number of samples between logs
        early_stopping_criteria: function (float -> boolean)
            Given validation accuracy, should return True if training should be stopped early.

        Returns
        -------

        None

        Output is logged to file.

        """

        if lr_schedule is None:
            assert lr is not None, "lr must be defined if not using lr_schedule"
            lr_schedule = {0: lr}
        else:
            assert lr is None, "lr should not be defined if using lr_schedule"
            assert 0 in lr_schedule.keys(), "lr for epoch 0 must be defined in lr_schedule"

        self.model.initialize(initializer, ctx=self.context)
        if optimizer is None: optimizer = mx.optimizer.SGD(learning_rate=lr_schedule[0], momentum=0.9)
        trainer = mx.gluon.Trainer(params=self.model.collect_params(), optimizer=optimizer, kvstore=kvstore)
        train_metric = mx.metric.Accuracy()
        criterion = mx.gluon.loss.SoftmaxCrossEntropyLoss()
        max_val_acc = {'val_acc': 0, 'trn_acc': 0, 'epoch': 0}

        for epoch in range(epochs):
            epoch_tick = time.time()

            # update learning rate
            if epoch in lr_schedule.keys():
                trainer.set_learning_rate(lr_schedule[epoch])
                logging.info("Epoch {}, Changed learning rate.".format(epoch))
            logging.info('Epoch {}, Learning rate={}'.format(epoch, trainer.learning_rate))
            if self.tensorboard_logging: self.writer.add_scalar(tag='learning_rate', value=trainer.learning_rate, global_step=epoch+1)

            train_metric.reset()
            samples_processed = 0
            for batch_idx, (data, label) in enumerate(train_data):
                batch_tick = time.time()
                batch_size = data.shape[0]

                # partition data across all devices in context
                data = mx.gluon.utils.split_and_load(data, ctx_list=self.context, batch_axis=0)
                label = mx.gluon.utils.split_and_load(label, ctx_list=self.context, batch_axis=0)

                y_pred = []
                losses = []
                with mx.autograd.record():
                    # calculate loss on each partition of data
                    for x_part, y_true_part in zip(data, label):
                        y_pred_part = self.model(x_part)
                        loss = criterion(y_pred_part, y_true_part)
                        # store the losses and do backward after we have done forward on all GPUs.
                        # for better performance on multiple GPUs.
                        losses.append(loss)
                        y_pred.append(y_pred_part)
                    for loss in losses:
                        loss.backward()
                trainer.step(batch_size)
                train_metric.update(label, y_pred)

                if self.tensorboard_logging:
                    # log to tensorboard (on first batch)
                    if batch_idx == 0:
                        self.writer.add_histogram(tag='input', values=x_part, global_step=epoch + 1, bins=100)
                        self.writer.add_histogram(tag='output', values=y_pred_part, global_step=epoch + 1, bins=100)
                        self.writer.add_histogram(tag='loss', values=loss, global_step=epoch + 1, bins=100)
                        self.writer.add_image(tag="batch", image=x_part, global_step=epoch + 1)

                # log batch speed (if a multiple of log_frequency is contained in the last batch)
                log_batch = (samples_processed // log_frequency) != ((samples_processed + batch_size) // log_frequency)
                if ((batch_idx >= 1) and log_batch):
                    # batch estimate, not averaged over multiple batches
                    speed = batch_size / (time.time() - batch_tick)
                    logging.info('Epoch {}, Batch {}, Speed={:.2f} images/second'.format(epoch, batch_idx, speed))
                samples_processed += batch_size

            # log training accuracy
            _, trn_acc = train_metric.get()
            logging.info('Epoch {}, Training accuracy={}'.format(epoch, trn_acc))
            if self.tensorboard_logging: self.writer.add_scalar(tag='accuracy/training', value=trn_acc*100, global_step=epoch+1)

            # log validation accuracy
            val_acc = evaluate_accuracy(valid_data, self.model, ctx=self.context)
            logging.info('Epoch {}, Validation accuracy={}'.format(epoch, val_acc))
            if self.tensorboard_logging: self.writer.add_scalar(tag='accuracy/validation', value=val_acc * 100, global_step=epoch + 1)
            # log maximum validation accuracy
            if val_acc > max_val_acc['val_acc']:
                max_val_acc = {'val_acc': val_acc, 'trn_acc': trn_acc, 'epoch': epoch}
            logging.info(("Epoch {}, Max validation accuracy={} @ "
                          "Epoch {} (with training accuracy={})").format(epoch, max_val_acc['val_acc'],
                                                                         max_val_acc['epoch'], max_val_acc['trn_acc']))

            # log duration of epoch
            logging.info('Epoch {}, Duration={}'.format(epoch, time.time() - epoch_tick))

            if early_stopping_criteria:
                if early_stopping_criteria(val_acc):
                    logging.info("Epoch {}, Reached early stopping target, stopping training.".format(epoch))
                    break

        # checkpoint final model
        current_folder = os.path.dirname(os.path.realpath(__file__))
        checkpoint_folder = os.path.join(current_folder, "..", "logs", "checkpoints")
        checkpoint_filepath = os.path.join(checkpoint_folder, self.run_id + '.params')
        self.model.save_params(checkpoint_filepath)


    def predict(self,
              test_data,
              log_frequency=10000):
        logging.info('Starting inference.')
        current_folder = os.path.dirname(os.path.realpath(__file__))
        checkpoint_folder = os.path.join(current_folder, "..", "logs", "checkpoints")
        checkpoint_filepath = os.path.join(checkpoint_folder, self.run_id + '.params')
        self.model.load_params(checkpoint_filepath, ctx=self.context)

        samples_processed = 0
        for batch_idx, (data, label) in enumerate(test_data):
            batch_tick = time.time()
            batch_size = data.shape[0]

            # partition data across all devices in context
            data = mx.gluon.utils.split_and_load(data, ctx_list=self.context, batch_axis=0)
            label = mx.gluon.utils.split_and_load(label, ctx_list=self.context, batch_axis=0)

            # calculate loss on each partition of data
            y_pred = []
            for x_part, y_true_part in zip(data, label):
                y_pred_part = self.model(x_part)
                y_pred.append(y_pred_part)

            mx.nd.waitall()
            batch_tock = time.time()
            # log batch speed (if a multiple of log_frequency is contained in the last batch)
            log_batch = (samples_processed // log_frequency) != ((samples_processed + batch_size) // log_frequency)
            warm_up_period = 5
            if ((batch_idx >= warm_up_period) and log_batch):
                # batch estimate, not averaged over multiple batches
                latency = (batch_tock - batch_tick) # seconds
                speed = batch_size / latency
                logging.info('Inference. Batch {}, Latency={:.5f} ms, Speed={:.2f} images/second'.format(batch_idx, latency * 1000, speed))
            samples_processed += batch_size

        logging.info('Completed inference.')
示例#15
0
def train(net, train_data, valid_data, num_epochs, lr, wd, momentum, ctx):
    trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr, 'wd': wd})
    #trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr, 'momentum': momentum})
    metric = mx.metric.Accuracy()#用来记录训练过程中的参数

    #自己画出训练曲线
    train_loss = []
    if valid_data is not None:
        test_loss = []

    # collect parameter names for logging the gradients of parameters in each epoch
    params = net.collect_params()
    param_names = params.keys()
    # define a summary writer that logs data and flushes to the file every 5 seconds
    sw = SummaryWriter(logdir='./logs', flush_secs=2)
    global_step = 0
    prev_time = datetime.datetime.now()#记录每一个epoch的时间
    for epoch in range(num_epochs):
        trainer = update_learning_rate(lr, trainer, epoch, opt.lr_factor, lr_steps) #学习率衰减策略
        _loss = 0.
        metric.reset()
        for i, (data, label) in enumerate(train_data):
            label = label.as_in_context(ctx) #标签和数据,放在gpu上
            data = data.as_in_context(ctx)
            #开始记录计算图
            with autograd.record():
                output = net(data) #预测值
                loss = softmax_cross_entropy(output, label) #和真实label对比,计算loss
            sw.add_scalar(tag='cross_entropy', value=loss.mean().asscalar(), global_step=global_step)
            global_step += 1
            loss.backward() #反向传播梯度
            trainer.step(opt.batch_size)
            metric.update([label], [output])
            if i % 100 == 0 and i > 0:
                name, acc = metric.get()
                print('[Epoch %d Batch %d] Training: %s=%f' % (epoch, i, name, acc))
            if i == 0:
                pass
                #sw.add_image('kaggleDog_first_minibatch', data.reshape((opt.batch_size, 2048, 1, 1)), epoch)
            _loss += nd.mean(loss).asscalar()

        ####################使用MXboard画出训练曲线###################
        if epoch == 0:
            sw.add_graph(net)
        grads = [i.grad() for i in net.collect_params().values()]
        assert len(grads) == len(param_names)
        # logging the gradients of parameters for checking convergence
        for i, name in enumerate(param_names):
            sw.add_histogram(tag=name, values=grads[i], global_step=epoch, bins=1000)

        #训练精度
        name, acc = metric.get()
        print('[Epoch %d] Training: %s=%f' % (epoch, name, acc))
        # logging training accuracy
        sw.add_scalar(tag='train_acc', value=acc, global_step=epoch)

        #得到测试精度
        name, val_acc = test(valid_data, ctx, net)
        # logging the validation accuracy
        print('[Epoch %d] Validation: %s=%f' % (epoch, name, val_acc))
        sw.add_scalar(tag='valid_acc', value=val_acc, global_step=epoch)
        ####################使用MXboard画出训练曲线###################


        cur_time = datetime.datetime.now()
        #转换为时分秒格式
        h, remainder = divmod((cur_time - prev_time).seconds, 3600)
        m, s = divmod(remainder, 60)
        time_str = "Time %02d:%02d:%02d" % (h, m, s)
        __loss = _loss/len(train_data)
        train_loss.append(__loss)

        #如果有验证数据,则给出训练loss和验证loss
        if valid_data is not None:  
            valid_loss = get_loss(valid_data, net, ctx)
            epoch_str = ("Epoch %d. Train loss: %f, Valid loss %f, "
                         % (epoch, __loss, valid_loss))
            test_loss.append(valid_loss)
        else:
            epoch_str = ("Epoch %d. Train loss: %f, "
                         % (epoch, __loss))
        #打印出一个epoch的时间和loss
        prev_time = cur_time
        print(epoch_str + time_str + ', lr ' + str(trainer.learning_rate))
    sw.close()

    #训练完成则画出loss曲线,保存到本地train.png
    plt.plot(train_loss, 'r')
    if valid_data is not None: 
        plt.plot(test_loss, 'g')
    plt.legend(['Train_Loss', 'Test_Loss'], loc=2)

    #保存训练参模型文件
    plt.savefig(pngname, dpi=1000)
    net.collect_params().save(modelparams)
    net.export('model')
示例#16
0
class TrainerAgent:
    def __init__(
        self,
        net,
        val_data,
        nb_parts,
        lr_schedule,
        momentum_schedule,
        total_it,
        wd=0.0001,
        batch_steps=1000,
        k_steps_initial=0,
        cpu_count=16,
        batch_size=2048,
        normalize=True,
        export_weights=True,
        export_grad_histograms=True,
        log_metrics_to_tensorboard=True,
        ctx=mx.gpu(),
        metrics={},  # clip_gradient=60,
        use_spike_recovery=True,
        max_spikes=5,
        spike_thresh=1.5,
        seed=42,
        val_loss_factor=0.01,
        policy_loss_factor=0.99,
    ):
        # , lr_warmup_k_steps=30, lr_warmup_init=0.01):
        # patience=25, nb_lr_drops=3, nb_k_steps=200,

        self._log_metrics_to_tensorboard = log_metrics_to_tensorboard
        self._ctx = ctx
        # lr_drop_fac=0.1,
        self._metrics = metrics
        self._net = net
        self._graph_exported = False
        # self._lr = lr
        self._normalize = normalize
        # self._nb_k_steps = nb_k_steps
        # self._patience = patience
        # self._nb_lr_droups = nb_lr_drops
        self._lr_schedule = lr_schedule
        self._momentum_schedule = momentum_schedule
        self._total_it = total_it
        self._batch_size = batch_size
        self._export_grad_histograms = export_grad_histograms
        self._cpu_count = cpu_count
        # self._lr_drop_fac = lr_drop_fac
        self._k_steps_initial = k_steps_initial
        self._val_data = val_data
        self._export_weights = export_weights
        self._batch_steps = batch_steps
        self._use_spike_recovery = use_spike_recovery
        self._max_spikes = max_spikes
        self._spike_thresh = spike_thresh
        self._seed = seed
        self._val_loss_factor = val_loss_factor
        self._policy_loss_factor = policy_loss_factor

        # self._nb_lr_drops = nb_lr_drops
        # self._warmup_k_steps = lr_warmup_k_steps
        # self._lr_warmup_init = lr_warmup_init

        # define a summary writer that logs data and flushes to the file every 5 seconds
        if log_metrics_to_tensorboard is True:
            self.sw = SummaryWriter(logdir="./logs",
                                    flush_secs=5,
                                    verbose=False)

        # Define the two loss functions
        self._softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
        self._l2_loss = gluon.loss.L2Loss()

        self._trainer = gluon.Trainer(
            self._net.collect_params(),
            "nag",
            {
                "learning_rate": lr_schedule(0),
                "momentum": momentum_schedule(0),
                #'clip_gradient': clip_gradient,
                "wd": wd,
            },
        )

        # collect parameter names for logging the gradients of parameters in each epoch
        self._params = self._net.collect_params()
        self._param_names = self._params.keys()

        # define a list which describes the order of the processed batches
        self.ordering = list(range(nb_parts))

    def _log_metrics(self, metric_values, global_step, prefix="train_"):
        """
        Logs a dictionary object of metric vlaue to the console and to tensorboard if _log_metrics_to_tensorboard is set to true
        :param metric_values: Dictionary object storing the current metrics
        :param global_step: X-Position point of all metric entries
        :param prefix: Used for labelling the metrics
        :return:
        """
        for name in metric_values.keys():  # show the metric stats
            print(" - %s%s: %.4f" % (prefix, name, metric_values[name]),
                  end="")
            # add the metrics to the tensorboard event file

            if self._log_metrics_to_tensorboard is True:
                self.sw.add_scalar(
                    name, [prefix.replace("_", ""), metric_values[name]],
                    global_step)

    def _process_on_data_plane_file(self, train_data, batch_proc_tmp):

        for i, (data, value_label, policy_label) in enumerate(train_data):
            data = data.as_in_context(self._ctx)
            value_label = value_label.as_in_context(self._ctx)
            policy_label = policy_label.as_in_context(self._ctx)

            # update a dummy metric to see a proper progress bar
            #  (the metrics will get evaluated at the end of 100k steps)
            # if self.batch_proc_tmp > 0:
            #    self._metrics['value_loss'].update(old_label, value_out)

            # old_label = value_label

            with autograd.record():
                [value_out, policy_out] = self._net(data)
                value_loss = self._l2_loss(value_out, value_label)
                policy_loss = self._softmax_cross_entropy(
                    policy_out, policy_label)
                # weight the components of the combined loss
                combined_loss = self._val_loss_factor * value_loss.sum(
                ) + self._policy_loss_factor * policy_loss.sum()

                # update a dummy metric to see a proper progress bar
                self._metrics["value_loss"].update(preds=value_out,
                                                   labels=value_label)

            combined_loss.backward()
            self._trainer.step(data.shape[0])
            batch_proc_tmp += 1

        return batch_proc_tmp, self._metrics["value_loss"].get()[1]

    def train(self):
        """

        :param net: Gluon network object
        :param val_data: Gluon dataloader object
        :param nb_parts: Sets how many different part files exist in the train directory
        :param lr: Initial learning rate
        :param momentum:
        :param wd:
        :param nb_k_steps: Number of steps in after which to drop the learning rate (assuming the patience counter
        early dropping hasn't activated beforehand)

        :param patience: Number of batches to wait until no progress on validation loss has been achieved.
        If the no progress has been done the learning rate is multiplied by the drop factor.
        :param nb_lr_drops: Number of time to drop the learning rate in total. This defines the end of the train loop
        :param batch_steps: Number of batches after which the validation loss is evaluated
        :param k_steps_initial: Initial starting point of the network in terms of process k batches (default 0)
        :param lr_drop_fac: Dropping factor to the learning rate to apply
        :param cpu_count: How many cpu threads on the current are available
        :param batch_size: Batch size to train the network with
        :param normalize: Weather to use data normalization after loading the data (recommend to set to True)
        :param export_weights: Sets if network checkpoints should be exported
        :param export_grad_histograms: Sets if the gradient updates of the weights should be logged to tensorboard
        :return:
        """

        # set a custom seed for reproducibility
        random.seed(self._seed)

        # define and initialize the variables which will be used
        t_s = time()

        # predefine the local variables that will be used in the training loop
        val_loss_best = None
        val_p_acc_best = None
        k_steps_best = None
        patience_cnt = 0

        epoch = 0
        # keep track on how many batches have been processed in this epoch so far
        batch_proc_tmp = 0
        # counter for thousands steps
        k_steps = self._k_steps_initial
        # calculate how many log states will be processed
        k_steps_end = self._total_it / self._batch_steps

        cur_it = 0

        # count the number of spikes that have been detected
        nb_spikes = 0
        # initialize the loss to compare with, with a very high value
        old_val_loss = 9000

        # self._lr = self._lr_warmup_init
        # logging.info('Warmup-Schedule')
        # logging.info('Initial learning rate: lr = %.5f', self._lr)
        # logging.info('=========================================')

        # set initial lr
        # self._trainer.set_learning_rate(self._lr)
        # log the current learning rate
        # self.sw.add_scalar(tag='lr', value=self._lr, global_step=k_steps)

        # create a state variable to check if the net architecture has been reported yet
        graph_exported = False

        old_label = None
        value_out = None

        # safety check to prevent eternal loop
        if not self.ordering:
            raise Exception(
                "You must have at least one part file in your planes-dataset directory!"
            )

        while True:
            # reshuffle the ordering of the training game batches (shuffle works in place)
            random.shuffle(self.ordering)

            epoch += 1
            logging.info("EPOCH %d", epoch)
            logging.info("=========================")
            t_s_steps = time()

            for part_id in tqdm_notebook(self.ordering):

                # load one chunk of the dataset from memory
                s_idcs_train, x_train, yv_train, yp_train, pgn_datasets_train = load_pgn_dataset(
                    dataset_type="train",
                    part_id=part_id,
                    normalize=self._normalize,
                    verbose=False)
                # update the train_data object
                train_dataset = gluon.data.ArrayDataset(
                    nd.array(x_train), nd.array(yv_train),
                    nd.array(yp_train.argmax(axis=1)))
                train_data = gluon.data.DataLoader(train_dataset,
                                                   batch_size=self._batch_size,
                                                   shuffle=True,
                                                   num_workers=self._cpu_count)
                # batch_proc_tmp, dummy = self._process_on_data_plane_file(train_data, batch_proc_tmp)

                for i, (data, value_label,
                        policy_label) in enumerate(train_data):
                    data = data.as_in_context(self._ctx)
                    value_label = value_label.as_in_context(self._ctx)
                    policy_label = policy_label.as_in_context(self._ctx)

                    # update a dummy metric to see a proper progress bar
                    #  (the metrics will get evaluated at the end of 100k steps)
                    if batch_proc_tmp > 0:
                        self._metrics["value_loss"].update(
                            old_label, value_out)

                    old_label = value_label
                    with autograd.record():
                        [value_out, policy_out] = self._net(data)
                        value_loss = self._l2_loss(value_out, value_label)
                        policy_loss = self._softmax_cross_entropy(
                            policy_out, policy_label)
                        # weight the components of the combined loss
                        combined_loss = (
                            self._val_loss_factor * value_loss.sum() +
                            self._policy_loss_factor * policy_loss.sum())

                        # update a dummy metric to see a proper progress bar
                        # self._metrics['value_loss'].update(preds=value_out, labels=value_label)

                    combined_loss.backward()

                    # update the learning rate
                    lr = self._lr_schedule(cur_it)
                    self._trainer.set_learning_rate(lr)

                    # update the momentum
                    momentum = self._momentum_schedule(cur_it)
                    self._trainer._optimizer.momentum = momentum

                    self._trainer.step(data.shape[0])
                    cur_it += 1
                    batch_proc_tmp += 1

                    # add the graph representation of the network to the tensorboard log file
                    if graph_exported is False and self._log_metrics_to_tensorboard is True:
                        self.sw.add_graph(self._net)
                        graph_exported = True

                    # show metrics every thousands steps
                    if batch_proc_tmp >= self._batch_steps:

                        # if k_steps < self._warmup_k_steps:
                        # update the learning rate
                        # self._lr *= k_steps * ((self._lr_first - self._lr_warmup_init) / self._warmup_k_steps) + self._lr_warmup_init #self._lr_drop_fac
                        # self._trainer.set_learning_rate(self._lr)
                        # logging.info('Learning rate update: lr = %.5f', self._lr)
                        # logging.info('=========================================')

                        # log the current learning rate

                        # update batch_proc_tmp counter by subtracting the batch_steps
                        batch_proc_tmp = batch_proc_tmp - self._batch_steps

                        # measure elapsed time
                        ms_step = (
                            (time() - t_s_steps) / self._batch_steps) * 1000
                        # update the counters
                        k_steps += 1
                        patience_cnt += 1

                        logging.info("Step %dK/%dK - %dms/step", k_steps,
                                     k_steps_end, ms_step)
                        logging.info("-------------------------")
                        logging.debug("Iteration %d/%d", cur_it,
                                      self._total_it)
                        logging.debug("lr: %.7f - momentum: %.7f", lr,
                                      momentum)

                        train_metric_values = evaluate_metrics(self._metrics,
                                                               train_data,
                                                               self._net,
                                                               nb_batches=25,
                                                               ctx=self._ctx)

                        val_metric_values = evaluate_metrics(self._metrics,
                                                             self._val_data,
                                                             self._net,
                                                             nb_batches=None,
                                                             ctx=self._ctx)

                        # spike_detected = False
                        # spike_detected = old_val_loss * 1.5 < val_metric_values['loss']
                        # if np.isnan(val_metric_values['loss']):
                        #    spike_detected = True

                        # check for spikes
                        if self._use_spike_recovery is True and (
                                old_val_loss * self._spike_thresh <
                                val_metric_values["loss"]
                                or np.isnan(val_metric_values["loss"])):
                            nb_spikes += 1
                            logging.warning(
                                "Spike %d/%d occurred - val_loss: %.3f",
                                nb_spikes,
                                self._max_spikes,
                                val_metric_values["loss"],
                            )
                            if nb_spikes >= self._max_spikes:

                                val_loss = val_metric_values["loss"]
                                val_p_acc = val_metric_values["policy_acc"]

                                logging.debug(
                                    "The maximum number of spikes has been reached. Stop training."
                                )
                                # finally stop training because the number of lr drops has been achieved
                                print()
                                print("Elapsed time for training(hh:mm:ss): " +
                                      str(
                                          datetime.timedelta(
                                              seconds=round(time() - t_s))))

                                if self._log_metrics_to_tensorboard is True:
                                    self.sw.close()

                                return (k_steps, val_loss,
                                        val_p_acc), (k_steps_best,
                                                     val_loss_best,
                                                     val_p_acc_best)

                            logging.debug("Recover to latest checkpoint")
                            # ## Load the best model once again
                            model_path = "./weights/model-%.5f-%.3f-%04d.params" % (
                                val_loss_best,
                                val_p_acc_best,
                                k_steps_best,
                            )
                            logging.debug("load current best model:%s" %
                                          model_path)
                            self._net.load_parameters(model_path,
                                                      ctx=self._ctx)
                            k_steps = k_steps_best
                            logging.debug("k_step is back at %d", k_steps_best)

                            # print the elapsed time
                            t_delta = time() - t_s_steps
                            print(" - %.ds" % t_delta)
                            t_s_steps = time()

                        else:

                            # update the val_loss_value to compare with using spike recovery
                            old_val_loss = val_metric_values["loss"]

                            # log the metric values to tensorboard
                            self._log_metrics(train_metric_values,
                                              global_step=k_steps,
                                              prefix="train_")
                            self._log_metrics(val_metric_values,
                                              global_step=k_steps,
                                              prefix="val_")

                            if self._export_grad_histograms is True:
                                grads = []
                                # logging the gradients of parameters for checking convergence
                                for i_p, name in enumerate(self._param_names):
                                    if "bn" not in name and "batch" not in name:
                                        grads.append(self._params[name].grad())
                                        self.sw.add_histogram(
                                            tag=name,
                                            values=grads[-1],
                                            global_step=k_steps,
                                            bins=20)

                            # check if a new checkpoint shall be created
                            if val_loss_best is None or val_metric_values[
                                    "loss"] < val_loss_best:
                                # update val_loss_best
                                val_loss_best = val_metric_values["loss"]
                                val_p_acc_best = val_metric_values[
                                    "policy_acc"]
                                k_steps_best = k_steps

                                if self._export_weights is True:
                                    prefix = "./weights/model-%.5f-%.3f" % (
                                        val_loss_best, val_p_acc_best)
                                    # the export function saves both the architecture and the weights
                                    self._net.export(prefix,
                                                     epoch=k_steps_best)
                                    print()
                                    logging.info(
                                        "Saved checkpoint to %s-%04d.params" %
                                        (prefix, k_steps_best))

                                # reset the patience counter
                                patience_cnt = 0

                            # print the elapsed time
                            t_delta = time() - t_s_steps
                            print(" - %.ds" % t_delta)
                            t_s_steps = time()

                            # log the samples per second metric to tensorbaord
                            self.sw.add_scalar(
                                tag="samples_per_second",
                                value={
                                    "hybrid_sync":
                                    data.shape[0] * self._batch_steps / t_delta
                                },
                                global_step=k_steps,
                            )

                            # log the current learning rate
                            self.sw.add_scalar(tag="lr",
                                               value=self._lr_schedule(cur_it),
                                               global_step=k_steps)

                            # log the current momentum value
                            self.sw.add_scalar(
                                tag="momentum",
                                value=self._momentum_schedule(cur_it),
                                global_step=k_steps)

                            if cur_it >= self._total_it:

                                val_loss = val_metric_values["loss"]
                                val_p_acc = val_metric_values["policy_acc"]

                                logging.debug(
                                    "The number of given iterations has been reached"
                                )
                                # finally stop training because the number of lr drops has been achieved
                                print()
                                print("Elapsed time for training(hh:mm:ss): " +
                                      str(
                                          datetime.timedelta(
                                              seconds=round(time() - t_s))))

                                if self._log_metrics_to_tensorboard is True:
                                    self.sw.close()

                                return (k_steps, val_loss,
                                        val_p_acc), (k_steps_best,
                                                     val_loss_best,
                                                     val_p_acc_best)
                            """
示例#17
0
文件: train.py 项目: zhhhzhang/ASTGCN
                output = net([train_w, train_d, train_r])
                l = loss_function(output, train_t)
            l.backward()
            trainer.step(train_t.shape[0])
            training_loss = l.mean().asscalar()

            sw.add_scalar(tag = 'training_loss', value = training_loss, global_step = global_step)
            
            print('global step: %s, training loss: %.2f, time: %.2fs'\
                %(global_step, training_loss, time() - start_time))
            global_step += 1

        # logging the gradients of parameters for checking convergence
        for name, param in net.collect_params().items():
            try:
                sw.add_histogram(tag = name + "_grad", values = param.grad(), global_step = global_step, bins = 1000)
            except:
                print(name)
                print(param.grad())

        # compute validation loss
        compute_val_loss(net, val_loader, loss_function, sw, epoch)

        # evaluate the model on testing set
        evaluate(net, test_loader, true_value, num_of_vertices, sw, epoch)

        params_filename = os.path.join(params_path, '%s_epoch_%s.params'%(model_name, epoch))
        net.save_parameters(params_filename)
        print('save parameters to file: %s'%(params_filename))
    
    # close SummaryWriter
示例#18
0
def train_net(net, train_iter, valid_iter, batch_size, trainer, ctx,
              num_epochs, lr_sch, save_prefix):
    logger.info("===================START TRAINING====================")
    if use_mxboard:
        sw = SummaryWriter(logdir='logs', flush_secs=5)
    cls_loss = gluon.loss.SoftmaxCrossEntropyLoss()
    cls_acc = mx.metric.Accuracy(name="train acc")
    top_acc = 0
    iter_num = 0
    #test_acc,test_loss = test_net(net, valid_iter, ctx)
    #sw.add_graph(net) #only hybrid block supported
    param_names = net.collect_params().keys()
    for epoch in range(num_epochs):
        train_loss = []
        t0 = time.time()
        if isinstance(train_iter, mx.io.MXDataIter):
            train_iter.reset()
        total = 0
        trainer.set_learning_rate(lr_sch(epoch))
        for batch in train_iter:
            iter_num += 1
            # print("iter ",iter_num," start")
            if isinstance(batch, mx.io.DataBatch):
                X, Y = batch.data[0], batch.label[0]
                #total += X.shape[0]
                #print(total)
            else:
                X, Y = batch
            #print(X.shape,Y.shape)
            #print(Y)
            X = X.as_in_context(ctx)
            Y = Y.as_in_context(ctx)
            with autograd.record(True):
                out = net(X)
                #out = out.as_in_context(mx.cpu())
                loss = cls_loss(out, Y)
        # print(out.asnumpy()[0])
        # print('loss = ',loss.sum().asscalar())
            loss.backward()
            train_loss.append(loss.sum().asscalar())
            trainer.step(batch_size)
            cls_acc.update(Y, out)
            nd.waitall()
            #print("iter ",iter_num," end")
            if use_mxboard:
                if iter_num % 100 == 0:
                    sw.add_scalar(tag='train_loss',
                                  value=loss.mean().asscalar(),
                                  global_step=iter_num)
                    sw.add_scalar(tag='train_acc',
                                  value=cls_acc.get(),
                                  global_step=iter_num)
                if iter_num % 100 == 0:
                    for name in net.collect_params():
                        param = net.collect_params()[name]
                        if param.grad_req != "null":
                            sw.add_histogram(tag=name,
                                             values=param.grad(),
                                             global_step=iter_num,
                                             bins=1000)

        logger.info("epoch {} lr {} {}sec".format(epoch, trainer.learning_rate,
                                                  time.time() - t0))
        train_loss, train_acc = np.mean(train_loss) / batch_size, cls_acc.get()
        logger.info("\ttrain loss {} {}".format(train_loss, train_acc))
        if epoch > 0 and (epoch % 10) == 0:
            test_acc, test_loss = test_net(net, valid_iter, ctx)
            if use_mxboard:
                sw.add_scalar(tag='test_acc',
                              value=test_acc,
                              global_step=epoch)
                sw.add_scalar(tag='test_loss',
                              value=test_loss,
                              global_step=epoch)
            if top_acc < test_acc:
                top_acc = test_acc
                logger.info('\ttop valid acc {}'.format(test_acc))
                if isinstance(net, mx.gluon.nn.HybridSequential) or isinstance(
                        net, mx.gluon.nn.HybridBlock):
                    pf = '{}_{:.3f}.params'.format(save_prefix, top_acc)
                    net.export(pf, epoch)
                else:
                    net_path = '{}top_acc_{}_{:.3f}.params'.format(
                        save_prefix, epoch, top_acc)
                    net.save_parameters(net_path)

    if use_mxboard:
        sw.close()
        (epoch, train_loss / n, train_acc / m, test_acc, time.time() - start))

    if train_loss / n <= 0.0008 and epoch >= 20:
        break
### 可視化 ###
    pn = list(net.collect_params().keys())
    param_names, grads = [], []
    for n, i in enumerate(net.collect_params().values()):
        if i.grad_req != 'null':
            grads.append(i.grad())
            param_names.append(pn[n])
    assert len(grads) == len(param_names)
    # logging the gradients of parameters for checking convergence
    for i, name in enumerate(param_names):
        sw.add_histogram(tag=name,
                         values=grads[i],
                         global_step=epoch,
                         bins=1000)
    sw.add_scalar(tag='cross_entropy', value=train_loss / n, global_step=epoch)
    sw.add_scalar(tag='train_acc',
                  value=train_acc / m * 100,
                  global_step=epoch)
    sw.add_scalar(tag='test_acc', value=test_acc * 100, global_step=epoch)

test_data.reset()
filter_image = test_data.data[0][1][0:2]
sw.add_image(tag='int_put_test_image', image=rescale_per_image(filter_image))

L1conv_net = gluon.nn.Sequential()
L1conv_net.add(net.features[0])
o_f_image = L1conv_net(filter_image)
o_f_image = o_f_image[0:1].transpose((1, 0, 2, 3))
示例#20
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        anchor_alloc_size=[256, 256],
        box_sizes=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72],
        box_ratios=[[1, 2, 0.5]] + [[1, 2, 0.5, 3, 1.0 / 3]] * 4 +
        [[1, 2, 0.5]] * 2,
        anchor_box_clip=True,
        graphviz=True,
        epoch=100,
        input_size=[400, 600],
        batch_log=100,
        batch_size=16,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=True,
        factor_scale=[8, 5],
        foreground_iou_thresh=0.5,
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        save_period=10,
        load_period=10,
        learning_rate=0.001,
        decay_lr=0.999,
        decay_step=10,
        GPU_COUNT=0,
        base="VGG16_512",
        pretrained_base=True,
        pretrained_path="modelparam",
        classHardNegativeMining=True,
        boxHardNegativeMining=True,
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        using_mlflow=True,
        decode_number=-1,
        multiperclass=True,
        nms_thresh=0.45,
        nms_topk=500,
        iou_thresh=0.5,
        except_class_thresh=0.01,
        plot_class_thresh=0.5):
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
        else:
            logging.info(f'Running on {ctx}')

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training SSD Detector")
    input_shape = (1, 3) + tuple(input_size)

    try:
        if base.upper() == "VGG16_300":  # 입력 사이즈 300 x 300 추천
            net = SSD_VGG16(version=300,
                            input_size=input_size,
                            box_sizes=box_sizes,
                            box_ratios=box_ratios,
                            anchor_box_clip=anchor_box_clip,
                            alloc_size=anchor_alloc_size,
                            ctx=mx.cpu())
        elif base.upper() == "VGG16_512":  # 입력 사이즈 512 x 512 추천
            net = SSD_VGG16(version=512,
                            input_size=input_size,
                            box_sizes=box_sizes,
                            box_ratios=box_ratios,
                            anchor_box_clip=anchor_box_clip,
                            ctx=mx.cpu())
        train_dataloader, train_dataset = traindataloader(
            multiscale=multiscale,
            factor_scale=factor_scale,
            augmentation=data_augmentation,
            path=train_dataset_path,
            input_size=input_size,
            batch_size=batch_size,
            batch_interval=batch_interval,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            net=net,
            foreground_iou_thresh=foreground_iou_thresh,
            make_target=True)
        valid_dataloader, valid_dataset = validdataloader(
            path=valid_dataset_path,
            input_size=input_size,
            batch_size=valid_size,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            net=net,
            foreground_iou_thresh=foreground_iou_thresh,
            make_target=True)
    except Exception:
        logging.info("dataset 없음")
        exit(0)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    # 이름 다시 붙이기
    optimizer = optimizer.upper()
    base = base.upper()
    if pretrained_base:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_P" + base
    else:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_" + base

    weight_path = f"weights/{model}"
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)} weights\n")
        net = gluon.SymbolBlock.imports(sym_path, ['data'],
                                        param_path,
                                        ctx=ctx)
    else:
        start_epoch = 0
        if base.upper() == "VGG16_300":  # 입력 사이즈 300 x 300 추천
            net = SSD_VGG16(
                version=300,
                input_size=input_size,
                # box_sizes=[21, 45, 101.25, 157.5, 213.75, 270, 326.25],
                # box_ratios=[[1, 2, 0.5]] +  # conv4_3
                #            [[1, 2, 0.5, 3, 1.0 / 3]] * 3 +  # conv7, conv8_2, conv9_2, conv10_2
                #            [[1, 2, 0.5]] * 2,  # conv11_2, conv12_2
                box_sizes=box_sizes,
                box_ratios=box_ratios,
                num_classes=num_classes,
                pretrained=pretrained_base,
                pretrained_path=pretrained_path,
                anchor_box_clip=anchor_box_clip,
                alloc_size=anchor_alloc_size,
                ctx=ctx)

        elif base.upper() == "VGG16_512":  # 입력 사이즈 512 x 512 추천
            net = SSD_VGG16(
                version=512,
                input_size=input_size,
                # box_sizes=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72],
                # box_ratios=[[1, 2, 0.5]] +  # conv4_3
                #            [[1, 2, 0.5, 3, 1.0 / 3]] * 4 +  # conv7, conv8_2, conv9_2, conv10_2
                #            [[1, 2, 0.5]] * 2,  # conv11_2, conv12_2
                box_sizes=box_sizes,
                box_ratios=box_ratios,
                num_classes=num_classes,
                pretrained=pretrained_base,
                pretrained_path=pretrained_path,
                anchor_box_clip=anchor_box_clip,
                ctx=ctx)
        else:
            logging.warning("backbone 없음")
            exit(0)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))
        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model),
                                max_queue=10,
                                flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)

    if graphviz:
        gluoncv.utils.viz.plot_network(net,
                                       shape=input_shape,
                                       save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) //
                 batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step,
                                             factor=decay_lr,
                                             stop_factor_lr=1e-12,
                                             base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'

    if AMP:
        '''
        update_on_kvstore : bool, default None
        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
        '''
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "beta1": 0.9,
                    "beta2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "gamma1": 0.9,
                    "gamma2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "wd": 0.0005,
                    "momentum": 0.9,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        else:
            logging.error("optimizer not selected")
            exit(0)

        amp.init_trainer(trainer)

    else:
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "beta1": 0.9,
                                        "beta2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "gamma1": 0.9,
                                        "gamma2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "wd": 0.0005,
                                        "momentum": 0.9,
                                        'multi_precision': False
                                    })

        else:
            logging.error("optimizer not selected")
            exit(0)
    '''
    localization loss -> Smooth L1 loss
    confidence loss -> Softmax
    '''
    if not classHardNegativeMining:
        confidence_loss = SoftmaxCrossEntropyLoss(axis=-1,
                                                  sparse_label=True,
                                                  from_log_softmax=False,
                                                  batch_axis=None,
                                                  reduction="sum",
                                                  exclude=False)
    if not boxHardNegativeMining:
        localization_loss = HuberLoss(rho=1,
                                      batch_axis=None,
                                      reduction="sum",
                                      exclude=False)

    prediction = Prediction(from_softmax=False,
                            num_classes=num_classes,
                            decode_number=decode_number,
                            nms_thresh=nms_thresh,
                            nms_topk=nms_topk,
                            except_class_thresh=except_class_thresh,
                            multiperclass=multiperclass)

    precision_recall = Voc_2007_AP(iou_thresh=iou_thresh,
                                   class_names=name_classes)

    start_time = time.time()

    for i in tqdm(range(start_epoch + 1, epoch + 1, 1),
                  initial=start_epoch + 1,
                  total=epoch):

        conf_loss_sum = 0
        loc_loss_sum = 0
        time_stamp = time.time()

        for batch_count, (image, _, cls_all, box_all,
                          _) in enumerate(train_dataloader, start=1):
            td_batch_size = image.shape[0]

            image = mx.nd.split(data=image, num_outputs=subdivision, axis=0)
            cls_all = mx.nd.split(data=cls_all,
                                  num_outputs=subdivision,
                                  axis=0)
            box_all = mx.nd.split(data=box_all,
                                  num_outputs=subdivision,
                                  axis=0)

            if subdivision == 1:
                image = [image]
                cls_t_all = [cls_t_all]
                box_t_all = [box_t_all]

            with autograd.record(train_mode=True):

                cls_all_losses = []
                box_all_losses = []

                for image_split, cls_split, box_split in zip(
                        image, cls_all, box_all):

                    if GPU_COUNT <= 1:
                        image_split = gluon.utils.split_and_load(
                            image_split, [ctx], even_split=False)
                        cls_split = gluon.utils.split_and_load(
                            cls_split, [ctx], even_split=False)
                        box_split = gluon.utils.split_and_load(
                            box_split, [ctx], even_split=False)
                    else:
                        image_split = gluon.utils.split_and_load(
                            image_split, ctx, even_split=False)
                        cls_split = gluon.utils.split_and_load(
                            cls_split, ctx, even_split=False)
                        box_split = gluon.utils.split_and_load(
                            box_split, ctx, even_split=False)

                    # prediction, target space for Data Parallelism
                    cls_losses = []
                    box_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, cls_target, box_target in zip(
                            image_split, cls_split, box_split):
                        # 1. SSD network Inference
                        cls_pred, box_pred, anchor = net(img)
                        '''
                            4. Hard negative mining (class에만 loss 계산)
                            Hard negative mining After the matching step, most of the default boxes are negatives,
                            especially when the number of possible default boxes is large. This introduces a
                            significant imbalance between the positive and negative training examples. Instead of
                            using all the negative examples, we sort them using the highest confidence loss for each
                            default box and pick the top ones so that the ratio between the negatives and positives is
                            at most 3:1. We found that this leads to faster optimization and a more stable training
                        '''
                        weight_term_alpha = 1
                        negative_mining_ratio = 3
                        positive_samples = cls_target > 0  # True or False
                        positive_numbers = positive_samples.sum()
                        if classHardNegativeMining:
                            pred = mx.nd.log_softmax(cls_pred, axis=-1)
                            negative_samples = 1 - positive_samples
                            conf_loss = -mx.nd.pick(
                                pred, cls_target,
                                axis=-1)  # (batch, all feature number)
                            '''
                            we sort them using the highest confidence loss for each
                            default box and pick the top ones so that the ratio between the negatives and positives is
                            at most 3:1.
                            '''
                            negative_samples_conf_loss = (conf_loss *
                                                          negative_samples)
                            # 아래 3줄의 코드 출처 : from gluoncv.loss import SSDMultiBoxLoss
                            negative_samples_index = mx.nd.argsort(
                                negative_samples_conf_loss,
                                axis=-1,
                                is_ascend=False)
                            selection = mx.nd.argsort(negative_samples_index,
                                                      axis=-1,
                                                      is_ascend=True)
                            hard_negative_samples = selection <= mx.nd.multiply(
                                positive_numbers,
                                negative_mining_ratio).expand_dims(-1)
                            pos_hardnega = positive_samples + hard_negative_samples
                            conf_loss = mx.nd.where(
                                pos_hardnega > 0, conf_loss,
                                mx.nd.zeros_like(conf_loss))
                            conf_loss = mx.nd.sum(conf_loss)
                            if positive_numbers:
                                conf_loss = mx.nd.divide(
                                    conf_loss, positive_numbers)
                            else:
                                conf_loss = mx.nd.multiply(conf_loss, 0)
                            cls_losses.append(conf_loss.asscalar())
                        else:
                            conf_loss = confidence_loss(
                                cls_pred, cls_target,
                                positive_samples.expand_dims(axis=-1))
                            if positive_numbers:
                                conf_loss = mx.nd.divide(
                                    conf_loss, positive_numbers)
                            else:
                                conf_loss = mx.nd.multiply(conf_loss, 0)
                            cls_losses.append(conf_loss.asscalar())

                        if boxHardNegativeMining:
                            # loc loss에도 hard HardNegativeMining 적용해보자.
                            pred = mx.nd.log_softmax(cls_pred, axis=-1)
                            negative_samples = 1 - positive_samples
                            conf_loss_for_box = -mx.nd.pick(
                                pred, cls_target,
                                axis=-1)  # (batch, all feature number)
                            negative_samples_conf_loss = (conf_loss_for_box *
                                                          negative_samples)
                            negative_samples_index = mx.nd.argsort(
                                negative_samples_conf_loss,
                                axis=-1,
                                is_ascend=False)
                            selection = mx.nd.argsort(negative_samples_index,
                                                      axis=-1,
                                                      is_ascend=True)
                            hard_negative_samples = selection <= mx.nd.multiply(
                                positive_numbers,
                                negative_mining_ratio).expand_dims(-1)
                            pos_hardnega = positive_samples + hard_negative_samples
                            pos_hardnega = mx.nd.repeat(
                                pos_hardnega.reshape(shape=(0, 0, 1)),
                                repeats=4,
                                axis=-1)

                            loc_loss = mx.nd.abs(box_pred - box_target)
                            loc_loss = mx.nd.where(loc_loss > 1,
                                                   loc_loss - 0.5, (0.5 / 1) *
                                                   mx.nd.square(loc_loss))
                            loc_loss = mx.nd.where(pos_hardnega > 0, loc_loss,
                                                   mx.nd.zeros_like(loc_loss))
                            loc_loss = mx.nd.sum(loc_loss)
                            if positive_numbers:
                                loc_loss = mx.nd.divide(
                                    loc_loss, positive_numbers)
                            else:
                                loc_loss = mx.nd.multiply(loc_loss, 0)
                            box_losses.append(loc_loss.asscalar())
                        else:
                            loc_loss = localization_loss(
                                box_pred, box_target,
                                positive_samples.expand_dims(axis=-1))
                            if positive_numbers:
                                loc_loss = mx.nd.divide(
                                    loc_loss, positive_numbers)
                            else:
                                loc_loss = mx.nd.multiply(loc_loss, 0)
                            box_losses.append(loc_loss.asscalar())

                        total_loss.append(conf_loss +
                                          weight_term_alpha * loc_loss)
                    if AMP:
                        with amp.scale_loss(total_loss,
                                            trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    cls_all_losses.append(sum(cls_losses))
                    box_all_losses.append(sum(box_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기
            for p in net.collect_params().values():
                p.zero_grad()

            conf_loss_sum += sum(cls_all_losses) / td_batch_size
            loc_loss_sum += sum(box_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(
                    f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                    f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                    f'[Lr = {trainer.learning_rate}]'
                    f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]'
                    f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]'
                )
            time_stamp = time.time()

        train_conf_loss_mean = np.divide(conf_loss_sum,
                                         train_update_number_per_epoch)
        train_loc_loss_mean = np.divide(loc_loss_sum,
                                        train_update_number_per_epoch)
        train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean

        logging.info(
            f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}"
        )

        if i % eval_period == 0 and valid_list:

            if classHardNegativeMining:
                confidence_loss = SoftmaxCrossEntropyLoss(
                    axis=-1,
                    sparse_label=True,
                    from_log_softmax=False,
                    batch_axis=None,
                    reduction="sum",
                    exclude=False)
            if boxHardNegativeMining:
                localization_loss = HuberLoss(rho=1,
                                              batch_axis=None,
                                              reduction="sum",
                                              exclude=False)

            conf_loss_sum = 0
            loc_loss_sum = 0
            for image, label, cls_all, box_all, _ in valid_dataloader:

                vd_batch_size = image.shape[0]
                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                    cls_all = gluon.utils.split_and_load(cls_all, [ctx],
                                                         even_split=False)
                    box_all = gluon.utils.split_and_load(box_all, [ctx],
                                                         even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)
                    cls_all = gluon.utils.split_and_load(cls_all, [ctx],
                                                         even_split=False)
                    box_all = gluon.utils.split_and_load(box_all, [ctx],
                                                         even_split=False)

                # prediction, target space for Data Parallelism
                cls_losses = []
                box_losses = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, cls_target, box_target in zip(
                        image, label, cls_all, box_all):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    id, score, bbox = prediction(cls_pred, box_pred, anchor)

                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box,
                                            gt_labels=gt_id)

                    positive_samples = cls_target > 0
                    positive_numbers = positive_samples.sum()

                    conf_loss = confidence_loss(
                        cls_pred, cls_target,
                        positive_samples.expand_dims(axis=-1))
                    if positive_numbers:
                        conf_loss = mx.nd.divide(conf_loss, positive_numbers)
                    else:
                        conf_loss = mx.nd.multiply(conf_loss, 0)
                    cls_losses.append(conf_loss.asscalar())

                    loc_loss = localization_loss(
                        box_pred, box_target,
                        positive_samples.expand_dims(axis=-1))
                    if positive_numbers:
                        loc_loss = mx.nd.divide(loc_loss, positive_numbers)
                    else:
                        loc_loss = mx.nd.multiply(loc_loss, 0)
                    box_losses.append(loc_loss.asscalar())

                conf_loss_sum += sum(cls_losses) / vd_batch_size
                loc_loss_sum += sum(box_losses) / vd_batch_size

            valid_conf_loss_mean = np.divide(conf_loss_sum,
                                             valid_update_number_per_epoch)
            valid_loc_loss_mean = np.divide(loc_loss_sum,
                                            valid_update_number_per_epoch)
            valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean

            logging.info(
                f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list(
            )
            for j, c, p, r in zip(range(len(recall)), class_name, precision,
                                  recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(
                    f"class {j}'s {name} AP : {round(AP * 100, round_position)}%"
                )
                AP_appender.append(AP)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender,
                                          mAP=mAP_result,
                                          folder_name=valid_graph_path,
                                          epoch=i)

            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _ = next(dataloader_iter)
                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 0, 1)

                batch_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    ids, scores, bboxes = prediction(cls_pred, box_pred,
                                                     anchor)

                    for ig, gt_id, gt_box, id, score, bbox in zip(
                            img, gt_ids, gt_boxes, ids, scores, bboxes):
                        ig = ig.transpose((1, 2, 0)) * mx.nd.array(
                            std, ctx=ig.context) + mx.nd.array(mean,
                                                               ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(
                            ig,
                            gt_box,
                            scores=None,
                            labels=gt_id,
                            thresh=None,
                            reverse_rgb=True,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True,
                            colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(
                            ground_truth,
                            bbox,
                            scores=score,
                            labels=id,
                            thresh=plot_class_thresh,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True)

                        # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = cv2.cvtColor(prediction_box,
                                                      cv2.COLOR_BGR2RGB)
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(
                            prediction_box)  # (batch, channel, height, width)

                summary.add_image(tag="valid_result",
                                  image=np.array(batch_image),
                                  global_step=i)
                summary.add_scalar(tag="conf_loss",
                                   value={
                                       "train_conf_loss": train_conf_loss_mean,
                                       "valid_conf_loss": valid_conf_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="loc_loss",
                                   value={
                                       "train_loc_loss": train_loc_loss_mean,
                                       "valid_loc_loss": valid_loc_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="total_loss",
                                   value={
                                       "train_total_loss":
                                       train_total_loss_mean,
                                       "valid_total_loss":
                                       valid_total_loss_mean
                                   },
                                   global_step=i)

                params = net.collect_params().values()
                if GPU_COUNT > 1:
                    for c in ctx:
                        for p in params:
                            summary.add_histogram(tag=p.name,
                                                  values=p.data(ctx=c),
                                                  global_step=i,
                                                  bins='default')
                else:
                    for p in params:
                        summary.add_histogram(tag=p.name,
                                              values=p.data(),
                                              global_step=i,
                                              bins='default')

        if i % save_period == 0:

            weight_epoch_path = os.path.join(weight_path, str(i))
            if not os.path.exists(weight_epoch_path):
                os.makedirs(weight_epoch_path)
            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''

            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)

            postnet = PostNet(net=net, auxnet=prediction)

            try:
                net.export(os.path.join(weight_path, f"{model}"),
                           epoch=i,
                           remove_amp_cast=True)
                net.save_parameters(os.path.join(weight_path,
                                                 f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함.
                export_block_for_cplusplus(
                    path=os.path.join(weight_epoch_path, f"{model}_prepost"),
                    block=postnet,
                    data_shape=tuple(input_size) + tuple((3, )),
                    epoch=i,
                    preprocess=
                    True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                    layout='HWC',
                    ctx=context,
                    remove_amp_cast=True)
            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))