def train(opt):
    params = Params(f'projects/{opt.project}.yml')

    if params.num_gpus == 0:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    if torch.cuda.is_available():
        torch.cuda.manual_seed(42)
    else:
        torch.manual_seed(42)

    opt.saved_path = opt.saved_path + f'/{params.project_name}/'
    opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/'
    os.makedirs(opt.log_path, exist_ok=True)
    os.makedirs(opt.saved_path, exist_ok=True)

    training_params = {
        'batch_size': opt.batch_size,
        'shuffle': True,
        'drop_last': True,
        'collate_fn': collater,
        'num_workers': opt.num_workers
    }

    val_params = {
        'batch_size': opt.batch_size,
        'shuffle': False,
        'drop_last': True,
        'collate_fn': collater,
        'num_workers': opt.num_workers
    }

    input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536]
    training_set = CocoDataset(root_dir=os.path.join(opt.data_path,
                                                     params.project_name),
                               set=params.train_set,
                               transform=transforms.Compose([
                                   Normalizer(mean=params.mean,
                                              std=params.std),
                                   Augmenter(),
                                   Resizer(input_sizes[opt.compound_coef])
                               ]))
    training_generator = DataLoader(training_set, **training_params)

    val_set = CocoDataset(root_dir=os.path.join(opt.data_path,
                                                params.project_name),
                          set=params.val_set,
                          transform=transforms.Compose([
                              Normalizer(mean=params.mean, std=params.std),
                              Resizer(input_sizes[opt.compound_coef])
                          ]))
    val_generator = DataLoader(val_set, **val_params)

    model = EfficientDetBackbone(num_classes=len(params.obj_list),
                                 compound_coef=opt.compound_coef,
                                 ratios=eval(params.anchors_ratios),
                                 scales=eval(params.anchors_scales))

    # load last weights
    if opt.load_weights is not None:
        if opt.load_weights.endswith('.pth'):
            weights_path = opt.load_weights
        else:
            weights_path = get_last_weights(opt.saved_path)
        try:
            last_step = int(
                os.path.basename(weights_path).split('_')[-1].split('.')[0])
        except:
            last_step = 0

        try:
            ret = model.load_state_dict(torch.load(weights_path), strict=False)
        except RuntimeError as e:
            print(f'[Warning] Ignoring {e}')
            print(
                '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.'
            )

        print(
            f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}'
        )
    else:
        last_step = 0
        print('[Info] initializing weights...')
        init_weights(model)

    # freeze backbone if train head_only
    if opt.head_only:

        def freeze_backbone(m):
            classname = m.__class__.__name__
            for ntl in ['EfficientNet', 'BiFPN']:
                if ntl in classname:
                    for param in m.parameters():
                        param.requires_grad = False

        model.apply(freeze_backbone)
        print('[Info] freezed backbone')

    # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
    # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4
    #  useful when gpu memory is limited.
    # because when bn is disable, the training will be very unstable or slow to converge,
    # apply sync_bn can solve it,
    # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus.
    # but it would also slow down the training by a little bit.
    if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4:
        model.apply(replace_w_sync_bn)
        use_sync_bn = True
    else:
        use_sync_bn = False

    writer = SummaryWriter(
        opt.log_path +
        f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/')

    # warp the model with loss function, to reduce the memory usage on gpu0 and speedup
    model = ModelWithLoss(model, debug=opt.debug)

    if params.num_gpus > 0:
        model = model.cuda()
        if params.num_gpus > 1:
            model = CustomDataParallel(model, params.num_gpus)
            if use_sync_bn:
                patch_replication_callback(model)

    if opt.optim == 'adamw':
        optimizer = torch.optim.AdamW(model.parameters(), opt.lr)
    else:
        optimizer = torch.optim.SGD(model.parameters(),
                                    opt.lr,
                                    momentum=0.9,
                                    nesterov=True)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           patience=3,
                                                           verbose=True)

    epoch = 0
    best_loss = 1e5
    best_epoch = 0
    step = max(0, last_step)
    model.train()

    num_iter_per_epoch = len(training_generator)

    try:
        for epoch in range(opt.num_epochs):
            last_epoch = step // num_iter_per_epoch
            if epoch < last_epoch:
                continue

            epoch_loss = []
            progress_bar = tqdm(training_generator)
            for iter, data in enumerate(progress_bar):
                if iter < step - last_epoch * num_iter_per_epoch:
                    progress_bar.update()
                    continue
                try:
                    imgs = data['img']
                    annot = data['annot']

                    if params.num_gpus == 1:
                        # if only one gpu, just send it to cuda:0
                        # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here
                        imgs = imgs.cuda()
                        annot = annot.cuda()

                    optimizer.zero_grad()
                    cls_loss, reg_loss = model(imgs,
                                               annot,
                                               obj_list=params.obj_list)
                    cls_loss = cls_loss.mean()
                    reg_loss = reg_loss.mean()

                    loss = cls_loss + reg_loss
                    if loss == 0 or not torch.isfinite(loss):
                        continue

                    loss.backward()
                    # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
                    optimizer.step()

                    epoch_loss.append(float(loss))

                    progress_bar.set_description(
                        'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}'
                        .format(step, epoch, opt.num_epochs, iter + 1,
                                num_iter_per_epoch, cls_loss.item(),
                                reg_loss.item(), loss.item()))
                    writer.add_scalars('Loss', {'train': loss}, step)
                    writer.add_scalars('Regression_loss', {'train': reg_loss},
                                       step)
                    writer.add_scalars('Classfication_loss',
                                       {'train': cls_loss}, step)

                    # log learning_rate
                    current_lr = optimizer.param_groups[0]['lr']
                    writer.add_scalar('learning_rate', current_lr, step)

                    step += 1

                    if step % opt.save_interval == 0 and step > 0:
                        save_checkpoint(
                            model,
                            f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth'
                        )
                        print('checkpoint...')

                except Exception as e:
                    print('[Error]', traceback.format_exc())
                    print(e)
                    continue
            scheduler.step(np.mean(epoch_loss))

            if epoch % opt.val_interval == 0:
                model.eval()
                loss_regression_ls = []
                loss_classification_ls = []
                for iter, data in enumerate(val_generator):
                    with torch.no_grad():
                        imgs = data['img']
                        annot = data['annot']

                        if params.num_gpus == 1:
                            imgs = imgs.cuda()
                            annot = annot.cuda()

                        cls_loss, reg_loss = model(imgs,
                                                   annot,
                                                   obj_list=params.obj_list)
                        cls_loss = cls_loss.mean()
                        reg_loss = reg_loss.mean()

                        loss = cls_loss + reg_loss
                        if loss == 0 or not torch.isfinite(loss):
                            continue

                        loss_classification_ls.append(cls_loss.item())
                        loss_regression_ls.append(reg_loss.item())

                cls_loss = np.mean(loss_classification_ls)
                reg_loss = np.mean(loss_regression_ls)
                loss = cls_loss + reg_loss

                print(
                    'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}'
                    .format(epoch, opt.num_epochs, cls_loss, reg_loss, loss))
                writer.add_scalars('Loss', {'val': loss}, step)
                writer.add_scalars('Regression_loss', {'val': reg_loss}, step)
                writer.add_scalars('Classfication_loss', {'val': cls_loss},
                                   step)

                if loss + opt.es_min_delta < best_loss:
                    best_loss = loss
                    best_epoch = epoch

                    save_checkpoint(
                        model,
                        f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth'
                    )

                model.train()

                # Early stopping
                if epoch - best_epoch > opt.es_patience > 0:
                    print(
                        '[Info] Stop training at epoch {}. The lowest loss achieved is {}'
                        .format(epoch, best_loss))
                    break
    except KeyboardInterrupt:
        save_checkpoint(
            model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth')
        writer.close()
    writer.close()
示例#2
0
文件: train.py 项目: sx14/DIRV
def train(opt):
    params = Params(f'projects/{opt.project}.yml')

    if opt.project == "vcoco":
        num_obj_class = 90
        num_union_action = 25
        num_inst_action = 51
    else:
        assert opt.project == "hico-det"
        num_obj_class = 90
        num_union_action = 117
        num_inst_action = 234

    if params.num_gpus == 0:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    opt.saved_path = opt.saved_path + f'/{params.project_name}/'
    opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/'
    os.makedirs(opt.log_path, exist_ok=True)
    os.makedirs(opt.saved_path, exist_ok=True)

    training_params = {
        'batch_size': opt.batch_size,
        'shuffle': True,
        'drop_last': True,
        'collate_fn': collater,
        'num_workers': opt.num_workers,
        'pin_memory': False
    }

    val_params = {
        'batch_size': opt.batch_size * 2,
        'shuffle': False,
        'drop_last': True,
        'collate_fn': collater,
        'num_workers': opt.num_workers,
        'pin_memory': False
    }

    input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]

    train_transform = transforms.Compose([
        Normalizer(mean=params.mean, std=params.std),
        Augmenter(),
        Resizer(input_sizes[opt.compound_coef])
    ])
    val_transform = transforms.Compose([
        Normalizer(mean=params.mean, std=params.std),
        Resizer(input_sizes[opt.compound_coef])
    ])

    if opt.project == "vcoco":
        # training_set = VCOCO_Dataset(root_dir="./datasets/vcoco", set=params.train_set, color_prob=1,
        #                        transform=train_transform)
        # val_set = VCOCO_Dataset(root_dir="./datasets/vcoco", set=params.val_set,
        #                   transform=val_transform)
        exit(-999)
    else:
        training_set = HICO_DET_Dataset(root_dir="data/hico_20160224_det",
                                        set="train",
                                        color_prob=1,
                                        transform=train_transform)
        val_set = HICO_DET_Dataset(root_dir="data/hico_20160224_det",
                                   set="test",
                                   transform=val_transform)

    training_generator = DataLoader(training_set, **training_params)

    val_generator = DataLoader(val_set, **val_params)

    model = EfficientDetBackbone(num_classes=num_obj_class,
                                 num_union_classes=num_union_action,
                                 num_inst_classes=num_inst_action,
                                 compound_coef=opt.compound_coef,
                                 ratios=eval(params.anchors_ratios),
                                 scales=eval(params.anchors_scales))

    model.train()
    print("num_classes:", num_obj_class)
    print("num_union_classes:", num_union_action)
    print("instance_action_list", num_inst_action)
    # load last weights
    if opt.load_weights is not None:
        if opt.load_weights.endswith('.pth'):
            weights_path = opt.load_weights
        else:
            weights_path = get_last_weights(opt.saved_path)
        try:
            # last_step = int(os.path.basename(weights_path).split('_')[-1].split('.')[0])
            # last_epoch = int(os.path.basename(weights_path).split('_')[-2].split('.')[0]) + 1
            last_epoch = int(
                os.path.basename(weights_path).split('_')[-1].split('.')[0])
            last_step = last_epoch * len(training_generator)
        except:
            last_step = 0

        try:
            init_weights(model)
            print(weights_path)
            model_dict = model.state_dict()
            pretrained_dict = torch.load(weights_path,
                                         map_location=torch.device('cpu'))
            new_pretrained_dict = {}
            for k, v in pretrained_dict.items():
                if k in model_dict:
                    new_pretrained_dict[k] = v
                elif ("instance_branch.object_" + k) in model_dict:
                    new_pretrained_dict["instance_branch.object_" + k] = v
                    # print("instance_branch.object_"+k)
            ret = model.load_state_dict(new_pretrained_dict, strict=False)
        except RuntimeError as e:
            print(f'[Warning] Ignoring {e}')
            print(
                '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.'
            )

        print(
            f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}'
        )
    else:
        last_step = 0
        print('[Info] initializing weights...')
        init_weights(model)

    # freeze backbone if train head_only
    if opt.head_only:
        model.apply(freeze_backbone)
        freeze_bn_backbone(model)
        print('[Info] freezed backbone')

    if opt.freeze_object_detection:
        freeze_object_detection(model)
        freeze_bn_object_detection(model)
        # model.apply(freeze_object_detection)
        print('[Info] freezed object detection branch')

    # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
    # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4
    #  useful when gpu memory is limited.
    # because when bn is disable, the training will be very unstable or slow to converge,
    # apply sync_bn can solve it,
    # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus.
    # but it would also slow down the training by a little bit.
    if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 8:
        model.apply(replace_w_sync_bn)
        use_sync_bn = True
    else:
        use_sync_bn = False

    # if os.path.exists('nohup.out'):
    #     os.remove('nohup.out')
    #     f = open('nohup.out', 'w')
    #     f.close()

    if os.path.exists(opt.log_path):
        import shutil
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    writer = SummaryWriter(
        opt.log_path +
        f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/')

    # warp the model with loss function, to reduce the memory usage on gpu0 and speedup
    model = ModelWithLoss(model, dataset=opt.project, debug=opt.debug)

    if params.num_gpus > 0:
        model = model.cuda()
        if params.num_gpus > 1:
            model = CustomDataParallel(model, params.num_gpus)
            if use_sync_bn:
                patch_replication_callback(model)
                if opt.head_only:
                    print('[Info] freezed SyncBN backbone')
                    freeze_bn_backbone(model.module.model)
                if opt.freeze_object_detection:
                    print('[Info] freezed SyncBN object detection')
                    freeze_bn_object_detection(model.module.model)

    if opt.optim == 'adamw':
        # optimizer = torch.optim.AdamW(model.parameters(), opt.lr)
        optimizer = torch.optim.AdamW(
            filter(lambda p: p.requires_grad, model.parameters()), opt.lr)
    elif opt.optim == "adam":
        # optimizer = torch.optim.Adam(model.parameters(), opt.lr)
        optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, model.parameters()), opt.lr)
    else:
        # optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, nesterov=True)
        optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                           model.parameters()),
                                    opt.lr,
                                    momentum=0.9,
                                    nesterov=True)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           patience=2,
                                                           verbose=True,
                                                           min_lr=1e-7)

    epoch = 0
    best_loss = 1e5
    best_epoch = 0
    step = max(0, last_step)

    num_iter_per_epoch = (len(training_generator) + opt.accumulate_batch -
                          1) // opt.accumulate_batch
    start_time = time.time()

    try:
        for epoch in range(opt.num_epochs):
            last_epoch = step // num_iter_per_epoch + 1
            if epoch < last_epoch:
                continue

            if epoch in [12, 16]:
                optimizer.param_groups[0][
                    'lr'] = optimizer.param_groups[0]['lr'] / 10

            epoch_loss = []
            for iter, data in enumerate(training_generator):
                try:
                    imgs = data['img']
                    annot = data['annot']
                    # torch.cuda.empty_cache()
                    if params.num_gpus == 1:
                        # if only one gpu, just send it to cuda:0
                        # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here
                        imgs = imgs.cuda()
                        for key in annot:
                            annot[key] = annot[key].cuda()

                    union_act_cls_loss, union_sub_reg_loss, union_obj_reg_loss, union_diff_reg_loss, \
                    inst_act_cls_loss, inst_obj_cls_loss, inst_obj_reg_loss = model(imgs, annot["instance"], annot["interaction"])

                    union_act_cls_loss = union_act_cls_loss.mean()
                    union_sub_reg_loss = union_sub_reg_loss.mean()
                    union_obj_reg_loss = union_obj_reg_loss.mean()
                    union_diff_reg_loss = union_diff_reg_loss.mean()

                    inst_act_cls_loss = inst_act_cls_loss.mean()
                    inst_obj_cls_loss = inst_obj_cls_loss.mean()
                    inst_obj_reg_loss = inst_obj_reg_loss.mean()

                    union_loss = union_act_cls_loss + union_sub_reg_loss + union_obj_reg_loss + union_diff_reg_loss
                    instance_loss = inst_act_cls_loss + inst_obj_cls_loss + inst_obj_reg_loss

                    loss = union_loss + inst_act_cls_loss

                    if loss == 0 or not torch.isfinite(loss):
                        continue

                    batch_loss = loss / opt.accumulate_batch
                    batch_loss.backward()
                    if (iter + 1) % opt.accumulate_batch == 0 or iter == len(
                            training_generator) - 1:
                        optimizer.step()
                        optimizer.zero_grad()
                        step += 1

                    loss = loss.item()
                    union_loss = union_loss.item()
                    instance_loss = instance_loss.item()

                    epoch_loss.append(float(loss))
                    current_lr = optimizer.param_groups[0]['lr']

                    if step % opt.log_interval == 0:
                        writer.add_scalars('Union Action Classification Loss',
                                           {'train': union_act_cls_loss}, step)
                        writer.add_scalars('Union Subject Regression Loss',
                                           {'train': union_sub_reg_loss}, step)
                        writer.add_scalars('Union Object Regression Loss',
                                           {'train': union_obj_reg_loss}, step)
                        writer.add_scalars('Union Diff Regression Loss',
                                           {'train': union_diff_reg_loss},
                                           step)

                        writer.add_scalars(
                            'Instance Action Classification Loss',
                            {'train': inst_act_cls_loss}, step)
                        writer.add_scalars(
                            'Instance Object Classification Loss',
                            {'train': inst_obj_cls_loss}, step)
                        writer.add_scalars('Instance Regression Loss',
                                           {'train': inst_obj_reg_loss}, step)

                        writer.add_scalars('Total Loss', {'train': loss}, step)
                        writer.add_scalars('Union Loss', {'train': union_loss},
                                           step)
                        writer.add_scalars('Instance Loss',
                                           {'train': instance_loss}, step)

                        # log learning_rate
                        writer.add_scalar('learning_rate', current_lr, step)

                    if iter % 20 == 0:
                        end_time = time.time()
                        print(
                            'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Union loss: {:.5f}. Instance loss: {:.5f}.  '
                            ' Total loss: {:.5f}. Learning rate: {:.5f} Time: {:.2f}s'
                            .format(step, epoch, opt.num_epochs,
                                    (iter + 1) // opt.accumulate_batch,
                                    num_iter_per_epoch, union_loss,
                                    instance_loss, loss, current_lr,
                                    end_time - start_time))
                        start_time = time.time()

                except Exception as e:
                    print('[Error]', traceback.format_exc())
                    print(e)
                    continue

            # scheduler.step(np.mean(epoch_loss))

            save_checkpoint(model,
                            f'efficientdet-d{opt.compound_coef}_{epoch}.pth')
            print('checkpoint...')

            if epoch % opt.val_interval == 0:
                # model.eval()

                union_loss_ls = []
                instance_loss_ls = []

                union_act_cls_loss_ls = []
                union_obj_cls_loss_ls = []
                union_act_reg_loss_ls = []

                union_sub_reg_loss_ls = []
                union_obj_reg_loss_ls = []
                union_diff_reg_loss_ls = []

                inst_act_cls_loss_ls = []
                inst_obj_cls_loss_ls = []
                inst_obj_reg_loss_ls = []

                val_loss = []
                for iter, data in enumerate(val_generator):
                    if (iter + 1) % 50 == 0:
                        print("%d/%d" % (iter + 1, len(val_generator)))
                    with torch.no_grad():
                        imgs = data['img']
                        annot = data['annot']
                        if params.num_gpus == 1:
                            imgs = imgs.cuda()
                            for key in annot:
                                annot[key] = annot[key].cuda()

                        union_act_cls_loss, union_sub_reg_loss, union_obj_reg_loss, union_diff_reg_loss, \
                        inst_act_cls_loss, inst_obj_cls_loss, inst_obj_reg_loss = model(imgs, annot["instance"], annot["interaction"])

                        union_act_cls_loss = union_act_cls_loss.mean()
                        union_sub_reg_loss = union_sub_reg_loss.mean()
                        union_obj_reg_loss = union_obj_reg_loss.mean()
                        union_diff_reg_loss = union_diff_reg_loss.mean()

                        inst_act_cls_loss = inst_act_cls_loss.mean()
                        inst_obj_cls_loss = inst_obj_cls_loss.mean()
                        inst_obj_reg_loss = inst_obj_reg_loss.mean()

                        union_loss = union_act_cls_loss + union_sub_reg_loss + union_obj_reg_loss + union_diff_reg_loss
                        instance_loss = inst_act_cls_loss + inst_obj_cls_loss + inst_obj_reg_loss

                        loss = union_loss + inst_act_cls_loss

                        if loss == 0 or not torch.isfinite(loss):
                            continue
                        val_loss.append(loss.item())

                        union_act_cls_loss_ls.append(union_act_cls_loss.item())
                        union_sub_reg_loss_ls.append(union_sub_reg_loss.item())
                        union_obj_reg_loss_ls.append(union_obj_reg_loss.item())
                        union_diff_reg_loss_ls.append(
                            union_diff_reg_loss.item())
                        # union_obj_cls_loss_ls.append(union_obj_cls_loss.item())
                        # union_act_reg_loss_ls.append(union_act_reg_loss.item())

                        inst_act_cls_loss_ls.append(inst_act_cls_loss.item())
                        inst_obj_cls_loss_ls.append(inst_obj_cls_loss.item())
                        inst_obj_reg_loss_ls.append(inst_obj_reg_loss.item())

                        union_loss_ls.append(union_loss.item())
                        instance_loss_ls.append(instance_loss.item())

                union_loss = np.mean(union_loss_ls)
                instance_loss = np.mean(instance_loss_ls)

                union_act_cls_loss = np.mean(union_act_cls_loss_ls)
                union_sub_reg_loss = np.mean(union_sub_reg_loss_ls)
                union_obj_reg_loss = np.mean(union_obj_reg_loss_ls)
                union_diff_reg_loss = np.mean(union_diff_reg_loss_ls)

                inst_act_cls_loss = np.mean(inst_act_cls_loss_ls)
                inst_obj_cls_loss = np.mean(inst_obj_cls_loss_ls)
                inst_obj_reg_loss = np.mean(inst_obj_reg_loss_ls)

                loss = union_loss + inst_act_cls_loss

                print(
                    'Val. Epoch: {}/{}. Union loss: {:1.5f}. Instance loss: {:1.5f}. '
                    'Total loss: {:1.5f}'.format(epoch, opt.num_epochs,
                                                 union_loss, instance_loss,
                                                 loss))

                writer.add_scalars('Union Action Classification Loss',
                                   {'val': union_act_cls_loss}, step)
                writer.add_scalars('Union Subject Regression Loss',
                                   {'val': union_sub_reg_loss}, step)
                writer.add_scalars('Union Object Regression Loss',
                                   {'val': union_obj_reg_loss}, step)
                writer.add_scalars('Union Diff Regression Loss',
                                   {'val': union_diff_reg_loss}, step)

                writer.add_scalars('Instance Action Classification Loss',
                                   {'val': inst_act_cls_loss}, step)
                writer.add_scalars('Instance Object Classification Loss',
                                   {'val': inst_obj_cls_loss}, step)
                writer.add_scalars('Instance Regression Loss',
                                   {'val': inst_obj_reg_loss}, step)

                writer.add_scalars('Total Loss', {'val': loss}, step)
                writer.add_scalars('Union Loss', {'val': union_loss}, step)
                writer.add_scalars('Instance Loss', {'val': instance_loss},
                                   step)

                if loss + opt.es_min_delta < best_loss:
                    best_loss = loss
                    best_epoch = epoch

                    save_checkpoint(
                        model,
                        f'efficientdet-d{opt.compound_coef}_{best_epoch}_best.pth'
                    )

                # model.train()

            # scheduler.step()

                scheduler.step(np.mean(val_loss))
                if optimizer.param_groups[0]['lr'] < opt.lr / 100:
                    break
                # Early stopping
                # if epoch - best_epoch > opt.es_patience > 0:
                #     print('[Info] Stop training at epoch {}. The lowest loss achieved is {}'.format(epoch, loss))
                #     break
    except KeyboardInterrupt:
        # save_checkpoint(model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth')
        writer.close()
    writer.close()
示例#3
0
def train_det(opt, cfg):
    # # Write history
    # if 'backlog' not in opt.config:
    #     with open(os.path.join(opt.saved_path, f'{opt.project}_backlog.yml'), 'w') as f:
    #         doc = open(f'projects/{opt.project}.yml', 'r')
    #         f.write('#History log file')
    #         f.write(f'\n__backlog__: {now.strftime("%Y/%m/%d %H:%M:%S")}\n')
    #         f.write(doc.read())
    #         f.write('\n# Manual seed used')
    #         f.write(f'\nmanual_seed: {cfg.manual_seed}')
    # else:
    #     with open(os.path.join(opt.saved_path, f'{opt.project}_history.yml'), 'w') as f:
    #         doc = open(f'projects/{opt.project}.yml', 'r')
    #         f.write(doc.read())

    training_params = {
        'batch_size': cfg.batch_size,
        'shuffle': True,
        'drop_last': True,
        'collate_fn': collater,
        'num_workers': opt.num_workers
    }

    val_params = {
        'batch_size': cfg.batch_size,
        'shuffle': False,
        'drop_last': True,
        'collate_fn': collater,
        'num_workers': opt.num_workers
    }

    input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]

    training_set = DataGenerator(
        data_path=os.path.join(opt.data_path, 'Train'),
        class_ids=cfg.dictionary_class_name.keys(),
        transform=transforms.Compose([
            Augmenter(),
            Normalizer(mean=cfg.mean, std=cfg.std),
            Resizer(input_sizes[cfg.compound_coef])
        ]),
        pre_augments=['', *[f'{aug}_' for aug in cfg.augment_list]]
        if cfg.augment_list else None)
    training_generator = DataLoader(training_set, **training_params)

    val_set = DataGenerator(
        # root_dir=os.path.join(opt.data_path, cfg.project_name),
        data_path=os.path.join(opt.data_path, 'Validation'),
        class_ids=cfg.dictionary_class_name.keys(),
        transform=transforms.Compose([
            Normalizer(mean=cfg.mean, std=cfg.std),
            Resizer(input_sizes[cfg.compound_coef])
        ]))
    val_generator = DataLoader(val_set, **val_params)

    model = EfficientDetBackbone(num_classes=len(cfg.dictionary_class_name),
                                 compound_coef=cfg.compound_coef,
                                 ratios=eval(cfg.anchor_ratios),
                                 scales=eval(cfg.anchor_scales))

    # load last weights
    if opt.load_weights is not None:
        if opt.load_weights.endswith('.pth'):
            weights_path = opt.load_weights
        else:
            weights_path = get_last_weights(opt.saved_path)
        try:
            last_step = int(
                os.path.basename(weights_path).split('_')[-1].split('.')[0])
        except:
            last_step = 0

        try:
            ret = model.load_state_dict(torch.load(weights_path), strict=False)
        except RuntimeError as e:
            print(f'[Warning] Ignoring {e}')
            print(
                '[Warning] Don\'t panic if you see this, '
                'this might be because you load a pretrained weights with different number of classes. '
                'The rest of the weights should be loaded already.')

        print(
            f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}'
        )
    else:
        last_step = 0
        print('[Info] initializing weights...')
        init_weights(model)

    # freeze backbone if train head_only
    if cfg.training_layer.lower() == 'heads':

        def freeze_backbone(m):
            classname = m.__class__.__name__
            for ntl in ['EfficientNet', 'BiFPN']:
                if ntl in classname:
                    for param in m.parameters():
                        param.requires_grad = False

        model.apply(freeze_backbone)
        print('[Info] freezed backbone')

    # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
    # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4
    #  useful when gpu memory is limited.
    # because when bn is disable, the training will be very unstable or slow to converge,
    # apply sync_bn can solve it,
    # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus.
    # but it would also slow down the training by a little bit.
    if cfg.num_gpus > 1 and cfg.batch_size // cfg.num_gpus < 4:
        model.apply(replace_w_sync_bn)
        use_sync_bn = True
    else:
        use_sync_bn = False

    # warp the model with loss function, to reduce the memory usage on gpu0 and speedup
    model = ModelWithLoss(model, debug=opt.debug)

    if cfg.num_gpus > 0:
        model = model.cuda()
        if cfg.num_gpus > 1:
            model = CustomDataParallel(model, cfg.num_gpus)
            if use_sync_bn:
                patch_replication_callback(model)

    if cfg.optimizer.lower() == 'adamw':
        optimizer = torch.optim.AdamW(model.parameters(), cfg.learning_rate)
    if cfg.optimizer.lower() == 'srsgd':
        optimizer = SRSGD(model.parameters(),
                          lr=cfg.learning_rate,
                          weight_decay=5e-4,
                          iter_count=100)
    else:
        optimizer = torch.optim.SGD(model.parameters(),
                                    cfg.learning_rate,
                                    momentum=0.9,
                                    nesterov=True)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           patience=3,
                                                           verbose=True)

    # Setup complete, then start training
    now = datetime.datetime.now()
    opt.saved_path = opt.saved_path + f'/trainlogs_{now.strftime("%Y%m%d_%H%M%S")}'
    if opt.log_path is None:
        opt.log_path = opt.saved_path
    os.makedirs(opt.log_path, exist_ok=True)
    os.makedirs(opt.saved_path, exist_ok=True)

    # Write history
    if 'backlog' not in opt.config:
        with open(
                os.path.join(opt.saved_path,
                             f'{now.strftime("%Y%m%d%H%M%S")}.backlog.json'),
                'w') as f:
            backlog = dict(cfg.to_pascal_case())
            backlog['__metadata__'] = 'Backlog at ' + now.strftime(
                "%Y/%m/%d %H:%M:%S")
            json.dump(backlog, f)
    else:
        with open(
                os.path.join(opt.saved_path,
                             f'{now.strftime("%Y%m%d%H%M%S")}.history.json'),
                'w') as f:
            history = dict(cfg.to_pascal_case())
            history['__metadata__'] = now.strftime("%Y/%m/%d %H:%M:%S")
            json.dump(history, f)

    writer = SummaryWriter(opt.log_path + f'/tensorboard')

    epoch = 0
    best_loss = 1e5
    best_epoch = 0
    step = max(0, last_step)
    model.train()

    num_iter_per_epoch = len(training_generator)

    try:
        for epoch in range(cfg.no_epochs):
            last_epoch = step // num_iter_per_epoch
            if epoch < last_epoch:
                continue

            epoch_loss = []
            progress_bar = tqdm(training_generator)
            for iter, data in enumerate(progress_bar):
                if iter < step - last_epoch * num_iter_per_epoch:
                    progress_bar.set_description(
                        f'Skip {iter} < {step} - {last_epoch} * {num_iter_per_epoch}'
                    )
                    progress_bar.update()
                    continue
                try:
                    imgs = data['img']
                    annot = data['annot']

                    if cfg.num_gpus == 1:
                        # if only one gpu, just send it to cuda:0
                        # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here
                        imgs = imgs.cuda()
                        annot = annot.cuda()

                    optimizer.zero_grad()
                    cls_loss, reg_loss = model(
                        imgs, annot, obj_list=cfg.dictionary_class_name.keys())
                    cls_loss = cls_loss.mean()
                    reg_loss = reg_loss.mean()

                    loss = cls_loss + reg_loss
                    if loss == 0 or not torch.isfinite(loss):
                        continue

                    loss.backward()
                    # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
                    optimizer.step()

                    epoch_loss.append(float(loss))

                    progress_bar.set_description(
                        'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. '
                        'Total loss: {:.5f}'.format(step, epoch, cfg.no_epochs,
                                                    iter + 1,
                                                    num_iter_per_epoch,
                                                    cls_loss.item(),
                                                    reg_loss.item(),
                                                    loss.item()))
                    writer.add_scalars('Loss', {'train': loss}, step)
                    writer.add_scalars('Regression_loss', {'train': reg_loss},
                                       step)
                    writer.add_scalars('Classification_loss',
                                       {'train': cls_loss}, step)

                    # log learning_rate
                    current_lr = optimizer.param_groups[0]['lr']
                    writer.add_scalar('learning_rate', current_lr, step)

                    step += 1

                except Exception as e:
                    print('[Error]', traceback.format_exc())
                    print(e)
                    continue
            scheduler.step(np.mean(epoch_loss))

            model.eval()
            loss_regression_ls = []
            loss_classification_ls = []
            for iter, data in enumerate(val_generator):
                with torch.no_grad():
                    imgs = data['img']
                    annot = data['annot']

                    if cfg.num_gpus == 1:
                        imgs = imgs.cuda()
                        annot = annot.cuda()

                    cls_loss, reg_loss = model(
                        imgs, annot, obj_list=cfg.dictionary_class_name.keys())
                    cls_loss = cls_loss.mean()
                    reg_loss = reg_loss.mean()

                    loss = cls_loss + reg_loss
                    if loss == 0 or not torch.isfinite(loss):
                        continue

                    loss_classification_ls.append(cls_loss.item())
                    loss_regression_ls.append(reg_loss.item())

            cls_loss = np.mean(loss_classification_ls)
            reg_loss = np.mean(loss_regression_ls)
            loss = cls_loss + reg_loss

            progress_bar.set_description(
                'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}.'
                ' Total loss: {:1.5f}'.format(epoch, cfg.no_epochs, cls_loss,
                                              reg_loss, loss))

            writer.add_scalars('Loss', {'val': loss}, step)
            writer.add_scalars('Regression_loss', {'val': reg_loss}, step)
            writer.add_scalars('Classification_loss', {'val': cls_loss}, step)

            if cfg.only_best_weights:
                if loss + opt.es_min_delta < best_loss:
                    best_loss = loss
                    best_epoch = epoch
                    save_checkpoint(
                        model,
                        f"{opt.saved_path}/det_d{cfg.compound_coef}_{epoch}_{step}.pth"
                    )
            else:
                if loss + opt.es_min_delta < best_loss:
                    best_loss = loss
                    best_epoch = epoch
                save_checkpoint(
                    model,
                    f"{opt.saved_path}/det_d{cfg.compound_coef}_{epoch}_{step}.pth"
                )

            model.train()

            # Early stopping
            if epoch - best_epoch > opt.es_patience > 0:
                print(
                    '[Info] Stop training at epoch {}. The lowest loss achieved is {}'
                    .format(epoch, best_loss))
                break
        print(
            f'[Info] Finished training. Best loss achieved {best_loss} at epoch {best_epoch}.'
        )
    except KeyboardInterrupt:
        save_checkpoint(
            model, f"{opt.saved_path}/d{cfg.compound_coef}_{epoch}_{step}.pth")
        writer.close()
    writer.close()
def train(args):
    assert args.weight_path, 'must indicate the path of initial weight'
    if (os.path.exists(f'{args.weight_path}/train_log.txt')):
        os.remove(f'{args.weight_path}/train_log.txt')
    if (os.path.exists(f'{args.weight_path}/pre_trained_weight.pth')):
        os.remove(f'{args.weight_path}/pre_trained_weight.pth')
    print("Hi")
    present_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
    params = Params(f'projects/eye.yml')

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    train_params = {'batch_size': args.batch_size,
                       'shuffle': True,
                       'drop_last': True,
                       'collate_fn': collater,
                       'num_workers': args.num_workers}

    val_params = {'batch_size': args.batch_size,
                  'shuffle': False,
                  'drop_last': True,
                  'collate_fn': collater,
                  'num_workers': args.num_workers}
    
    input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
    
    model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=args.compound_coef,
                                 ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales))
    init_weights(model)

    # warp the model with loss function, to reduce the memory usage on gpu0 and speedup
    model = ModelWithLoss(model)
    model = model.cuda()

    if args.optim == 'adamw':
        optimizer = torch.optim.AdamW(model.parameters(), args.lr)
    else:
        optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, nesterov=True)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=args.patience, verbose=True)  # unit is epoch
    
    img_list = glob.glob(f"{args.dataset_path}/train/*")
    normal_img_list = []
    yellow_img_list = []
    for img in img_list:
        if (img.find("n_") != -1):
            normal_img_list.append(img)
        else:
            yellow_img_list.append(img)
    random.shuffle(normal_img_list)
    random.shuffle(yellow_img_list)
    normal_val_num = int(len(normal_img_list) / 5)
    yellow_val_num = int(len(yellow_img_list) / 5)
    train_img_list = normal_img_list[normal_val_num:] + yellow_img_list[yellow_val_num:]
    val_img_list = normal_img_list[:normal_val_num] + yellow_img_list[:yellow_val_num]
    train_anno_txt_path = f"{args.dataset_path}/train.txt"
    val_anno_txt_path = f"{args.dataset_path}/train.txt"

    train_transform = transforms.Compose([# Normalizer(mean=params.mean, std=params.std),
                                    Augmenter(),
                                    randomScaleWidth(),
                                    randomBlur(),
                                    # randomBrightness(),
                                    # randomHue(),
                                    # randomSaturation(),
                                    Normalizer(mean=params.mean, std=params.std),
                                    Resizer(input_sizes[args.compound_coef])])

    val_transform = transforms.Compose([# Normalizer(mean=params.mean, std=params.std),
                                    Augmenter(),
                                    Normalizer(mean=params.mean, std=params.std),
                                    Resizer(input_sizes[args.compound_coef])])

    train_set = EyeDataset(train_img_list, train_anno_txt_path, train_transform)
    val_set = EyeDataset(val_img_list, val_anno_txt_path, val_transform)
    
    train_generator = DataLoader(train_set, **train_params)
    val_generator = DataLoader(val_set, **val_params)
   
    model.model.load_state_dict(torch.load(f'{args.weight_path}/init_weight.pth')["model_state_dict"])
    optimizer.load_state_dict(torch.load(f'{args.weight_path}/init_weight.pth')["optimizer_state_dict"])
    scheduler.load_state_dict(torch.load(f'{args.weight_path}/init_weight.pth')["scheduler_state_dict"])
    model.train()

    best_val_loss = 1e5
    for epoch in range(args.epoch):
        model.train()
        total_loss_ls = []
        total_correct = 0
        total = 0
        for data in train_generator:
            imgs = data['img'].cuda()
            annot = data['annot'].cuda()

            optimizer.zero_grad()
            reg_loss, cls_head_loss, cls_correct_num, total_num = model(imgs, annot, obj_list=params.obj_list)
            total_correct += cls_correct_num
            total += total_num
            reg_loss = reg_loss.mean()
            loss = cls_head_loss + reg_loss
            total_loss_ls.append(loss.item())

            if (loss == 0 or not torch.isfinite(loss)):
                continue

            loss.backward()
            optimizer.step()
        total_loss = np.mean(total_loss_ls)
        scheduler.step(total_loss)

        with open(f'{args.weight_path}/train_log.txt', 'a') as fp:
            fp.write(f'Epoch: {epoch}  loss: {total_loss:.6f} | acc: {total_correct / total * 100:.2f}\n')

        model.eval()
        with torch.no_grad():
            total = 0
            total_correct = 0
            total_loss_ls = []
            for data in val_generator:
                imgs = data['img'].cuda()
                annot = data['annot'].cuda()

                reg_loss, cls_head_loss, cls_correct_num, total_num = model(imgs, annot, obj_list=params.obj_list)
                total += total_num
                total_correct += cls_correct_num
                reg_loss = reg_loss.mean()
                loss = cls_head_loss + reg_loss
                total_loss_ls.append(loss.item())

            total_loss = np.mean(total_loss_ls)
            with open(f'{args.weight_path}/train_log.txt', 'a') as fp:
                fp.write(f'Epoch: {epoch}  loss: {total_loss:.6f} | acc: {total_correct / total * 100:.2f}\n\n')
            if (total_loss < best_val_loss):
                best_val_loss = total_loss
                torch.save({
                    "model_state_dict": model.model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "scheduler_state_dict": scheduler.state_dict(),
                    }, f"{args.weight_path}/pre_trained_weight.pth")
def train(args):
    print("Hi")
    present_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
    params = Params(f'projects/eye.yml')

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    torch.cuda.manual_seed(20)
    torch.cuda.manual_seed_all(20)
    np.random.seed(20)
    random.seed(20)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    prepare_dir(args, present_time)

    training_params = {
        'batch_size': args.batch_size,
        'shuffle': True,
        'drop_last': True,
        'collate_fn': collater,
        'num_workers': args.num_workers
    }

    val_params = {
        'batch_size': args.batch_size,
        'shuffle': False,
        'drop_last': True,
        'collate_fn': collater,
        'num_workers': args.num_workers
    }

    input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]

    model = EfficientDetBackbone(num_classes=len(params.obj_list),
                                 compound_coef=args.compound_coef,
                                 ratios=eval(params.anchors_ratios),
                                 scales=eval(params.anchors_scales))

    # load last weights
    '''
    if opt.load_weights is not None:
        if opt.load_weights.endswith('.pth'):
            weights_path = opt.load_weights
        else:
            weights_path = get_last_weights(opt.saved_path)
        try:
            last_step = int(os.path.basename(weights_path).split('_')[-1].split('.')[0])
        except:
            last_step = 0

        try:
            ret = model.load_state_dict(torch.load(weights_path), strict=False)
        except RuntimeError as e:
            print(f'[Warning] Ignoring {e}')

        print(f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}')
    else:
        last_step = 0
        
        print('[Info] initializing weights...')
        init_weights(model)
    '''
    init_weights(model)

    # warp the model with loss function, to reduce the memory usage on gpu0 and speedup
    model = ModelWithLoss(model)
    model = model.cuda()

    if args.optim == 'adamw':
        optimizer = torch.optim.AdamW(model.parameters(), args.lr)
    else:
        optimizer = torch.optim.SGD(model.parameters(),
                                    args.lr,
                                    momentum=0.9,
                                    nesterov=True)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, patience=args.patience, verbose=True)  # unit is epoch

    torch.save(
        {
            "model_state_dict": model.model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
        }, f"{args.saved_path}/init_weight.pth")

    k = 10
    train_img_list = glob.glob(f"{args.dataset_path}/train/*")
    normal_img_list = []
    yellow_img_list = []
    for img in train_img_list:
        if (img.find('n_') != -1):
            normal_img_list.append(img)
        else:
            yellow_img_list.append(img)
    random.shuffle(normal_img_list)
    random.shuffle(yellow_img_list)
    normal_part_num = math.ceil(len(normal_img_list) / k)
    yellow_part_num = math.ceil(len(yellow_img_list) / k)

    last_acc = []
    last_loss = []
    for i in range(k):
        best_loss = 1e5

        model.model.load_state_dict(
            torch.load(f"{args.saved_path}/init_weight.pth")
            ["model_state_dict"])
        optimizer.load_state_dict(
            torch.load(f"{args.saved_path}/init_weight.pth")
            ["optimizer_state_dict"])
        scheduler.load_state_dict(
            torch.load(f"{args.saved_path}/init_weight.pth")
            ["scheduler_state_dict"])
        model.train()

        sub_train_img_list = normal_img_list[:i * normal_part_num] + normal_img_list[
            (i + 1) *
            normal_part_num:] + yellow_img_list[:i *
                                                yellow_part_num] + yellow_img_list[
                                                    (i + 1) * yellow_part_num:]
        sub_test_img_list = normal_img_list[i * normal_part_num:(
            i + 1) * normal_part_num] + yellow_img_list[i * yellow_part_num:
                                                        (i + 1) *
                                                        yellow_part_num]
        random.shuffle(sub_train_img_list)
        random.shuffle(sub_test_img_list)
        print("---")
        for img in sub_test_img_list:
            print(img)
        print("---")

        train_anno_txt_path = f"{args.dataset_path}/train.txt"
        test_anno_txt_path = f"{args.dataset_path}/train.txt"

        train_transform = transforms.Compose(
            [  # Normalizer(mean=params.mean, std=params.std),
                Augmenter(),
                randomScaleWidth(),
                randomBlur(),
                randomBrightness(),
                randomHue(),
                randomSaturation(),
                Normalizer(mean=params.mean, std=params.std),
                Resizer(input_sizes[args.compound_coef])
            ])
        test_transform = transforms.Compose(
            [  # Normalizer(mean=params.mean, std=params.std),
                Augmenter(),
                Normalizer(mean=params.mean, std=params.std),
                Resizer(input_sizes[args.compound_coef])
            ])

        train_set = EyeDataset(sub_train_img_list, train_anno_txt_path,
                               train_transform)
        test_set = EyeDataset(sub_test_img_list, test_anno_txt_path,
                              test_transform)
        training_generator = DataLoader(train_set, **training_params)
        val_generator = DataLoader(test_set, **val_params)

        for epoch in range(args.epoch):
            model.train()
            total_correct = 0
            total = 0
            total_loss_ls = []
            for data in training_generator:
                imgs = data['img']
                annot = data['annot']

                imgs = imgs.cuda()
                annot = annot.cuda()

                optimizer.zero_grad()
                reg_loss, cls_head_loss, cls_correct_num, total_num = model(
                    imgs, annot, obj_list=params.obj_list)
                total_correct += cls_correct_num
                total += total_num
                reg_loss = reg_loss.mean()
                loss = cls_head_loss + reg_loss
                total_loss_ls.append(loss.item())

                if loss == 0 or not torch.isfinite(loss):
                    continue

                loss.backward()
                optimizer.step()
            total_loss = np.mean(total_loss_ls)
            scheduler.step(total_loss)
            with open(f'./logs/{present_time}/cv_log.txt', 'a') as fp:
                fp.write(f"Epoch: {i}/{epoch}/{args.epoch}\n")
                fp.write(
                    f"Training loss: {total_loss:.6f} | acc: {total_correct / total * 100:.2f}\n"
                )

            model.eval()
            with torch.no_grad():
                total = 0
                total_correct = 0
                total_loss_ls = []
                for data in val_generator:
                    imgs = data['img'].cuda()
                    annot = data['annot'].cuda()

                    reg_loss, cls_head_loss, cls_correct_num, total_num = model(
                        imgs, annot, obj_list=params.obj_list)
                    total_correct += cls_correct_num
                    total += total_num
                    reg_loss = reg_loss.mean()
                    loss = reg_loss + cls_head_loss
                    total_loss_ls.append(loss.item())
                total_loss = np.mean(total_loss_ls)

                with open(f'./logs/{present_time}/cv_log.txt', 'a') as fp:
                    fp.write(
                        f"Testing loss: {total_loss:.6f} | acc: {total_correct / total * 100:.2f}\n\n"
                    )

                if (epoch == args.epoch - 1):
                    last_loss.append(total_loss)
                    last_acc.append(total_correct / total * 100)

    with open(f'./logs/{present_time}/cv_log.txt', 'a') as fp:
        fp.write("\n===========\n\n")
        fp.write(f"Avg. loss: {np.mean(np.array(last_loss)):.2f}\n")
        fp.write(f"Avg. accuracy: {np.mean(np.array(last_acc)):.2f}\n")