Exemplo n.º 1
0
def train(model, device, config, epochs=5, batch_size=1, save_cp=True, log_step=20, img_scale=0.5):
    train_dataset = Yolo_dataset(config.train_label, config)
    val_dataset = Yolo_dataset(config.val_label, config)

    n_train = len(train_dataset)
    n_val = len(val_dataset)

    train_loader = DataLoader(train_dataset, batch_size=config.batch // config.subdivisions, shuffle=True,
                              num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate)

    val_loader = DataLoader(val_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8,
                            pin_memory=True, drop_last=True)

    writer = SummaryWriter(log_dir=config.TRAIN_TENSORBOARD_DIR,
                           filename_suffix=f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}',
                           comment=f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}')
    # writer.add_images('legend',
    #                   torch.from_numpy(train_dataset.label2colorlegend2(cfg.DATA_CLASSES).transpose([2, 0, 1])).to(
    #                       device).unsqueeze(0))
    max_itr = config.TRAIN_EPOCHS * n_train
    # global_step = cfg.TRAIN_MINEPOCH * n_train
    global_step = 0
    logging.info(f'''Starting training:
        Epochs:          {epochs}
        Batch size:      {config.batch}
        Subdivisions:    {config.subdivisions}
        Learning rate:   {config.learning_rate}
        Training size:   {n_train}
        Validation size: {n_val}
        Checkpoints:     {save_cp}
        Device:          {device.type}
        Images size:     {config.width}
        Optimizer:       {config.TRAIN_OPTIMIZER}
        Dataset classes: {config.classes}
        Train label path:{config.train_label}
        Pretrained:
    ''')

    # learning rate setup
    def burnin_schedule(i):
        if i < config.burn_in:
            factor = pow(i / config.burn_in, 4)
        elif i < config.steps[0]:
            factor = 1.0
        elif i < config.steps[1]:
            factor = 0.1
        else:
            factor = 0.01
        return factor

    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate / config.batch, betas=(0.9, 0.999), eps=1e-08)
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule)

    criterion = Yolo_loss(device=device, batch=config.batch // config.subdivisions, n_classes=config.classes)
    # scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=6, min_lr=1e-7)
    # scheduler = CosineAnnealingWarmRestarts(optimizer, 0.001, 1e-6, 20)

    model.train()
    for epoch in range(epochs):
        # model.train()
        epoch_loss = 0
        epoch_step = 0

        with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img', ncols=50) as pbar:
            for i, batch in enumerate(train_loader):
                global_step += 1
                epoch_step += 1
                images = batch[0]
                bboxes = batch[1]

                images = images.to(device=device, dtype=torch.float32)
                bboxes = bboxes.to(device=device)

                bboxes_pred = model(images)
                loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion(bboxes_pred, bboxes)
                # loss = loss / config.subdivisions
                loss.backward()

                epoch_loss += loss.item()

                if global_step % config.subdivisions == 0:
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()

                if global_step % (log_step * config.subdivisions) == 0:
                    writer.add_scalar('train/Loss', loss.item(), global_step)
                    writer.add_scalar('train/loss_xy', loss_xy.item(), global_step)
                    writer.add_scalar('train/loss_wh', loss_wh.item(), global_step)
                    writer.add_scalar('train/loss_obj', loss_obj.item(), global_step)
                    writer.add_scalar('train/loss_cls', loss_cls.item(), global_step)
                    writer.add_scalar('train/loss_l2', loss_l2.item(), global_step)
                    writer.add_scalar('lr', scheduler.get_lr()[0] * config.batch, global_step)
                    pbar.set_postfix(**{'loss (batch)': loss.item(), 'loss_xy': loss_xy.item(),
                                        'loss_wh': loss_wh.item(),
                                        'loss_obj': loss_obj.item(),
                                        'loss_cls': loss_cls.item(),
                                        'loss_l2': loss_l2.item(),
                                        'lr': scheduler.get_lr()[0] * config.batch
                                        })
                    logging.debug('Train step_{}: loss : {},loss xy : {},loss wh : {},'
                                  'loss obj : {},loss cls : {},loss l2 : {},lr : {}'
                                  .format(global_step, loss.item(), loss_xy.item(),
                                          loss_wh.item(), loss_obj.item(),
                                          loss_cls.item(), loss_l2.item(),
                                          scheduler.get_lr()[0] * config.batch))

                pbar.update(images.shape[0])

            if save_cp:
                try:
                    os.mkdir(config.checkpoints)
                    logging.info('Created checkpoint directory')
                except OSError:
                    pass
                torch.save(model.state_dict(), os.path.join(config.checkpoints, f'Yolov4_epoch{epoch + 1}.pth'))
                logging.info(f'Checkpoint {epoch + 1} saved !')

    writer.close()
Exemplo n.º 2
0
def train(model,
          device,
          config,
          epochs=5,
          batch_size=1,
          save_cp=True,
          log_step=20,
          img_scale=0.5):
    train_dataset = Yolo_dataset(config.train_label, config, train=True)
    val_dataset = Yolo_dataset(config.val_label, config, train=False)

    n_train = len(train_dataset)
    n_val = len(val_dataset)

    train_loader = DataLoader(train_dataset,
                              batch_size=config.batch // config.subdivisions,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=True,
                              drop_last=True,
                              collate_fn=collate)

    val_loader = DataLoader(val_dataset,
                            batch_size=config.batch // config.subdivisions,
                            shuffle=True,
                            num_workers=8,
                            pin_memory=True,
                            drop_last=True,
                            collate_fn=val_collate)

    writer = SummaryWriter(
        log_dir=config.TRAIN_TENSORBOARD_DIR,
        filename_suffix=
        f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}',
        comment=
        f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}'
    )
    # writer.add_images('legend',
    #                   torch.from_numpy(train_dataset.label2colorlegend2(cfg.DATA_CLASSES).transpose([2, 0, 1])).to(
    #                       device).unsqueeze(0))
    max_itr = config.TRAIN_EPOCHS * n_train
    # global_step = cfg.TRAIN_MINEPOCH * n_train
    global_step = 0
    logging.info(f'''Starting training:
        Epochs:          {epochs}
        Batch size:      {config.batch}
        Subdivisions:    {config.subdivisions}
        Learning rate:   {config.learning_rate}
        Training size:   {n_train}
        Validation size: {n_val}
        Checkpoints:     {save_cp}
        Device:          {device.type}
        Images size:     {config.width}
        Optimizer:       {config.TRAIN_OPTIMIZER}
        Dataset classes: {config.classes}
        Train label path:{config.train_label}
        Pretrained:
    ''')

    # learning rate setup
    def burnin_schedule(i):
        if i < config.burn_in:
            factor = pow(i / config.burn_in, 4)
        elif i < config.steps[0]:
            factor = 1.0
        elif i < config.steps[1]:
            factor = 0.1
        else:
            factor = 0.01
        return factor

    if config.TRAIN_OPTIMIZER.lower() == 'adam':
        optimizer = optim.Adam(
            model.parameters(),
            lr=config.learning_rate / config.batch,
            betas=(0.9, 0.999),
            eps=1e-08,
        )
    elif config.TRAIN_OPTIMIZER.lower() == 'sgd':
        optimizer = optim.SGD(
            params=model.parameters(),
            lr=config.learning_rate / config.batch,
            momentum=config.momentum,
            weight_decay=config.decay,
        )
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule)

    criterion = Yolo_loss(device=device,
                          batch=config.batch // config.subdivisions,
                          n_classes=config.classes)
    # scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=6, min_lr=1e-7)
    # scheduler = CosineAnnealingWarmRestarts(optimizer, 0.001, 1e-6, 20)

    save_prefix = 'Yolov4_epoch'
    saved_models = deque()
    model.train()
    for epoch in range(epochs):
        # model.train()
        epoch_loss = 0
        epoch_step = 0

        with tqdm(total=n_train,
                  desc=f'Epoch {epoch + 1}/{epochs}',
                  unit='img',
                  ncols=50) as pbar:
            for i, batch in enumerate(train_loader):
                global_step += 1
                epoch_step += 1
                images = batch[0]
                bboxes = batch[1]

                images = images.to(device=device, dtype=torch.float32)
                bboxes = bboxes.to(device=device)

                bboxes_pred = model(images)
                loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion(
                    bboxes_pred, bboxes)
                # loss = loss / config.subdivisions
                loss.backward()

                epoch_loss += loss.item()

                if global_step % config.subdivisions == 0:
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()

                if global_step % (log_step * config.subdivisions) == 0:
                    writer.add_scalar('train/Loss', loss.item(), global_step)
                    writer.add_scalar('train/loss_xy', loss_xy.item(),
                                      global_step)
                    writer.add_scalar('train/loss_wh', loss_wh.item(),
                                      global_step)
                    writer.add_scalar('train/loss_obj', loss_obj.item(),
                                      global_step)
                    writer.add_scalar('train/loss_cls', loss_cls.item(),
                                      global_step)
                    writer.add_scalar('train/loss_l2', loss_l2.item(),
                                      global_step)
                    writer.add_scalar('lr',
                                      scheduler.get_lr()[0] * config.batch,
                                      global_step)
                    pbar.set_postfix(
                        **{
                            'loss (batch)': loss.item(),
                            'loss_xy': loss_xy.item(),
                            'loss_wh': loss_wh.item(),
                            'loss_obj': loss_obj.item(),
                            'loss_cls': loss_cls.item(),
                            'loss_l2': loss_l2.item(),
                            'lr': scheduler.get_lr()[0] * config.batch
                        })
                    logging.debug(
                        'Train step_{}: loss : {},loss xy : {},loss wh : {},'
                        'loss obj : {},loss cls : {},loss l2 : {},lr : {}'.
                        format(global_step, loss.item(), loss_xy.item(),
                               loss_wh.item(), loss_obj.item(),
                               loss_cls.item(), loss_l2.item(),
                               scheduler.get_lr()[0] * config.batch))

                pbar.update(images.shape[0])

            if cfg.use_darknet_cfg:
                eval_model = Darknet(cfg.cfgfile, inference=True)
            else:
                eval_model = Yolov4(cfg.pretrained,
                                    n_classes=cfg.classes,
                                    inference=True)
            # eval_model = Yolov4(yolov4conv137weight=None, n_classes=config.classes, inference=True)
            if torch.cuda.device_count() > 1:
                eval_model.load_state_dict(model.module.state_dict())
            else:
                eval_model.load_state_dict(model.state_dict())
            eval_model.to(device)
            evaluator = evaluate(eval_model, val_loader, config, device)
            del eval_model

            stats = evaluator.coco_eval['bbox'].stats
            writer.add_scalar('train/AP', stats[0], global_step)
            writer.add_scalar('train/AP50', stats[1], global_step)
            writer.add_scalar('train/AP75', stats[2], global_step)
            writer.add_scalar('train/AP_small', stats[3], global_step)
            writer.add_scalar('train/AP_medium', stats[4], global_step)
            writer.add_scalar('train/AP_large', stats[5], global_step)
            writer.add_scalar('train/AR1', stats[6], global_step)
            writer.add_scalar('train/AR10', stats[7], global_step)
            writer.add_scalar('train/AR100', stats[8], global_step)
            writer.add_scalar('train/AR_small', stats[9], global_step)
            writer.add_scalar('train/AR_medium', stats[10], global_step)
            writer.add_scalar('train/AR_large', stats[11], global_step)

            if save_cp:
                try:
                    # os.mkdir(config.checkpoints)
                    os.makedirs(config.checkpoints, exist_ok=True)
                    logging.info('Created checkpoint directory')
                except OSError:
                    pass
                save_path = os.path.join(config.checkpoints,
                                         f'{save_prefix}{epoch + 1}.pth')
                torch.save(model.state_dict(), save_path)
                logging.info(f'Checkpoint {epoch + 1} saved !')
                saved_models.append(save_path)
                if len(saved_models) > config.keep_checkpoint_max > 0:
                    model_to_remove = saved_models.popleft()
                    try:
                        os.remove(model_to_remove)
                    except:
                        logging.info(f'failed to remove {model_to_remove}')

    writer.close()
def train(model, device, config, epochs=5, log_step=20, img_scale=0.5):
    train_dataset = Yolo_dataset(config.train_label, config)

    n_train = len(train_dataset)

    train_loader = DataLoader(train_dataset,
                              batch_size=config.batch,
                              shuffle=True,
                              num_workers=0,
                              pin_memory=True,
                              drop_last=True,
                              collate_fn=collate)

    logging.info(f'''Starting training:
        Epochs:          {epochs}
        Batch size:      {config.batch}
        Subdivisions:    {config.subdivisions}
        Learning rate:   {config.learning_rate}
        Training size:   {n_train}
        Device:          {device.type}
        Images size:     {config.width}
        Optimizer:       {config.TRAIN_OPTIMIZER}
        Dataset classes: {config.classes}
        Train label path:{config.train_label}
        Pretrained:
    ''')

    # learning rate setup
    def adjust_learning_rate(optimizer, iter, lr):
        """Sets the learning rate to the initial LR decayed by 10 every 2 epochs"""
        if iter < burn_in:
            factor = pow(iter / burn_in, 4)
        elif iter < steps[0]:
            factor = 1.0
        elif iter < steps[1]:
            factor = 0.1
        else:
            factor = 0.01
        lr = lr * factor
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

    lr_start = 0.001
    optimizer = optim.Adam(model.parameters(),
                           lr=lr_start,
                           betas=(0.9, 0.999),
                           eps=1e-08)
    # optimizer = optim.SGD(model.parameters(), lr=lr_start, momentum=0.9, dampening=0)

    criterion = Yolo_loss(device=device,
                          batch=config.batch // config.subdivisions,
                          n_classes=config.classes)
    # scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=6, min_lr=1e-7)
    # scheduler = CosineAnnealingWarmRestarts(optimizer, 0.001, 1e-6, 20)

    model.train()
    start_epoch = 0
    burn_in = 10
    epochs = 6
    batch_num = len(train_loader) * epochs
    steps = [int(0.5 * batch_num), int(0.8 * batch_num)]
    global_iter = start_epoch * len(train_loader)
    for epoch in range(start_epoch, epochs):
        torch.save(
            model.state_dict(),
            os.path.join(config.checkpoints, f'Yolov4_epoch{epoch + 1}.pth'))

        # with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img', ncols=50) as pbar:
        for i, batch in enumerate(train_loader):
            global_iter += 1
            images = batch[0]
            bboxes = batch[1]

            images = images.to(device=device, dtype=torch.float32)
            bboxes = bboxes.to(device=device)

            bboxes_pred = model(images)
            loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion(
                bboxes_pred, bboxes, images)
            # loss = loss / config.subdivisions

            loss.backward()
            optimizer.step()
            model.zero_grad()

            adjust_learning_rate(optimizer, global_iter, lr_start)
            lr = optimizer.param_groups[0]['lr']

            if global_iter % (log_step) == 0:
                part = i / len(train_loader)
                logging.info(
                    'Epoch:%.2f/%d  Loss:%.4f. loss_xy:%.4f  loss_wh:%.4f  loss_obj:%.4f  loss_cls:%.4f  loss_l2:%.4f  lr:%.6f'
                    % (epoch + part, epochs, loss.item(), loss_xy.item(),
                       loss_wh.item(), loss_obj.item(), loss_cls.item(),
                       loss_l2.item(), lr))

        try:
            os.mkdir(config.checkpoints)
            logging.info('Created checkpoint directory')
        except OSError:
            pass
        torch.save(
            model.state_dict(),
            os.path.join(config.checkpoints, f'Yolov4_epoch{epoch + 1}.pth'))
        logging.info(f'Checkpoint {epoch + 1} saved !')
Exemplo n.º 4
0
def train(model, device, config, epochs=5, batch_size=1, save_cp=True, log_step=20, img_scale=0.5):
    # TODO:加上resume功能,resume需要什么信息?
    # config的所有信息、yolov4-custom.cfg的所有信息,权重,epoch序号,学习率到哪了
    
    
    # 创建dataset
    # config.train_label为data/coins.txt标签文本的路径
    train_dataset = Yolo_dataset(config.train_label, config, train=True)
    val_dataset = Yolo_dataset(config.val_label, config, train=False)

    # 获得dataset的长度
    n_train = len(train_dataset)
    n_val = len(val_dataset)

    # 创建dataloader
    # 当pin_memory=False,num_workers=0(子进程数量为0,即只有主进程)时,正常
    # 当pin_memory=True,num_workers=8时,卡住
    # 当pin_memory=False,num_workers=8时,卡住
    # 当pin_memory=True,num_workers=0时,正常
    # 综上,原因在于num_workers大于0开启多线程导致
    # 经查,dataset加载图片中使用OpenCV,OpenCV某些函数默认也会开多线程,
    # 多线程套多线程,容易导致线程卡住(是否会卡住可能与不同操作系统有关)
    # 解决方法:法一,在dataset的前面import cv2时加上cv2.setNumThreads(0)禁用OpenCV多进程(推荐)
    #          法二,使用PIL加载和预处理图片(不推荐,PIL速度不如OpenCV)
    train_loader = DataLoader(train_dataset, batch_size=config.batch // config.subdivisions, shuffle=True,
                              num_workers=8, pin_memory=True, drop_last=True)
    val_loader = DataLoader(val_dataset, batch_size=config.batch // config.subdivisions, shuffle=False,
                              num_workers=8, pin_memory=True, drop_last=False, collate_fn=val_collate)
                            
    if config.only_evaluate or config.evaluate_when_train:
        tgtFile = makeTgtJson(val_loader, config.categories)

    writer = SummaryWriter(log_dir=config.TRAIN_TENSORBOARD_DIR,
                           filename_suffix=f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}',
                           comment=f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}')
    
    # 计算迭代次数的最大值
    max_itr = config.TRAIN_EPOCHS * n_train
    
    # 迭代次数的全局计数器
    global_step = 0

    logging.info(f'''Starting training:
        Epochs:          {epochs}
        Batch size:      {config.batch}
        Subdivisions:    {config.subdivisions}
        Learning rate:   {config.learning_rate}
        Training size:   {n_train}
        Validation size: {n_val}
        Checkpoints:     {save_cp}
        Device:          {device.type}
        Images size:     {config.width}
        Optimizer:       {config.TRAIN_OPTIMIZER}
        Dataset classes: {config.classes}
        Train label path:{config.train_label}
        Pretrained:      {config.pretrainedWeight is not None or config.Pretrained is not None}
    ''')
    if config.only_evaluate:
        if config.use_darknet_cfg:
            eval_model = Darknet(config.cfgfile)
        else:
            raise NotImplementedError
        if torch.cuda.device_count() > 1:
            eval_model.load_state_dict(model.module.state_dict())
        else:
            eval_model.load_state_dict(model.state_dict())
        eval_model.to(device)
        eval_model.eval()
        resFile = evaluate(eval_model, config.val_label, config.dataset_dir, device==torch.device("cuda"))
        if resFile is None:
            debugPrint("detect 0 boxes in the val set")
            return
        cocoEvaluate(tgtFile, resFile)
        return

    # learning rate setup
    # 自定义的学习率调整函数,先递增,然后阶梯性降低
    def burnin_schedule(i):
        # i表示iter,而不是epoch
        if i < config.burn_in:  # 按4次方递增阶段
            # factor表示乘在学习率上的倍数
            factor = pow(i / config.burn_in, 4)
        elif i < config.steps[0]:  # 第一阶段
            factor = 1.0
        elif i < config.steps[1]:  # 第二阶段
            factor = 0.1
        else:  # 第三阶段
            factor = 0.01
        return factor

    if config.TRAIN_OPTIMIZER.lower() == 'adam':  # 默认是adam
        optimizer = optim.Adam(
            model.parameters(),
            lr=config.learning_rate / config.batch,  # 学习率的实际值是设置值/batch_size
            betas=(0.9, 0.999),  # adam的特殊参数,一般用默认即可
            eps=1e-08,  # adam的特殊参数,一般用默认即可
        )
    elif config.TRAIN_OPTIMIZER.lower() == 'sgd':
        optimizer = optim.SGD(
            params=model.parameters(),
            lr=config.learning_rate / config.batch,
            momentum=config.momentum,
            weight_decay=config.decay,
        )

    # pytorch调整学习率的专用接口
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule)

    # 计算loss的对象,这个模块是在yolo网络后专门求解loss的(yolo主网络只负责接收图片,然后输出三路张量),这个模块不需要权重等参数
    criterion = Yolo_loss(device=device, batch=config.batch // config.subdivisions, n_classes=config.classes)

    save_prefix = 'Yolov4_epoch'
    saved_models = deque()
    for epoch in range(epochs):
        epoch_loss = 0
        epoch_step = 0
        model.train()
        logging.info("===Train===")
        for i, batch in enumerate(train_loader):
            global_step += 1
            epoch_step += 1
            images = batch[0]
            bboxes = batch[1]

            images = images.to(device=device, dtype=torch.float32)
            bboxes = bboxes.to(device=device)

            bboxes_pred = model(images)
            loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion(bboxes_pred, bboxes)
            loss.backward()

            epoch_loss += loss.item()

            if global_step % config.subdivisions == 0:
                optimizer.step()
                scheduler.step()
                model.zero_grad()
            
            logging.info("Epoch:[{:3}/{}],step:[{:3}/{}],total loss:{:.2f}|lr:{:.5f}".format(epoch + 1, epochs, i + 1, len(train_loader), loss.item(), scheduler.get_last_lr()[0]))

            if global_step % (log_step * config.subdivisions) == 0:  # log_step默认为20,这里指的是迭代次数
                
                writer.add_scalar('train/Loss', loss.item(), global_step)
                writer.add_scalar('train/loss_xy', loss_xy.item(), global_step)
                writer.add_scalar('train/loss_wh', loss_wh.item(), global_step)
                writer.add_scalar('train/loss_obj', loss_obj.item(), global_step)
                writer.add_scalar('train/loss_cls', loss_cls.item(), global_step)
                writer.add_scalar('train/loss_l2', loss_l2.item(), global_step)
                writer.add_scalar('lr', scheduler.get_last_lr()[0] * config.batch, global_step)
                
                logging.debug('Train step_{}: loss : {},loss xy : {},loss wh : {},'
                            'loss obj : {},loss cls : {},loss l2 : {},lr : {}'
                            .format(global_step, loss.item(), loss_xy.item(),
                                    loss_wh.item(), loss_obj.item(),
                                    loss_cls.item(), loss_l2.item(),
                                    scheduler.get_last_lr()[0] * config.batch))
        if save_cp:  # True
            # 创建checkpoints文件夹
            if not os.path.exists(config.checkpoints):
                os.makedirs(config.checkpoints, exist_ok=True)  # exist_ok=True表示可以接受已经存在该文件夹,当exist_ok=False时文件夹存在会抛出错误
                logging.info('Created checkpoint directory')
            save_path = os.path.join(config.checkpoints, f'{save_prefix}{epoch + 1}.weights')                
            # 考虑torch.nn.DataParallel特殊情况
            if torch.cuda.device_count() > 1:
                model.module.save_weights(save_path)
            else:
                model.save_weights(save_path)                
            logging.info(f'Checkpoint {epoch + 1} saved !')
            # 只保留最新keep_checkpoint_max个checkpoint,自动删除较早的checkpoint
            saved_models.append(save_path)
            if len(saved_models) > config.keep_checkpoint_max > 0:
                model_to_remove = saved_models.popleft()
                try:
                    os.remove(model_to_remove)
                except:
                    logging.info(f'failed to remove {model_to_remove}')

        if config.evaluate_when_train:
            try:
                model.eval()
                resFile = evaluate(model, config.val_label, config.dataset_dir, device==torch.device("cuda"), config.width, config.height)
                if resFile is None:
                    continue
                stats = cocoEvaluate(tgtFile, resFile)

                logging.info("===Val===")
                logging.info("Epoch:[{:3}/{}],AP:{:.3f}|AP50:{:.3f}|AP75:{:.3f}|APs:{:.3f}|APm:{:.3f}|APl:{:.3f}".format(
                    epoch + 1, epochs, stats[0], stats[1], stats[2], stats[3], stats[4], stats[5]))
                logging.info("Epoch:[{:3}/{}],AR1:{:.3f}|AR10:{:.3f}|AR100:{:.3f}|ARs:{:.3f}|ARm:{:.3f}|ARl:{:.3f}".format(
                    epoch + 1, epochs, stats[6], stats[7], stats[8], stats[9], stats[10], stats[11]))


                writer.add_scalar('train/AP', stats[0], global_step)
                writer.add_scalar('train/AP50', stats[1], global_step)
                writer.add_scalar('train/AP75', stats[2], global_step)
                writer.add_scalar('train/AP_small', stats[3], global_step)
                writer.add_scalar('train/AP_medium', stats[4], global_step)
                writer.add_scalar('train/AP_large', stats[5], global_step)
                writer.add_scalar('train/AR1', stats[6], global_step)
                writer.add_scalar('train/AR10', stats[7], global_step)
                writer.add_scalar('train/AR100', stats[8], global_step)
                writer.add_scalar('train/AR_small', stats[9], global_step)
                writer.add_scalar('train/AR_medium', stats[10], global_step)
                writer.add_scalar('train/AR_large', stats[11], global_step)
            except Exception as e:
                debugPrint("evaluate meets an exception, here is the exception info:")
                traceback.print_exc()
                debugPrint("ignore error in evaluate and continue training")

    writer.close()
Exemplo n.º 5
0
def train(model,
          device,
          config,
          epochs=5,
          batch_size=1,
          save_cp=True,
          log_step=20,
          img_scale=0.5):
    train_dataset = Yolo_dataset(config.train_label, config, train=True)
    val_dataset = Yolo_dataset(config.val_label, config, train=False)

    n_train = len(train_dataset)
    n_val = len(val_dataset)

    train_loader = DataLoader(train_dataset,
                              batch_size=config.batch // config.subdivisions,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=True,
                              drop_last=True,
                              collate_fn=collate)

    # val_loader = DataLoader(val_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=0,
    #                         pin_memory=True, drop_last=True, collate_fn=val_collate)

    # writer = SummaryWriter(log_dir=config.TRAIN_TENSORBOARD_DIR,
    #                        filename_suffix=f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}',
    #                        comment=f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}')

    writer = SummaryWriter(log_dir=config.TRAIN_TENSORBOARD_DIR)

    # writer.add_images('legend',
    #                   torch.from_numpy(train_dataset.label2colorlegend2(cfg.DATA_CLASSES).transpose([2, 0, 1])).to(
    #                       device).unsqueeze(0))
    max_itr = config.TRAIN_EPOCHS * n_train
    # global_step = cfg.TRAIN_MINEPOCH * n_train
    global_step = 0
    logging.info(f'''Starting training:
        Epochs:          {epochs}
        Batch size:      {config.batch}
        Subdivisions:    {config.subdivisions}
        Learning rate:   {config.learning_rate}
        Training size:   {n_train}
        Validation size: {n_val}
        Checkpoints:     {save_cp}
        Device:          {device.type}
        Images size:     {config.width}
        Optimizer:       {config.TRAIN_OPTIMIZER}
        Dataset classes: {config.classes}
        Train label path:{config.train_label}
        Pretrained:      {config.pretrained}
    ''')

    # learning rate setup
    def burnin_schedule(i):
        if i < config.burn_in:
            factor = pow(i / config.burn_in, 4)
        elif i < config.steps[0]:
            factor = 1.0
        elif i < config.steps[1]:
            factor = 0.1
        else:
            factor = 0.01
        return factor

    if config.TRAIN_OPTIMIZER.lower() == 'adam':
        optimizer = optim.Adam(
            model.parameters(),
            lr=config.learning_rate,
            betas=(0.9, 0.999),
            eps=1e-08,
        )
    elif config.TRAIN_OPTIMIZER.lower() == 'sgd':
        optimizer = optim.SGD(
            params=model.parameters(),
            lr=config.learning_rate / config.batch,
            momentum=config.momentum,
            weight_decay=config.decay,
        )
    # scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule)

    criterion = Yolo_loss(device=device,
                          batch=config.batch // config.subdivisions,
                          n_classes=config.classes,
                          image_size=config.width)
    # scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=6, min_lr=1e-7)
    # scheduler = CosineAnnealingWarmRestarts(optimizer, 0.001, 1e-6, 20)

    save_prefix = 'Yolov4_tiny_epoch'
    saved_models = deque()
    model.train()

    #   MODEL FREEZE CHECK ---------------------------------------------------
    # for param in model.parameters():
    #     param.requires_grad = False
    # for param in model.head.parameters():
    #     param.requires_grad = True
    # optimizer = optim.Adam(
    #         model.head.parameters(),
    #         lr=config.learning_rate,
    #         betas=(0.9, 0.999),
    #         eps=1e-08,
    #     )
    #   MODEL FREEZE CHECK ---------------------------------------------------

    for epoch in range(epochs):
        # model.train()
        epoch_loss = 0
        epoch_step = 0

        with tqdm(total=n_train,
                  desc=f'Epoch {epoch + 1}/{epochs}',
                  unit='img',
                  ncols=50) as pbar:
            #   EVALUATION CHECK ---------------------------------------------------
            print("\n", "-" * 50)
            print("EVALUATION CHECK")
            # print("Strat Eval, cfg type: ",type(cfg), " , cfg.cfgfile type", type(cfg.cfgfile))
            # print("cfg : ", cfg)
            # print("cfg.cfgfile : ", cfg.cfgfile)
            print("-" * 50, "\n")

            print()
            print("START ITER")
            for i, batch in enumerate(train_loader):
                # print()
                # print("START ITER")
                global_step += 1
                epoch_step += 1
                images = batch[0]
                bboxes = batch[1]

                # print("DATA TO DEVICE")
                # print(images.shape)
                images = images.to(device=device, dtype=torch.float32)
                # print("DATA TO DEVICE")
                # print(bboxes.shape)
                bboxes = bboxes.to(device=device)
                # print("MODEL PRED")
                bboxes_pred = model(images)
                # print("LOSS CALC")
                # print(bboxes_pred[0].shape)
                # print(bboxes_pred[1].shape)
                # print(bboxes.shape)
                loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion(
                    bboxes_pred, bboxes)
                # loss = loss / config.subdivisions
                # print("LOSS CALC END")
                loss.backward()
                # print("LOSS backward END")

                # epoch_loss += loss.item()

                # print("LOSS adding END")

                # print("LR : ", optimizer.param_groups[0]['lr'])

                if global_step % 10 * config.subdivisions == 0:

                    # print("OPT STEP")
                    # print("\n-------BackWard ------\n")
                    optimizer.step()
                    # scheduler.step()
                    model.zero_grad()

                    #   EVALUATION CHECK ---------------------------------------------------
                    # print("LOSS : ", loss.item())
                    writer.add_scalar(' /Loss', loss.item(), global_step)

                    print("ADD TO TENSORBOARD")
                    writer.add_scalar('train/Loss', loss.item(), global_step)
                    writer.add_scalar('train/loss_xy', loss_xy.item(),
                                      global_step)
                    writer.add_scalar('train/loss_wh', loss_wh.item(),
                                      global_step)
                    writer.add_scalar('train/loss_obj', loss_obj.item(),
                                      global_step)
                    writer.add_scalar('train/loss_cls', loss_cls.item(),
                                      global_step)
                    writer.add_scalar('train/loss_l2', loss_l2.item(),
                                      global_step)
                    # writer.add_scalar('lr', scheduler.get_lr()[0] * config.batch, global_step)
                    pbar.set_postfix(
                        **{
                            'loss (batch)': loss.item(),
                            'loss_xy': loss_xy.item(),
                            'loss_wh': loss_wh.item(),
                            'loss_obj': loss_obj.item(),
                            'loss_cls': loss_cls.item(),
                            'loss_l2': loss_l2.item(),
                            # 'lr': scheduler.get_lr()[0] * config.batch
                        })
                    logging.debug(
                        'Train step_{}: loss : {},loss xy : {},loss wh : {},'
                        'loss obj : {},loss cls : {},loss l2 : {},lr : {}'.
                        format(global_step, loss.item(), loss_xy.item(),
                               loss_wh.item(), loss_obj.item(),
                               loss_cls.item(), loss_l2.item(),
                               config.learning_rate))

                    model.eval()
                    # model.head.inference = True

                    # eval_stats = coco_evaluator.evaluate(model, device)
                    img_path = '/workspace/GitHub/YOLO/sample_data/sample_0.jpg'
                    detect.detect_img(model,
                                      img_path,
                                      savename='prediction_sample.jpg',
                                      img_size=640)

                    model.train()
                    # model.head.inference = False
                    #   EVALUATION CHECK ---------------------------------------------------

                pbar.update(images.shape[0])

            if save_cp:
                try:
                    # os.mkdir(config.checkpoints)
                    os.makedirs(config.checkpoints, exist_ok=True)
                    logging.info('Created checkpoint directory')
                except OSError:
                    pass
                save_path = os.path.join(config.checkpoints,
                                         f'{save_prefix}{epoch + 1}.pth')
                torch.save(model.state_dict(), save_path)
                logging.info(f'Checkpoint {epoch + 1} saved !')
                saved_models.append(save_path)
                #   MODEL REMOVE ------------------------------------------------
                # if len(saved_models) > config.keep_checkpoint_max > 0:
                #     model_to_remove = saved_models.popleft()
                #     try:
                #         os.remove(model_to_remove)
                #     except:
                #         logging.info(f'failed to remove {model_to_remove}')
                #   MODEL REMOVE ------------------------------------------------

    writer.close()
Exemplo n.º 6
0
def train(model,
          device,
          config,
          epochs=5,
          batch_size=1,
          save_cp=True,
          log_step=20,
          img_scale=0.5,
          freeze_backbone=False):
    """ Train the YOLOv4 network with given configurations """
    train_dataset = Yolo_dataset(config.train_label, config, train=True)
    val_dataset = Yolo_dataset(config.val_label, config, train=False)

    n_train = len(train_dataset)
    n_val = len(val_dataset)

    train_loader = DataLoader(train_dataset,
                              batch_size=config.batch // config.subdivisions,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=True,
                              drop_last=True,
                              collate_fn=collate)
    val_loader = DataLoader(val_dataset,
                            batch_size=config.batch // config.subdivisions,
                            shuffle=True,
                            num_workers=8,
                            pin_memory=True,
                            drop_last=True,
                            collate_fn=val_collate)

    global_step = 0
    logging.info(f'''Starting training:
        Epochs:          {epochs}
        Batch size:      {config.batch}
        Subdivisions:    {config.subdivisions}
        Learning rate:   {config.learning_rate}
        Training size:   {n_train}
        Validation size: {n_val}
        Checkpoints:     {save_cp}
        Device:          {device.type}
        Images size:     {config.width}
        Optimizer:       {config.TRAIN_OPTIMIZER}
        Dataset classes: {config.classes}
        Train label path:{config.train_label}
        Pretrained:      {config.pretrained}
    ''')

    # Learning rate setup
    def burnin_schedule(i):
        if i < config.burn_in:
            factor = pow(i / config.burn_in, 4)
        elif i < config.steps[0]:
            factor = 1.0
        elif i < config.steps[1]:
            factor = 0.1
        else:
            factor = 0.01
        return factor

    # Optimiser alternatives
    if config.TRAIN_OPTIMIZER.lower() == 'adam':
        optimizer = optim.Adam(
            model.parameters(),
            lr=config.learning_rate / config.batch,
            betas=(0.9, 0.999),
            eps=1e-08,
        )
    elif config.TRAIN_OPTIMIZER.lower() == 'sgd':
        optimizer = optim.SGD(
            params=model.parameters(),
            lr=config.learning_rate / config.batch,
            momentum=config.momentum,
            weight_decay=config.decay,
        )
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule)

    criterion = Yolo_loss(device=device,
                          batch=config.batch // config.subdivisions,
                          n_classes=config.classes)

    save_prefix = 'Yolov4_epoch'
    saved_models = deque()
    # Set model to training mode
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        epoch_step = 0

        # Freeze backbone first epoch
        if freeze_backbone and epoch < 2:
            for name, p in model.named_parameters():
                # Freeze everything but the head
                if not 'head' in name.split('.')[0]:
                    p.requires_grad = False if (epoch == 0) else True

        with tqdm(total=n_train,
                  desc=f'Epoch {epoch + 1}/{epochs}',
                  unit='img',
                  ncols=200) as progress_bar:
            for batch in train_loader:
                global_step += 1
                epoch_step += 1
                images = batch[0]
                bboxes = batch[1]

                images = images.to(device=device, dtype=torch.float32)
                bboxes = bboxes.to(device=device)

                bboxes_pred = model(images)
                loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion(
                    bboxes_pred, bboxes)
                loss.backward()

                epoch_loss += loss.item()

                if global_step % config.subdivisions == 0:
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()

                if global_step % (log_step * config.subdivisions) == 0:
                    progress_bar.set_postfix(
                        **{
                            'loss (batch)': loss.item(),
                            'loss_xy': loss_xy.item(),
                            'loss_wh': loss_wh.item(),
                            'loss_obj': loss_obj.item(),
                            'loss_cls': loss_cls.item(),
                            'loss_l2': loss_l2.item(),
                            'lr': scheduler.get_lr()[0] * config.batch
                        })
                    logging.debug(
                        'Train step_{}: loss : {}, loss xy : {}, loss wh : {},'
                        ' loss obj : {}, loss cls : {}, loss l2 : {}, lr : {}'.
                        format(global_step, loss.item(), loss_xy.item(),
                               loss_wh.item(), loss_obj.item(),
                               loss_cls.item(), loss_l2.item(),
                               scheduler.get_lr()[0] * config.batch))

                progress_bar.update(images.shape[0])

            eval_model = Yolov4(cfg.pretrained,
                                n_classes=cfg.classes,
                                inference=True)
            if torch.cuda.device_count() > 1:
                eval_model.load_state_dict(model.module.state_dict())
            else:
                eval_model.load_state_dict(model.state_dict())
            eval_model.to(device)
            """
            evaluator = evaluate(eval_model, val_loader, config, device)
            del eval_model

            stats = evaluator.coco_eval['bbox'].stats
            writer.add_scalar('train/AP', stats[0], global_step)
            writer.add_scalar('train/AP50', stats[1], global_step)
            writer.add_scalar('train/AP75', stats[2], global_step)
            writer.add_scalar('train/AP_small', stats[3], global_step)
            writer.add_scalar('train/AP_medium', stats[4], global_step)
            writer.add_scalar('train/AP_large', stats[5], global_step)
            writer.add_scalar('train/AR1', stats[6], global_step)
            writer.add_scalar('train/AR10', stats[7], global_step)
            writer.add_scalar('train/AR100', stats[8], global_step)
            writer.add_scalar('train/AR_small', stats[9], global_step)
            writer.add_scalar('train/AR_medium', stats[10], global_step)
            writer.add_scalar('train/AR_large', stats[11], global_step)
            """
            # Save model to file
            if save_cp:
                try:
                    os.makedirs(config.checkpoints, exist_ok=True)
                    logging.info('Created checkpoint directory')
                except OSError:
                    pass
                save_path = os.path.join(config.checkpoints,
                                         f'{save_prefix}{epoch + 1}.pth')
                torch.save(model.state_dict(), save_path)
                logging.info(f'Checkpoint {epoch + 1} saved !')
                saved_models.append(save_path)
                if len(saved_models) > config.keep_checkpoint_max > 0:
                    model_to_remove = saved_models.popleft()
                    try:
                        os.remove(model_to_remove)
                    except:
                        logging.info(f'failed to remove {model_to_remove}')
Exemplo n.º 7
0
def train(model,
          device,
          config,
          epochs=5,
          batch_size=1,
          save_cp=True,
          log_step=20,
          img_scale=0.5):
    train_dataset = Yolo_dataset(config.train_label, config)
    #val_dataset = Yolo_dataset(config.val_label, config)

    n_train = len(train_dataset)
    #n_val = len(val_dataset)
    n_val = 0

    #print(config.batch, config.subdivisions, config.batch // config.subdivisions)
    train_loader = DataLoader(train_dataset,
                              batch_size=config.batch // config.subdivisions,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=True,
                              drop_last=True,
                              collate_fn=collate)

    #val_loader = DataLoader(val_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8,
    #                       pin_memory=True, drop_last=True)

    outfile = open('loss.txt', 'w')

    max_itr = config.TRAIN_EPOCHS * n_train
    # global_step = cfg.TRAIN_MINEPOCH * n_train
    global_step = 0
    logging.info(f'''Starting training:
        Epochs:          {epochs}
        Batch size:      {config.batch}
        Subdivisions:    {config.subdivisions}
        Learning rate:   {config.learning_rate}
        Training size:   {n_train}
        Validation size: {n_val}
        Checkpoints:     {save_cp}
        Device:          {device.type}
        Images size:     {config.width}
        Optimizer:       {config.TRAIN_OPTIMIZER}
        Dataset classes: {config.classes}
        Train label path:{config.train_label}
        Pretrained:
    ''')

    # learning rate setup
    def burnin_schedule(i):
        if i < config.burn_in:
            factor = pow(i / config.burn_in, 4)
        elif i < config.steps[0]:
            factor = 1.0
        elif i < config.steps[1]:
            factor = 0.1
        else:
            factor = 0.01
        return factor

    optimizer = optim.Adam(model.parameters(),
                           lr=config.learning_rate / config.batch,
                           betas=(0.9, 0.999),
                           eps=1e-08)
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule)

    criterion = Yolo_loss(device=device,
                          batch=config.batch // config.subdivisions,
                          n_classes=config.classes)
    # scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=6, min_lr=1e-7)
    # scheduler = CosineAnnealingWarmRestarts(optimizer, 0.001, 1e-6, 20)

    model.train()
    for epoch in range(epochs):
        #model.train()
        #epoch_loss = 0.0
        #epoch_step = 0
        with tqdm(total=n_train,
                  desc=f'Epoch {epoch + 1}/{epochs}',
                  unit='img',
                  ncols=50) as pbar:
            for epoch_step, batch in enumerate(train_loader):
                global_step += 1
                #epoch_step += 1
                images = batch[0]
                bboxes = batch[1]

                images = images.to(device=device, dtype=torch.float32)
                bboxes = bboxes.to(device=device)

                bboxes_pred = model(images)
                loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion(
                    bboxes_pred, bboxes)
                #loss = loss / config.subdivisions
                loss.backward()

                #epoch_loss += loss.item()

                if ((epoch_step + 1) % config.subdivisions) == 0:
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()

                if global_step % (log_step * config.subdivisions) == 0:

                    outfile.write(
                        str(round(loss.item(), 3)) + " " +
                        str(round(loss_xy.item(), 3)) + " " +
                        str(round(loss_wh.item(), 3)) + " " +
                        str(round(loss_obj.item(), 3)) + " " +
                        str(round(loss_cls.item(), 3)) + " " +
                        str(round(loss_l2.item(), 3)) + " " +
                        str(round(scheduler.get_lr()[0] * config.batch, 3)) +
                        "\n")

                    logging.info(
                        'Train step_{}: loss : {},loss xy : {},loss wh : {},'
                        'loss obj : {},loss cls : {},loss l2 : {},lr : {}'.
                        format(global_step, loss.item(), loss_xy.item(),
                               loss_wh.item(), loss_obj.item(),
                               loss_cls.item(), loss_l2.item(),
                               scheduler.get_lr()[0] * config.batch))

                pbar.update(images.shape[0])

            if (save_cp) & ((epoch + 1) % 10 == 0):
                try:
                    os.mkdir(config.checkpoints)
                    logging.info('Created checkpoint directory')
                except OSError:
                    pass
                torch.save(
                    model.state_dict(),
                    os.path.join(config.checkpoints,
                                 f'Yolov4_epoch{epoch + 1}.pth'))
                logging.info(f'Checkpoint {epoch + 1} saved !')

    #writer.close()
    outfile.close()