예제 #1
0
def main():
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True

    # parse the configurations
    parser = argparse.ArgumentParser(
        description='Additioal configurations for training',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--gpu_ids',
        type=str,
        default='-1',
        help=
        'IDs of GPUs to use (please use `,` to split multiple IDs); -1 means CPU only'
    )
    parser.add_argument('--tr_list',
                        type=str,
                        required=True,
                        help='Path to the list of training files')
    parser.add_argument('--cv_file',
                        type=str,
                        required=True,
                        help='Path to the cross validation file')
    parser.add_argument('--ckpt_dir',
                        type=str,
                        required=True,
                        help='Name of the directory to dump checkpoint')
    parser.add_argument('--unit',
                        type=str,
                        required=True,
                        help='Unit of sample, can be either `seg` or `utt`')
    parser.add_argument(
        '--logging_period',
        type=int,
        default=1000,
        help=
        'Logging period (also the period of cross validation) represented by the number of iterations'
    )
    parser.add_argument('--time_log',
                        type=str,
                        default='',
                        help='Log file for timing batch processing')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='Minibatch size')
    parser.add_argument('--buffer_size',
                        type=int,
                        default=32,
                        help='Buffer size')
    parser.add_argument('--segment_size',
                        type=float,
                        default=4.0,
                        help='Length of segments used for training (seconds)')
    parser.add_argument('--segment_shift',
                        type=float,
                        default=1.0,
                        help='Shift of segments used for training (seconds)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='Initial learning rate for training')
    parser.add_argument('--lr_decay_factor',
                        type=float,
                        default=0.98,
                        help='Decaying factor of learning rate')
    parser.add_argument('--lr_decay_period',
                        type=int,
                        default=2,
                        help='Decaying period of learning rate (epochs)')
    parser.add_argument('--clip_norm',
                        type=float,
                        default=-1.0,
                        help='Gradient clipping (L2-norm)')
    parser.add_argument('--max_n_epochs',
                        type=int,
                        default=100,
                        help='Maximum number of epochs')
    parser.add_argument('--loss_log',
                        type=str,
                        default='loss.txt',
                        help='Filename of the loss log')
    parser.add_argument('--resume_model',
                        type=str,
                        default='',
                        help='Existing model to resume training from')

    args = parser.parse_args()
    logger.info('Arguments in command:\n{}'.format(pprint.pformat(vars(args))))

    model = Model()
    model.train(args)
예제 #2
0
def main():
    """ Train and test

    :param opt: args
    :param writer: tensorboard
    :return:
    """

    global opt
    opt = parse()

    arc = opt.arc
    cfg = opt.cfg
    teacher_cfg = opt.teacher_cfg
    img_size = opt.img_size
    epochs = opt.epochs
    batch_size = opt.batch_size
    accumulate = opt.accumulate  # effective bs = batch_size * accumulate = 16 * 4 = 64
    weights = opt.weights
    teacher_weights = opt.teacher_weights
    multi_scale = opt.multi_scale
    sparsity_training = opt.st

    opt.weights = last if opt.resume else opt.weights

    # Initial logging
    logging.basicConfig(
        format="%(message)s",
        level=logging.INFO if opt.local_rank in [-1, 0] else logging.WARN)

    # Train
    logger.info(opt)
    if opt.local_rank in [-1, 0]:
        logger.info('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/')
        writer = SummaryWriter()

    # Hyperparameters
    with open(opt.hyp) as f_hyp:
        hyp = yaml.safe_load(f_hyp)
    # data dict
    with open(opt.data) as f_data:
        data = yaml.safe_load(f_data)

    # Distributed training initialize
    device = select_device(opt.device)
    if opt.local_rank != -1:
        dist.init_process_group(init_method="env://", backend='nccl')
        torch.cuda.set_device(opt.local_rank)
        device = torch.device(f"cuda:{opt.local_rank}")
        # world_size = torch.distributed.get_world_size()

    init_seeds()
    cuda = device.type != 'cpu'
    torch.backends.cudnn.benchmark = True

    if multi_scale:
        img_size_min = round(img_size / 32 / 1.5) + 1
        img_size_max = round(img_size / 32 * 1.5) - 1
        img_size = img_size_max * 32  # initiate with maximum multi_scale size
        logger.info(f'Using multi-scale  {img_size_min * 32} - {img_size}')

    train_path = data['train']
    num_classes = int(data['num_classes'])  # number of classes

    # Load dataset
    dataset = LoadImagesAndLabels(train_path,
                                  img_size,
                                  batch_size,
                                  augment=True,
                                  hyp=hyp,
                                  rect=opt.rect)
    train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) if opt.local_rank != -1 else None
    num_worker = os.cpu_count() // torch.cuda.device_count()
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             num_workers=min([num_worker, batch_size, 8]),
                                             shuffle=not (opt.rect or train_sampler),
                                             sampler=train_sampler,
                                             pin_memory=True,
                                             collate_fn=dataset.collate_fn)

    # Load model
    model = Model(cfg, img_size, arc=arc).to(device)

    # Load teacher model
    if teacher_cfg:
        teacher_model = Model(teacher_cfg, img_size, arc).to(device)

    # optimizer parameter groups
    param_group0, param_group1 = [], []
    for key, value in model.named_parameters():
        if 'Conv2d.weight' in key:
            param_group1.append(value)
        else:
            param_group0.append(value)
    if opt.adam:
        optimizer = optim.Adam(param_group0, lr=hyp['lr0'])
    else:
        optimizer = optim.SGD(param_group0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
    # add param_group1 with weight_decay
    optimizer.add_param_group({'params': param_group1, 'weight_decay': hyp['weight_decay']})
    logger.info(f'Optimizer groups: {len(param_group1)} conv.weight, {len(param_group0)} other')
    del param_group0, param_group1

    start_epoch = 0
    best_fitness = 0.
    if weights.endswith('.pt'):
        checkpoint = torch.load(weights, map_location=device)
        state_dict = intersect_dicts(checkpoint['model'], model.state_dict())
        model.load_state_dict(state_dict, strict=False)
        print('loaded weights from', weights, '\n')

        # load optimizer
        if checkpoint['optimizer'] is not None:
            optimizer.load_state_dict(checkpoint['optimizer'])
            best_fitness = checkpoint['best_fitness']
        # load results
        if checkpoint.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(checkpoint['training_results'])
        # resume
        if opt.resume:
            start_epoch = checkpoint['epoch'] + 1
        del checkpoint

    elif len(weights) > 0:
        # weights are 'yolov4.weights', 'darknet53.conv.74' etc.
        load_darknet_weights(model, weights)
        logger.info(f'loaded weights from {weights}\n')

    # Load teacher weights
    if teacher_cfg:
        if teacher_weights.endswith('.pt'):
            teacher_model.load_state_dict(torch.load(teacher_weights, map_location=device)['model'])
        elif teacher_weights.endswith('.weights'):
            load_darknet_weights(teacher_model, teacher_weights)
        else:
            raise Exception('pls provide proper teacher weights for knowledge distillation')
        if not mixed_precision:
            teacher_model.eval()
        logger.info('<......................using knowledge distillation....................>')
        logger.info(f'teacher model: {teacher_weights}\n')

    # Sparsity training
    if opt.prune == 0:
        _, _, prune_index = parse_module_index(model.module_dicts)
        if sparsity_training:
            logger.info('normal sparse training')

    if mixed_precision:
        if teacher_cfg:
            [model, teacher_model], optimizer = amp.initialize([model, teacher_model], optimizer,
                                                               opt_level='O1', verbosity=1)
        else:
            model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=1)

    # SyncBatchNorm and distributed training
    if cuda and opt.local_rank != -1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        model = model.to(device)
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[opt.local_rank])
        model.module_list = model.module.module_list
        model.yolo_layers = model.module.yolo_layers

    for index in prune_index:
        bn_weights = gather_bn_weights(model.module_list, [index])
        if opt.local_rank == 0:
            writer.add_histogram('before_train_per_layer_bn_weights/hist', bn_weights.numpy(), index, bins='doane')

    # Start training
    model.num_classes = num_classes
    model.arc = opt.arc
    model.hyp = hyp
    num_batch_size = len(dataloader)
    # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    results = (0, 0, 0, 0, 0, 0, 0)
    start_train_time = time.time()
    logger.info('Image sizes %d \n Starting training for %d epochs...', img_size, epochs)

    for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
        model.train()

        mean_losses = torch.zeros(4).to(device)
        mean_soft_target = torch.zeros(1).to(device)
        pbar = enumerate(dataloader)
        logger.info(('\n %10s %10s %10s %10s %10s %10s %10s %10s'), 'Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total',
                    'targets', 'img_size')
        if opt.local_rank in [-1, 0]:
            pbar = tqdm(pbar, total=num_batch_size)
        optimizer.zero_grad()

        for i, (imgs, targets, _, _) in pbar:  # batch -------------------------------------------------------------
            num_integrated_batches = i + num_batch_size * epoch

            # Adjust the learning rate
            learning_rate = adjust_learning_rate(optimizer, num_integrated_batches, num_batch_size, hyp, epoch, epochs)
            if i == 0 and opt.local_rank in [-1, 0]:
                logger.info(f'learning rate: {learning_rate}')
            imgs = imgs.to(device) / 255.0
            targets = targets.to(device)

            # Multi-Scale training
            if multi_scale:
                if num_integrated_batches / accumulate % 10 == 0:
                    img_size = random.randrange(img_size_min, img_size_max + 1) * 32
                scale_factor = img_size / max(imgs.shape[2:])
                if scale_factor != 1:
                    new_shape = [math.ceil(x * scale_factor / 32.) * 32 for x in imgs.shape[2:]]
                    imgs = F.interpolate(imgs, size=new_shape, mode='bilinear', align_corners=False)

            pred = model(imgs)

            # Compute loss
            loss, loss_items = compute_loss(pred, targets, model)

            # knowledge distillation
            soft_target = 0
            if teacher_cfg:
                if mixed_precision:
                    with torch.no_grad():
                        output_teacher = teacher_model(imgs)
                else:
                    _, output_teacher = teacher_model(imgs)
                soft_target = distillation_loss(pred, output_teacher, model.num_classes, imgs.size(0))
                loss += soft_target

            # Scale loss by nominal batch_size of 64
            loss *= batch_size / 64

            # Compute gradient
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Sparse the BN layer that needs pruning
            if sparsity_training:
                # bn_l1_regularization(model.module_list, opt.penalty_factor, cba_index, epoch, epochs)
                bn_l1_regularization(model.module_list, opt.penalty_factor, prune_index, epoch, epochs)

            # Accumulate gradient for x batches before optimizing
            if num_integrated_batches % accumulate == 0:
                optimizer.step()
                optimizer.zero_grad()

            if opt.local_rank in [-1, 0]:
                mean_losses = (mean_losses * i + loss_items) / (i + 1)
                mean_soft_target = (mean_soft_target * i + soft_target) / (i + 1)
                memory = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0  # (GB)
                description = ('%10s' * 2 + '%10.3g' * 6) % (
                    '%g/%g' % (epoch, epochs - 1), '%.3gG' % memory, *mean_losses, mean_soft_target, img_size)
                pbar.set_description(description)

            # end batch ------------------------------------------------------------------------------------------------

        # Update scheduler
        # scheduler.step()

        if opt.local_rank in [-1, 0]:
            final_epoch = epoch + 1 == epochs
            # Calculate mAP
            if not (opt.notest or opt.nosave) or final_epoch:
                with torch.no_grad():
                    results, _ = test(cfg, data,
                                      batch_size=batch_size,
                                      img_size=opt.img_size,
                                      model=model,
                                      conf_thres=0.001 if final_epoch and epoch > 0 else 0.1,  # 0.1 for speed
                                      save_json=final_epoch and epoch > 0)

            # Write epoch results
            with open(results_file, 'a') as file:
                # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
                file.write(description + '%10.3g' * 7 % results + '\n')

            # Write Tensorboard results
            if writer:
                outputs = list(mean_losses) + list(results)
                titles = ['GIoU', 'Objectness', 'Classification', 'Train loss',
                          'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification']
                for output, title in zip(outputs, titles):
                    writer.add_scalar(title, output, epoch)
                bn_weights = gather_bn_weights(model.module_list, prune_index)
                writer.add_histogram('bn_weights/hist', bn_weights.numpy(), epoch, bins='doane')

            # Update best mAP
            fitness = results[2]
            if fitness > best_fitness:
                best_fitness = fitness

            # Save training results
            save = (not opt.nosave) or (final_epoch and not opt.evolve)
            if save and opt.local_rank == 0:
                with open(results_file, 'r') as file:
                    # Create checkpoint
                    checkpoint = {'epoch': epoch,
                                  'best_fitness': best_fitness,
                                  'training_results': file.read(),
                                  'model': model.module.state_dict() if isinstance(
                                   model, nn.parallel.DistributedDataParallel) else model.state_dict(),
                                  'optimizer': None if final_epoch else optimizer.state_dict()}

                # Save last checkpoint
                torch.save(checkpoint, last)

                # Save best checkpoint
                if best_fitness == fitness:
                    torch.save(checkpoint, best)

                # Delete checkpoint
                del checkpoint

            # end epoch -----------------------------------------------------------------------------------------------
    # end training

    if opt.local_rank in [-1, 0]:
        if len(opt.name):
            os.rename('results.txt', 'results_%s.txt' % opt.name)
        plot_results()  # save as results.png
        print(f'{epoch - start_epoch + 1} epochs completed in {(time.time() - start_train_time) / 3600:.3f} hours.\n')
    if torch.cuda.device_count() > 1:
        dist.destroy_process_group()
    torch.cuda.empty_cache()
    return results