示例#1
0
def main(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)

    train_params = {
        "batch_size": opt.batch_size,
        "shuffle": True,
        "drop_last": False,
        "num_workers": opt.num_workers,
        "collate_fn": collate_fn
    }

    eval_params = {
        "batch_size": opt.batch_size,
        "shuffle": True,
        "drop_last": False,
        "num_workers": opt.num_workers,
        "collate_fn": collate_fn
    }

    dboxes = generate_dboxes()
    model = SSD()
    train_set = OIDataset(SimpleTransformer(dboxes), train=True)
    train_loader = DataLoader(train_set, **train_params)
    val_set = OIDataset(SimpleTransformer(dboxes, eval=True), validation=True)
    val_loader = DataLoader(val_set, **eval_params)

    encoder = Encoder(dboxes)

    opt.lr = opt.lr * (opt.batch_size / 32)
    criterion = Loss(dboxes)

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=opt.lr,
                                momentum=opt.momentum,
                                weight_decay=opt.weight_decay,
                                nesterov=True)
    scheduler = MultiStepLR(optimizer=optimizer,
                            milestones=opt.multistep,
                            gamma=0.1)

    if torch.cuda.is_available():
        model.cuda()
        criterion.cuda()

    model = torch.nn.DataParallel(model)

    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)

    if not os.path.isdir(opt.save_folder):
        os.makedirs(opt.save_folder)
    checkpoint_path = os.path.join(opt.save_folder, "SSD.pth")

    writer = SummaryWriter(opt.log_path)

    if os.path.isfile(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        first_epoch = checkpoint["epoch"] + 1
        model.module.load_state_dict(checkpoint["model_state_dict"])
        scheduler.load_state_dict(checkpoint["scheduler"])
        optimizer.load_state_dict(checkpoint["optimizer"])
    else:
        first_epoch = 0

    for epoch in range(first_epoch, opt.epochs):
        train(model, train_loader, epoch, writer, criterion, optimizer,
              scheduler)
        evaluate(model, val_loader, encoder, opt.nms_threshold)

        checkpoint = {
            "epoch": epoch,
            "model_state_dict": model.module.state_dict(),
            "optimizer": optimizer.state_dict(),
            "scheduler": scheduler.state_dict()
        }
        torch.save(checkpoint, checkpoint_path)
示例#2
0
def main(opt):
    if torch.cuda.is_available():
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        num_gpus = torch.distributed.get_world_size()
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
        num_gpus = 1

    train_params = {
        "batch_size": opt.batch_size * num_gpus,
        "shuffle": True,
        "drop_last": False,
        "num_workers": opt.num_workers,
        "collate_fn": collate_fn
    }

    test_params = {
        "batch_size": opt.batch_size * num_gpus,
        "shuffle": False,
        "drop_last": False,
        "num_workers": opt.num_workers,
        "collate_fn": collate_fn
    }

    if opt.model == "ssd":
        dboxes = generate_dboxes(model="ssd")
        model = SSD(backbone=ResNet(), num_classes=len(coco_classes))
    else:
        dboxes = generate_dboxes(model="ssdlite")
        model = SSDLite(backbone=MobileNetV2(), num_classes=len(coco_classes))
    train_set = CocoDataset(opt.data_path, 2017, "train",
                            SSDTransformer(dboxes, (300, 300), val=False))
    train_loader = DataLoader(train_set, **train_params)
    test_set = CocoDataset(opt.data_path, 2017, "val",
                           SSDTransformer(dboxes, (300, 300), val=True))
    test_loader = DataLoader(test_set, **test_params)

    encoder = Encoder(dboxes)

    opt.lr = opt.lr * num_gpus * (opt.batch_size / 32)
    criterion = Loss(dboxes)

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=opt.lr,
                                momentum=opt.momentum,
                                weight_decay=opt.weight_decay,
                                nesterov=True)
    scheduler = MultiStepLR(optimizer=optimizer,
                            milestones=opt.multistep,
                            gamma=0.1)

    if torch.cuda.is_available():
        model.cuda()
        criterion.cuda()

        if opt.amp:
            from apex import amp
            from apex.parallel import DistributedDataParallel as DDP
            model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
        else:
            from torch.nn.parallel import DistributedDataParallel as DDP
        # It is recommended to use DistributedDataParallel, instead of DataParallel
        # to do multi-GPU training, even if there is only a single node.
        model = DDP(model)

    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)

    if not os.path.isdir(opt.save_folder):
        os.makedirs(opt.save_folder)
    checkpoint_path = os.path.join(opt.save_folder, "SSD.pth")

    writer = SummaryWriter(opt.log_path)

    if os.path.isfile(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        first_epoch = checkpoint["epoch"] + 1
        model.module.load_state_dict(checkpoint["model_state_dict"])
        scheduler.load_state_dict(checkpoint["scheduler"])
        optimizer.load_state_dict(checkpoint["optimizer"])
    else:
        first_epoch = 0

    for epoch in range(first_epoch, opt.epochs):
        train(model, train_loader, epoch, writer, criterion, optimizer,
              scheduler, opt.amp)
        evaluate(model, test_loader, epoch, writer, encoder, opt.nms_threshold)

        checkpoint = {
            "epoch": epoch,
            "model_state_dict": model.module.state_dict(),
            "optimizer": optimizer.state_dict(),
            "scheduler": scheduler.state_dict()
        }
        torch.save(checkpoint, checkpoint_path)
示例#3
0
def main(agrs):
    start_epoch = 0

    # Initialize model or load trained checkpoint
    if args.resume:
        start_epoch, model, optimizer = load_checkpoint(args.trained_model)
    else:
        model = SSD('train', args)
        optimizer = init_optimizer(model, args)

    # Move to default device and set 'train' mode
    model = model.cuda()
    model.train()

    # Create multibox loss
    criterion = MultiBoxLoss(PriorBox().forward().cuda(),
                             args.overlap_threshold, args.negpos_ratio,
                             args.alpha)

    # VOC dataloaders
    train_dataset = VOCxx('train',
                          args.dataroot,
                          args.datayears,
                          args.datanames,
                          discard_difficult=args.discard_difficult,
                          use_augment=args.use_augment)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=train_dataset.collate_fn,
                              num_workers=1,
                              pin_memory=True)

    # Loop and decay params
    epochs = args.iterations // (len(train_dataset) // args.batch_size)
    decay_iters = [int(it) for it in args.lr_decay.split(',')]
    decay_lr_at = [
        it // (len(train_dataset) // args.batch_size) for it in decay_iters
    ]
    print('total length of dataset : ', len(train_dataset))
    print('total epochs : ', epochs)
    print('decay lr at : ', decay_lr_at)

    # Epochs
    loc_losses, conf_losses = [], []
    for epoch in range(start_epoch, epochs):

        # Decay learning rate at particular epochs
        if epoch in decay_lr_at:
            optimizer = adjust_lr(optimizer)

        for i, (images, targets) in enumerate(train_loader):
            # Move to default device
            images = images.cuda()
            targets = [t.cuda() for t in targets]

            # Forward prop
            preds = model(images)

            # Loss
            loc_loss, conf_loss = criterion(preds, targets)
            loss = loc_loss + conf_loss

            # Backward prop
            optimizer.zero_grad()
            loss.backward()

            # Clip gradients if necessary
            if args.clip_grad:
                clip_gradient(model.parameters(), args.clip_grad)

            # Update model
            optimizer.step()

            # Print status
            if i % 200 == 0:
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Loss : {loss:.4f}\t'.format(epoch,
                                                   i,
                                                   len(train_loader),
                                                   loss=loss.item()))
                loc_losses.append(loc_loss.item())
                conf_losses.append(conf_loss.item())

        # Plot losses
        plot_losses(loc_losses, 'regression', args.model_save_name)
        plot_losses(conf_losses, 'classification', args.model_save_name)

        # Save checkpoint
        save_checkpoint(epoch, model, optimizer, args.model_save_name)