def main(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) train_params = { "batch_size": opt.batch_size, "shuffle": True, "drop_last": False, "num_workers": opt.num_workers, "collate_fn": collate_fn } eval_params = { "batch_size": opt.batch_size, "shuffle": True, "drop_last": False, "num_workers": opt.num_workers, "collate_fn": collate_fn } dboxes = generate_dboxes() model = SSD() train_set = OIDataset(SimpleTransformer(dboxes), train=True) train_loader = DataLoader(train_set, **train_params) val_set = OIDataset(SimpleTransformer(dboxes, eval=True), validation=True) val_loader = DataLoader(val_set, **eval_params) encoder = Encoder(dboxes) opt.lr = opt.lr * (opt.batch_size / 32) criterion = Loss(dboxes) optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay, nesterov=True) scheduler = MultiStepLR(optimizer=optimizer, milestones=opt.multistep, gamma=0.1) if torch.cuda.is_available(): model.cuda() criterion.cuda() model = torch.nn.DataParallel(model) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.save_folder): os.makedirs(opt.save_folder) checkpoint_path = os.path.join(opt.save_folder, "SSD.pth") writer = SummaryWriter(opt.log_path) if os.path.isfile(checkpoint_path): checkpoint = torch.load(checkpoint_path) first_epoch = checkpoint["epoch"] + 1 model.module.load_state_dict(checkpoint["model_state_dict"]) scheduler.load_state_dict(checkpoint["scheduler"]) optimizer.load_state_dict(checkpoint["optimizer"]) else: first_epoch = 0 for epoch in range(first_epoch, opt.epochs): train(model, train_loader, epoch, writer, criterion, optimizer, scheduler) evaluate(model, val_loader, encoder, opt.nms_threshold) checkpoint = { "epoch": epoch, "model_state_dict": model.module.state_dict(), "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict() } torch.save(checkpoint, checkpoint_path)
def main(opt): if torch.cuda.is_available(): torch.distributed.init_process_group(backend='nccl', init_method='env://') num_gpus = torch.distributed.get_world_size() torch.cuda.manual_seed(123) else: torch.manual_seed(123) num_gpus = 1 train_params = { "batch_size": opt.batch_size * num_gpus, "shuffle": True, "drop_last": False, "num_workers": opt.num_workers, "collate_fn": collate_fn } test_params = { "batch_size": opt.batch_size * num_gpus, "shuffle": False, "drop_last": False, "num_workers": opt.num_workers, "collate_fn": collate_fn } if opt.model == "ssd": dboxes = generate_dboxes(model="ssd") model = SSD(backbone=ResNet(), num_classes=len(coco_classes)) else: dboxes = generate_dboxes(model="ssdlite") model = SSDLite(backbone=MobileNetV2(), num_classes=len(coco_classes)) train_set = CocoDataset(opt.data_path, 2017, "train", SSDTransformer(dboxes, (300, 300), val=False)) train_loader = DataLoader(train_set, **train_params) test_set = CocoDataset(opt.data_path, 2017, "val", SSDTransformer(dboxes, (300, 300), val=True)) test_loader = DataLoader(test_set, **test_params) encoder = Encoder(dboxes) opt.lr = opt.lr * num_gpus * (opt.batch_size / 32) criterion = Loss(dboxes) optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay, nesterov=True) scheduler = MultiStepLR(optimizer=optimizer, milestones=opt.multistep, gamma=0.1) if torch.cuda.is_available(): model.cuda() criterion.cuda() if opt.amp: from apex import amp from apex.parallel import DistributedDataParallel as DDP model, optimizer = amp.initialize(model, optimizer, opt_level='O1') else: from torch.nn.parallel import DistributedDataParallel as DDP # It is recommended to use DistributedDataParallel, instead of DataParallel # to do multi-GPU training, even if there is only a single node. model = DDP(model) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.save_folder): os.makedirs(opt.save_folder) checkpoint_path = os.path.join(opt.save_folder, "SSD.pth") writer = SummaryWriter(opt.log_path) if os.path.isfile(checkpoint_path): checkpoint = torch.load(checkpoint_path) first_epoch = checkpoint["epoch"] + 1 model.module.load_state_dict(checkpoint["model_state_dict"]) scheduler.load_state_dict(checkpoint["scheduler"]) optimizer.load_state_dict(checkpoint["optimizer"]) else: first_epoch = 0 for epoch in range(first_epoch, opt.epochs): train(model, train_loader, epoch, writer, criterion, optimizer, scheduler, opt.amp) evaluate(model, test_loader, epoch, writer, encoder, opt.nms_threshold) checkpoint = { "epoch": epoch, "model_state_dict": model.module.state_dict(), "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict() } torch.save(checkpoint, checkpoint_path)
def main(agrs): start_epoch = 0 # Initialize model or load trained checkpoint if args.resume: start_epoch, model, optimizer = load_checkpoint(args.trained_model) else: model = SSD('train', args) optimizer = init_optimizer(model, args) # Move to default device and set 'train' mode model = model.cuda() model.train() # Create multibox loss criterion = MultiBoxLoss(PriorBox().forward().cuda(), args.overlap_threshold, args.negpos_ratio, args.alpha) # VOC dataloaders train_dataset = VOCxx('train', args.dataroot, args.datayears, args.datanames, discard_difficult=args.discard_difficult, use_augment=args.use_augment) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=1, pin_memory=True) # Loop and decay params epochs = args.iterations // (len(train_dataset) // args.batch_size) decay_iters = [int(it) for it in args.lr_decay.split(',')] decay_lr_at = [ it // (len(train_dataset) // args.batch_size) for it in decay_iters ] print('total length of dataset : ', len(train_dataset)) print('total epochs : ', epochs) print('decay lr at : ', decay_lr_at) # Epochs loc_losses, conf_losses = [], [] for epoch in range(start_epoch, epochs): # Decay learning rate at particular epochs if epoch in decay_lr_at: optimizer = adjust_lr(optimizer) for i, (images, targets) in enumerate(train_loader): # Move to default device images = images.cuda() targets = [t.cuda() for t in targets] # Forward prop preds = model(images) # Loss loc_loss, conf_loss = criterion(preds, targets) loss = loc_loss + conf_loss # Backward prop optimizer.zero_grad() loss.backward() # Clip gradients if necessary if args.clip_grad: clip_gradient(model.parameters(), args.clip_grad) # Update model optimizer.step() # Print status if i % 200 == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Loss : {loss:.4f}\t'.format(epoch, i, len(train_loader), loss=loss.item())) loc_losses.append(loc_loss.item()) conf_losses.append(conf_loss.item()) # Plot losses plot_losses(loc_losses, 'regression', args.model_save_name) plot_losses(conf_losses, 'classification', args.model_save_name) # Save checkpoint save_checkpoint(epoch, model, optimizer, args.model_save_name)