def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'valf') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) data_aug_scale = (0.08, 1.0) if args.modelsize == 'large' else (0.2, 1.0) train_loader = torch.utils.data.DataLoader( datasets.ImageFolder(traindir, transforms.Compose([ transforms.RandomResizedCrop(224, scale = data_aug_scale), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])), batch_size=args.train_batch, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Scale(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.test_batch, shuffle=True, num_workers=args.workers, pin_memory=True) # create model # if args.pretrained: # print("=> using pre-trained model '{}'".format(args.arch)) # model = models.__dict__[args.arch](pretrained=True) # elif 'resnext' in args.arch: # model = models.__dict__[args.arch]( # baseWidth=args.base_width, # cardinality=args.cardinality, # ) # else: # print("=> creating model '{}'".format(args.arch)) # model = models.__dict__[args.arch]() model = se_resnet101().cuda() flops, params = get_model_complexity_info(model, (224, 224), as_strings=False, print_per_layer_stat=False) print('Flops: %.3f' % (flops / 1e9)) print('Params: %.2fM' % (params / 1e6)) model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Resume title = 'ImageNet-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..', args.resume) assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] # model may have more keys t = model.state_dict() c = checkpoint['state_dict'] flag = True for k in t: if k not in c: print('not in loading dict! fill it', k, t[k]) c[k] = t[k] flag = False model.load_state_dict(c) if flag: print('optimizer load old state') optimizer.load_state_dict(checkpoint['optimizer']) else: print('new optimizer !') logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.']) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(val_loader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(val_loader, model, criterion, epoch, use_cuda) # append logger file logger.append([state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() print('Best acc:') print(best_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint) and args.local_rank == 0: mkdir_p(args.checkpoint) args.distributed = True args.gpu = args.local_rank torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() print('world_size = ', args.world_size) assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch]() elif 'resnext' in args.arch: model = models.__dict__[args.arch]( baseWidth=args.base_width, cardinality=args.cardinality, ) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() flops, params = get_model_complexity_info(model, (224, 224), as_strings=False, print_per_layer_stat=False) print('Flops: %.3f' % (flops / 1e9)) print('Params: %.2fM' % (params / 1e6)) cudnn.benchmark = True # define loss function (criterion) and optimizer # criterion = nn.CrossEntropyLoss().cuda() criterion = SoftCrossEntropyLoss(label_smoothing=args.label_smoothing).cuda() model = model.cuda() args.lr = float(0.1 * float(args.train_batch * args.world_size) / 256.) state['lr'] = args.lr optimizer = set_optimizer(model) # optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=args.keep_batchnorm_fp32, loss_scale=args.loss_scale) # model = torch.nn.DataParallel(model).cuda() # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) model = DDP(model, delay_allreduce=True) # Data loading code traindir = os.path.join(args.data, 'img_train') valdir = os.path.join(args.data, 'img_val') # normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) data_aug_scale = (0.08, 1.0) train_dataset = datasets.ImageFolder(traindir, transforms.Compose([ transforms.RandomResizedCrop(224, scale=data_aug_scale), transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # normalize, ])) val_dataset = datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), # transforms.ToTensor(), # normalize, ])) train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, collate_fn=fast_collate) # Resume title = 'ImageNet-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..', args.resume) assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume, map_location='cpu') best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] # model may have more keys t = model.state_dict() c = checkpoint['state_dict'] flag = True for k in t: if k not in c: print('not in loading dict! fill it', k, t[k]) c[k] = t[k] flag = False model.load_state_dict(c) print('new optimizer !') if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.']) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(val_loader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) if args.local_rank == 0: print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(val_loader, model, criterion, epoch, use_cuda) # save model if args.local_rank == 0: # append logger file logger.append([state['lr'], train_loss, test_loss, train_acc, test_acc]) is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) if args.local_rank == 0: logger.close() print('Best acc:') print(best_acc)