def main(): global args args = parser.parse_args() print('config: wd', args.weight_decay, 'lr', args.lr, 'batch_size', args.batch_size, 'num_gpus', args.num_gpus) # torch.cuda.device_count(): 返回可得到的GPU数量 iteration_size = args.num_gpus // torch.cuda.device_count( ) # do multiple iterations assert iteration_size >= 1 args.weight_decay = args.weight_decay * iteration_size # will cancel out with lr args.lr = args.lr / iteration_size args.batch_size = args.batch_size // iteration_size # 对于只有一块GPU来说,参数没有改变 print('real: wd', args.weight_decay, 'lr', args.lr, 'batch_size', args.batch_size, 'iteration_size', iteration_size) # 分布式处理部分 args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # create model print("=> creating model '{}'".format(args.arch)) # from resnext import resnext50_elastic # from resnext_MulTask_11 import resnext50_elastic # from resnext_MulTask_12 import resnext50_elastic # 没有elastic结构 from resnext_MulTask_clothes_conv1_split import resnext50 model = resnext50(num_classes=data_class) # 注意类别: 两个任务,对应两个不同的类别数 # 有elastic结构 # from resnext_MulTask_clothes_conv1_split import resnext50_elastic # model = resnext50_elastic(num_classes=data_class) # 注意类别: 两个任务,对应两个不同的类别数 # from resnext_MulTask_11 import resnext50 # from resnext import resnext50 # model = resnext50(num_classes=80) # model = models.__dict__[args.arch](num_classes=80) # 加载模型 # count number of parameters count = 0 params = list() for n, p in model.named_parameters(): if '.ups.' not in n: params.append(p) count += np.prod(p.size()) print('Parameters:', count / 1000000.0, "( 百万)") # 参数的数量 # count flops model = add_flops_counting_methods(model) model.eval() image = torch.randn(1, 3, 224, 224) # 图像归一化大小 model.start_flops_count() model(image)[0].sum() # 有改动 model.stop_flops_count() print("GFLOPs", model.compute_average_flops_cost() / 1000000000.0, '( 十亿)') # FLOP的个数 # normal code model = torch.nn.DataParallel(model).cuda() # BCE损失函数 criterion = nn.BCEWithLogitsLoss().cuda() # SGD优化策略 optimizer = torch.optim.SGD([ { 'params': iter(params), 'lr': args.lr }, ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) # 加载ckpt文件 # resume = ('module.fc.bias' in checkpoint['state_dict'] and # checkpoint['state_dict']['module.fc.bias'].size() == model.module.fc.bias.size()) or \ # ('module.classifier.bias' in checkpoint['state_dict'] and # checkpoint['state_dict']['module.classifier.bias'].size() == model.module.classifier.bias.size()) resume = False if resume: # True resume: resume training on MS-COCO # 在MS-COCO上 评估? print() print("resume training on MS-COCO...") print("在MS-COCO上 评估...") print() model.load_state_dict(checkpoint['state_dict'], strict=False) optimizer.load_state_dict( checkpoint['optimizer'] ) if 'optimizer' in checkpoint else print('no optimizer found') args.start_epoch = checkpoint[ 'epoch'] if 'epoch' in checkpoint else args.start_epoch else: # Fake resume: transfer from ImageNet # 从ImageNet——>MS-COCO 训练? print() print("transfer from ImageNet...") # print("从ImageNet——>MS-COCO 训练...") print("从ImageNet——>服饰数据集 训练...") print() pretrained_dict = checkpoint['state_dict'] model_dict = model.state_dict() # 字典对象 pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } model_dict.update(pretrained_dict) # 对模型的参数进行更新 model.load_state_dict(model_dict) # for n, p in list(checkpoint['state_dict'].items()): # if 'classifier' in n or 'fc' in n: # print(n, 'deleted from state_dict') # del checkpoint['state_dict'][n] # model.load_state_dict(checkpoint['state_dict'], strict=False) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'] if 'epoch' in checkpoint else 'unknown')) else: print("=> no checkpoint found at '{}'".format(args.resume)) # 提升一点训练速度,没什么额外开销,一般都会加 # 仅限于非多尺度训练!否则效果更差! cudnn.benchmark = True ################################################################################################# # 本体标签 train_txt_path = os.path.join("train.txt") val_txt_path = os.path.join("test.txt") # 隐义标签 train_sem_txt_path = os.path.join("train-sem.txt") val_sem_txt_path = os.path.join("test-sem.txt") # 服饰数据集正则化部分 normTransform = transforms.Normalize( mean=[0.56391764, 0.43714827, 0.4107524], std=[0.22986116, 0.21178758, 0.20076773]) # 训练集的数据变换 trainTransform = transforms.Compose([ transforms.RandomResizedCrop(224), # 随机裁剪, transforms.RandomHorizontalFlip(), # 随机水平翻转 transforms.ToTensor(), normTransform # 正则化 ]) # 测试集的数据变换 valTransform = transforms.Compose([ transforms.Resize((224, 224)), # 调整图像大小 transforms.ToTensor(), normTransform # 正则化 ]) # 构建MyDataset实例 train_data = MyDataset(txt_path=train_txt_path, txt_sem_path=train_sem_txt_path, transform=trainTransform, sd='训练') # 路径名,数据变换 val_data = MyDataset(txt_path=val_txt_path, txt_sem_path=val_sem_txt_path, transform=valTransform, sd='测试') print("---------------------") train_sampler = torch.utils.data.sampler.RandomSampler(train_data) # 随机采样器 # 构建DataLoder train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) ################################################################################################# ### 继续调用程序 if args.evaluate: validate_multi(val_loader, model, criterion) # 在验证集上测试数据,return返回 return for epoch in range(args.start_epoch, args.epochs): # 学习率调整 coco_adjust_learning_rate(optimizer, epoch) # 模型训练 train_multi(train_loader, model, criterion, optimizer, epoch, iteration_size) print("***********************************************") print("模型训练完第 " + str(epoch + 1) + " 轮,下面进行验证集实验...") print("***********************************************") # evaluate on validation set # 模型验证 validate_multi(val_loader, model, criterion) # 模型保存的位置 save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, False, filename='5-clothes-result/' + args.arch + '_checkpoint.pth.tar')
def main(): global args, best_err1 args = parser.parse_args() print('config: wd', args.weight_decay, 'lr', args.lr, 'batch_size', args.batch_size, 'num_gpus', args.num_gpus) iteration_size = args.num_gpus // torch.cuda.device_count() # do multiple iterations assert iteration_size >= 1 args.weight_decay = args.weight_decay * iteration_size # will cancel out with lr args.lr = args.lr / iteration_size args.batch_size = args.batch_size // iteration_size print('real: wd', args.weight_decay, 'lr', args.lr, 'batch_size', args.batch_size, 'iteration_size', iteration_size) args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # create model print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() # count number of parameters count = 0 params = list() for n, p in model.named_parameters(): if '.ups.' not in n: params.append(p) count += np.prod(p.size()) print('Parameters:', count) # count flops model = add_flops_counting_methods(model) model.eval() image = torch.randn(1, 3, 224, 224) model.start_flops_count() model(image).sum() model.stop_flops_count() print("GFLOPs", model.compute_average_flops_cost() / 1000000000.0) # normal code if not args.distributed: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) # cuda warm up model = model.cuda() image = torch.randn(args.batch_size, 3, 224, 224) image_cuda = image.cuda() for i in range(3): start = time.time() model(image_cuda).sum().backward() # Warmup CUDA memory allocator print(time.time() - start) # with torch.autograd.profiler.profile(use_cuda=True) as prof: # start = time.time() # model(image_cuda).sum().backward() # print(time.time() - start) # prof.export_chrome_trace('trace_gpu') # import cProfile, pstats, io # pr = cProfile.Profile(time.perf_counter) # pr.enable() # model(image_cuda).sum().backward() # pr.disable() # s = io.StringIO() # sortby = 'cumulative' # ps = pstats.Stats(pr, stream=s).sort_stats(sortby) # ps.print_stats() # print(s.getvalue()) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD([{'params': iter(params), 'lr': args.lr}, ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict'], strict=False) if 'state_dict' in checkpoint else print('no state_dict found') optimizer.load_state_dict(checkpoint['optimizer']) if 'optimizer' in checkpoint else print('no optimizer found') args.start_epoch = checkpoint['epoch'] if 'epoch' in checkpoint else args.start_epoch best_err1 = checkpoint['best_err1'] if 'best_err' in checkpoint else best_err1 print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'] if 'epoch' in checkpoint else 'unknown')) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, iteration_size) # evaluate on validation set err1 = validate(val_loader, model, criterion) # remember best err@1 and save checkpoint is_best = err1 < best_err1 best_err1 = min(err1, best_err1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_err1': best_err1, 'optimizer': optimizer.state_dict(), }, is_best, filename=args.arch + '_checkpoint.pth.tar') print(str(float(best_err1)))
def main(): global best_err1, args iteration_size = args.num_gpus // args.world_size args.weight_decay = args.weight_decay * iteration_size # will cancel out with lr args.lr = args.lr / iteration_size print('real: wd', args.weight_decay, 'lr', args.lr, 'batch_size', args.batch_size, 'iteration_size', iteration_size) args.distributed = args.world_size > 1 args.gpu = 0 if args.distributed: args.gpu = args.rank % torch.cuda.device_count() if args.distributed: torch.cuda.set_device(args.gpu) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." # create model print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() # count number of parameters count = 0 params = list() for n, p in model.named_parameters(): if '.ups.' not in n: params.append(p) count += np.prod(p.size()) print('Parameters:', count) # count flops model = add_flops_counting_methods(model) model.eval() image = torch.randn(1, 3, 224, 224) model.start_flops_count() model(image).sum() model.stop_flops_count() print("GFLOPs", model.compute_average_flops_cost() / 1000000000.0) model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: #shared param turns off bucketing in DDP, for lower latency runs this can improve perf model = DDP(model, shared_param=True) global model_params, master_params if args.fp16: model_params, master_params = prep_param_lists(model) else: master_params = list(model.parameters()) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD([ { 'params': iter(params), 'lr': args.lr }, ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict( checkpoint['state_dict'], strict=False ) if 'state_dict' in checkpoint else print('no state_dict found') optimizer.load_state_dict( checkpoint['optimizer'] ) if 'optimizer' in checkpoint else print('no optimizer found') args.start_epoch = checkpoint[ 'epoch'] if 'epoch' in checkpoint else args.start_epoch best_err1 = checkpoint[ 'best_err1'] if 'best_err' in checkpoint else best_err1 print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'] if 'epoch' in checkpoint else 'unknown')) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') crop_size = 224 val_size = 256 train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip(), # transforms.ToTensor(), Too slow # normalize, ])) val_dataset = datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(val_size), transforms.CenterCrop(crop_size), ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate, drop_last=True) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(val_size), transforms.CenterCrop(crop_size), ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, collate_fn=fast_collate) # print(len(train_loader), len(val_loader)) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) print('allocated before', torch.cuda.memory_allocated()) print('cached before', torch.cuda.memory_cached()) gc.collect() torch.cuda.empty_cache() print('allocated after', torch.cuda.memory_allocated()) print('cached after', torch.cuda.memory_cached()) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, iteration_size) # # sync models on multiple GPUs # if args.rank == 0: # save_checkpoint({ # 'epoch': epoch + 1, # 'arch': args.arch, # 'state_dict': model.state_dict(), # 'optimizer' : optimizer.state_dict(), # }, False, 'temp.pth.tar') # # barrier # loss = torch.FloatTensor([args.rank]).cuda() # reduced_loss = reduce_tensor(loss.data) # print(loss.data, reduced_loss) # if os.path.isfile('temp.pth.tar'): # print("=> loading checkpoint '{}'".format('temp.pth.tar')) # checkpoint = torch.load('temp.pth.tar', map_location = lambda storage, loc: storage.cuda(args.gpu)) # model.load_state_dict(checkpoint['state_dict'], strict=False) # optimizer.load_state_dict(checkpoint['optimizer']) # print("=> loaded checkpoint '{}' (epoch {})" # .format('temp.pth.tar', checkpoint['epoch'])) # assert checkpoint['epoch'] == epoch + 1 # evaluate on validation set err1 = validate(val_loader, model, criterion) # remember best err@1 and save checkpoint if args.rank == 0: is_best = err1 < best_err1 best_err1 = min(err1, best_err1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_err1': best_err1, 'optimizer': optimizer.state_dict(), }, is_best) print(str(float(best_err1)))
def main(): global args args = parser.parse_args() print('config: wd', args.weight_decay, 'lr', args.lr, 'batch_size', args.batch_size, 'num_gpus', args.num_gpus) iteration_size = args.num_gpus // torch.cuda.device_count( ) # do multiple iterations assert iteration_size >= 1 args.weight_decay = args.weight_decay * iteration_size # will cancel out with lr args.lr = args.lr / iteration_size args.batch_size = args.batch_size // iteration_size print('real: wd', args.weight_decay, 'lr', args.lr, 'batch_size', args.batch_size, 'iteration_size', iteration_size) args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # create model print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](num_classes=80) # count number of parameters count = 0 params = list() for n, p in model.named_parameters(): if '.ups.' not in n: params.append(p) count += np.prod(p.size()) print('Parameters:', count) # count flops model = add_flops_counting_methods(model) model.eval() image = torch.randn(1, 3, 224, 224) model.start_flops_count() model(image).sum() model.stop_flops_count() print("GFLOPs", model.compute_average_flops_cost() / 1000000000.0) # normal code model = torch.nn.DataParallel(model).cuda() criterion = nn.BCEWithLogitsLoss().cuda() optimizer = torch.optim.SGD([ { 'params': iter(params), 'lr': args.lr }, ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) resume = ('module.fc.bias' in checkpoint['state_dict'] and checkpoint['state_dict']['module.fc.bias'].size() == model.module.fc.bias.size()) or \ ('module.classifier.bias' in checkpoint['state_dict'] and checkpoint['state_dict']['module.classifier.bias'].size() == model.module.classifier.bias.size()) if resume: # True resume: resume training on COCO model.load_state_dict(checkpoint['state_dict'], strict=False) optimizer.load_state_dict( checkpoint['optimizer'] ) if 'optimizer' in checkpoint else print('no optimizer found') args.start_epoch = checkpoint[ 'epoch'] if 'epoch' in checkpoint else args.start_epoch else: # Fake resume: transfer from ImageNet for n, p in list(checkpoint['state_dict'].items()): if 'classifier' in n or 'fc' in n: print(n, 'deleted from state_dict') del checkpoint['state_dict'][n] model.load_state_dict(checkpoint['state_dict'], strict=False) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'] if 'epoch' in checkpoint else 'unknown')) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = CocoDetection( os.path.join(args.data, 'train2014'), os.path.join(args.data, 'annotations/instances_train2014.json'), transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) val_dataset = CocoDetection( os.path.join(args.data, 'val2014'), os.path.join(args.data, 'annotations/instances_val2014.json'), transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), normalize, ])) train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate_multi(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): coco_adjust_learning_rate(optimizer, epoch) # train for one epoch train_multi(train_loader, model, criterion, optimizer, epoch, iteration_size) # evaluate on validation set validate_multi(val_loader, model, criterion) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, False, filename='coco_' + args.arch + '_checkpoint.pth.tar')
def main(): global args args = parser.parse_args() print('config: wd', args.weight_decay, 'lr', args.lr, 'batch_size', args.batch_size, 'num_gpus', args.num_gpus) # torch.cuda.device_count(): 返回可得到的GPU数量 iteration_size = args.num_gpus // torch.cuda.device_count( ) # do multiple iterations assert iteration_size >= 1 args.weight_decay = args.weight_decay * iteration_size # will cancel out with lr args.lr = args.lr / iteration_size args.batch_size = args.batch_size // iteration_size # 对于只有一块GPU来说,参数没有改变 print('real: wd', args.weight_decay, 'lr', args.lr, 'batch_size', args.batch_size, 'iteration_size', iteration_size) # 分布式处理部分 args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # create model print("=> creating model '{}'".format(args.arch)) # from resnext import resnext50 # model = resnext50(num_classes=80) from resnext import resnext50_elastic model = resnext50_elastic(num_classes=80) # from resnext_MulTask_12 import resnext50_elastic # model = resnext50_elastic(num_classes=80) # 模型 经过预训练 # opts["num_labels"] = 14 # count number of parameters count = 0 params = list() for n, p in model.named_parameters(): if '.ups.' not in n: params.append(p) count += np.prod(p.size()) print('Parameters:', count / 1000000.0) # 参数的数量 # count flops model = add_flops_counting_methods(model) model.eval() image = torch.randn(1, 3, 224, 224) model.start_flops_count() model(image).sum() model.stop_flops_count() print("GFLOPs", model.compute_average_flops_cost() / 1000000000.0) # FLOP的个数 # normal code model = torch.nn.DataParallel(model).cuda() # BCE损失函数 criterion = nn.BCEWithLogitsLoss().cuda() # SGD优化策略 optimizer = torch.optim.SGD([ { 'params': iter(params), 'lr': args.lr }, ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) # 加载ckpt文件 resume = ('module.fc.bias' in checkpoint['state_dict'] and checkpoint['state_dict']['module.fc.bias'].size() == model.module.fc.bias.size()) or \ ('module.classifier.bias' in checkpoint['state_dict'] and checkpoint['state_dict']['module.classifier.bias'].size() == model.module.classifier.bias.size()) if resume: # True resume: resume training on MS-COCO # 在MS-COCO上 评估? print() print("resume training on MS-COCO...") print("在MS-COCO上 评估...") print() model.load_state_dict(checkpoint['state_dict'], strict=False) optimizer.load_state_dict( checkpoint['optimizer'] ) if 'optimizer' in checkpoint else print('no optimizer found') args.start_epoch = checkpoint[ 'epoch'] if 'epoch' in checkpoint else args.start_epoch else: # Fake resume: transfer from ImageNet # 从ImageNet——>MS-COCO 训练? print() print("transfer from ImageNet...") print("从ImageNet——>MS-COCO 训练...") print() for n, p in list(checkpoint['state_dict'].items()): if 'classifier' in n or 'fc' in n: print(n, 'deleted from state_dict') del checkpoint['state_dict'][n] model.load_state_dict(checkpoint['state_dict'], strict=False) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'] if 'epoch' in checkpoint else 'unknown')) else: print("=> no checkpoint found at '{}'".format(args.resume)) # 提升一点训练速度,没什么额外开销,一般都会加 # 仅限于非多尺度训练!否则效果更差! cudnn.benchmark = True # Data loading code # ms-coco正则化部分 normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # 训练集处理 train_dataset = CocoDetection( os.path.join(args.data, 'train2014'), os.path.join(args.data, 'annotations/instances_train2014.json'), transforms.Compose([ transforms.RandomResizedCrop(224), # 随机裁剪 transforms.RandomHorizontalFlip(), # 随机水平翻转 transforms.ToTensor(), normalize, # 正则化 ])) # print("train_dataset: ", train_dataset) # 验证集处理 val_dataset = CocoDetection( os.path.join(args.data, 'val2014'), os.path.join(args.data, 'annotations/instances_val2014.json'), transforms.Compose([ transforms.Resize((224, 224)), # 调整图像大小 transforms.ToTensor(), normalize, # 正则化 ])) train_sampler = torch.utils.data.sampler.RandomSampler( train_dataset) # 随机采样器 # torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=default_collate, # pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None) # 1.dataset(Dataset): (image, label)形式,数据读取接口(比如torchvision.datasets.ImageFolder)或者自定义的数据接口的输出, # 该输出是torch.utils.data.Dataset类的对象(或者继承自该类的自定义类的对象)。 # 2.batch_size: 批训练数据量的大小,根据具体情况设置即可。(默认:1) # 3.shuffle: 打乱数据,一般在训练数据中会采用。(默认:False) # 4.sampler(Sampler, optional): 从数据集中提取样本的策略。如果指定,“shuffle”必须为false。一般默认即可。 # 5.num_workers,这个参数必须大于等于0,其他大于0的数表示通过多个进程来导入数据,可以加快数据导入速度。(默认:0) # 6.pin_memory (bool, optional):数据加载器将把张量复制到CUDA内存中,然后返回它们。也就是一个数据拷贝的问题。 # 7.drop_last (bool, optional): 如果数据集大小不能被批大小整除,则设置为“true”以除去最后一个未完成的批。如果“false”那么最后一批将更小。(默认:false) # 8.timeout(numeric, optional):设置数据读取超时,但超过这个时间还没读取到数据的话就会报错。(默认:0) # 训练集 train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True) # 验证集 val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate_multi(val_loader, model, criterion) # 在验证集上测试数据,return返回 return for epoch in range(args.start_epoch, args.epochs): # 学习率调整 coco_adjust_learning_rate(optimizer, epoch) # 模型训练 train_multi(train_loader, model, criterion, optimizer, epoch, iteration_size) print("***********************************************") print("模型训练完第 " + str(epoch + 1) + " 轮,下面进行验证集实验...") print("***********************************************") # evaluate on validation set # 模型验证 validate_multi(val_loader, model, criterion) # 模型保存的位置 save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, False, filename='3-MSCOCO--model-train-demo/' + 'coco_' + args.arch + '_checkpoint_' + str(epoch + 1) + '.pth.tar')