def main(args): init_distributed_mode(args) print(args) device = torch.device(args.device) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # Data loading code print("Loading data") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } COCO_root = args.data_path # load train data set # coco2017 -> annotations -> instances_train2017.json train_data_set = CocoDetection(COCO_root, "train", data_transform["train"]) # load validation data set # coco2017 -> annotations -> instances_val2017.json val_data_set = CocoDetection(COCO_root, "val", data_transform["val"]) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) test_sampler = torch.utils.data.distributed.DistributedSampler( val_data_set) else: train_sampler = torch.utils.data.RandomSampler(train_data_set) test_sampler = torch.utils.data.SequentialSampler(val_data_set) if args.aspect_ratio_group_factor >= 0: # 统计所有图像比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_data_set, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) data_loader_test = torch.utils.data.DataLoader( val_data_set, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) print("Creating model") # create model num_classes equal background + 80 classes model = create_model(num_classes=args.num_classes + 1, device=device) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 train_loss = [] learning_rate = [] val_map = [] print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) mean_loss, lr = utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq, warmup=True) # update learning rate lr_scheduler.step() # evaluate after every epoch coco_info = utils.evaluate(model, data_loader_test, device=device) # 只在主进程上进行写操作 if args.rank in [-1, 0]: train_loss.append(mean_loss.item()) learning_rate.append(lr) val_map.append(coco_info[1]) # pascal mAP # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") if args.output_dir: # 只在主节点上执行保存权重操作 save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str)) if args.rank in [-1, 0]: # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(args): print(args) # mp.spawn(main_worker, args=(args,), nprocs=args.world_size, join=True) init_distributed_mode(args) device = torch.device(args.device) results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # Data loading code print("Loading data") data_transform = { "train": transform.Compose([ transform.SSDCropping(), transform.Resize(), transform.ColorJitter(), transform.ToTensor(), transform.RandomHorizontalFlip(), transform.Normalization(), transform.AssignGTtoDefaultBox() ]), "val": transform.Compose([ transform.Resize(), transform.ToTensor(), transform.Normalization() ]) } VOC_root = args.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], train_set='train.txt') # load validation data set val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], train_set='val.txt') print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) test_sampler = torch.utils.data.distributed.DistributedSampler( val_data_set) else: train_sampler = torch.utils.data.RandomSampler(train_data_set) test_sampler = torch.utils.data.SequentialSampler(val_data_set) if args.aspect_ratio_group_factor >= 0: # 统计所有图像比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_data_set, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) data_loader_test = torch.utils.data.DataLoader( val_data_set, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) print("Creating model") model = create_model(num_classes=args.num_classes + 1, device=device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: utils.evaluate(model, data_loader_test, device=device) return train_loss = [] learning_rate = [] val_map = [] print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) mean_loss, lr = utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) # only first process to save training info if args.rank in [-1, 0]: train_loss.append(mean_loss.item()) learning_rate.append(lr) # update learning rate lr_scheduler.step() # evaluate after every epoch coco_info = utils.evaluate(model, data_loader_test, device=device) if args.rank in [-1, 0]: # write into txt with open(results_file, "a") as f: result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP if args.output_dir: # 只在主节点上执行保存权重操作 save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str)) if args.rank in [-1, 0]: # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(opt, hyp): # 初始化各进程 init_distributed_mode(opt) if opt.rank in [-1, 0]: print(opt) print( 'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/' ) tb_writer = SummaryWriter(comment=opt.name) device = torch.device(opt.device) if "cuda" not in device.type: raise EnvironmentError("not find GPU device for training.") # 使用DDP后会对每个device上的gradients取均值,所以需要放大学习率 hyp["lr0"] *= max(1., opt.world_size * opt.batch_size / 64) wdir = "weights" + os.sep # weights dir best = wdir + "best.pt" results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) cfg = opt.cfg data = opt.data epochs = opt.epochs batch_size = opt.batch_size # accumulate n times before optimizer update (bs 64) accumulate = max(round(64 / (opt.world_size * opt.batch_size)), 1) weights = opt.weights # initial training weights imgsz_train = opt.img_size imgsz_test = opt.img_size # test image sizes multi_scale = opt.multi_scale # Image sizes # 图像要设置成32的倍数 gs = 32 # (pixels) grid size assert math.fmod( imgsz_test, gs) == 0, "--img-size %g must be a %g-multiple" % (imgsz_test, gs) grid_min, grid_max = imgsz_test // gs, imgsz_test // gs if multi_scale: imgsz_min = opt.img_size // 1.5 imgsz_max = opt.img_size // 0.667 # 将给定的最大,最小输入尺寸向下调整到32的整数倍 grid_min, grid_max = imgsz_min // gs, imgsz_max // gs imgsz_min, imgsz_max = int(grid_min * gs), int(grid_max * gs) imgsz_train = imgsz_max # initialize with max size if opt.rank in [-1, 0]: # 只在第一个进程中显示打印信息 print("Using multi_scale training, image range[{}, {}]".format( imgsz_min, imgsz_max)) # configure run random.seed(0) # 设置随机种子 data_dict = parse_data_cfg(data) train_path = data_dict["train"] test_path = data_dict["valid"] nc = 1 if opt.single_cls else int( data_dict["classes"]) # number of classes hyp["cls"] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset hyp["obj"] *= imgsz_test / 320 if opt.rank in [-1, 0]: # Remove previous results for f in glob.glob(results_file) + glob.glob("tmp.pk"): os.remove(f) # Initialize model model = Darknet(cfg).to(device) start_epoch = 0 best_map = 0.0 # 如果指定了预训练权重,则载入预训练权重 if weights.endswith(".pt"): ckpt = torch.load(weights, map_location=device) # load model try: ckpt["model"] = { k: v for k, v in ckpt["model"].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(ckpt["model"], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e if opt.rank in [-1, 0]: # load results if ckpt.get("training_results") is not None: with open(results_file, "w") as file: file.write(ckpt["training_results"]) # write results.txt # epochs start_epoch = ckpt["epoch"] + 1 if epochs < start_epoch: print( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (opt.weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt # 是否冻结权重,只训练predictor的权重 if opt.freeze_layers: # 索引减一对应的是predictor的索引,YOLOLayer并不是predictor output_layer_indices = [ idx - 1 for idx, module in enumerate(model.module_list) if isinstance(module, YOLOLayer) ] # 冻结除predictor和YOLOLayer外的所有层 freeze_layer_indeces = [ x for x in range(len(model.module_list)) if (x not in output_layer_indices) and ( x - 1 not in output_layer_indices) ] # Freeze non-output layers # 总共训练3x2=6个parameters for idx in freeze_layer_indeces: for parameter in model.module_list[idx].parameters(): parameter.requires_grad_(False) else: # 如果freeze_layer为False,默认仅训练除darknet53之后的部分 # 若要训练全部权重,删除以下代码 darknet_end_layer = 74 # only yolov3spp cfg # Freeze darknet53 layers # 总共训练21x3+3x2=69个parameters for idx in range(darknet_end_layer + 1): # [0, 74] for parameter in model.module_list[idx].parameters(): parameter.requires_grad_(False) # SyncBatchNorm # 如果只训练最后的predictor(其中不含bn层),SyncBatchNorm没有作用 if opt.freeze_layers is False: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[opt.gpu]) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # optimizer pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.SGD(pg, lr=hyp["lr0"], momentum=hyp["momentum"], weight_decay=hyp["weight_decay"], nesterov=True) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[ "lrf"]) + hyp["lrf"] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler.last_epoch = start_epoch # 指定从哪个epoch开始 # dataset # 训练集的图像尺寸指定为multi_scale_range中最大的尺寸 # Make sure only the first process in DDP process the dataset first, and the following others can use the cache. with torch_distributed_zero_first(opt.rank): train_dataset = LoadImagesAndLabels( train_path, imgsz_train, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls, rank=opt.rank) # 验证集的图像尺寸指定为img_size(512) val_dataset = LoadImagesAndLabels(test_path, imgsz_test, batch_size, hyp=hyp, cache_images=opt.cache_images, single_cls=opt.single_cls, rank=opt.rank) # 给每个rank对应的进程分配训练的样本索引 train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset) # 将样本索引每batch_size个元素组成一个list train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, batch_size, drop_last=True) # dataloader nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers if opt.rank in [-1, 0]: print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_sampler=train_batch_sampler, num_workers=nw, pin_memory=True, collate_fn=train_dataset.collate_fn) val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, sampler=val_sampler, num_workers=nw, pin_memory=True, collate_fn=val_dataset.collate_fn) # Model parameters model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) # start training # caching val_data when you have plenty of memory(RAM) with torch_distributed_zero_first(opt.rank): if os.path.exists("tmp.pk") is False: coco = get_coco_api_from_dataset(val_dataset) with open("tmp.pk", "wb") as f: pickle.dump(coco, f) else: with open("tmp.pk", "rb") as f: coco = pickle.load(f) if opt.rank in [-1, 0]: print("starting traning for %g epochs..." % epochs) print('Using %g dataloader workers' % nw) start_time = time.time() for epoch in range(start_epoch, epochs): train_sampler.set_epoch(epoch) mloss, lr = train_util.train_one_epoch( model, optimizer, train_data_loader, device, epoch, accumulate=accumulate, # 迭代多少batch才训练完64张图片 img_size=imgsz_train, # 输入图像的大小 multi_scale=multi_scale, grid_min=grid_min, # grid的最小尺寸 grid_max=grid_max, # grid的最大尺寸 gs=gs, # grid step: 32 print_freq=50, # 每训练多少个step打印一次信息 warmup=True) # update scheduler scheduler.step() if opt.notest is False or epoch == epochs - 1: # evaluate on the test dataset result_info = train_util.evaluate(model, val_data_loader, coco=coco, device=device) # only first process in DDP process to record info and save weights if opt.rank in [-1, 0]: coco_mAP = result_info[0] voc_mAP = result_info[1] coco_mAR = result_info[8] # write into tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'train/loss', "learning_rate", "mAP@[IoU=0.50:0.95]", "mAP@[IoU=0.5]", "mAR@[IoU=0.50:0.95]" ] for x, tag in zip( mloss.tolist() + [lr, coco_mAP, voc_mAP, coco_mAR], tags): tb_writer.add_scalar(tag, x, epoch) # write into txt with open(results_file, "a") as f: # 记录coco的12个指标加上训练总损失和lr result_info = [ str(round(i, 4)) for i in result_info + [mloss.tolist()[-1]] ] + [str(round(lr, 6))] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") # update best mAP(IoU=0.50:0.95) if coco_mAP > best_map: best_map = coco_mAP if opt.savebest is False: # save weights every epoch with open(results_file, 'r') as f: save_files = { 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'training_results': f.read(), 'epoch': epoch, 'best_map': best_map } torch.save(save_files, "./weights/yolov3spp-{}.pt".format(epoch)) else: # only save best weights if best_map == coco_mAP: with open(results_file, 'r') as f: save_files = { 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'training_results': f.read(), 'epoch': epoch, 'best_map': best_map } torch.save(save_files, best.format(epoch)) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) if opt.rank in [-1, 0]: print('Training time {}'.format(total_time_str))
def main(args): init_distributed_mode(args) print(args) device = torch.device(args.device) # segmentation nun_classes + background num_classes = args.num_classes + 1 mean = (0.709, 0.381, 0.224) std = (0.127, 0.079, 0.043) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_root = args.data_path # check data root if os.path.exists(os.path.join(data_root, "DRIVE")) is False: raise FileNotFoundError( "DRIVE dose not in path:'{}'.".format(data_root)) train_dataset = DriveDataset(args.data_path, train=True, transforms=get_transform(train=True, mean=mean, std=std)) val_dataset = DriveDataset(args.data_path, train=False, transforms=get_transform(train=False, mean=mean, std=std)) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) test_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) else: train_sampler = torch.utils.data.RandomSampler(train_dataset) test_sampler = torch.utils.data.SequentialSampler(val_dataset) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.workers, collate_fn=train_dataset.collate_fn, drop_last=True) val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=train_dataset.collate_fn) print("Creating model") # create model num_classes equal background + foreground classes model = create_model(num_classes=num_classes) model.to(device) if args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params_to_optimize = [ p for p in model_without_ddp.parameters() if p.requires_grad ] optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scaler = torch.cuda.amp.GradScaler() if args.amp else None # 创建学习率更新策略,这里是每个step更新一次(不是每个epoch) lr_scheduler = create_lr_scheduler(optimizer, len(train_data_loader), args.epochs, warmup=True) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.amp: scaler.load_state_dict(checkpoint["scaler"]) if args.test_only: confmat = evaluate(model, val_data_loader, device=device, num_classes=num_classes) val_info = str(confmat) print(val_info) return best_dice = 0. print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) mean_loss, lr = train_one_epoch(model, optimizer, train_data_loader, device, epoch, num_classes, lr_scheduler=lr_scheduler, print_freq=args.print_freq, scaler=scaler) confmat, dice = evaluate(model, val_data_loader, device=device, num_classes=num_classes) val_info = str(confmat) print(val_info) print(f"dice coefficient: {dice:.3f}") # 只在主进程上进行写操作 if args.rank in [-1, 0]: # write into txt with open(results_file, "a") as f: # 记录每个epoch对应的train_loss、lr以及验证集各指标 train_info = f"[epoch: {epoch}]\n" \ f"train_loss: {mean_loss:.4f}\n" \ f"lr: {lr:.6f}\n" \ f"dice coefficient: {dice:.3f}\n" f.write(train_info + val_info + "\n\n") if args.save_best is True: if best_dice < dice: best_dice = dice else: continue if args.output_dir: # 只在主节点上执行保存权重操作 save_file = { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch } if args.amp: save_file["scaler"] = scaler.state_dict() if args.save_best is True: save_on_master(save_file, os.path.join(args.output_dir, 'best_model.pth')) else: save_on_master( save_file, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): print(args) # mp.spawn(main_worker, args=(args,), nprocs=args.world_size, join=True) init_distributed_mode(args) device = torch.device(args.device) # Data loading code print("Loading data") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = args.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], "train.txt") # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], "val.txt") print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) test_sampler = torch.utils.data.distributed.DistributedSampler( val_data_set) else: train_sampler = torch.utils.data.RandomSampler(train_data_set) test_sampler = torch.utils.data.SequentialSampler(val_data_set) if args.aspect_ratio_group_factor >= 0: # 统计所有图像比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_data_set, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) data_loader_test = torch.utils.data.DataLoader( val_data_set, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) print("Creating model") model = create_model(num_classes=21, device=device) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: utils.evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: # 只在主节点上执行保存权重操作 save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch utils.evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))