def train_network(self, args): self.optimizer = torch.optim.Adam(self.model.parameters(), args.init_lr) self.scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.96, last_epoch=-1) save_path = 'weights_' + args.dataset start_epoch = 1 # add resume part for continuing training when break previously, 10-16-2020 if args.resume_train: self.model, self.optimizer, start_epoch = self.load_model( self.model, self.optimizer, args.resume_train, strict=True) # end if not os.path.exists(save_path): os.mkdir(save_path) if args.ngpus > 1: if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs self.model = nn.DataParallel(self.model) self.model.to(self.device) criterion = loss.LossAll() print('Setting up data...') dataset_module = self.dataset[args.dataset] dsets = { x: dataset_module(data_dir=args.data_dir, phase=x, input_h=args.input_h, input_w=args.input_w, down_ratio=self.down_ratio) for x in self.dataset_phase[args.dataset] } dsets_loader = {} dsets_loader['train'] = torch.utils.data.DataLoader( dsets['train'], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True, drop_last=True, collate_fn=collater) print('Starting training...') train_loss = [] ap_list = [] for epoch in range(start_epoch, args.num_epoch + 1): print('-' * 10) print('Epoch: {}/{} '.format(epoch, args.num_epoch)) epoch_loss = self.run_epoch(phase='train', data_loader=dsets_loader['train'], criterion=criterion) train_loss.append(epoch_loss) self.scheduler.step(epoch) np.savetxt(os.path.join(save_path, 'train_loss.txt'), train_loss, fmt='%.6f') if epoch % 5 == 0 or epoch > 20: self.save_model( os.path.join(save_path, 'model_{}.pth'.format(epoch)), epoch, self.model, self.optimizer) if 'test' in self.dataset_phase[args.dataset] and epoch % 5 == 0: mAP = self.dec_eval(args, dsets['test']) ap_list.append(mAP) np.savetxt(os.path.join(save_path, 'ap_list.txt'), ap_list, fmt='%.6f') self.save_model(os.path.join(save_path, 'model_last.pth'), epoch, self.model, self.optimizer)
def train_network(self, args): save_path = 'weights_'+args.dataset if not os.path.exists(save_path): os.mkdir(save_path) self.optimizer = torch.optim.Adam(self.model.parameters(), args.init_lr) scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.96, last_epoch=-1) if args.ngpus>0: if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") self.model = nn.DataParallel(self.model) self.model.to(self.device) criterion = loss.LossAll() print('Setting up data...') dataset_module = self.dataset[args.dataset] dsets = {x: dataset_module(data_dir=args.data_dir, phase=x, input_h=args.input_h, input_w=args.input_w, down_ratio=args.down_ratio) for x in ['train', 'val']} dsets_loader = {'train': torch.utils.data.DataLoader(dsets['train'], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True, drop_last=True, collate_fn=collater), 'val':torch.utils.data.DataLoader(dsets['val'], batch_size=1, shuffle=False, num_workers=1, pin_memory=True, collate_fn=collater)} print('Starting training...') train_loss = [] val_loss = [] for epoch in range(1, args.num_epoch+1): print('-'*10) print('Epoch: {}/{} '.format(epoch, args.num_epoch)) epoch_loss = self.run_epoch(phase='train', data_loader=dsets_loader['train'], criterion=criterion) train_loss.append(epoch_loss) scheduler.step(epoch) epoch_loss = self.run_epoch(phase='val', data_loader=dsets_loader['val'], criterion=criterion) val_loss.append(epoch_loss) np.savetxt(os.path.join(save_path, 'train_loss.txt'), train_loss, fmt='%.6f') np.savetxt(os.path.join(save_path, 'val_loss.txt'), val_loss, fmt='%.6f') if epoch % 10 == 0 or epoch ==1: self.save_model(os.path.join(save_path, 'model_{}.pth'.format(epoch)), epoch, self.model) if len(val_loss)>1: if val_loss[-1]<np.min(val_loss[:-1]): self.save_model(os.path.join(save_path, 'model_last.pth'), epoch, self.model)
def train_network(self, args): optimizer = torch.optim.AdamW(self.model.parameters(), args.init_lr) self.optimizer = Lookahead(optimizer) milestones = [5 + x * 80 for x in range(5)] # print(f'milestones:{milestones}') # self.scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.96, last_epoch=-1) scheduler_c = CyclicCosAnnealingLR(optimizer, milestones=milestones, eta_min=5e-5) self.scheduler = LearningRateWarmUP(optimizer=optimizer, target_iteration=5, target_lr=0.003, after_scheduler=scheduler_c) save_path = 'weights_' + args.dataset start_epoch = 1 best_loss = 1000 # try: # self.model, _, _ = self.load_model(self.model, self.optimizer, args.resume) # except: # print('load pretrained model failed') # self.model = self.load_model(self.model, self.optimizer, args.resume) if not os.path.exists(save_path): os.mkdir(save_path) if args.ngpus > 1: if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs self.model = nn.DataParallel(self.model) self.model.to(self.device) criterion = loss.LossAll() print('Setting up data...') dataset_module = self.dataset[args.dataset] dsets = { x: dataset_module(data_dir=args.data_dir, phase=x, input_h=args.input_h, input_w=args.input_w, down_ratio=self.down_ratio) for x in self.dataset_phase[args.dataset] } dsets_loader = {} dsets_loader['train'] = torch.utils.data.DataLoader( dsets['train'], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True, drop_last=True, collate_fn=collater) dsets_loader['valid'] = torch.utils.data.DataLoader( dsets['valid'], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True, drop_last=True, collate_fn=collater) print('Starting training...') train_loss = [] valid_loss = [] ap_list = [] for epoch in range(start_epoch, args.num_epoch + 1): print('-' * 10) print('Epoch: {}/{} '.format(epoch, args.num_epoch)) epoch_loss = self.run_epoch(phase='train', data_loader=dsets_loader['train'], criterion=criterion) train_loss.append(epoch_loss) epoch_loss = self.run_epoch(phase='valid', data_loader=dsets_loader['valid'], criterion=criterion) valid_loss.append(epoch_loss) self.scheduler.step(epoch) np.savetxt(os.path.join(save_path, 'train_loss.txt'), train_loss, fmt='%.6f') np.savetxt(os.path.join(save_path, 'valid_loss.txt'), valid_loss, fmt='%.6f') # if epoch % 5 == 0 or epoch > 20: # self.save_model(os.path.join(save_path, 'model_{}.pth'.format(epoch)), # epoch, # self.model, # self.optimizer) if epoch_loss < best_loss: self.save_model( os.path.join(save_path, 'model_{}.pth'.format(epoch)), epoch, self.model, self.optimizer) print(f'find optimal model, {best_loss}==>{epoch_loss}') best_loss = epoch_loss self.save_model(os.path.join(save_path, 'model_last.pth'), epoch, self.model, self.optimizer)
def train_network(self, args): # 多gpu的情况我就考虑不了了,删掉即可(随时优化无用代码,降低复杂度) # if args.ngpus > 1: # if torch.cuda.device_count() > 1: # print("Let's use", torch.cuda.device_count(), "GPUs!") # # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs # self.model = nn.DataParallel(self.model) self.optimizer = torch.optim.Adam(self.model.parameters(), args.init_lr) # !!!! LR scheduler may not work for low version pytorch. self.scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.96, last_epoch=-1) save_path = 'weights_' + args.dataset start_epoch = 1 # 当需要加载预训练模型的时候可以指定resume_train参数 if args.resume_train: self.model, self.optimizer, start_epoch = self.load_model(self.model, self.optimizer, args.resume_train, strict=True) if not os.path.exists(save_path): os.mkdir(save_path) self.model.to(self.device) # 生成loss.LossAll实例, 用于对检测结果计算loss criterion = loss.LossAll() print('Setting up data...') # self.dataset是一个字典{数据集名:数据集类(在datasets内)},如果只要一个数据集,考虑优化的简单点 # 这里利用args参数选择了一个数据集 dataset_module = self.dataset[args.dataset] # 对数据集中每个指定的阶段(在self.dataset_phase中指定),生成一个类实例(dataset类可以看看实现了啥接口) dsets = {x: dataset_module(data_dir=args.data_dir, phase=x, input_h=args.input_h, input_w=args.input_w, down_ratio=self.down_ratio) for x in self.dataset_phase[args.dataset]} # 这里作者保留了扩展性,但是其实只用到了一个dsets_loader['train'] dsets_loader = {} # 神奇的dataloader,在run_epoch中可以直接使用 for batch_data in dsets_loader['train'] # dsets_loader['train'] = torch.utils.data.DataLoader(dsets['train'], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True, drop_last=True, collate_fn=collater) print('Starting training...') train_loss = [] ap_list = [] for epoch in range(start_epoch, args.num_epoch + 1): print('-' * 10) print('Epoch: {}/{} '.format(epoch, args.num_epoch)) # run_epoch方法,封装了前向传播,loss计算,后向传播的代码 epoch_loss = self.run_epoch(phase='train', data_loader=dsets_loader['train'], criterion=criterion) train_loss.append(epoch_loss) # !!! 注意可能造成版本问题的lr scheduler 相关代码 self.scheduler.step(epoch) np.savetxt(os.path.join(save_path, 'train_loss.txt'), train_loss, fmt='%.6f') # 隔5保1 if epoch % 5 == 0: # or epoch > 20: self.save_model(os.path.join(save_path, 'model_{}.pth'.format(epoch)), epoch, self.model, self.optimizer) # 隔5测1(test指的是验证集?) if 'test' in self.dataset_phase[args.dataset] and epoch % 5 == 0: # 使用dec_eval函数对结果指标进行计算 mAP = self.dec_eval(args, dsets['test']) ap_list.append(mAP) np.savetxt(os.path.join(save_path, 'ap_list.txt'), ap_list, fmt='%.6f') self.save_model(os.path.join(save_path, 'model_last.pth'), epoch, self.model, self.optimizer)
def train_network(self, args): self.optimizer = torch.optim.Adam(self.model.parameters(), args.init_lr) start_epoch = 1 path = 'weights_dota/model_last.pth' # 恢复 if '.pth' in args.resume: self.model, _, start_epoch = self.load_model(self.model, self.optimizer, path, strict=True) # 将学习率去掉 self.optimizer # 以指数形式衰减学习率(可以调参!!!!!!!!!!!) self.scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.96, last_epoch=-1) save_path = 'weights_' + args.dataset # 创建检测点保存目录 if not os.path.exists(save_path): os.mkdir(save_path) # 多GPU并行操作 if args.ngpus > 1: if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") self.model = nn.DataParallel(self.model) self.model.to(self.device) criterion = loss.LossAll() # loss函数的设计 print('Setting up data...') dataset_module = self.dataset[args.dataset] # DOTA类 dsets = { x: dataset_module(data_dir=args.data_dir, phase=x, input_h=args.input_h, input_w=args.input_w, down_ratio=self.down_ratio) for x in self.dataset_phase[args.dataset] } # DOTA类的初始化操作 # 调用torch自带函数完成数据迭代器的生成 dsets_loader = {} dsets_loader['train'] = torch.utils.data.DataLoader( dsets['train'], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True, drop_last=True, collate_fn=collater) print('Starting training...') train_loss = [] ap_list = [] # 迭代 for epoch in range(start_epoch, args.num_epoch + 1): print('-' * 10) print('Epoch: {}/{} '.format(epoch, args.num_epoch)) # 单个epoch的loss epoch_loss = self.run_epoch(phase='train', data_loader=dsets_loader['train'], criterion=criterion) train_loss.append(epoch_loss) self.scheduler.step() # 根据epoch调整学习率 np.savetxt(os.path.join(save_path, 'train_loss.txt'), train_loss, fmt='%.6f') # 保存训练时loss的变化 # 检查点保存 or epoch > 40 if epoch % 10 == 0: self.save_model_yan( os.path.join(save_path, 'model_{}.pth'.format(epoch)), epoch, self.model) ''' # 如果有test关键字,则进行验证输出 if 'test' in self.dataset_phase[args.dataset] and epoch % 5 == 0: mAP = self.dec_eval(args, dsets['test']) ap_list.append(mAP) np.savetxt(os.path.join(save_path, 'ap_list.txt'), ap_list, fmt='%.6f') ''' # 保存每个epoch的模型 self.save_model(os.path.join(save_path, 'model_last.pth'), epoch, self.model, self.optimizer)