def train(self): # Setting the variables before starting the training avg_train_loss = AverageMeter() avg_train_acc = AverageMeter() best_val_acc = -np.inf for epoch in range(self.num_epochs): avg_train_loss.reset() avg_train_acc.reset() # Mini batch loop for batch_idx, batch in enumerate(tqdm(self.train_loader)): step = epoch * len(self.train_loader) + batch_idx # Get the model output for the batch and update the loss and accuracy meters train_loss, train_acc = self.train_step(batch) if self.args.scheduler == 'cycle': self.scheduler.step() avg_train_loss.update([train_loss.item()]) avg_train_acc.update([train_acc]) # Save the step checkpoint if needed # if step % self.save_every == 0: # step_chkpt_path = os.path.join(self.model_dir, # 'step_chkpt_{}_{}.pth'.format(epoch, step)) # print("Saving the model checkpoint for epoch {} at step {}".format(epoch, step)) # torch.save(self.model.state_dict(), step_chkpt_path) # Logging and validation check if step % self.print_every == 0: print( 'Epoch {}, batch {}, step {}, ' 'loss = {:.4f}, acc = {:.4f}, ' 'running averages: loss = {:.4f}, acc = {:.4f}'.format( epoch, batch_idx, step, train_loss.item(), train_acc, avg_train_loss.get(), avg_train_acc.get())) if step % self.val_every == 0: val_loss, val_acc = self.val() print('Val acc = {:.4f}, Val loss = {:.4f}'.format( val_acc, val_loss)) if self.visualize: self.writer.add_scalar('Val/loss', val_loss, step) self.writer.add_scalar('Val/acc', val_acc, step) # Update the save the best validation checkpoint if needed if val_acc > best_val_acc: best_val_acc = val_acc best_chkpt_path = os.path.join(self.model_dir, 'best_ckpt.pth') torch.save(self.model.state_dict(), best_chkpt_path) if self.args.scheduler == 'plateau': self.scheduler.step(val_acc) if self.visualize: # Log data to self.writer.add_scalar('Train/loss', train_loss.item(), step) self.writer.add_scalar('Train/acc', train_acc, step)
def evaluate(loader, model): print("Evaluate") # Set model to eval model.eval() accuracy = AverageMeter() positive_accuracy = AverageMeter() negative_accuracy = AverageMeter() y_true = None y_scores = None with torch.no_grad(): for batch_idx, (x, y) in enumerate(loader): x = x.to(device=device).to(torch.float32) y = y.to(device=device).to(torch.float32) scores = model(x) scores = torch.squeeze(scores, 2) y = torch.unsqueeze(y, 1) loss = criterion(scores, y) scores = torch.squeeze(scores, 1) y = torch.squeeze(y, 1) if y_true is None: y_true = y y_scores = scores else: y_true = torch.cat((y_true, y)) y_scores = torch.cat((y_scores, scores)) acc = get_accuracy(y, scores) # neg_acc, pos_acc = get_accuracy_per_class(y.cpu(), scores.cpu()) accuracy.update(acc) # positive_accuracy.update(pos_acc) # negative_accuracy.update(neg_acc) auc = roc_auc_score(y_true.cpu(), y_scores.cpu()) wandb.log({ "valid_acc": accuracy.avg, # "positive_acc": positive_accuracy.avg, # "negative_acc": negative_accuracy.avg, "valid_loss": loss.item(), "AUC": auc }) accuracy.reset() # Set model back to train model.train()
def test(cfg, model, logger, writer, metrics, tid_done): model.eval() criterion = torch.nn.CrossEntropyLoss() test_loaders = [(tid, get_loader(cfg, False, tid)) for tid in range(tid_done+1)] avg_meter = AverageMeter() for tid, test_loader in test_loaders: avg_meter.reset() for idx, data in enumerate(test_loader): x, y = data x = x.to(device) y = y.to(device) output = model(x) test_loss = criterion(output, y) pred = output.argmax(dim=1, keepdim=True) acc = metrics.accuracy(tid, tid_done, pred, y) metrics.avg_accuracy(tid, tid_done, len(test_loader.dataset)) metrics.forgetting(tid, tid_done) logger.info(f'Task Done:{tid_done},\ Test Acc:{metrics.acc_task(tid_done)},\ Test Forgetting:{metrics.forgetting_task(tid_done)}')
def train(self): # Setting the variables before starting the training avg_train_loss = AverageMeter() avg_train_acc = AverageMeter() text_avg_train_acc = AverageMeter() best_val_acc = -np.inf for epoch in range(self.num_epochs): self.model.print_frozen() avg_train_loss.reset() avg_train_acc.reset() text_avg_train_acc.reset() # Mini batch loop for batch_idx, batch in enumerate(tqdm(self.train_loader)): step = epoch * len(self.train_loader) + batch_idx # Get the model output for the batch and update the loss and accuracy meters train_loss, train_acc, text_train_acc = self.train_step(batch) if self.args.scheduler == 'cycle': self.scheduler.step() avg_train_loss.update([train_loss.item()]) avg_train_acc.update([train_acc]) text_avg_train_acc.update([text_train_acc]) # Logging and validation check if step % self.print_every == 0: print( 'Epoch {}, batch {}, step {}, ' 'loss = {:.4f}, acc_audio = {:.4f}, acc_text = {:.4f}, ' 'running averages: loss = {:.4f}, acc_audio = {:.4f}, acc_text = {:.4f}' .format(epoch, batch_idx, step, train_loss.item(), train_acc, text_train_acc, avg_train_loss.get(), avg_train_acc.get(), text_avg_train_acc.get())) if step % self.val_every == 0: val_loss, val_acc, text_val_acc = self.val() print( 'Val acc (audio) = {:.4f}, Val acc (text) = {:.4f}, Val loss = {:.4f}' .format(val_acc, text_val_acc, val_loss)) # Update the save the best validation checkpoint if needed audio_text_avg_acc = (val_acc + text_val_acc) / 2 if audio_text_avg_acc > best_val_acc: best_val_acc = audio_text_avg_acc #print('Start saving best checkpoint...) best_chkpt_path = os.path.join(self.model_dir, 'best_ckpt.pth') torch.save(self.model.state_dict(), best_chkpt_path) #print('Done saving best checkpoint!!!) if self.args.scheduler == 'plateau': self.scheduler.step(audio_text_avg_acc) self.model.unfreeze_one_layer()
def train_net(param, model, train_data, valid_data, plot=False,device='cuda'): # 初始化参数 model_name = param['model_name'] epochs = param['epochs'] batch_size = param['batch_size'] lr = param['lr'] gamma = param['gamma'] step_size = param['step_size'] momentum = param['momentum'] weight_decay = param['weight_decay'] disp_inter = param['disp_inter'] save_inter = param['save_inter'] min_inter = param['min_inter'] iter_inter = param['iter_inter'] save_log_dir = param['save_log_dir'] save_ckpt_dir = param['save_ckpt_dir'] load_ckpt_dir = param['load_ckpt_dir'] # scaler = GradScaler() # 网络参数 train_data_size = train_data.__len__() valid_data_size = valid_data.__len__() c, y, x = train_data.__getitem__(0)['image'].shape train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True, num_workers=1) valid_loader = DataLoader(dataset=valid_data, batch_size=batch_size, shuffle=False, num_workers=1) optimizer = optim.AdamW(model.parameters(), lr=3e-4 ,weight_decay=weight_decay) #optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=momentum, weight_decay=weight_decay) #scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma) scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=3, T_mult=2, eta_min=1e-5, last_epoch=-1) #criterion = nn.CrossEntropyLoss(reduction='mean').to(device) DiceLoss_fn=DiceLoss(mode='multiclass') SoftCrossEntropy_fn=SoftCrossEntropyLoss(smooth_factor=0.1) criterion = L.JointLoss(first=DiceLoss_fn, second=SoftCrossEntropy_fn, first_weight=0.5, second_weight=0.5).cuda() logger = inial_logger(os.path.join(save_log_dir, time.strftime("%m-%d %H:%M:%S", time.localtime()) +'_'+model_name+ '.log')) # 主循环 train_loss_total_epochs, valid_loss_total_epochs, epoch_lr = [], [], [] train_loader_size = train_loader.__len__() valid_loader_size = valid_loader.__len__() best_iou = 0 best_epoch=0 best_mode = copy.deepcopy(model) epoch_start = 0 if load_ckpt_dir is not None: ckpt = torch.load(load_ckpt_dir) epoch_start = ckpt['epoch'] model.load_state_dict(ckpt['state_dict']) optimizer.load_state_dict(ckpt['optimizer']) logger.info('Total Epoch:{} Image_size:({}, {}) Training num:{} Validation num:{}'.format(epochs, x, y, train_data_size, valid_data_size)) # for epoch in range(epoch_start, epochs): epoch_start = time.time() # 训练阶段 model.train() train_epoch_loss = AverageMeter() train_iter_loss = AverageMeter() for batch_idx, batch_samples in enumerate(train_loader): data, target = batch_samples['image'], batch_samples['label'] data, target = Variable(data.to(device)), Variable(target.to(device)) with autocast(): #need pytorch>1.6 pred = model(data) loss = criterion(pred, target) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() optimizer.zero_grad() scheduler.step(epoch + batch_idx / train_loader_size) image_loss = loss.item() train_epoch_loss.update(image_loss) train_iter_loss.update(image_loss) if batch_idx % iter_inter == 0: spend_time = time.time() - epoch_start logger.info('[train] epoch:{} iter:{}/{} {:.2f}% lr:{:.6f} loss:{:.6f} ETA:{}min'.format( epoch, batch_idx, train_loader_size, batch_idx/train_loader_size*100, optimizer.param_groups[-1]['lr'], train_iter_loss.avg,spend_time / (batch_idx+1) * train_loader_size // 60 - spend_time // 60)) train_iter_loss.reset() # 验证阶段 model.eval() valid_epoch_loss = AverageMeter() valid_iter_loss = AverageMeter() iou=IOUMetric(10) with torch.no_grad(): for batch_idx, batch_samples in enumerate(valid_loader): data, target = batch_samples['image'], batch_samples['label'] data, target = Variable(data.to(device)), Variable(target.to(device)) pred = model(data) loss = criterion(pred, target) pred=pred.cpu().data.numpy() pred= np.argmax(pred,axis=1) iou.add_batch(pred,target.cpu().data.numpy()) # image_loss = loss.item() valid_epoch_loss.update(image_loss) valid_iter_loss.update(image_loss) # if batch_idx % iter_inter == 0: # logger.info('[val] epoch:{} iter:{}/{} {:.2f}% loss:{:.6f}'.format( # epoch, batch_idx, valid_loader_size, batch_idx / valid_loader_size * 100, valid_iter_loss.avg)) val_loss=valid_iter_loss.avg acc, acc_cls, iu, mean_iu, fwavacc=iou.evaluate() logger.info('[val] epoch:{} miou:{:.2f}'.format(epoch,mean_iu)) # 保存loss、lr train_loss_total_epochs.append(train_epoch_loss.avg) valid_loss_total_epochs.append(valid_epoch_loss.avg) epoch_lr.append(optimizer.param_groups[0]['lr']) # 保存模型 if epoch % save_inter == 0 and epoch > min_inter: state = {'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()} filename = os.path.join(save_ckpt_dir, 'checkpoint-epoch{}.pth'.format(epoch)) torch.save(state, filename) # pytorch1.6会压缩模型,低版本无法加载 # 保存最优模型 if mean_iu > best_iou: # train_loss_per_epoch valid_loss_per_epoch state = {'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()} filename = os.path.join(save_ckpt_dir, 'checkpoint-best.pth') torch.save(state, filename) best_iou = mean_iu best_mode = copy.deepcopy(model) logger.info('[save] Best Model saved at epoch:{} ============================='.format(epoch)) #scheduler.step() # 显示loss # 训练loss曲线 if plot: x = [i for i in range(epochs)] fig = plt.figure(figsize=(12, 4)) ax = fig.add_subplot(1, 2, 1) ax.plot(x, smooth(train_loss_total_epochs, 0.6), label='train loss') ax.plot(x, smooth(valid_loss_total_epochs, 0.6), label='val loss') ax.set_xlabel('Epoch', fontsize=15) ax.set_ylabel('CrossEntropy', fontsize=15) ax.set_title('train curve', fontsize=15) ax.grid(True) plt.legend(loc='upper right', fontsize=15) ax = fig.add_subplot(1, 2, 2) ax.plot(x, epoch_lr, label='Learning Rate') ax.set_xlabel('Epoch', fontsize=15) ax.set_ylabel('Learning Rate', fontsize=15) ax.set_title('lr curve', fontsize=15) ax.grid(True) plt.legend(loc='upper right', fontsize=15) plt.show() return best_mode, model
def train_model(epoch, model, optimizer, lr_scheduler, loader, test_loader): global GLOBAL_STEP test_loader_it = iter(test_loader) loss_meter = AverageMeter() val_loss_meter = AverageMeter() acc_meter = AverageMeter() val_acc_meter = AverageMeter() model.train() model.to(device) print('=' * 20 + "Model Training" + '=' * 20) loss_func = nn.CrossEntropyLoss() for i, batch in tqdm(enumerate(loader)): start = time.time() model.train() optimizer.zero_grad() model.zero_grad() sentence1, sentence2, label = batch label = label.to(device) pred = model((sentence1, sentence2)) loss = loss_func(pred, label) loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_bound) optimizer.step() acc = torch.mean((torch.max(pred, 1)[1] == label).type(torch.float)) loss_meter.update(loss.item()) acc_meter.update(acc.item()) end = time.time() used_time = end - start if (GLOBAL_STEP) % args.log_every == 0: try: batch = next(test_loader_it) except: test_loader_it = iter(test_loader) batch = next(test_loader_it) eval_loss, eval_acc = batch_eval(batch, model) val_loss_meter.update(eval_loss.item()) val_acc_meter.update(eval_acc.item()) lr = optimizer.param_groups[0]['lr'] display = 'epoch=' + str(epoch) + \ '\tglobal_step=%d' % (GLOBAL_STEP) + \ '\tloss=%.4f' % (loss_meter.val) + \ '\tloss_avg=%.4f' % (loss_meter.avg) + \ '\tval_loss=%.4f' % (val_loss_meter.avg) + \ '\tacc=%.4f' % (acc_meter.avg) + \ '\tval_acc=%.4f' % (val_acc_meter.avg) + \ '\tlr=%.6f' % (lr) + \ '\t|g|=%.4f' % (grad_norm) + \ '\ttime=%.2fit/s' % (1. / used_time) tb_writer.add_scalar('Training/training_loss', loss_meter.avg, GLOBAL_STEP) tb_writer.add_scalar('Training/training_acc', acc_meter.avg, GLOBAL_STEP) tb_writer.add_scalar('Training/dev_loss', val_loss_meter.avg, GLOBAL_STEP) tb_writer.add_scalar('Training/dev_acc', val_acc_meter.avg, GLOBAL_STEP) tqdm.write(display) loss_meter.reset() acc_meter.reset() val_loss_meter.reset() val_acc_meter.reset() if (GLOBAL_STEP) % (args.log_every * 20) == 0: save_mode(epoch=epoch, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) GLOBAL_STEP += 1 return
def validate(args): setup_dllogger(0, filename=args.dllogger_file) if args.checkpoint != '': args.pretrained = True args.prefetcher = not args.no_prefetcher if args.waymo: assert args.waymo_val is not None memory_format = (torch.channels_last if args.memory_format == "nhwc" else torch.contiguous_format) args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank if args.distributed: torch.cuda.manual_seed_all(args.seed) args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() # Set device limit on the current device # cudaLimitMaxL2FetchGranularity = 0x05 pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) assert pValue.contents.value == 128 assert args.rank >= 0 # create model bench = create_model(args.model, input_size=args.input_size, num_classes=args.num_classes, bench_task='predict', pretrained=args.pretrained, redundant_bias=args.redundant_bias, checkpoint_path=args.checkpoint, checkpoint_ema=args.use_ema, soft_nms=args.use_soft_nms, strict_load=False) input_size = bench.config.image_size data_config = bench.config param_count = sum([m.numel() for m in bench.parameters()]) print('Model %s created, param count: %d' % (args.model, param_count)) bench = bench.cuda().to(memory_format=memory_format) if args.distributed > 1: raise ValueError( "Evaluation is supported only on single GPU. args.num_gpu must be 1" ) bench = DDP( bench, device_ids=[args.device] ) # torch.nn.DataParallel(bench, device_ids=list(range(args.num_gpu))) if args.waymo: annotation_path = args.waymo_val_annotation image_dir = args.waymo_val else: if 'test' in args.anno: annotation_path = os.path.join(args.data, 'annotations', f'image_info_{args.anno}.json') image_dir = 'test2017' else: annotation_path = os.path.join(args.data, 'annotations', f'instances_{args.anno}.json') image_dir = args.anno dataset = CocoDetection(os.path.join(args.data, image_dir), annotation_path, data_config) evaluator = COCOEvaluator(dataset.coco, distributed=args.distributed, waymo=args.waymo) loader = create_loader(dataset, input_size=input_size, batch_size=args.batch_size, use_prefetcher=args.prefetcher, interpolation=args.interpolation, fill_color=args.fill_color, num_workers=args.workers, distributed=args.distributed, pin_mem=args.pin_mem, memory_format=memory_format) img_ids = [] results = [] dllogger_metric = {} bench.eval() batch_time = AverageMeter() throughput = AverageMeter() end = time.time() total_time_start = time.time() with torch.no_grad(): for i, (input, target) in enumerate(loader): with torch.cuda.amp.autocast(enabled=args.amp): output = bench(input, target['img_scale'], target['img_size']) batch_time.update(time.time() - end) throughput.update(input.size(0) / batch_time.val) evaluator.add_predictions(output, target) torch.cuda.synchronize() # measure elapsed time if i == 9: batch_time.reset() throughput.reset() if args.rank == 0 and i % args.log_freq == 0: print( 'Test: [{0:>4d}/{1}] ' 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' .format( i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg, )) end = time.time() dllogger_metric['total_inference_time'] = time.time() - total_time_start dllogger_metric['inference_throughput'] = throughput.avg dllogger_metric['inference_time'] = 1000 / throughput.avg total_time_start = time.time() mean_ap = 0. if not args.inference: if 'test' not in args.anno: mean_ap = evaluator.evaluate() else: evaluator.save_predictions(args.results) dllogger_metric['map'] = mean_ap dllogger_metric['total_eval_time'] = time.time() - total_time_start else: evaluator.save_predictions(args.results) if not args.distributed or args.rank == 0: dllogger.log(step=(), data=dllogger_metric, verbosity=0) return results
def train(self): criterion = nn.CrossEntropyLoss().to(self.device) optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) total_step = len(self.train_loader) scheduler = StepLR(optimizer, self.decay_epoch, gamma=0.5) color_avg = AverageMeter('color') season_avg = AverageMeter('season') # part_avg = AverageMeter('part') style_avg = AverageMeter('style') category_avg = AverageMeter('category') color_plot = create_vis_plot('Epoch', 'Loss', 'Color') season_plot = create_vis_plot('Epoch', 'Loss', 'Season') # part_plot = create_vis_plot('Epoch', 'Loss', 'Part') style_plot = create_vis_plot('Epoch', 'Loss', 'Style') category_plot = create_vis_plot('Epoch', 'Loss', 'Category') for epoch in range(self.epoch, self.num_epoch): color_avg.reset() season_avg.reset() # part_avg.reset() style_avg.reset() category_avg.reset() correct_color = 0 correct_style = 0 # correct_part = 0 correct_season = 0 correct_category = 0 for step, (images, color, style, season, category) in enumerate(self.train_loader): images = images.to(self.device) color = color.to(self.device) style = style.to(self.device) # part = part.to(self.device) season = season.to(self.device) category = category.to(self.device) color_prediction, style_prediction, season_prediction, category_prediction = self.net( images) color_loss = criterion(color_prediction, color) style_loss = criterion(style_prediction, style) # part_loss = criterion(part_prediction, part) season_loss = criterion(season_prediction, season) category_loss = criterion(category_prediction, category) loss = color_loss + 2 * style_loss + season_loss + category_loss optimizer.zero_grad() loss.backward() optimizer.step() color_avg.update(color_loss.item()) style_avg.update(style_loss.item()) # part_avg.update(part_loss.item()) season_avg.update(season_loss.item()) category_avg.update(category_loss.item()) correct_color += color_prediction.argmax( dim=1).eq(color).sum().item() correct_style += style_prediction.argmax( dim=1).eq(style).sum().item() # correct_part += part_prediction.argmax(dim=1).eq(part).sum().item() correct_season += season_prediction.argmax( dim=1).eq(season).sum().item() correct_category += category_prediction.argmax( dim=1).eq(category).sum().item() if step % 10 == 1: print( f'Epoch [{epoch}/{self.num_epoch}], Step: [{step}/{total_step}], Color Loss: {color_avg.avg:.4f},' f'Season Loss: {season_avg.avg:.4f}, Style Loss: {style_avg.avg:.4f}, ' f'Category Loss: {category_avg.avg:.4f}') print( f'Color: {correct_color/((step+1)*self.batch_size)*100:.4f}%, Style: {correct_style/((step+1)*self.batch_size)*100:.4f}%, ' f'Season:{correct_season/((step+1)*self.batch_size)*100:.4f}%, Category: {correct_category/((step+1)*self.batch_size)*100:.4f}%' ) torch.save( self.net.state_dict(), f'{self.checkpoint_dir}/{self.backbone}_checkpoint-{epoch}.pth' ) input_tensor = torch.rand(64, 3, 224, 224).to(self.device) #base_network = mobilenet_v2(pretrained=True) script_module = torch.jit.trace(self.net, input_tensor) #딥러닝 모델 저장 : 모델구조, 파라미터 - 케라스 // 파라미터만 (가중치) - 파이토치, 파이썬 파이토치, 라이브 토치 파라미터 + 모델구조 (C++토치) .pt: 모델, script script_module.save("jyson_classification0116.pt") scheduler.step() update_vis_plot(epoch, color_avg.avg, color_plot, 'append') update_vis_plot(epoch, season_avg.avg, season_plot, 'append') # update_vis_plot(epoch, part_avg.avg, part_plot, 'append') update_vis_plot(epoch, style_avg.avg, style_plot, 'append') update_vis_plot(epoch, category_avg.avg, category_plot, 'append')
def train_val(model, args): train_dir = args.train_dir val_dir = args.val_dir config = Config(args.config) cudnn.benchmark = True #lspet dataset contains 10000 images, lsp dataset contains 2000 images. # train train_loader = torch.utils.data.DataLoader(lsp_lspet_data.LSP_Data( 'lspet', train_dir, 8, Mytransforms.Compose([ Mytransforms.RandomResized(), Mytransforms.RandomRotate(40), Mytransforms.RandomCrop(368), Mytransforms.RandomHorizontalFlip(), ])), batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) # val if args.val_dir is not None and config.test_interval != 0: # val val_loader = torch.utils.data.DataLoader(lsp_lspet_data.LSP_Data( 'lsp', val_dir, 8, Mytransforms.Compose([ Mytransforms.TestResized(368), ])), batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) criterion = nn.MSELoss().cuda() params, multiple = get_parameters(model, config, False) optimizer = torch.optim.SGD(params, config.base_lr, momentum=config.momentum, weight_decay=config.weight_decay) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() losses_list = [AverageMeter() for i in range(6)] end = time.time() iters = config.start_iters best_model = config.best_model heat_weight = 46 * 46 * 15 / 1.0 while iters < config.max_iter: #train_loader가 한번 불러오면 i는 1증가, input은 16개씩 가져옴 for i, (input, heatmap, centermap, img_path) in enumerate(train_loader): learning_rate = adjust_learning_rate( optimizer, iters, config.base_lr, policy=config.lr_policy, policy_parameter=config.policy_parameter, multiple=multiple) data_time.update(time.time() - end) heatmap = heatmap.cuda(async=True) #print(heatmap) #sys.exit(1) centermap = centermap.cuda(async=True) input_var = torch.autograd.Variable(input) heatmap_var = torch.autograd.Variable(heatmap) centermap_var = torch.autograd.Variable(centermap) heat1, heat2, heat3, heat4, heat5, heat6 = model( input_var, centermap_var) loss1 = criterion(heat1, heatmap_var) * heat_weight loss2 = criterion(heat2, heatmap_var) * heat_weight loss3 = criterion(heat3, heatmap_var) * heat_weight loss4 = criterion(heat4, heatmap_var) * heat_weight loss5 = criterion(heat5, heatmap_var) * heat_weight loss6 = criterion(heat6, heatmap_var) * heat_weight loss = loss1 + loss2 + loss3 + loss4 + loss5 + loss6 losses.update(loss.data[0], input.size(0)) for cnt, l in enumerate([loss1, loss2, loss3, loss4, loss5, loss6]): losses_list[cnt].update(l.data[0], input.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end) end = time.time() iters += 1 #print(i,'\n') if iters % config.display == 0: print( 'Train Iteration: {0}\t' 'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t' 'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n' 'Learning rate = {2}\n' 'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'.format( iters, config.display, learning_rate, batch_time=batch_time, data_time=data_time, loss=losses)) for cnt in range(0, 6): print( 'Loss{0} = {loss1.val:.8f} (ave = {loss1.avg:.8f})\t'. format(cnt + 1, loss1=losses_list[cnt])) print( time.strftime( '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n', time.localtime())) ############# image write ################## for cnt in range(config.batch_size): kpts = get_kpts(heat6[cnt], img_h=368.0, img_w=368.0) draw_paint(img_path[cnt], kpts, i, cnt) ####################################################### batch_time.reset() data_time.reset() losses.reset() for cnt in range(6): losses_list[cnt].reset() save_checkpoint({ 'iter': iters, 'state_dict': model.state_dict(), }, 0, args.model_name) # val if args.val_dir is not None and config.test_interval != 0 and iters % config.test_interval == 0: model.eval() for j, (input, heatmap, centermap) in enumerate(val_loader): heatmap = heatmap.cuda(async=True) centermap = centermap.cuda(async=True) input_var = torch.autograd.Variable(input) heatmap_var = torch.autograd.Variable(heatmap) centermap_var = torch.autograd.Variable(centermap) heat1, heat2, heat3, heat4, heat5, heat6 = model( input_var, centermap_var) loss1 = criterion(heat1, heatmap_var) * heat_weight loss2 = criterion(heat2, heatmap_var) * heat_weight loss3 = criterion(heat3, heatmap_var) * heat_weight loss4 = criterion(heat4, heatmap_var) * heat_weight loss5 = criterion(heat5, heatmap_var) * heat_weight loss6 = criterion(heat6, heatmap_var) * heat_weight loss = loss1 + loss2 + loss3 + loss4 + loss5 + loss6 losses.update(loss.data[0], input.size(0)) for cnt, l in enumerate( [loss1, loss2, loss3, loss4, loss5, loss6]): losses_list[cnt].update(l.data[0], input.size(0)) batch_time.update(time.time() - end) end = time.time() is_best = losses.avg < best_model best_model = min(best_model, losses.avg) save_checkpoint( { 'iter': iters, 'state_dict': model.state_dict(), }, is_best, args.model_name) if j % config.display == 0: print( 'Test Iteration: {0}\t' 'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t' 'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n' 'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'. format(j, config.display, batch_time=batch_time, data_time=data_time, loss=losses)) for cnt in range(0, 6): print( 'Loss{0} = {loss1.val:.8f} (ave = {loss1.avg:.8f})\t' .format(cnt + 1, loss1=losses_list[cnt])) print( time.strftime( '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n', time.localtime())) batch_time.reset() losses.reset() for cnt in range(6): losses_list[cnt].reset() model.train()
def train(self, train_loader, eval_loader): losses_1 = AverageMeter() losses_2 = AverageMeter() data_time = AverageMeter() batch_time = AverageMeter() end_time = time.time() if self.iter > self.max_iter: logging.info("Optimization is done !") sys.exit(0) for data in train_loader: self.model.train() # forward data_time.update(time.time() - end_time) data = self._read_inputs(data) # get loss loss, train_prec = self._forward(data) if isinstance(loss, tuple): losses_1.update(loss[0].item()) losses_2.update(loss[1].item()) total_loss = sum(loss) else: losses_1.update(loss.item()) total_loss = loss # optimization self.optimizer.zero_grad() total_loss.backward() self.optimizer.step() self.lr_scheduler.step() # time for training(forward & loss computation & optimization) on one batch batch_time.update(time.time() - end_time) # log avg loss if self.iter > 0 and self.iter % self.cfg.TRAIN.PRINT_FREQ == 0: if isinstance(loss, tuple): self.writer.add_scalar('loss/cls', losses_1.avg, self.iter) self.writer.add_scalar('loss/box', losses_2.avg, self.iter) loss_msg = f'avg_cls_loss:{losses_1.avg:.04f} avg_box_loss:{losses_2.avg:.04f}' else: if self.replace_model_name: self.writer.add_scalar(f'{self.model_name}_loss', losses_1.avg, self.iter) loss_msg = f'avg_{self.model_name}_loss:{losses_1.avg:.04f}' else: self.writer.add_scalar('loss', losses_1.avg, self.iter) loss_msg = f'avg_loss:{losses_1.avg:.04f}' logging.info( f'epoch:{self.epoch:03d} ' f'{loss_msg:s} ' f'io_rate:{data_time.avg / batch_time.avg:.04f} ' f'samples/(gpu*s):{self.cfg.DATASET.IMG_NUM_PER_GPU / batch_time.avg:.02f}' ) self.writer.add_scalar( 'speed/samples_per_second_per_gpu', self.cfg.DATASET.IMG_NUM_PER_GPU / batch_time.avg, self.iter) self.writer.add_scalar('speed/io_rate', data_time.avg / batch_time.avg, self.iter) if train_prec is not None: logging.info(f'train precision: {train_prec}') losses_1.reset() losses_2.reset() # save checkpoint if self.iter > 0 and self.iter % self.cfg.TRAIN.SAVE_INTERVAL == 0: # evaluation if self.cfg.TRAIN.VAL_WHEN_TRAIN: self.model.eval() performance = self.evaluate(eval_loader) self.writer.add_scalar(self.PI, performance, self.iter) if self.PI == 'triplet_loss' and performance < self.best_performance: self.is_best = True self.best_performance = performance elif performance > self.best_performance: self.is_best = True self.best_performance = performance else: self.is_best = False logging.info( f'Now: best {self.PI} is {self.best_performance}') else: performance = -1 # save checkpoint try: state_dict = self.model.module.state_dict( ) # remove prefix of multi GPUs except AttributeError: state_dict = self.model.state_dict() if self.rank == 0: if self.cfg.TRAIN.SAVE_EVERY_CHECKPOINT: filename = f"{self.model_name}_epoch{self.epoch:03d}_iter{self.iter:06d}_checkpoint.pth" else: filename = "checkpoint.pth" save_checkpoint( { 'iter': self.iter, 'model': self.model_name, f'performance/{self.PI}': performance, 'state_dict': state_dict, 'optimizer': self.optimizer.state_dict(), }, self.is_best, self.log_dir, filename=filename) self.iter += 1 end_time = time.time() self.epoch += 1
def train(self): criterion = nn.CrossEntropyLoss().to(self.device) optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) lr_scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, LambdaLR(self.num_epoch, self.epoch, self.decay_epoch).step) total_step = len(self.train_loader) losses = AverageMeter() accuracy = AverageMeter() accuracy_set, loss_set, lr_set, epoch_set = self.read_loss_info() loss_window = self.visdom.line(Y=[1]) lr_window = self.visdom.line(Y=[1]) accuracy_window = self.visdom.line(Y=[1]) for epoch in range(self.epoch, self.num_epoch): losses.reset() for step, (images, labels) in enumerate(self.train_loader): images = images.to(self.device) labels = labels.to(self.device) outputs = self.net(images) loss = criterion(outputs, labels) optimizer.zero_grad() loss.backward() optimizer.step() _, predicted = torch.max(outputs.data, 1) predicted = (predicted == labels).sum().item() losses.update(loss.item(), self.batch_size) accuracy.update(predicted / self.batch_size, self.batch_size) if step % 10 == 0: print( f'Epoch [{epoch}/{self.num_epoch}], Step [{step}/{total_step}], Loss: {losses.avg:.4f}, ' f'Accuracy: {accuracy.avg:.4f}') accuracy_set += [accuracy.avg] loss_set += [losses.avg] lr_set += [optimizer.param_groups[0]['lr']] epoch_set += [epoch] loss_window = self.visdom.line(Y=loss_set, X=epoch_set, win=loss_window, update='replace') lr_window = self.visdom.line(Y=lr_set, X=epoch_set, win=lr_window, update='replace') accuracy_window = self.visdom.line(Y=accuracy_set, X=epoch_set, win=accuracy_window, update='replace') self.save_loss_info(accuracy_set, loss_set, lr_set, epoch_set) torch.save(self.net.state_dict(), '%s/vgg16-%d.pth' % (self.checkpoint_dir, epoch)) lr_scheduler.step()
def train(self): optimizer_ae = Adam(chain(self.Encoder.parameters(), self.Decoder.parameters()), self.lr, betas=(self.b1, self.b2), weight_decay=self.weight_decay) optimizer_discriminator = Adam(self.Disciminator.parameters(), self.lr, betas=(self.b1, self.b2), weight_decay=self.weight_decay) lr_scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer_ae, LambdaLR(self.num_epoch, self.epoch, self.decay_epoch).step) total_step = len(self.data_loader) perceptual_criterion = PerceptualLoss().to(self.device) content_criterion = nn.L1Loss().to(self.device) adversarial_criterion = nn.BCELoss().to(self.device) self.Encoder.train() self.Decoder.train() content_losses = AverageMeter() generator_losses = AverageMeter() perceptual_losses = AverageMeter() discriminator_losses = AverageMeter() ae_losses = AverageMeter() lr_window = create_vis_plot('Epoch', 'Learning rate', 'Learning rate') loss_window = create_vis_plot('Epoch', 'Loss', 'Total Loss') generator_loss_window = create_vis_plot('Epoch', 'Loss', 'Generator Loss') discriminator_loss_window = create_vis_plot('Epoch', 'Loss', 'Discriminator Loss') content_loss_window = create_vis_plot('Epoch', 'Loss', 'Content Loss') perceptual_loss_window = create_vis_plot('Epoch', 'Loss', 'Perceptual Loss') if not os.path.exists(self.sample_dir): os.makedirs(self.sample_dir) if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) for epoch in range(self.epoch, self.num_epoch): content_losses.reset() perceptual_losses.reset() generator_losses.reset() ae_losses.reset() discriminator_losses.reset() for step, images in enumerate(self.data_loader): images = images.to(self.device) real_labels = torch.ones((images.size(0), 1)).to(self.device) fake_labels = torch.zeros((images.size(0), 1)).to(self.device) encoded_image = self.Encoder(images) binary_decoded_image = paq.compress( encoded_image.cpu().detach().numpy().tobytes()) # encoded_image = paq.decompress(binary_decoded_image) # # encoded_image = torch.from_numpy(np.frombuffer(encoded_image, dtype=np.float32) # .reshape(-1, self.storing_channels, self.image_size // 8, # self.image_size // 8)).to(self.device) decoded_image = self.Decoder(encoded_image) content_loss = content_criterion(images, decoded_image) perceptual_loss = perceptual_criterion(images, decoded_image) generator_loss = adversarial_criterion( self.Disciminator(decoded_image), real_labels) # generator_loss = -self.Disciminator(decoded_image).mean() ae_loss = content_loss * self.content_loss_factor + perceptual_loss * self.perceptual_loss_factor + \ generator_loss * self.generator_loss_factor content_losses.update(content_loss.item()) perceptual_losses.update(perceptual_loss.item()) generator_losses.update(generator_loss.item()) ae_losses.update(ae_loss.item()) optimizer_ae.zero_grad() ae_loss.backward(retain_graph=True) optimizer_ae.step() interpolated_image = self.eta * images + ( 1 - self.eta) * decoded_image gravity_penalty = self.Disciminator(interpolated_image).mean() real_loss = adversarial_criterion(self.Disciminator(images), real_labels) fake_loss = adversarial_criterion( self.Disciminator(decoded_image), fake_labels) discriminator_loss = (real_loss + fake_loss) * self.discriminator_loss_factor / 2 +\ gravity_penalty * self.penalty_loss_factor # discriminator_loss = self.Disciminator(decoded_image).mean() - self.Disciminator(images).mean() + \ # gravity_penalty * self.penalty_loss_factor optimizer_discriminator.zero_grad() discriminator_loss.backward(retain_graph=True) optimizer_discriminator.step() discriminator_losses.update(discriminator_loss.item()) if step % 100 == 0: print( f"[Epoch {epoch}/{self.num_epoch}] [Batch {step}/{total_step}] [Learning rate {get_lr(optimizer_ae)}] " f"[Content {content_loss:.4f}] [Perceptual {perceptual_loss:.4f}] [Gan {generator_loss:.4f}]" f"[Discriminator {discriminator_loss:.4f}]") save_image( torch.cat([images, decoded_image], dim=2), os.path.join(self.sample_dir, f"Sample-epoch-{epoch}-step-{step}.png")) update_vis_plot(epoch, ae_losses.avg, loss_window, 'append') update_vis_plot(epoch, generator_losses.avg, generator_loss_window, 'append') update_vis_plot(epoch, discriminator_losses.avg, discriminator_loss_window, 'append') update_vis_plot(epoch, content_losses.avg, content_loss_window, 'append') update_vis_plot(epoch, perceptual_losses.avg, perceptual_loss_window, 'append') update_vis_plot(epoch, get_lr(optimizer_ae), lr_window, 'append') lr_scheduler.step() torch.save( self.Encoder.state_dict(), os.path.join(self.checkpoint_dir, f"Encoder-{epoch}.pth")) torch.save( self.Decoder.state_dict(), os.path.join(self.checkpoint_dir, f"Decoder-{epoch}.pth")) torch.save( self.Disciminator.state_dict(), os.path.join(self.checkpoint_dir, f"Discriminator-{epoch}.pth"))
def val(model, args, val_loader, criterion, config): global e batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() losses_list = [AverageMeter() for i in range(6)] end = time.time() iters = config.start_iters best_model = config.best_model heat_weight = 46 * 46 * 15 / 1.0 # model.eval() model.detector.eval() model.flownet.eval() # model.eval() for j, (input, heatmap, centermap) in enumerate(val_loader): heatmap = heatmap.cuda(async=True) centermap = centermap.cuda(async=True) input_var = torch.autograd.Variable(input) heatmap_var = torch.autograd.Variable(heatmap) centermap_var = torch.autograd.Variable(centermap) output = model(input_var, centermap_var) loss_ = [criterion(ht, heatmap_var) * heat_weight for ht in output] loss = 0. loss += l for l in loss_ losses.update(loss.data[0], input.size(0)) for cnt, l in enumerate( [loss1, loss2, loss3, loss4, loss5, loss6]): losses_list[cnt].update(l.data[0], input.size(0)) if j % config.display == 0: print('Valepoch: {2}/{3}\t' 'Test Iteration: {0}\t' 'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t' 'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n' 'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'.format( j, config.display, e, args.n_epochs, batch_time=batch_time, data_time=data_time, loss=losses)) for cnt in range(0, 6): print('Loss{0} = {loss1.val:.8f} (ave = {loss1.avg:.8f})\t' .format(cnt + 1, loss1=losses_list[cnt])) print(time.strftime( '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n', time.localtime())) batch_time.reset() losses.reset() for cnt in range(6): losses_list[cnt].reset() def main(args): # build train and val set train_dir = args.train_dir val_dir = args.val_dir config = Config(args.config) cudnn.benchmark = True # train train_loader = torch.utils.data.DataLoader( lsp_lspet_data.LSP_Data('lspet', train_dir, 8, Mytransforms.Compose([Mytransforms.RandomResized(), Mytransforms.RandomRotate(40), Mytransforms.RandomCrop(368), Mytransforms.RandomHorizontalFlip(), ])), batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) # val if args.val_dir is not None and config.test_interval != 0: # val val_loader = torch.utils.data.DataLoader( lsp_lspet_data.LSP_Data('lsp', val_dir, 8, Mytransforms.Compose([Mytransforms.TestResized(368), ])), batch_size=config.batch_size, shuffle=False, num_workers=config.workers, pin_memory=True) # build model model = MSBR(config=config, args=args, k=14, stages=config.stages) model.build_nets() return model, train_loader, val_loader if __name__ == '__main__': # os.environ['CUDA_VISIBLE_DEVICES'] = '0' args = parse() model, train_loader, val_loader = main(args) if args.pretrained_d is not 'None' and args.val_dir is not None and config.test_interval != 0: val_loss_d, val_loss_f = val(model, args, val_loader, criterion, config) for e in range(args.n_epochs): global e train_loss_d, train_loss_f = train(model, args, train_loader) if args.val_dir is not None and config.test_interval != 0: with torch.no_grad(): val_loss_d, val_loss_f = val(model, args, val_loader, criterion) is_best_d = val_loss_d.avg < config.best_model_d is_best_f = val_loss_f.avg < config.best_model_f config.best_model_d = min(config.best_model_d, losses.avg) config.best_model_f = min(config.best_model_f, losses.avg) save_checkpoint({ 'epoch': e, 'state_dict': model.detector.state_dict(), }, is_best_d, args.detector_name) save_checkpoint({ 'epoch': e, 'state_dict': model.flownet.state_dict(), }, is_best_f, args.flownet_name)
def training(train_loader, epochs, n_subact=0, save=True, **kwargs): """Training pipeline for embedding. Args: train_loader: iterator within dataset epochs: how much training epochs to perform n_subact: number of subactions in current complex activity mnist: if training with mnist dataset (just to test everything how well it works) Returns: trained pytorch model """ logger.debug('create model') torch.manual_seed(opt.seed) np.random.seed(opt.seed) try: model = kwargs['model'] loss = kwargs['loss'] optimizer = kwargs['optimizer'] except KeyError: model = Embedding(embed_dim=opt.embed_dim, feature_dim=opt.feature_dim, n_subact=n_subact).cuda() loss = RankLoss(margin=0.2).cuda() optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay) cudnn.benchmark = True batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() vis = Visual() best_acc = -1 _lr = opt.lr logger.debug('epochs: %s', epochs) loss_previous = np.inf for epoch in range(epochs): model.cuda() model.train() logger.debug('Epoch # %d' % epoch) if opt.lr_adj: # if epoch in [int(epochs * 0.3), int(epochs * 0.7)]: # if epoch in [int(epochs * 0.5)]: if epoch % 30 == 0 and epoch > 0: _lr = adjust_lr(optimizer, _lr) logger.debug('lr: %f' % _lr) end = time.time() for i, (input, k, _) in enumerate(train_loader): # TODO: not sure that it's necessary data_time.update(time.time() - end) input = input.float().cuda(non_blocking=True) k = k.float().cuda() output = model(input) loss_values = loss(output, k) losses.update(loss_values.item(), input.size(0)) optimizer.zero_grad() loss_values.backward() optimizer.step() batch_time.update(time.time() - end) end = time.time() if i % 100 == 0 and i: logger.debug( 'Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses)) logger.debug('loss: %f' % losses.avg) losses.reset() if save: save_dict = { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } dir_check(join(opt.dataset_root, 'models')) dir_check(join(opt.dataset_root, 'models', kwargs['name'])) torch.save( save_dict, join(opt.dataset_root, 'models', kwargs['name'], '%s.pth.tar' % opt.log_str)) return model
def train_val(model, args): train_dir = args.train_dir val_dir = args.val_dir config = Config(args.config) cudnn.benchmark = True # train train_loader = torch.utils.data.DataLoader(lsp_lspet_data.LSP_Data( 'lspet', train_dir, 8, Mytransforms.Compose([ Mytransforms.RandomResized(), Mytransforms.RandomRotate(40), Mytransforms.RandomCrop(368), Mytransforms.RandomHorizontalFlip(), ])), batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) # val if args.val_dir is not None and config.test_interval != 0: # val val_loader = torch.utils.data.DataLoader(lsp_lspet_data.LSP_Data( 'lsp', val_dir, 8, Mytransforms.Compose([ Mytransforms.TestResized(368), ])), batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) if args.gpu[0] < 0: criterion = nn.MSELoss() else: criterion = nn.MSELoss().cuda() params, multiple = get_parameters(model, config, True) # params, multiple = get_parameters(model, config, False) optimizer = torch.optim.SGD(params, config.base_lr, momentum=config.momentum, weight_decay=config.weight_decay) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() losses_list = [AverageMeter() for i in range(6)] end = time.time() iters = config.start_iters best_model = config.best_model heat_weight = 46 * 46 * 15 / 1.0 losstracker1 = [] losstracker2 = [] losstracker3 = [] losstracker4 = [] losstracker5 = [] losstracker6 = [] while iters < config.max_iter: for i, (input, heatmap, centermap) in enumerate(train_loader): learning_rate = adjust_learning_rate( optimizer, iters, config.base_lr, policy=config.lr_policy, policy_parameter=config.policy_parameter, multiple=multiple) data_time.update(time.time() - end) if args.gpu[0] >= 0: heatmap = heatmap.cuda(async=True) centermap = centermap.cuda(async=True) input_var = torch.autograd.Variable(input) heatmap_var = torch.autograd.Variable(heatmap) centermap_var = torch.autograd.Variable(centermap) heat1, heat2, heat3, heat4, heat5, heat6 = model( input_var, centermap_var) loss1 = criterion(heat1, heatmap_var) * heat_weight loss2 = criterion(heat2, heatmap_var) * heat_weight loss3 = criterion(heat3, heatmap_var) * heat_weight loss4 = criterion(heat4, heatmap_var) * heat_weight loss5 = criterion(heat5, heatmap_var) * heat_weight loss6 = criterion(heat6, heatmap_var) * heat_weight loss = loss1 + loss2 + loss3 + loss4 + loss5 + loss6 #print(input.size(0).item()) losses.update(loss.item(), input.size(0)) for cnt, l in enumerate([loss1, loss2, loss3, loss4, loss5, loss6]): losses_list[cnt].update(l.item(), input.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end) end = time.time() iters += 1 if iters % config.display == 0: print( 'Train Iteration: {0}\t' 'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t' 'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n' 'Learning rate = {2}\n' 'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'.format( iters, config.display, learning_rate, batch_time=batch_time, data_time=data_time, loss=losses)) for cnt in range(0, 6): print( 'Loss{0} = {loss1.val:.8f} (ave = {loss1.avg:.8f})\t'. format(cnt + 1, loss1=losses_list[cnt])) print( time.strftime( '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n', time.localtime())) batch_time.reset() data_time.reset() losses.reset() for cnt in range(6): losses_list[cnt].reset() save_checkpoint({ 'iter': iters, 'state_dict': model.state_dict(), }, 0, args.model_name) # val if args.val_dir is not None and config.test_interval != 0 and iters % config.test_interval == 0: model.eval() for j, (input, heatmap, centermap) in enumerate(val_loader): if args.cuda[0] >= 0: heatmap = heatmap.cuda(async=True) centermap = centermap.cuda(async=True) input_var = torch.autograd.Variable(input) heatmap_var = torch.autograd.Variable(heatmap) centermap_var = torch.autograd.Variable(centermap) heat1, heat2, heat3, heat4, heat5, heat6 = model( input_var, centermap_var) loss1 = criterion(heat1, heatmap_var) * heat_weight loss2 = criterion(heat2, heatmap_var) * heat_weight loss3 = criterion(heat3, heatmap_var) * heat_weight loss4 = criterion(heat4, heatmap_var) * heat_weight loss5 = criterion(heat5, heatmap_var) * heat_weight loss6 = criterion(heat6, heatmap_var) * heat_weight loss = loss1 + loss2 + loss3 + loss4 + loss5 + loss6 losses.update(loss.data[0], input.size(0)) for cnt, l in enumerate( [loss1, loss2, loss3, loss4, loss5, loss6]): losses_list[cnt].update(l.data[0], input.size(0)) batch_time.update(time.time() - end) end = time.time() is_best = losses.avg < best_model best_model = min(best_model, losses.avg) save_checkpoint( { 'iter': iters, 'state_dict': model.state_dict(), }, is_best, args.model_name) if j % config.display == 0: print( 'Test Iteration: {0}\t' 'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t' 'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n' 'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'. format(j, config.display, batch_time=batch_time, data_time=data_time, loss=losses)) for cnt in range(0, 6): print( 'Loss{0} = {loss1.val:.8f} (ave = {loss1.avg:.8f})\t' .format(cnt + 1, loss1=losses_list[cnt])) print( time.strftime( '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n', time.localtime())) batch_time.reset() losses.reset() for cnt in range(6): losses_list[cnt].reset() losstracker1.append(loss1) losstracker2.append(loss2) losstracker3.append(loss3) losstracker4.append(loss4) losstracker5.append(loss5) losstracker6.append(loss6) model.train() np.save('loss1', np.asarray(losstracker1)) np.save('loss2', np.asarray(losstracker2)) np.save('loss3', np.asarray(losstracker3)) np.save('loss4', np.asarray(losstracker4)) np.save('loss5', np.asarray(losstracker5)) np.save('loss6', np.asarray(losstracker6))
def train(self): # Setting the variables before starting the training print('Loading checkpoint if checkpoint_dir is given...') self.load_checkpoint() avg_train_loss = AverageMeter() avg_train_acc = AverageMeter() text_avg_train_acc = AverageMeter() combined_avg_train_acc = AverageMeter() best_val_acc = -np.inf patience_counter = 0 best_epoch = self.num_epochs for epoch in range(self.num_epochs): self.model.print_frozen() avg_train_loss.reset() avg_train_acc.reset() text_avg_train_acc.reset() # Mini batch loop for batch_idx, batch in enumerate(tqdm(self.train_loader)): step = epoch * len(self.train_loader) + batch_idx # Get the model output for the batch and update the loss and accuracy meters train_loss, train_acc, text_train_acc = self.train_step(batch) if self.args.scheduler == 'cycle': self.scheduler.step() avg_train_loss.update([train_loss.item()]) avg_train_acc.update([train_acc]) text_avg_train_acc.update([text_train_acc]) # Logging and validation check if step % self.print_every == 0: print( 'Epoch {}, batch {}, step {}, ' 'loss = {:.4f}, acc_audio = {:.4f}, acc_text = {:.4f}, ' 'running averages: loss = {:.4f}, acc_audio = {:.4f}, acc_text = {:.4f}' .format(epoch, batch_idx, step, train_loss.item(), train_acc, text_train_acc, avg_train_loss.get(), avg_train_acc.get(), text_avg_train_acc.get())) if step % self.val_every == 0: val_loss, val_acc, text_val_acc, combined_val_acc = self.val( ) print( 'Val acc (audio) = {:.4f}, Val acc (text) = {:.4f}, Val acc (combined) = {:.4f}, Val loss = {:.4f}' .format(val_acc, text_val_acc, combined_val_acc, val_loss)) # Update the save the best validation checkpoint if needed if self.args.model_save_criteria == 'audio_text': cur_avg_acc = (val_acc + text_val_acc) / 2 else: #'combined' cur_avg_acc = combined_val_acc if cur_avg_acc > best_val_acc: #print('Start saving best check point at step{}...'.format(step)) best_val_acc = cur_avg_acc best_chkpt_path = os.path.join(self.model_dir, 'best_ckpt.pth') torch.save(self.model.state_dict(), best_chkpt_path) print('Done saving best check point!') if self.args.scheduler == 'plateau': self.scheduler.step(audio_text_avg_acc) print('------ End of epoch validation ------') val_loss, val_acc, text_val_acc, combined_val_acc = self.val() # Update the save the best validation checkpoint if needed if self.args.model_save_criteria == 'audio_text': cur_avg_acc = (val_acc + text_val_acc) / 2 else: #'combined' cur_avg_acc = combined_val_acc if cur_avg_acc > best_val_acc: #print('Start saving best check point at step{}...'.format(step)) best_val_acc = cur_avg_acc best_chkpt_path = os.path.join(self.model_dir, 'best_ckpt.pth') torch.save(self.model.state_dict(), best_chkpt_path) patience_counter = 0 best_epoch = epoch print('Done saving best check point! Patience counter reset!') else: patience_counter += 1 if patience_counter > self.max_patience: print( 'Reach max patience limit. Training stops! Best val acc achieved at epoch: {}.' .format(epoch)) break self.model.unfreeze_one_layer()
class Classifier: def __init__(self, **kwargs): opt._parse(kwargs) self.opt = opt self.model = getattr(models, self.opt.model)() self.criterion = t.nn.CrossEntropyLoss().to(self.opt.device) # 1. 铰链损失(Hinge Loss):主要用于支持向量机(SVM) 中; # 2. 互熵损失 (Cross Entropy Loss,Softmax Loss ):用于Logistic 回归与Softmax 分类中; # 3. 平方损失(Square Loss):主要是最小二乘法(OLS)中; # 4. 指数损失(Exponential Loss) :主要用于Adaboost 集成学习算法中; # 5. 其他损失(如0-1损失,绝对值损失) self.optimizer = self.model.get_optimizer(self.opt.lr, self.opt.weight_decay) self.compression_scheduler = distiller.CompressionScheduler(self.model) self.train_losses = AverageMeter() # 误差仪表 self.train_top1 = AverageMeter() # top1 仪表 self.train_top5 = AverageMeter() # top5 仪表 self.best_precision = 0 # 最好的精确度 self.start_epoch = 0 self.train_writer = None self.value_writer = None def load_data(self): test_data = DatasetFromFilename(self.opt.data_root, flag='test') train_data = DatasetFromFilename(self.opt.data_root, flag='train') # 训练集 val_data = DatasetFromFilename(self.opt.data_root, flag='valid') # 验证集 self.test_dataloader = DataLoader(test_data, batch_size=self.opt.batch_size, shuffle=True, num_workers=self.opt.num_workers) self.train_dataloader = DataLoader( train_data, self.opt.batch_size, shuffle=True, num_workers=self.opt.num_workers) # 训练集加载器 self.val_dataloader = DataLoader( val_data, self.opt.batch_size, shuffle=True, num_workers=self.opt.num_workers) # 验证集加载器 def create_write(self): if self.opt.vis: self.train_writer = SummaryWriter( log_dir='./runs/train_' + datetime.now().strftime('%y%m%d-%H-%M-%S')) self.value_writer = SummaryWriter( log_dir='./runs/val_' + datetime.now().strftime('%y%m%d-%H-%M-%S')) def train_save_model(self, epoch, val_loss, val_top1, val_top5): self.model.save({ "epoch": epoch + 1, "model_name": self.opt.model, "state_dict": self.model.state_dict(), "best_precision": self.best_precision, "optimizer": self.optimizer, "valid_loss": [val_loss, val_top1, val_top5], 'compression_scheduler': self.compression_scheduler.state_dict() }) # 保存模型 def train_load_model(self): if self.opt.load_model_path: # # 把所有的张量加载到CPU中 # t.load(opt.load_model_path, map_location=lambda storage, loc: storage) # # 把所有的张量加载到GPU 1中 # t.load(opt.load_model_path, map_location=lambda storage, loc: storage.cuda(1)) # # 把张量从GPU 1 移动到 GPU 0 # t.load(opt.load_model_path, map_location={'cuda:1': 'cuda:0'}) checkpoint = t.load(self.opt.load_model_path) self.start_epoch = checkpoint["epoch"] # compression_scheduler.load_state_dict(checkpoint['compression_scheduler'], False) self.best_precision = checkpoint["best_precision"] self.model.load_state_dict(checkpoint["state_dict"]) self.optimizer = checkpoint['optimizer'] self.model.to(self.opt.device) # 加载模型到 GPU def load_model(self): if self.opt.load_model_path: checkpoint = t.load(self.opt.load_model_path) self.model.load_state_dict(checkpoint["state_dict"]) # 加载模型 self.model.to(self.opt.device) def save_quantize_model(self): if self.opt.quantize_eval: self.model.save( { "model_name": self.opt.model, "state_dict": self.model.state_dict(), 'quantizer_metadata': self.model.quantizer_metadata }, './checkpoint/ResNet152_quantize.pth') def quantize_model(self): if self.opt.quantize_eval: self.model.cpu() quantizer = quantization.PostTrainLinearQuantizer.from_args( self.model, self.opt) # 量化模型 quantizer.prepare_model() self.model.to(self.opt.device) def load_compress(self): if self.opt.compress: self.compression_scheduler = distiller.file_config( self.model, self.optimizer, self.opt.compress, self.compression_scheduler) # 加载模型修剪计划表 self.model.to(self.opt.device) def visualization_train(self, input, ii, epoch): if ii % self.opt.print_freq: if self.train_writer: grid = make_grid( (input.data.cpu() * 0.225 + 0.45).clamp(min=0, max=1)) self.train_writer.add_image('train_images', grid, ii * (epoch + 1)) # 训练图片 self.train_writer.add_scalar('loss', self.train_losses.avg, ii * (epoch + 1)) # 训练误差 self.train_writer.add_text( 'top1', 'train accuracy top1 %.2f%%' % self.train_top1.avg, ii * (epoch + 1)) # top1准确率文本 self.train_writer.add_scalars( 'accuracy', { 'top1': self.train_top1.avg, 'top5': self.train_top5.avg, 'loss': self.train_losses.avg }, ii * (epoch + 1)) def test(self): self.load_model() self.load_data() self.model.eval() # 把module设成测试模式,对Dropout和BatchNorm有影响 correct = 0 total = 0 msglogger.info('测试数据集大小', len(self.test_dataloader)) # 量化 self.quantize_model() self.model.eval() # 把module设成测试模式,对Dropout和BatchNorm有影响 err_img = [('img_path', 'result', 'label')] for ii, (data, labels, img_path) in tqdm(enumerate(self.test_dataloader)): input = data.to(self.opt.device) labels = labels.to(self.opt.device) score = self.model(input) # probability = t.nn.functional.softmax(score, dim=1)[:, 1].detach().tolist() # [:,i] 第i类的权重 # 将一个K维的任意实数向量压缩(映射)成另一个K维的实数向量,其中向量中的每个元素取值都介于(0,1)之间,并且压缩后的K个值相加等于1( # 变成了概率分布)。在选用Softmax做多分类时,可以根据值的大小来进行多分类的任务,如取权重最大的一维 results = score.max(dim=1)[1].detach( ) # max 返回每一行中最大值的那个元素,且返回其索引(返回最大元素在这一行的列索引) 返回最有可能的一类 # batch_results = [(labels_.item(), self.opt.cate_classes[label_]) for labels_, label_ in zip(labels, label)] total += input.size(0) correct += (results == labels).sum().item() error_list = (results != labels).tolist() err_img.extend([(img_path[i], self.opt.cate_classes[results[i]], self.opt.cate_classes[labels[i]]) for i, j in enumerate(error_list) if j == 1]) # 识别错误图片地址,识别标签,正确标签,添加到错误列表 msglogger.info( 'Test Accuracy of the model on the {} test images: {} %'.format( total, 100 * correct / total)) # 错误图片写入csv write_err_img(err_img) # 保存量化模型 self.save_quantize_model() def recognition(self): self.load_model() self.model.eval() img = image_loader(self.opt.url) image = img.view(1, 3, self.opt.image_size, self.opt.image_size).to(self.opt.device) # 转换image outputs = self.model(image) result = {} for i in range(self.opt.num_classes): # 计算各分类比重 result[self.opt.cate_classes[i]] = t.nn.functional.softmax( outputs, dim=1)[:, i].detach().tolist()[0] result = sorted(result.items(), key=lambda x: x[1], reverse=True) return result def sensitivity(self): self.load_data() self.load_model() sensitivities = np.arange(self.opt.sensitivity_range[0], self.opt.sensitivity_range[1], self.opt.sensitivity_range[2]) return sensitivity_analysis(self.model, self.criterion, self.test_dataloader, self.opt, sensitivities, msglogger) def train(self): previous_loss = 1e10 # 上次学习的loss lr = self.opt.lr perf_scores_history = [] pylogger = PythonLogger(msglogger) self.train_load_model() self.load_compress() self.create_write() lr_scheduler = get_scheduler(self.optimizer, opt) for epoch in range(self.start_epoch, self.opt.max_epoch): self.model.train() self.load_data() if self.opt.pruning: self.compression_scheduler.on_epoch_begin(epoch) # epoch 开始修剪 self.train_losses.reset() # 重置仪表 self.train_top1.reset() # 重置仪表 # print('训练数据集大小', len(train_dataloader)) total_samples = len(self.train_dataloader.sampler) steps_per_epoch = math.ceil(total_samples / self.opt.batch_size) train_progressor = ProgressBar(mode="Train ", epoch=epoch, total_epoch=self.opt.max_epoch, model_name=self.opt.model, total=len(self.train_dataloader)) lr = lr_scheduler.get_lr() for ii, (data, labels, img_path) in enumerate(self.train_dataloader): if self.opt.pruning: self.compression_scheduler.on_minibatch_begin( epoch, ii, steps_per_epoch, self.optimizer) # batch 开始修剪 train_progressor.current = ii + 1 # 训练集当前进度 # train model input = data.to(self.opt.device) target = labels.to(self.opt.device) score = self.model(input) # 网络结构返回值 loss = self.criterion(score, target) # 计算损失 if self.opt.pruning: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) agg_loss = self.compression_scheduler.before_backward_pass( epoch, ii, steps_per_epoch, loss, optimizer=self.optimizer, return_loss_components=True) # 模型修建误差 loss = agg_loss.overall_loss self.train_losses.update(loss.item(), input.size(0)) # loss = criterion(score[0], target) # 计算损失 Inception3网络 self.optimizer.zero_grad() # 参数梯度设成0 loss.backward() # 反向传播 self.optimizer.step() # 更新参数 if opt.pruning: self.compression_scheduler.on_minibatch_end( epoch, ii, steps_per_epoch, self.optimizer) # batch 结束修剪 precision1_train, precision5_train = accuracy( score, target, topk=(1, 5)) # top1 和 top5 的准确率 # precision1_train, precision2_train = accuracy(score[0], target, topk=(1, 2)) # Inception3网络 self.train_losses.update(loss.item(), input.size(0)) self.train_top1.update(precision1_train[0].item(), input.size(0)) self.train_top5.update(precision5_train[0].item(), input.size(0)) train_progressor.current_loss = self.train_losses.avg train_progressor.current_top1 = self.train_top1.avg train_progressor.current_top5 = self.train_top5.avg train_progressor() # 打印进度 if (ii + 1) % self.opt.print_freq == 0: self.visualization_train(input, ii, epoch) if self.opt.pruning: distiller.log_weights_sparsity(self.model, epoch, loggers=[pylogger]) # 打印模型修剪结果 self.compression_scheduler.on_epoch_end( epoch, self.optimizer) # epoch 结束修剪 val_loss, val_top1, val_top5 = val(self.model, self.criterion, self.val_dataloader, epoch, self.value_writer) # 校验模型 sparsity = distiller.model_sparsity(self.model) perf_scores_history.append( distiller.MutableNamedTuple( { 'sparsity': sparsity, 'top1': val_top1, 'top5': val_top5, 'epoch': epoch + 1, 'lr': lr, 'loss': val_loss }, )) # 保持绩效分数历史记录从最好到最差的排序 # 按稀疏度排序为主排序键,然后按top1、top5、epoch排序 perf_scores_history.sort(key=operator.attrgetter( 'sparsity', 'top1', 'top5', 'epoch'), reverse=True) for score in perf_scores_history[:1]: msglogger.info( '==> Best [Top1: %.3f Top5: %.3f Sparsity: %.2f on epoch: %d Lr: %f Loss: %f]', score.top1, score.top5, score.sparsity, score.epoch, lr, score.loss) is_best = epoch == perf_scores_history[ 0].epoch # 当前epoch 和最佳epoch 一样 self.best_precision = max(perf_scores_history[0].top1, self.best_precision) # 最大top1 准确率 if is_best: self.train_save_model(epoch, val_loss, val_top1, val_top5) # update learning rate lr = lr_scheduler.get_lr() # # 如果训练误差比上次大 降低学习效率 # if self.train_losses.val > previous_loss: # lr = lr * self.opt.lr_decay # # 当loss大于上一次loss,降低学习率 # for param_group in self.optimizer.param_groups: # param_group['lr'] = lr # # previous_loss = self.train_losses.val t.cuda.empty_cache()
def train(net, criterion, optimizer, train_loader, val_loader, config, scheduler=None): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") train_loss_avg = AverageMeter() val_loss_avg = AverageMeter() train_step = len(train_loader) val_step = len(test_loader) writer = SummaryWriter(config.log_path) criterion.to(device) net.train() net.to(device) if scheduler: scheduler.step(config.epoch) for epoch in range(config.epoch, config.num_epoch): train_loss_avg.reset() val_loss_avg.reset() iter = tqdm(enumerate(train_loader)) iter.set_description(f'Train Step in {epoch} total step: {train_step}') for step, (images, targets) in iter: images = images.to(device) targets = targets.to(device) preds = net(images) loss = criterion(preds, targets) optimizer.zero_grad() loss.backward() optimizer.step() train_loss_avg.update(loss.item()) iter = tqdm(enumerate(val_loader)) iter.set_description( f'Validation Step in {epoch} total step: {val_step}') for step, (images, targets) in iter: images = images.to(device) targets = targets.to(device) preds = net(images) loss = criterion(preds, targets) val_loss_avg.update(loss.item()) writer.add_scalars('DarkNet19/loss', { 'train': train_loss_avg.avg, 'validation': val_loss_avg.avg }, epoch) writer.add_scalar('DarkNet19/LearningRate', optimizer.param_groups[0]['lr'], epoch) torch.save(net.state_dict(), f'{config.checkpoint_dir}/DarkNet19-{epoch}.pth') if scheduler: scheduler.step()
def train(**kwargs): opt._parse(kwargs) train_writer = None value_writer = None if opt.vis: train_writer = SummaryWriter( log_dir='./runs/train_' + datetime.now().strftime('%y%m%d-%H-%M-%S')) value_writer = SummaryWriter( log_dir='./runs/val_' + datetime.now().strftime('%y%m%d-%H-%M-%S')) previous_loss = 1e10 # 上次学习的loss best_precision = 0 # 最好的精确度 start_epoch = 0 lr = opt.lr perf_scores_history = [] # 绩效分数 # step1: criterion and optimizer # 1. 铰链损失(Hinge Loss):主要用于支持向量机(SVM) 中; # 2. 互熵损失 (Cross Entropy Loss,Softmax Loss ):用于Logistic 回归与Softmax 分类中; # 3. 平方损失(Square Loss):主要是最小二乘法(OLS)中; # 4. 指数损失(Exponential Loss) :主要用于Adaboost 集成学习算法中; # 5. 其他损失(如0-1损失,绝对值损失) criterion = t.nn.CrossEntropyLoss().to(opt.device) # 损失函数 # step2: meters train_losses = AverageMeter() # 误差仪表 train_top1 = AverageMeter() # top1 仪表 train_top5 = AverageMeter() # top5 仪表 pylogger = PythonLogger(msglogger) # step3: configure model model = getattr(models, opt.model)() # 获得网络结构 compression_scheduler = distiller.CompressionScheduler(model) optimizer = model.get_optimizer(lr, opt.weight_decay) # 优化器 if opt.load_model_path: # # 把所有的张量加载到CPU中 # t.load(opt.load_model_path, map_location=lambda storage, loc: storage) # t.load(opt.load_model_path, map_location='cpu') # # 把所有的张量加载到GPU 1中 # t.load(opt.load_model_path, map_location=lambda storage, loc: storage.cuda(1)) # # 把张量从GPU 1 移动到 GPU 0 # t.load(opt.load_model_path, map_location={'cuda:1': 'cuda:0'}) checkpoint = t.load(opt.load_model_path) start_epoch = checkpoint["epoch"] # compression_scheduler.load_state_dict(checkpoint['compression_scheduler'], False) best_precision = checkpoint["best_precision"] model.load_state_dict(checkpoint["state_dict"]) optimizer = checkpoint['optimizer'] model.to(opt.device) # 加载模型到 GPU if opt.compress: compression_scheduler = distiller.file_config( model, optimizer, opt.compress, compression_scheduler) # 加载模型修剪计划表 model.to(opt.device) # 学习速率调整器 lr_scheduler = get_scheduler(optimizer, opt) # step4: data_image train_data = DatasetFromFilename(opt.data_root, flag='train') # 训练集 val_data = DatasetFromFilename(opt.data_root, flag='test') # 验证集 train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) # 训练集加载器 val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) # 验证集加载器 # train for epoch in range(start_epoch, opt.max_epoch): model.train() if opt.pruning: compression_scheduler.on_epoch_begin(epoch) # epoch 开始修剪 train_losses.reset() # 重置仪表 train_top1.reset() # 重置仪表 # print('训练数据集大小', len(train_dataloader)) total_samples = len(train_dataloader.sampler) steps_per_epoch = math.ceil(total_samples / opt.batch_size) train_progressor = ProgressBar(mode="Train ", epoch=epoch, total_epoch=opt.max_epoch, model_name=opt.model, lr=lr, total=len(train_dataloader)) lr = lr_scheduler.get_lr()[0] for ii, (data, labels, img_path, tag) in enumerate(train_dataloader): if not check_date(img_path, tag, msglogger): return if opt.pruning: compression_scheduler.on_minibatch_begin( epoch, ii, steps_per_epoch, optimizer) # batch 开始修剪 train_progressor.current = ii + 1 # 训练集当前进度 # train model input = data.to(opt.device) target = labels.to(opt.device) if train_writer: grid = make_grid( (input.data.cpu() * 0.225 + 0.45).clamp(min=0, max=1)) train_writer.add_image('train_images', grid, ii * (epoch + 1)) # 训练图片 score = model(input) # 网络结构返回值 # 计算损失 loss = criterion(score, target) if opt.pruning: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) agg_loss = compression_scheduler.before_backward_pass( epoch, ii, steps_per_epoch, loss, optimizer=optimizer, return_loss_components=True) # 模型修建误差 loss = agg_loss.overall_loss train_losses.update(loss.item(), input.size(0)) # loss = criterion(score[0], target) # 计算损失 Inception3网络 optimizer.zero_grad() # 参数梯度设成0 loss.backward() # 反向传播 optimizer.step() # 更新参数 if opt.pruning: compression_scheduler.on_minibatch_end(epoch, ii, steps_per_epoch, optimizer) # batch 结束修剪 precision1_train, precision5_train = accuracy( score, target, topk=(1, 5)) # top1 和 top5 的准确率 # writer.add_graph(model, input) # precision1_train, precision2_train = accuracy(score[0], target, topk=(1, 2)) # Inception3网络 train_losses.update(loss.item(), input.size(0)) train_top1.update(precision1_train[0].item(), input.size(0)) train_top5.update(precision5_train[0].item(), input.size(0)) train_progressor.current_loss = train_losses.avg train_progressor.current_top1 = train_top1.avg train_progressor.current_top5 = train_top5.avg train_progressor() # 打印进度 if ii % opt.print_freq == 0: if train_writer: train_writer.add_scalar('loss', train_losses.avg, ii * (epoch + 1)) # 训练误差 train_writer.add_text( 'top1', 'train accuracy top1 %s' % train_top1.avg, ii * (epoch + 1)) # top1准确率文本 train_writer.add_scalars( 'accuracy', { 'top1': train_top1.avg, 'top5': train_top5.avg, 'loss': train_losses.avg }, ii * (epoch + 1)) # train_progressor.done() # 保存训练结果为txt # validate and visualize if opt.pruning: distiller.log_weights_sparsity(model, epoch, loggers=[pylogger]) # 打印模型修剪结果 compression_scheduler.on_epoch_end(epoch, optimizer) # epoch 结束修剪 val_loss, val_top1, val_top5 = val(model, criterion, val_dataloader, epoch, value_writer, lr) # 校验模型 sparsity = distiller.model_sparsity(model) perf_scores_history.append( distiller.MutableNamedTuple( { 'sparsity': sparsity, 'top1': val_top1, 'top5': val_top5, 'epoch': epoch + 1, 'lr': lr, 'loss': val_loss }, )) # 保持绩效分数历史记录从最好到最差的排序 # 按稀疏度排序为主排序键,然后按top1、top5、epoch排序 perf_scores_history.sort(key=operator.attrgetter( 'sparsity', 'top1', 'top5', 'epoch'), reverse=True) for score in perf_scores_history[:1]: msglogger.info( '==> Best [Top1: %.3f Top5: %.3f Sparsity: %.2f on epoch: %d Lr: %f Loss: %f]', score.top1, score.top5, score.sparsity, score.epoch, lr, score.loss) best_precision = max(perf_scores_history[0].top1, best_precision) # 最大top1 准确率 is_best = epoch + 1 == perf_scores_history[ 0].epoch # 当前epoch 和最佳epoch 一样 if is_best: model.save({ "epoch": epoch + 1, "model_name": opt.model, "state_dict": model.state_dict(), "best_precision": best_precision, "optimizer": optimizer, "valid_loss": [val_loss, val_top1, val_top5], 'compression_scheduler': compression_scheduler.state_dict(), }) # 保存模型 # update learning rate lr_scheduler.step(epoch) # 更新学习效率 # 如果训练误差比上次大 降低学习效率 # if train_losses.val > previous_loss: # lr = lr * opt.lr_decay # # 当loss大于上一次loss,降低学习率 # for param_group in optimizer.param_groups: # param_group['lr'] = lr # # previous_loss = train_losses.val t.cuda.empty_cache() # 这个命令是清除没用的临时变量的
def valid(self, t_max, epoch, model): top1 = AverageMeter() top5 = AverageMeter() # switch to evaluate mode model.eval() #eval on each class seperately acc_av = 0 acc_av5 = 0 with torch.no_grad(): for t_past in range(t_max + 1): idx_ = [i + (t_past * 100) for i in self.idx] top1.reset() top5.reset() print(t_past) dataset_test = ImageFolder( root=self.dataroot_test, transform=transforms.Compose([ transforms.Resize(self.imageSize), transforms.CenterCrop(self.imageSize), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]), classes_idx=(idx_), ) val_loader_task = torch.utils.data.DataLoader( dataset_test, batch_size=self.batchSize, shuffle=True, num_workers=int(1)) for i, (input, target) in enumerate(val_loader_task): #if args.gpu is not None: input = input.cuda(self.cuda0) target = target.cuda(self.cuda0) + ( t_past * len(self.unique_classes[t_past])) #print(target) self.c_label.data.resize_(target.shape[0]).copy_(target) # compute output _, output = model(input) output = torch.nn.functional.softmax(output, dim=1) topk = ([1, 5] ) #min(t_max+len(self.unique_classes[t_past]),5)]) acc1, acc5 = accuracy(output, target, topk=topk) top1.update(acc1) #, input.size(0)) top5.update(acc5) #, input.size(0)) acc_av += top1.avg print( 'Test: {}, Acc@1 {top1.val[0]:.3f} ({top1.avg[0]:.3f})), Acc@5 {top5.val[0]:.3f} ({top5.avg[0]:.3f}))' .format( #Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format( t_past, top1=top1, top5=top5)) self.acc_writers[t_past].scalar_summary( "Accuracy top 1", top1.avg[0], self.global_step) self.acc_writers[t_past].scalar_summary( "Accuracy top 5", top5.avg[0], self.global_step) self.acc_writers[t_past].scalar_summary( "Accuracy top 1_val", top1.val[0], self.global_step) self.acc_writers[t_past].scalar_summary( "Accuracy top 5_val", top5.val[0], self.global_step) acc_av5 += top5.avg self.writer.scalar_summary("Average_Acc. top 1", acc_av / (t_max + 1), self.global_step) self.writer.scalar_summary("Average_Acc. top 5", acc_av5 / (t_max + 1), self.global_step) model.train() return top1.avg