def valid(epoch): net.eval() valid_loss = 0.0 correct = 0.0 total = 0.0 with torch.no_grad(): for step, data in enumerate(valid_loader): x, y = data x, y = x.to(device), y.to(device) out = net(x) loss = criterion(out, y) _, pred = torch.max(out.data, 1) valid_loss += loss.item() total += y.size(0) correct += (pred == y).squeeze().sum().cpu().numpy() valid_acc = correct / total print("valid accuracy", valid_acc) logx.metric('val', { 'loss': valid_loss, 'accuracy': valid_acc }, epoch=epoch) return valid_acc
def train(epoch): net.train() train_loss = 0.0 correct = 0.0 total = 0.0 for step, data in enumerate(train_loader): x, y = data x, y = x.to(device), y.to(device) out = net(x) loss = criterion(out, y) optimizer.zero_grad() loss.backward() optimizer.step() _, pred = torch.max(out.data, 1) total += y.size(0) correct += (pred == y).squeeze().sum().cpu().numpy() train_loss += loss.item() if step % 100 == 0: print("epoch", epoch, "step", step, "loss", loss.item()) train_acc = correct / total print("train accuracy", train_acc) logx.metric('train', { 'loss': train_loss, 'accuracy': train_acc }, epoch=epoch)
def __train_per_epoch(self, epoch_idx: int, steps_per_eval: int): with tqdm(total=len(self.train_dataloader), desc=f"Epoch {epoch_idx}") as pbar: for batch_idx, batch in enumerate(self.train_dataloader): global_step = epoch_idx * len( self.train_dataloader) + batch_idx loss = self.__training_step(batch) if self.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. loss.backward() logx.metric( 'train', { "tr_loss": loss.item(), "learning_rate": self.scheduler.get_last_lr()[0] }, global_step) pbar.set_postfix_str(f"tr_loss: {loss.item():.5f}") # update weights self.optimizer.step() self.scheduler.step() # Update learning rate schedule if batch_idx % steps_per_eval == 0: # validate and save checkpoints # downsample a subset of dev dataset eval_dataset = self.dev_dataloader.dataset subset_size = len(eval_dataset) // 500 eval_sampled_dataloader = DataLoader( Subset( self.dev_dataloader.dataset, random.sample(range(len(eval_dataset)), subset_size)), shuffle=True, batch_size=self.batch_size, pin_memory=True) mean_loss, metrics_scores, _, _ = self.validate( eval_sampled_dataloader) logx.metric('val', metrics_scores, global_step) if self.n_gpu > 1: save_dict = { "model_construct_params_dict": self.model.module.param_dict(), "state_dict": self.model.module.state_dict(), "solver_construct_params_dict": self.state_dict(), "optimizer": self.optimizer.state_dict() } else: save_dict = { "model_construct_params_dict": self.model.param_dict(), "state_dict": self.model.state_dict(), "solver_construct_params_dict": self.state_dict(), "optimizer": self.optimizer.state_dict() } logx.save_model(save_dict, metric=mean_loss, epoch=global_step, higher_better=False) pbar.update(1)
def train(train_loader, model, criterion, optimizer, epoch, args): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode model.train() end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) if args.gpu is not None: input = input.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(input) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) top5.update(acc5[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() metrics = { 'loss': losses.avg, 'top1': float(top1.avg), 'top5': float(top5.avg) } logx.metric('train', metrics, i + epoch * len(train_loader)) if i % args.print_freq == 0: logx.msg('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5))
def validation(args, model, device, val_loader, optimizer, epoch, criterion): model.eval() n_val = len(val_loader) val_loss = 0 val_psnr = 0 for batch_idx, batch_data in enumerate(val_loader): batch_ldr0, batch_ldr1, batch_ldr2 = batch_data['input0'].to(device), batch_data['input1'].to(device), \ batch_data['input2'].to(device) label = batch_data['label'].to(device) with torch.no_grad(): pred = model(batch_ldr0, batch_ldr1, batch_ldr2) pred = range_compressor_tensor(pred) pred = torch.clamp(pred, 0., 1.) loss = criterion(pred, label) psnr = batch_PSNR(pred, label, 1.0) logx.msg('Validation set: PSNR: {:.4f}'.format(psnr)) iteration = (epoch - 1) * len(val_loader) + batch_idx if epoch % 100 == 0: logx.add_image('val/input1', batch_ldr0[0][[2, 1, 0], :, :], iteration) logx.add_image('val/input2', batch_ldr1[0][[2, 1, 0], :, :], iteration) logx.add_image('val/input3', batch_ldr2[0][[2, 1, 0], :, :], iteration) logx.add_image('val/pred', pred[0][[2, 1, 0], :, :], iteration) logx.add_image('val/gt', label[0][[2, 1, 0], :, :], iteration) val_loss += loss val_psnr += psnr val_loss /= n_val val_psnr /= n_val logx.msg('Validation set: Average loss: {:.4f}'.format(val_loss)) logx.msg('Validation set: Average PSNR: {:.4f}\n'.format(val_psnr)) # capture metrics metrics = {'psnr': val_psnr} logx.metric('val', metrics, epoch) # save_model save_dict = { 'epoch': epoch + 1, 'arch': 'AHDRNet', 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } logx.save_model(save_dict, epoch=epoch, metric=val_loss, higher_better=True)
def validate(val_loader, model, criterion, args, epoch): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (input, target) in enumerate(val_loader): if args.gpu is not None: input = input.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(input) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) top5.update(acc5[0], input.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: logx.msg('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1, top5=top5)) logx.msg(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format( top1=top1, top5=top5)) metrics = {'top1': float(top1.avg), 'top5': float(top5.avg)} logx.metric('val', metrics, epoch) return top1.avg
def train(args, model, device, train_loader, optimizer, epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: logx.msg('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) # capture metrics metrics = {'loss': loss.item()} iteration = epoch * len(train_loader) + batch_idx logx.metric('train', metrics, iteration)
def test_epoch(epoch): model.eval() losses = 0.0 total, correct = 0.0, 0.0 with torch.no_grad(): for step, (x, y) in enumerate(val_loader): x, y = x.to(config.device), y.to(config.device) out = model(x) loss = criterion(out, y) losses += loss.cpu().detach().numpy() _, pred = torch.max(out.data, 1) total += y.size(0) correct += (pred == y).squeeze().sum().cpu().numpy() save_dict = { 'state_dict': model.state_dict() } logx.msg("epoch {} validation loss {} validation acc {}".format(epoch, losses / (step + 1), correct / total)) logx.metric('val', {'loss': losses / (step + 1), 'acc': correct / total}) logx.save_model(save_dict, losses, epoch, higher_better=False, delete_old=True)
def train(args, model, device, train_loader, optimizer, epoch, criterion): model.train() epoch_loss = 0 for batch_idx, batch_data in enumerate(train_loader): batch_ldr0, batch_ldr1, batch_ldr2 = batch_data['input0'].to( device), batch_data['input1'].to(device), batch_data['input2'].to( device) label = batch_data['label'].to(device) pred = model(batch_ldr0, batch_ldr1, batch_ldr2) pred = range_compressor_tensor(pred) pred = torch.clamp(pred, 0., 1.) loss = criterion(pred, label) psnr = batch_PSNR(pred, label, 1.0) # psnr = batch_PSNR(torch.clamp(pred, 0., 1.), label, 1.0) epoch_loss += loss.item() optimizer.zero_grad() loss.backward() # nn.utils.clip_grad_value_(model.parameters(), 0.01) optimizer.step() iteration = (epoch - 1) * len(train_loader) + batch_idx if batch_idx % args.log_interval == 0: logx.msg('Train Epoch: {} [{}/{} ({:.0f} %)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(batch_data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) logx.add_scalar('train/learning_rate', optimizer.param_groups[0]['lr'], iteration) logx.add_scalar('train/psnr', psnr, iteration) logx.add_image('train/input1', batch_ldr0[0][[2, 1, 0], :, :], iteration) logx.add_image('train/input2', batch_ldr1[0][[2, 1, 0], :, :], iteration) logx.add_image('train/input3', batch_ldr2[0][[2, 1, 0], :, :], iteration) logx.add_image('train/pred', pred[0][[2, 1, 0], :, :], iteration) logx.add_image('train/gt', label[0][[2, 1, 0], :, :], iteration) # capture metrics metrics = {'loss': loss.item()} logx.metric('train', metrics, iteration)
def train_epoch(epoch): model.train() losses = 0.0 total, correct = 0.0, 0.0 for step, (x, y) in enumerate(train_loader): x, y = x.to(config.device), y.to(config.device) out = model(x) loss = criterion(out, y) losses += loss.cpu().detach().numpy() optimizer.zero_grad() loss.backward() optimizer.step() _, pred = torch.max(out.data, 1) total += y.size(0) correct += (pred == y).squeeze().sum().cpu().numpy() if step % 100 == 0: logx.msg("epoch {} step {} training loss {}".format(epoch, step, loss.item())) logx.msg("epoch {} training loss {} training acc {}".format(epoch, losses / (step + 1), correct / total)) logx.metric("train", {"loss": losses / (step + 1), 'acc': correct / total}) return losses
def callback_func(env): """ callback function that records r^2 and MSE """ if env.evaluation_result_list[0][0] == "dev-NegrSquare" and env.evaluation_result_list[1][0] == "dev-rmse": eval_dict = { "R2": -env.evaluation_result_list[0][1], "MSE": env.evaluation_result_list[1][1], } elif env.evaluation_result_list[0][0] == "dev-rmse" and env.evaluation_result_list[1][0] == "dev-NegrSquare": eval_dict = { "MSE": env.evaluation_result_list[0][1], "R2": -env.evaluation_result_list[1][1], } else: eval_dict = { env.evaluation_result_list[0][0]: env.evaluation_result_list[0][1], env.evaluation_result_list[1][0]: env.evaluation_result_list[1][1], } logx.metric('val', eval_dict, env.iteration)
def test(args, model, device, test_loader, epoch, optimizer): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) test_loss += F.nll_loss(output, target, reduction='sum').item() pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) accuracy = 100. * correct / len(test_loader.dataset) logx.msg( '\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), accuracy)) # capture metrics metrics = {'loss': test_loss, 'accuracy': accuracy} logx.metric('val', metrics, epoch) # save model save_dict = { 'epoch': epoch + 1, 'arch': 'lenet', 'state_dict': model.state_dict(), 'accuracy': accuracy, 'optimizer': optimizer.state_dict() } logx.save_model(save_dict, metric=accuracy, epoch=epoch, higher_better=True)
def __train_per_epoch(self, epoch_idx, steps_per_eval): # with tqdm(total=len(self.train_dataloader), desc=f"Epoch {epoch_idx}") as pbar: for batch_idx, batch in enumerate(self.train_dataloader): # assume that the whole input matrix fits the GPU memory global_step = epoch_idx * len(self.train_dataloader) + batch_idx training_set_loss, training_set_outputs, training_set_output_similarity = self.__training_step( batch) if batch_idx + 1 == len(self.train_dataloader): # validate and save checkpoints developing_set_outputs, developing_set_metrics_scores, developing_set_output_similarity = \ self.validate(self.dev_dataloader) # TODO: this part can be optimized to batchwise computing if self.record_training_loss_per_epoch: training_set_metrics_scores, _ = \ self.get_scores(self.train_decoder, training_set_outputs, self.train_dataloader.dataset.anchor_idx) else: training_set_metrics_scores = dict() training_set_metrics_scores['loss'] = training_set_loss.item() if self.scheduler: training_set_metrics_scores[ 'learning_rate'] = self.scheduler.get_last_lr()[0] logx.metric('train', training_set_metrics_scores, global_step) logx.metric('val', developing_set_metrics_scores, global_step) if self.n_gpu > 1: save_dict = { "model_construct_dict": self.model.module.config, "model_state_dict": self.model.module.state_dict(), "solver_construct_params_dict": self.construct_param_dict, "optimizer": self.optimizer.state_dict(), "train_scores": training_set_metrics_scores, "train_input_embedding": self.train_dataloader.dataset.x, "train_input_similarity": self.train_dataloader.dataset.input_similarity, "train_output_embedding": training_set_outputs, "train_output_similarity": training_set_output_similarity, "dev_scores": developing_set_metrics_scores, "dev_input_embeddings": self.dev_dataloader.dataset.x, "dev_input_similarity": self.dev_dataloader.dataset.input_similarity, "dev_output_embedding": developing_set_outputs, "dev_output_similarity": developing_set_output_similarity, } else: save_dict = { "model_construct_dict": self.model.config, "model_state_dict": self.model.state_dict(), "solver_construct_params_dict": self.construct_param_dict, "optimizer": self.optimizer.state_dict(), "train_scores": training_set_metrics_scores, "train_input_embedding": self.train_dataloader.dataset.x, "train_input_similarity": self.train_dataloader.dataset.input_similarity, "train_output_embedding": training_set_outputs, "train_output_similarity": training_set_output_similarity, "dev_scores": developing_set_metrics_scores, "dev_input_embeddings": self.dev_dataloader.dataset.x, "dev_input_similarity": self.dev_dataloader.dataset.input_similarity, "dev_output_embedding": developing_set_outputs, "dev_output_similarity": developing_set_output_similarity, } logx.save_model( save_dict, metric=developing_set_metrics_scores['Recall@1'], epoch=global_step, higher_better=True)
batch_size=args.batch_size, shuffle=True) valid_dataset = Train_Dataset('./data/new_valid.csv', './data/train/', transform=valid_transformer) valid_loader = DataLoader(dataset=valid_dataset, batch_size=args.batch_size) best_accuracy = 0 for epoch in range(args.epochs): print("epoch:" + str(epoch)) train_acc, train_loss = train(my_model, train_loader, optimizer, scheduler=scheduler) metric_train = {'train_acc': train_acc, 'train_loss': train_loss} logx.metric('train', metric_train, epoch) # torch.save({'state_dict}': my_model.state_dict()}, './weights/resnet50_last.pth') valid_acc, valid_loss = valid(my_model, valid_loader) metric_valid = {'valid_acc': valid_acc, 'valid_loss': valid_loss} logx.metric('val', metric_valid, epoch) if valid_acc > best_accuracy: best_accuracy = valid_acc torch.save({'state_dict}': my_model.state_dict()}, './logs/exp9/highest_valid_acc.pth') logx.save_model({'state_dict}': my_model.state_dict()}, valid_loss, epoch, higher_better=False, delete_old=True) print("current_acc:{0}, best_acc:{1}".format(valid_acc, best_accuracy))
def train(train_loader, net, optim, curr_epoch, scaler): """ Runs the training loop per epoch train_loader: Data loader for train net: thet network optimizer: optimizer curr_epoch: current epoch return: """ full_bt = time.perf_counter() net.train() train_main_loss = AverageMeter() start_time = None warmup_iter = 10 optim.last_batch = len(train_loader) - 1 btimes = [] batch_time = time.perf_counter() for i, data in enumerate(train_loader): lr_warmup(optim, curr_epoch, i, len(train_loader), max_lr=0.4) if i <= warmup_iter: start_time = time.time() # inputs = (bs,3,713,713) # gts = (bs,713,713) images, gts, _img_name, scale_float = data batch_pixel_size = images.size(0) * images.size(2) * images.size(3) images, gts, scale_float = images.cuda(), gts.cuda(), scale_float.cuda( ) inputs = {'images': images, 'gts': gts} optim.zero_grad() if args.amp: with amp.autocast(): main_loss = net(inputs) log_main_loss = main_loss.clone().detach_() # torch.distributed.all_reduce(log_main_loss, # torch.distributed.ReduceOp.SUM) log_wait = optim.comm.Iallreduce(MPI.IN_PLACE, log_main_loss, MPI.SUM) # log_main_loss = log_main_loss / args.world_size # train_main_loss.update(log_main_loss.item(), batch_pixel_size) scaler.scale(main_loss).backward() else: main_loss = net(inputs) main_loss = main_loss.mean() log_main_loss = main_loss.clone().detach_() log_wait = None #train_main_loss.update(log_main_loss.item(), batch_pixel_size) main_loss.backward() # the scaler update is within the optim step optim.step() if i >= warmup_iter: curr_time = time.time() batches = i - warmup_iter + 1 batchtime = (curr_time - start_time) / batches else: batchtime = 0 if log_wait is not None: log_wait.Wait() log_main_loss = log_main_loss / args.world_size train_main_loss.update(log_main_loss.item(), batch_pixel_size) msg = ('[epoch {}], [iter {} / {}], [train main loss {:0.6f}],' ' [lr {:0.6f}] [batchtime {:0.3g}]') msg = msg.format(curr_epoch, i + 1, len(train_loader), train_main_loss.avg, optim.local_optimizer.param_groups[-1]['lr'], batchtime) logx.msg(msg) metrics = { 'loss': train_main_loss.avg, 'lr': optim.local_optimizer.param_groups[-1]['lr'] } curr_iter = curr_epoch * len(train_loader) + i logx.metric('train', metrics, curr_iter) if i >= 10 and args.test_mode: del data, inputs, gts return btimes.append(time.perf_counter() - batch_time) batch_time = time.perf_counter() if args.benchmarking: train_loss_tens = torch.tensor(train_main_loss.avg) optim.comm.Allreduce(MPI.IN_PLACE, train_loss_tens, MPI.SUM) train_loss_tens = train_loss_tens.to(torch.float) train_loss_tens /= float(optim.comm.size) train_main_loss.avg = train_loss_tens.item() return train_main_loss.avg, torch.mean( torch.tensor(btimes)), time.perf_counter() - full_bt
def train(train_loader, net, optim, curr_epoch): """ Runs the training loop per epoch train_loader: Data loader for train net: thet network optimizer: optimizer curr_epoch: current epoch return: """ net.train() train_main_loss = AverageMeter() start_time = None warmup_iter = 10 loss_metric = dict([('epoch', []), ('loss', []), ('lr', [])]) for i, data in enumerate(train_loader): if i <= warmup_iter: start_time = time.time() # inputs = (bs,3,713,713) # gts = (bs,713,713) images, gts, _img_name, scale_float = data batch_pixel_size = images.size(0) * images.size(2) * images.size(3) images, gts, scale_float = images.cuda(), gts.cuda(), scale_float.cuda( ) inputs = {'images': images, 'gts': gts} optim.zero_grad() main_loss = net(inputs) if args.apex: log_main_loss = main_loss.clone().detach_() torch.distributed.all_reduce(log_main_loss, torch.distributed.ReduceOp.SUM) log_main_loss = log_main_loss / args.world_size else: main_loss = main_loss.mean() log_main_loss = main_loss.clone().detach_() train_main_loss.update(log_main_loss.item(), batch_pixel_size) if args.fp16: with amp.scale_loss(main_loss, optim) as scaled_loss: scaled_loss.backward() else: main_loss.backward() optim.step() if i >= warmup_iter: curr_time = time.time() batches = i - warmup_iter + 1 batchtime = (curr_time - start_time) / batches else: batchtime = 0 msg = ('[epoch {}], [iter {} / {}], [train main loss {:0.6f}],' ' [lr {:0.6f}] [batchtime {:0.3g}]') msg = msg.format(curr_epoch, i + 1, len(train_loader), train_main_loss.avg, optim.param_groups[-1]['lr'], batchtime) logx.msg(msg) metrics = { 'loss': train_main_loss.avg, 'lr': optim.param_groups[-1]['lr'] } curr_iter = curr_epoch * len(train_loader) + i logx.metric('train', metrics, curr_iter) loss_metric['epoch'].append(curr_epoch) loss_metric['loss'].append(train_main_loss.avg) loss_metric['lr'].append(optim.param_groups[-1]['lr']) if i >= 10 and args.test_mode: del data, inputs, gts return del data
def train(): for time in range(5): logx.initialize(get_logdir("../runs"), tensorboard=True, coolname=False) model.load_state_dict( torch.load("..\\runs\exp10\last_checkpoint_ep0.pth") ['state_dict']) # warmup dataset_train = TrainDataset( '../' + cfg.root_folder + '/five_fold/train_kfold_{}.csv'.format(time), '../' + cfg.root_folder + '/train/', train_transform) train_loader = DataLoader(dataset_train, batch_size=cfg.bs, shuffle=True) test_data = TrainDataset( '../' + cfg.root_folder + '/five_fold/test_kfold_{}.csv'.format(time), '../' + cfg.root_folder + '/train/', ) test_load = DataLoader(test_data, batch_size=cfg.bs, shuffle=False) # train for epoch in range(cfg.epoch): loss_epoch = 0 total = 0 correct = 0 for i, (x, y) in enumerate(train_loader, 1): x, y = x.to(device), y.to(device) y_hat = model(x) # 计算正确率 total += x.size(0) _, predict = torch.max(y_hat.data, dim=1) correct += (predict == y).sum().item() # 损失 loss = criterion(y_hat, y) loss_epoch += loss.item() optimizer.zero_grad() loss.backward() optimizer.step() # 过程可视化 if i % 30 == 0: print( 'epoch:%d, enumerate:%d, loss_avg:%f, now_acc:%f' % (epoch, i, loss_epoch / i, correct / total)) # epoch matric 可视化 train_loss = loss_epoch / i train_acc = (correct / total) * 100 logx.metric('train', {'loss': train_loss, 'acc': train_acc}, epoch) # valid # 开发集正确率 correct = 0 total = 0 val_loss = 0 with torch.no_grad(): for i, (img, label) in enumerate(test_load, 1): img, label = img.to(device), label.to(device) output = model(img) loss = criterion(output, label) val_loss += loss.cpu().item() _, predicted = torch.max(output.data, dim=1) # 最大值,位置 total += img.size(0) correct += (predicted == label).sum().item() val_acc = (100 * correct / total) val_loss /= i logx.metric('val', {'loss': val_loss, 'acc': val_acc}, epoch) # epoch lossand other metric print( 'epoch over; train_loss:%f, val_loss:%f, train_acc=%f, val_acc:%f' % (train_loss, val_loss, train_acc, val_acc)) logx.save_model({ 'state_dict': model.state_dict(), 'epoch': epoch }, val_acc, higher_better=True, epoch=epoch, delete_old=True) scheduler.step()
def eval_metrics(iou_acc, args, net, optim, val_loss, epoch, mf_score=None): """ Modified IOU mechanism for on-the-fly IOU calculations ( prevents memory overflow for large dataset) Only applies to eval/eval.py """ was_best = False iou_per_scale = {} iou_per_scale[1.0] = iou_acc if args.amp or args.apex: iou_acc_tensor = torch.cuda.FloatTensor(iou_acc) torch.distributed.all_reduce(iou_acc_tensor, op=torch.distributed.ReduceOp.SUM) iou_per_scale[1.0] = iou_acc_tensor.cpu().numpy() scales = [1.0] # Only rank 0 should save models and calculate metrics if args.global_rank != 0: return None, 0 hist = iou_per_scale[args.default_scale] iu, acc, acc_cls = calculate_iou(hist) iou_per_scale = {args.default_scale: iu} # calculate iou for other scales for scale in scales: if scale != args.default_scale: iou_per_scale[scale], _, _ = calculate_iou(iou_per_scale[scale]) print_evaluate_results(hist, iu, epoch=epoch, iou_per_scale=iou_per_scale, log_multiscale_tb=args.log_msinf_to_tb) freq = hist.sum(axis=1) / hist.sum() mean_iu = np.nanmean(iu) fwavacc = (freq[freq > 0] * iu[freq > 0]).sum() metrics = { 'loss': val_loss.avg, 'mean_iu': mean_iu, 'acc_cls': acc_cls, 'acc': acc, } logx.metric('val', metrics, epoch) logx.msg('Mean: {:2.2f}'.format(mean_iu * 100)) save_dict = { 'epoch': epoch, 'arch': args.arch, 'num_classes': cfg.DATASET_INST.num_classes, 'state_dict': net.state_dict(), 'optimizer': optim.lcl_optimizer.state_dict() if args.heat else optim.state_dict(), 'mean_iu': mean_iu, 'command': ' '.join(sys.argv[1:]) } logx.save_model(save_dict, metric=mean_iu, epoch=epoch) torch.cuda.synchronize() if mean_iu > args.best_record['mean_iu']: was_best = True args.best_record['val_loss'] = val_loss.avg if mf_score is not None: args.best_record['mask_f1_score'] = mf_score.avg args.best_record['acc'] = acc args.best_record['acc_cls'] = acc_cls args.best_record['fwavacc'] = fwavacc args.best_record['mean_iu'] = mean_iu args.best_record['epoch'] = epoch logx.msg('-' * 107) if mf_score is None: fmt_str = ('{:5}: [epoch {}], [val loss {:0.5f}], [acc {:0.5f}], ' '[acc_cls {:.5f}], [mean_iu {:.5f}], [fwavacc {:0.5f}]') current_scores = fmt_str.format('this', epoch, val_loss.avg, acc, acc_cls, mean_iu, fwavacc) logx.msg(current_scores) best_scores = fmt_str.format('best', args.best_record['epoch'], args.best_record['val_loss'], args.best_record['acc'], args.best_record['acc_cls'], args.best_record['mean_iu'], args.best_record['fwavacc']) logx.msg(best_scores) else: fmt_str = ('{:5}: [epoch {}], [val loss {:0.5f}], [mask f1 {:.5f} ] ' '[acc {:0.5f}], ' '[acc_cls {:.5f}], [mean_iu {:.5f}], [fwavacc {:0.5f}]') current_scores = fmt_str.format('this', epoch, val_loss.avg, mf_score.avg, acc, acc_cls, mean_iu, fwavacc) logx.msg(current_scores) best_scores = fmt_str.format( 'best', args.best_record['epoch'], args.best_record['val_loss'], args.best_record['mask_f1_score'], args.best_record['acc'], args.best_record['acc_cls'], args.best_record['mean_iu'], args.best_record['fwavacc']) logx.msg(best_scores) logx.msg('-' * 107) return was_best, mean_iu
def train_net(): header = [ 'epoch', 'train_loss', 'val_loss', 'val_dice', 'val_iou', 'lr', 'time(s)' ] start_epoch, global_step, best_score, total_list = -1, 1, 0.0, [] if args.vis: viz = Visualizer(port=args.port, env=f"EXP_{args.exp_id}_NET_{args.arch}") # Resume the training process if args.resume: start_epoch = resume(args=args) # automatic mixed-precision training if args.amp_available: scaler = torch.cuda.amp.GradScaler() for epoch in range(start_epoch + 1, args.epochs): args.net.train() epoch_loss, epoch_start_time, rows = 0., time(), [epoch + 1] # get the current learning rate new_lr = get_lr(args=args, epoch=epoch) # Training process with tqdm(total=n_train, desc=f'Epoch-{epoch + 1}/{args.epochs}', unit='img') as p_bar: for batch in train_loader: # args.optimizer.zero_grad() image, label = batch['image'], batch['label'] assert image.shape[1] == args.n_channels # Prepare the image and the corresponding label. image = image.to(device=args.device, dtype=torch.float32) mask_type = torch.float32 if args.n_classes == 1 else torch.long label = label.to(device=args.device, dtype=mask_type) # Forward propagation. if args.amp_available: with torch.cuda.amp.autocast(): try: output = args.net(image) except RuntimeError as exception: if "out of memory" in str(exception): print("WARNING: out of memory") if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() exit(0) else: raise exception loss = criterion(output, label) else: output = args.net(image) loss = criterion(output, label) # visualize the image. if args.vis: try: viz.img(name='ground_truth', img_=label[0]) tmp = output[0] tmp[tmp > 0.5] = 1.0 tmp[tmp < 0.5] = 0.0 viz.img(name='prediction', img_=tmp) except ConnectionError: pass args.optimizer.zero_grad() # Back propagation. if args.amp_available: scaler.scale(loss).backward() scaler.step(args.optimizer) scaler.update() else: loss.backward() args.optimizer.step() global_step += 1 epoch_loss += loss.item() logx.add_scalar('Loss/train', loss.item(), global_step) p_bar.set_postfix(**{'loss (batch)': loss.item()}) p_bar.update(image.shape[0]) # Calculate the train loss train_loss = epoch_loss / (n_train // args.batch_size) metrics = {'train_loss': train_loss} logx.metric(phase='train', metrics=metrics, epoch=epoch) # Validate process val_score, val_loss = eval_net(criterion, logx, epoch, val_loader, n_val, args) # Update the current learning rate and # you should write the monitor metrics in step() if you use the ReduceLROnPlateau scheduler. if args.sche != "Poly": args.scheduler.step() # Calculating and logging the metrics metrics = { 'val_loss': val_loss, 'iou': val_score['iou'], 'dc': val_score['dc'], 'sp': val_score['sp'], 'se': val_score['se'], 'acc': val_score['acc'], } logx.metric(phase='val', metrics=metrics, epoch=epoch) # Print the metrics print( "\033[1;33;44m=============================Evaluation result=============================\033[0m" ) logx.msg("[Train] Loss: %.4f | LR: %.6f" % (train_loss, new_lr)) logx.msg("[Valid] Loss: %.4f | ACC: %.4f | IoU: %.4f | DC: %.4f" % ( val_loss, metrics['acc'], metrics['iou'], metrics['dc'], )) rows += [train_loss, val_loss, metrics['dc'], metrics['iou'], new_lr] # Logging the image to tensorboard logx.add_image('image', torch.cat([i for i in image], 2), epoch) logx.add_image('label/gt', torch.cat([j for j in label], 2), epoch) logx.add_image('label/pd', torch.cat([k > 0.5 for k in output], 2), epoch) # Update the best score best_score, tm = update_score(args, best_score, val_score, logx, epoch, epoch_start_time) rows.append(tm) total_list.append(rows) # Saving the model with relevant parameters save_model(args, epoch, new_lr, interval=10) data = pd.DataFrame(total_list) file_path = os.path.join(os.path.join(args.dir_log, 'metrics.csv')) data.to_csv(file_path, header=header, index=False, mode='w', encoding='utf-8') plot_curve(file_path, args.dir_log, show=True)