def train(train_loader, model, criterion, optimizer, args): model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() running_metric_text = runningScore(2) running_metric_kernel = runningScore(2) end = time.time() for batch_idx, (imgs, gt_texts, gt_kernels, training_masks) in enumerate(train_loader): data_time.update(time.time() - end) imgs = Variable(imgs.cuda()) gt_texts = Variable(gt_texts.cuda()) gt_kernels = Variable(gt_kernels.cuda()) training_masks = Variable(training_masks.cuda()) outputs = model(imgs) texts = outputs[:, 0, :, :] kernels = outputs[:, 1:, :, :] loss = criterion(texts, gt_texts, kernels, gt_kernels, training_masks) losses.update(loss.item(), imgs.size(0)) optimizer.zero_grad() loss.backward() if (args.sr_lr is not None): updateBN(model, args) optimizer.step() score_text = cal_text_score(texts, gt_texts, training_masks, running_metric_text) score_kernel = cal_kernel_score(kernels, gt_kernels, gt_texts, training_masks, running_metric_kernel) batch_time.update(time.time() - end) end = time.time() if batch_idx % 20 == 0: output_log = '({batch}/{size}) Batch: {bt:.3f}s | TOTAL: {total:.0f}min | ETA: {eta:.0f}min | Loss: {loss:.4f} | Acc_t: {acc: .4f} | IOU_t: {iou_t: .4f} | IOU_k: {iou_k: .4f}'.format( batch=batch_idx + 1, size=len(train_loader), bt=batch_time.avg, total=batch_time.avg * batch_idx / 60.0, eta=batch_time.avg * (len(train_loader) - batch_idx) / 60.0, loss=losses.avg, acc=score_text['Mean Acc'], iou_t=score_text['Mean IoU'], iou_k=score_kernel['Mean IoU']) print(output_log) sys.stdout.flush() return (losses.avg, score_text['Mean Acc'], score_kernel['Mean Acc'], score_text['Mean IoU'], score_kernel['Mean IoU'])
def validate(valloader, net, criterion): #validate net.eval() #initialization n_classes = 3 val_loss = 0 running_metrics = runningScore(n_classes) running_metrics.reset() with torch.no_grad(): pbar = tqdm(total=250, desc='Validation') for val_idx, data_samples in enumerate(valloader): volume, labels = data_samples['data'], data_samples['target'] volume = volume.cuda() labels = labels.long().cuda() outputs = net(volume) validation_loss_current_model = criterion(input_=outputs, target=labels) val_loss += criterion(input_=outputs, target=labels) pred = outputs.data.max(1)[1].cpu().numpy() gt = labels.cpu().numpy() running_metrics.update(gt, pred) pbar.update(1) pbar.close() validation_loss = val_loss / (2 * val_idx) #print("Training Loss: {}".format(validation_loss)) return (validation_loss, running_metrics.get_scores())
def eval_model(model, valid_dl, test_dl, wandb_log, args): device = next(model.parameters()).device model.eval() eval_running_metrics = [runningScore(20) for i in range(5)] with torch.no_grad(): pbar = tqdm.tqdm(enumerate(valid_dl), total=len(valid_dl)) for _, ((b_clear, ), (b_beta_005, b_beta_01, b_beta_02, ), (b_sparse, _, )) in pbar: for running_metrics, b_input in zip(eval_running_metrics[:4], [b_clear, b_beta_005, b_beta_01, b_beta_02, ]): b_sparse_pred = model(b_input.to(device)).argmax(1).cpu() running_metrics.update(b_sparse.numpy(), b_sparse_pred.numpy(), ) pbar.set_description("Valid Epoch {:3d}".format(wandb_log.running_metrics_epoch_step)) if wandb_log.use_wandb: for name, running_metrics in zip(['clear', 'beta_0.005', 'beta_0.01', 'beta_0.02', ], eval_running_metrics[:4]): wandb_log.running_metrics_epoch_log(name, running_metrics) pbar = tqdm.tqdm(enumerate(test_dl), total=len(test_dl)) for _, (b_input, b_sparse, _, ) in pbar: b_sparse_pred = model(b_input.to(device)).argmax(1).cpu() eval_running_metrics[-1].update(b_sparse.numpy(), b_sparse_pred.numpy(), ) if wandb_log.use_wandb: wandb_log.running_metrics_epoch_log('testv2', eval_running_metrics[-1]) pbar.set_description("Test Epoch {:3d}".format(wandb_log.running_metrics_epoch_step)) for name, running_metrics in zip(['clear', 'beta_0.005', 'beta_0.01', 'beta_0.02', 'testv2'], eval_running_metrics): metrics, per_class_IoU = running_metrics.get_scores() pbar.write("{} Evaluation Metrics={}".format(name, metrics)) pbar.write("{} Evaluation per_class_IoU={}".format(name, per_class_IoU)) return eval_running_metrics
def validate(model, dataloader, checkpoint_path, save_path=None): since = time.time() # set the device to gpu if possible device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Testing Device: {}\n".format(device)) model.to(device) metrics = runningScore(model.num_classes) checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint['model_state_dict']) # Iterate over data. for inputs, labels in dataloader: inputs, labels = inputs.to(device), labels.to(device) outputs = model(inputs) pred = outputs.data.max(1)[1].cpu().numpy() gt = labels.cpu().numpy() metrics.update(gt, pred) time_elapsed = time.time() - since score, class_iou = metrics.get_scores() for k, v in score.items(): print(k, v) for i in range(model.num_classes): print(i, class_iou[i]) return model
def main(test_args): testset = "/mnt/iusers01/eee01/mchiwml4/CamVid/test" transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize(mean, std)]) test_dataset = DataLoader(Loaddata(testset, transform=transform, target_transform=MaskToTensor()), batch_size=1, shuffle=False, num_workers=8) label_num = 11 model = linknetmodel.linknet(label_num) model = model.cuda() model.load_state_dict(torch.load(test_args.load_param)) model.eval() total = np.zeros((label_num, )) running_metrics = runningScore(label_num) time_elapsed = 0 for j, data in enumerate(test_dataset): since = time.time() inputs, labels = data inputs = Variable(inputs.cuda(), volatile=True) outputs = model(inputs) time_elapsed += time.time() - since pred = outputs.data.max(1)[1].cpu().numpy() gt = labels.numpy() running_metrics.update(gt, pred) for i in range(label_num): mask = gt == i # ground truth mask of class i total[i] += np.sum( mask) # total number of pixels of class i (tp+fn) print('Inference speed: {:.0f}ms, {:.0f}fps '.format( time_elapsed / len(test_dataset) * 1000, 1 / (time_elapsed / len(test_dataset)))) score, class_iou, class_acc = running_metrics.get_scores() for k, v in score.items(): print(k, v) print('class iou: ') for i in range(label_num): print(i, class_iou[i]) print('class acc: ') for i in range(label_num): print(i, class_acc[i]) print('number of pixels:') print(total)
def update_metric(outputs_dict, targets_dict, running_metrics, metric_fn_dict, config, summary_all=False, prefix_note='train'): """ update running_metrics and metric_fn_dict summary running_metrics: update seg miou and acc for summary metric_fn_dict: update aux,edge miou and acc for summary """ if summary_all: # convert tensor to numpy, np_outputs_dict = {} for key, value in outputs_dict.items(): np_outputs_dict[key] = torch.argmax( value, dim=1).data.cpu().numpy() if key not in metric_fn_dict.keys(): if key.startswith(('seg','aux')): metric_fn_dict[key] = runningScore(config.model.class_number) elif key.startswith('edge'): metric_fn_dict[key] = runningScore(config.dataset.edge_class_num) else: assert False, 'unexcepted key %s in outputs_dict' % key np_targets_dict = {} for key, value in targets_dict.items(): np_targets_dict[key] = value.data.cpu().numpy() # main metric, run for each epoch running_metrics.update(np_targets_dict['seg'], np_outputs_dict['seg']) for key, value in np_outputs_dict.items(): if key.startswith(('seg', 'aux')): metric_fn_dict[key].update(np_targets_dict['seg'], value) elif key.startswith('edge'): metric_fn_dict[key].update(np_targets_dict['edge'], value) else: assert False, 'unexcepted key %s in outputs_dict' % key else: # main metric, run for each epoch running_metrics.update(targets_dict['seg'].data.cpu().numpy( ), torch.argmax(outputs_dict['seg'], dim=1).data.cpu().numpy()) return running_metrics, metric_fn_dict
def eval(cfg): # Setup seeds torch.manual_seed(cfg.get("seed", 1337)) torch.cuda.manual_seed(cfg.get("seed", 1337)) np.random.seed(cfg.get("seed", 1337)) random.seed(cfg.get("seed", 1337)) # Setup device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Setup evaluation data data_eval_images = utils.recursive_glob( os.path.join(cfg["data"]["path"], 'images')) # data_eval_labels = utils.recursive_glob(os.path.join(cfg["data"]["path"], 'labels')) # Setup model model = DRCN(cfg).to(device) checkpoint = torch.load(cfg["training"]["checkpoint"]) model.load_state_dict(checkpoint["model_state"]) # Setup Metrics and visualizer running_metrics_val = runningScore(cfg["data"]["n_classes"]) # Start training utils.mkdirs(cfg["training"]["checkpoint"]) s = cfg["data"]["img_rows"] for img_name in tqdm.tqdm(data_eval_images): img = np.array(Image.open(img_name)) lbl = np.array(Image.open(img_name.replace('images', 'labels'))) w, h, _ = img.shape out = np.zeros((6, w, h)) for x in range(0, w - s, 200): for y in range(0, h - s, 200): img_input, lbl_input = threeCityLoader.transform( img[x:x + s, y:y + s, :], lbl[x:x + s, y:y + s]) model.set_input(img_input.unsqueeze(0), lbl_input.unsqueeze(0)) model.inference() out[:, x:x + s, y:y + s] += model.out1.cpu().detach().numpy().squeeze() max_x = (w - s) // 200 * 200 max_y = (h - s) // 200 * 200 pred = out[:, :max_x, :max_y] pred = pred.argmax(0).squeeze() gt = lbl[:max_x, :max_y] running_metrics_val.update(gt, pred) score, class_iou = running_metrics_val.get_scores() for k, v in score.items(): print(k, v) for k, v in class_iou.items(): print("{}: {}".format(k, v)) running_metrics_val.reset()
def train(cfg, logger): # Setup seeds torch.manual_seed(cfg.get("seed", 1337)) torch.cuda.manual_seed(cfg.get("seed", 1337)) np.random.seed(cfg.get("seed", 1337)) random.seed(cfg.get("seed", 1337)) # Setup device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Setup Dataloader loader_train = get_loader(cfg, "train") loader_val = get_loader(cfg, "val") # Setup model model = DRCN(cfg).to(device) start_epoch = 1 if cfg["training"]["resume"] is not None: if os.path.isfile(cfg["training"]["resume"]): print("Loading model and optimizer from checkpoint '{}'".format( cfg["training"]["resume"])) checkpoint = torch.load(cfg["training"]["resume"]) model.load_state_dict(checkpoint["model_state"]) start_epoch = checkpoint["epoch"] del checkpoint else: print("No checkpoint found at '{}'".format( cfg["training"]["resume"])) # Setup Metrics and visualizer running_metrics_val = runningScore(cfg["data"]["n_classes"]) val_loss1_meter = averageMeter() val_loss2_meter = averageMeter() opt = BaseOptions() visualizer = Visualizer(opt) # Start training utils.mkdirs(cfg["training"]["checkpoint"]) best_iou = -100.0 epoch = start_epoch train_epochs = cfg["training"]["epochs"] iters_per_epoch = len(loader_train) while epoch < train_epochs: visualizer.reset() for iter, (images, labels) in enumerate(loader_train): model.set_input(images, labels) model.optimize_parameters() if iter % cfg["training"]["print_interval"] == 0 and iter != 0: print_info = "Epoch:[{:2d}/{:2d}] Iter: [{:4d}/{:4d}] loss1: {:.5f} loss2: {:.5f} lr: {:.5f}"\ .format(epoch, train_epochs, iter, iters_per_epoch, model.loss1.item(), model.loss2.item(), model.optimizer1.defaults['lr']) print(print_info) if iter % cfg["training"]["val_interval"] == 0 and iter != 0: for images, labels in loader_val: model.set_input(images, labels) model.inference() preds = torch.argmax(model.out1, 1).cpu().numpy() labels = labels.data.numpy().squeeze() running_metrics_val.update(labels, preds) val_loss1_meter.update(model.loss1.item()) val_loss2_meter.update(model.loss2.item()) # visualizer.display_current_results(model.get_current_visuals(), epoch, save_result) losses = { 'loss1': val_loss1_meter.avg, '5loss2': val_loss2_meter.avg * 5 } score, class_iou = running_metrics_val.get_scores() accs = [] accs.append(score["Overall Acc: \t"]) accs.extend(list(class_iou.values())) accs = dict(zip(AccNames, accs)) tmp = iter / iters_per_epoch visualizer.plot_current_losses(epoch, tmp, losses) visualizer.plot_current_accuracy(epoch, tmp, accs) logger.info( "Epoch:{:03d} val_loss1:{:.05f} val_loss2:{:.05f}".format( epoch, val_loss1_meter.avg, val_loss2_meter.avg)) for k, v in score.items(): print(k, v) logger.info("{}: {}".format(k, v)) for k, v in class_iou.items(): print("{}: {}".format(k, v)) logger.info("{}: {}".format(k, v)) running_metrics_val.reset() if score["Mean IoU : \t"] >= best_iou: best_iou = score["Mean IoU : \t"] state = { "epoch": epoch, "model_state": model.state_dict(), "optimizer1_state": model.optimizer1.state_dict(), "scheduler1_state": model.scheduler1.state_dict(), "optimizer2_state": model.optimizer2.state_dict(), "scheduler2_state": model.scheduler2.state_dict(), "best_iou": best_iou, } save_path = os.path.join( cfg["training"]["checkpoint"], "{}_{}_best_model.pkl".format(cfg["model"]["arch"], cfg["data"]["dataset"]), ) torch.save(state, save_path) epoch += 1
def keras_fit(model, train_loader=None, val_loader=None, config=None): """ target to multiple output model remove args (depracated) """ # support for pure model without config if config is None: config = model.config # support for cpu/gpu device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) if config.args.checkpoint_path is not None: ckpt_path = get_ckpt_path(config.args.checkpoint_path) print('load checkpoint file from', ckpt_path) state_dict = torch.load(ckpt_path) if 'model_state' in state_dict.keys(): model.load_state_dict(state_dict['model_state']) else: model.load_state_dict(state_dict) optimizer = get_optimizer(model, config) scheduler = get_scheduler(optimizer, config) if config.args.center_loss is not None: center_loss_model=CenterLoss(model.center_channels,model.class_number, ignore_index=config.dataset.ignore_index, loss_fn=config.args.center_loss).to(device) center_optimizer=torch.optim.SGD(center_loss_model.parameters(), lr=0.5) else: center_loss_model=None center_optimizer=None loss_fn_dict = get_loss_fn_dict(config) # for different output, generate the metric_fn_dict automaticly. metric_fn_dict = {} # output for main output running_metrics = runningScore(config.model.class_number) time_str = time.strftime("%Y-%m-%d___%H-%M-%S", time.localtime()) log_dir = os.path.join(config.args.log_dir, model.name, config.dataset.name, config.args.note, time_str) # checkpoint_path = os.path.join( # log_dir, "{}_{}_best_model.pkl".format(model.name, config.dataset.name)) writer = None best_iou = 0.0 # create loader from config if train_loader is None and val_loader is None: train_loader, val_loader = get_loader(config) loaders = [train_loader, val_loader] loader_names = ['train', 'val'] # support for multiple gpu, model will be changed, model.name will not exist if device.type == 'cuda': gpu_num = torch.cuda.device_count() if gpu_num > 1: device_ids = [i for i in range(gpu_num)] model = torch.nn.DataParallel(model, device_ids=device_ids) # eval module if train_loader is None: config.args.n_epoch = 1 summary_all_step=max(1,config.args.n_epoch//10) # 1<= summary_metric_step <=10 summary_metric_step=max(min(10,config.args.n_epoch//10),1) tqdm_epoch = trange(config.args.n_epoch, desc='epoches', leave=True) for epoch in tqdm_epoch: tqdm_epoch.set_postfix(best_iou=best_iou) for loader, loader_name in zip(loaders, loader_names): if loader is None: continue # summary all only 10 times if epoch % summary_all_step == 0: summary_all = True summary_metric = True else: summary_all = False if epoch % summary_metric_step == 0 or epoch == config.args.n_epoch-1: summary_metric=True else: summary_metric=False # summary_all=True ==> summary_metric=True if loader_name == 'val': # val at summary, and val for plateau scheduler if (not summary_metric) and (scheduler is None): continue with torch.no_grad(): outputs_dict, targets_dict, \ running_metrics, metric_fn_dict, \ grads_dict, losses_dict, loss_weight_dict = train_val( model=model, optimizer=optimizer, scheduler=scheduler, loss_fn_dict=loss_fn_dict, metric_fn_dict=metric_fn_dict, running_metrics=running_metrics, loader=loader, config=config, epoch=epoch, summary_all=summary_all, summary_metric=summary_metric, loader_name=loader_name, center_loss_model=center_loss_model, center_optimizer=center_optimizer) # use rop/poly_rop to schedule learning rate if isinstance(scheduler,(poly_rop,rop)): total_loss=sum(losses_dict['%s/total_loss' % loader_name]) scheduler.step(total_loss) else: outputs_dict, targets_dict, \ running_metrics, metric_fn_dict, \ grads_dict, losses_dict, loss_weight_dict = train_val( model=model, optimizer=optimizer, scheduler=scheduler, loss_fn_dict=loss_fn_dict, metric_fn_dict=metric_fn_dict, running_metrics=running_metrics, loader=loader, config=config, epoch=epoch, summary_all=summary_all, summary_metric=summary_metric, loader_name=loader_name, center_loss_model=center_loss_model, center_optimizer=center_optimizer) # use cos_lr to shceduler the learning rate if isinstance(scheduler,cos_lr): scheduler.step() metric_dict, class_iou_dict = get_metric( running_metrics, metric_fn_dict, summary_all=summary_all, prefix_note=loader_name, summary_metric=summary_metric) if loader_name == 'val' and summary_metric: val_iou = metric_dict['val/iou'] tqdm.write('epoch %d,curruent val iou is %0.5f' % (epoch, val_iou)) if val_iou >= best_iou: best_iou = val_iou iou_save_threshold = config.args.iou_save_threshold # save the best the model if good enough if best_iou >= iou_save_threshold: print('save current best model', '*'*30) checkpoint_path = os.path.join( log_dir, 'model-best-%d.pkl' % epoch) save_model_if_necessary(model, config, checkpoint_path) # save the last model if the best model not good enough if epoch == config.args.n_epoch-1 and best_iou < iou_save_threshold: print('save the last model', '*'*30) checkpoint_path = os.path.join( log_dir, 'model-last-%d.pkl' % epoch) save_model_if_necessary(model, config, checkpoint_path) # return valid image when summary_all=True image_dict = get_image_dict( outputs_dict, targets_dict, config, summary_all=summary_all, prefix_note=loader_name) if writer is None: writer = init_writer(config=config, log_dir=log_dir) # change weight and learning rate (train only) if loader_name == 'train': weight_dict = {} for k, v in loss_weight_dict.items(): weight_dict['%s/weight_%s' % (loader_name, k)] = v lr_dict = get_lr_dict(optimizer, prefix_note=loader_name) else: weight_dict = {} lr_dict = {} write_summary(writer=writer, losses_dict=losses_dict, metric_dict=metric_dict, class_iou_dict=class_iou_dict, lr_dict=lr_dict, image_dict=image_dict, weight_dict=weight_dict, grads_dict=grads_dict, epoch=epoch) writer.close() print('total epoch is %d, best iou is' % config.args.n_epoch, best_iou) return best_iou
def _train_epoch(self, epoch): self.model.train() epoch_start = time.time() batch_start = time.time() train_loss = 0. running_metric_text = runningScore(2) lr = self.optimizer.param_groups[0]['lr'] for i, batch in enumerate(self.train_loader): if i >= self.train_loader_len: break self.global_step += 1 lr = self.optimizer.param_groups[0]['lr'] # 数据进行转换和丢到gpu for key, value in batch.items(): if value is not None: if isinstance(value, torch.Tensor): batch[key] = value.to(self.device) cur_batch_size = batch['img'].size()[0] preds = self.model(batch['img']) loss_dict = self.criterion(preds, batch) # backward self.optimizer.zero_grad() loss_dict['loss'].backward() self.optimizer.step() if self.config.lr_scheduler_type == 'WarmupPolyLR': self.scheduler.step() # acc iou score_shrink_map = cal_text_score( preds[:, 0, :, :], batch['shrink_map'], batch['shrink_mask'], running_metric_text, thred=self.config.post_processing_thresh) # loss 和 acc 记录到日志 loss_str = 'loss: {:.4f}, '.format(loss_dict['loss'].item()) for idx, (key, value) in enumerate(loss_dict.items()): loss_dict[key] = value.item() if key == 'loss': continue loss_str += '{}: {:.4f}'.format(key, loss_dict[key]) if idx < len(loss_dict) - 1: loss_str += ', ' train_loss += loss_dict['loss'] acc = score_shrink_map['Mean Acc'] iou_shrink_map = score_shrink_map['Mean IoU'] if self.global_step % self.log_iter == 0: batch_time = time.time() - batch_start self.logger_info( '[{}/{}], [{}/{}], global_step: {}, speed: {:.1f} samples/sec, acc: {:.4f}, iou_shrink_map: {:.4f}, {}, lr:{:.6}, time:{:.2f}' .format(epoch, self.epochs, i + 1, self.train_loader_len, self.global_step, self.log_iter * cur_batch_size / batch_time, acc, iou_shrink_map, loss_str, lr, batch_time)) batch_start = time.time() return { 'train_loss': train_loss / self.train_loader_len, 'lr': lr, 'time': time.time() - epoch_start, 'epoch': epoch }
def train(cfg, writer, logger): # Setup seeds torch.manual_seed(cfg.get("seed", 1337)) torch.cuda.manual_seed(cfg.get("seed", 1337)) np.random.seed(cfg.get("seed", 1337)) random.seed(cfg.get("seed", 1337)) # Setup device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Setup Dataloader trainloader = get_loader(cfg, "train") valloader = get_loader(cfg, "val") n_classes = cfg["data"]["n_classes"] n_channels = cfg["data"]["channels"] # Setup Metrics running_metrics_val = runningScore(n_classes) # Setup Model model = get_model(cfg, n_classes, n_channels).to(device) model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count())) # Setup optimizer, lr_scheduler and loss function optimizer_cls = get_optimizer(cfg) optimizer_params = { k: v for k, v in cfg["training"]["optimizer"].items() if k != "name" } optimizer = optimizer_cls(model.parameters(), **optimizer_params) logger.info("Using optimizer {}".format(optimizer)) scheduler = get_scheduler(optimizer, cfg["training"]["lr_schedule"]) loss_fn = get_loss_function(cfg) logger.info("Using loss {}".format(loss_fn)) start_iter = 0 if cfg["training"]["resume"] is not None: if os.path.isfile(cfg["training"]["resume"]): logger.info( "Loading model and optimizer from checkpoint '{}'".format( cfg["training"]["resume"])) checkpoint = torch.load(cfg["training"]["resume"]) model.module.load_state_dict(checkpoint["model_state"]) optimizer.load_state_dict(checkpoint["optimizer_state"]) scheduler.load_state_dict(checkpoint["scheduler_state"]) start_iter = checkpoint["epoch"] logger.info("Loaded checkpoint '{}' (iter {})".format( cfg["training"]["resume"], checkpoint["epoch"])) else: logger.info("No checkpoint found at '{}'".format( cfg["training"]["resume"])) val_loss_meter = averageMeter() time_meter = averageMeter() best_iou = -100.0 i = start_iter flag = True # fig = plt.figure() # plt.rcParams['xtick.major.pad'] = '15' # fig.show() # fig.canvas.draw() while i <= cfg["training"]["train_iters"] and flag: for (images, labels) in trainloader: i += 1 start_ts = time.time() model.train() images = images.to(device) labels = labels.to(device) optimizer.zero_grad() outputs = model(images) loss = loss_fn(input=outputs, target=labels) loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 5) # plot_grad_flow(model.named_parameters(), fig) # zero mean conv for layer 1 of dsm encoder optimizer.step() scheduler.step() # m = model._modules['module'].encoderDSM._modules['0']._modules['0'] # model._modules['module'].encoderDSM._modules['0']._modules['0'].weight = m.weight - torch.mean(m.weight) model = zero_mean(model, all=False) time_meter.update(time.time() - start_ts) if (i + 1) % cfg["training"]["print_interval"] == 0: fmt_str = "Iter [{:d}/{:d}] Loss: {:.4f} Time/Image: {:.4f}" print_str = fmt_str.format( i + 1, cfg["training"]["train_iters"], loss.item(), time_meter.avg / cfg["training"]["batch_size"], ) print(print_str) logger.info(print_str) writer.add_scalar("loss/train_loss", loss.item(), i + 1) time_meter.reset() if (i + 1) % cfg["training"]["val_interval"] == 0 or ( i + 1) == cfg["training"]["train_iters"]: model.eval() with torch.no_grad(): for i_val, (images_val, labels_val) in tqdm(enumerate(valloader)): images_val = images_val.to(device) labels_val = labels_val.to(device) outputs = model(images_val) val_loss = loss_fn(input=outputs, target=labels_val) pred = outputs.data.max(1)[1].cpu().numpy() gt = labels_val.data.cpu().numpy() # plt.imshow(v_loader.decode_segmap(gt[0,:,:])) # plt.imshow(v_loader.decode_segmap(pred[0, :, :])) running_metrics_val.update(gt, pred) val_loss_meter.update(val_loss.item()) writer.add_scalar("loss/val_loss", val_loss_meter.avg, i + 1) logger.info("Iter %d Loss: %.4f" % (i + 1, val_loss_meter.avg)) score, class_iou = running_metrics_val.get_scores() for k, v in score.items(): #print(k, v) logger.info("{}: {}".format(k, v)) writer.add_scalar("val_metrics/{}".format(k), v, i + 1) for k, v in class_iou.items(): logger.info("{}: {}".format(k, v)) writer.add_scalar("val_metrics/cls_{}".format(k), v, i + 1) val_loss_meter.reset() running_metrics_val.reset() if score["Mean IoU : \t"] >= best_iou: best_iou = score["Mean IoU : \t"] state = { "epoch": i + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), "scheduler_state": scheduler.state_dict(), "best_iou": best_iou, } save_path = os.path.join( writer.file_writer.get_logdir(), "{}_{}_best_model.pkl".format(cfg["model"]["arch"], cfg["data"]["dataset"]), ) torch.save(state, save_path) if (i + 1) == cfg["training"]["train_iters"]: flag = False break
def train_net(config): os.environ["CUDA_VISIBLE_DEVICES"] = config['pruned']['gpu_id'] data_loader = DataLoader(config) train_loader = torch.utils.data.DataLoader( data_loader, batch_size=config['train']['batch_size'], shuffle=True, num_workers=config['train']['num_workers'], worker_init_fn=worker_init_fn, drop_last=True, pin_memory=False) start_epoch = 0 running_metric_binary = runningScore(2) if not (os.path.exists(config['train']['checkpoints'])): os.mkdir(config['train']['checkpoints']) checkpoints = os.path.join( config['pruned']['save_checkpoints'], "DB_%s_bs_%d_ep_%d" % (config['train']['backbone'], config['train']['batch_size'], config['train']['n_epoch'])) if not (os.path.exists(checkpoints)): os.mkdir(checkpoints) model = DBNet(config) criterion = L1BalanceCELoss() optimizer = torch.optim.SGD(model.parameters(), lr=config['pruned']['finetune_lr'], momentum=0.99, weight_decay=5e-4) if config['pruned']['restore']: print('Resuming from checkpoint.') assert os.path.isfile( config['pruned'] ['resume']), 'Error: no checkpoint directory found!' checkpoint = torch.load(config['pruned']['resume']) start_epoch = checkpoint['epoch'] model = load_prune_model(model, config['pruned']['checkpoints_dict']).cuda() model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) log_write = Logger(os.path.join(checkpoints, 'log.txt'), title=config['train']['backbone'], resume=True) else: print('Training from scratch.') model_dict = torch.load(config['pruned']['pruned_checkpoints']) model = load_prune_model(model, config['pruned']['checkpoints_dict']).cuda() print(model) try: model.load_state_dict(model_dict) except: state = model.state_dict() for key in state.keys(): state[key] = model_dict['module.' + key] model.load_state_dict(state) log_write = Logger(os.path.join(checkpoints, 'log.txt'), title=config['train']['backbone']) log_write.set_names([ ' epoch', 'Total loss', ' Bce loss', 'Thresh loss', ' L1 loss', 'Binary Acc', 'Binary IoU', ' rescall', ' precision', ' hmean' ]) max_hmean = -1 for epoch in range(start_epoch, config['pruned']['n_epoch']): model.train() bce_loss_list = [] thresh_loss_list = [] l1_loss_list = [] total_loss_list = [] if (config['train']['decay_method'] == 'e_decay'): adjust_learning_rate_poly(config['pruned']['finetune_lr'], optimizer, epoch, max_epoch=config['pruned']['n_epoch'], factor=0.9) else: adjust_learning_rate(config, optimizer, epoch, config['train']['gama']) for batch_idx, (imgs, gts, gt_masks, thresh_maps, thresh_masks) in enumerate(train_loader): imgs = Variable(imgs.cuda()) gts = Variable(gts.cuda()) gt_masks = Variable(gt_masks.cuda()) thresh_maps = Variable(thresh_maps.cuda()) thresh_masks = Variable(thresh_masks.cuda()) batch = {} batch['gt'] = gts batch['mask'] = gt_masks batch['thresh_map'] = thresh_maps batch['thresh_mask'] = thresh_masks pre = model(imgs) loss, metrics = criterion(pre, batch) optimizer.zero_grad() loss.backward() optimizer.step() score_binary = cal_binary_score(pre['binary'], gts, gt_masks.unsqueeze(1), running_metric_binary) bce_loss_list.append(metrics['bce_loss'].item()) thresh_loss_list.append(metrics['thresh_loss'].item()) l1_loss_list.append(metrics['l1_loss'].item()) total_loss_list.append(loss.item()) if batch_idx % config['train']['show_step'] == 0: if (config['train']['print_format'] == 'linux'): headers = [ 'epoch/epochs', 'batch/batchs', 'TotalLoss', 'BceLoss', ' ThreshLoss', 'L1Loss', 'Binary Acc', 'Binary IoU', 'Lr Rate' ] show_item = [[ str(epoch) + '/' + str(config['pruned']['n_epoch']), str(batch_idx + 1) + '/' + str(len(train_loader)), get_str(np.mean(total_loss_list)), get_str(np.mean(bce_loss_list)), get_str(np.mean(thresh_loss_list)), get_str(np.mean(l1_loss_list)), get_str(score_binary['Mean Acc']), get_str(score_binary['Mean IoU']), get_str(optimizer.param_groups[0]['lr']) ]] print_table(headers, show_item, type_str='train') else: output_log = '({epoch}/{epochs}/{batch}/{size}) | TotalLoss: {total_loss:.4f} | BceLoss: {bce_loss:.4f} | ThreshLoss: {thresh_loss: .4f} | L1Loss: {l1_loss: .4f} | Binary Acc: {bin_acc: .4f} | Binary IoU: {bin_iou: .4f} | Lr: {lr: .4f}'.format( epoch=epoch, epochs=config['pruned']['n_epoch'], batch=batch_idx + 1, size=len(train_loader), total_loss=np.mean(total_loss_list), bce_loss=np.mean(bce_loss_list), thresh_loss=np.mean(thresh_loss_list), l1_loss=np.mean(l1_loss_list), bin_acc=score_binary['Mean Acc'], bin_iou=score_binary['Mean IoU'], lr=optimizer.param_groups[0]['lr']) print(output_log) sys.stdout.flush() if (epoch > config['pruned']['start_val_epoch']): result_dict = val(model, config) rescall, precision, hmean = result_dict['recall'], result_dict[ 'precision'], result_dict['hmean'] print('epoch:', epoch, 'rescall:', rescall, 'precision:', precision, 'hmean:', hmean) else: rescall = 0 precision = 0 hmean = 0 log_write.append([ epoch, np.mean(total_loss_list), np.mean(bce_loss_list), np.mean(thresh_loss_list), np.mean(l1_loss_list), score_binary['Mean Acc'], score_binary['Mean IoU'], rescall, precision, hmean ]) if (hmean > max_hmean and config['pruned']['start_val_epoch'] < config['pruned']['n_epoch']): max_hmean = hmean save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'lr': optimizer.param_groups[0]['lr'], 'optimizer': optimizer.state_dict(), }, checkpoint=checkpoints, filename='best_model.pth.tar') save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'lr': optimizer.param_groups[0]['lr'], 'optimizer': optimizer.state_dict(), }, checkpoint=checkpoints)
def do_train_or_val(net, args=None, train_loader=None, val_loader=None): gpu_config = tf.ConfigProto() # config.gpu_options.per_process_gpu_memory_fraction = 0.3 gpu_config.gpu_options.allow_growth = True session = tf.Session(config=gpu_config) KTF.set_session(session) session.run(tf.global_variables_initializer()) if args is None: args = net.config.training metrics = net.get_metrics(net.class_number) opt = net.get_optimizer(args) net.model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=metrics) if train_loader is None and val_loader is None: train_dataset = dataset_generalize(net.config.dataset, split='train', bchw=False) train_loader = TD.DataLoader(dataset=train_dataset, batch_size=net.config.dataset.batch_size, shuffle=True, drop_last=False) val_dataset = dataset_generalize(net.config.dataset, split='val', bchw=False) val_loader = TD.DataLoader(dataset=val_dataset, batch_size=net.config.dataset.batch_size, shuffle=True, drop_last=False) running_metrics = runningScore(net.class_number) time_str = time.strftime("%Y-%m-%d___%H-%M-%S", time.localtime()) log_dir = os.path.join(args.log_dir, net.name, args.dataset_name, args.note, time_str) checkpoint_path = os.path.join( log_dir, "{}_{}_best_model.pkl".format(net.name, args.dataset_name)) os.makedirs(log_dir, exist_ok=True) writer = SummaryWriter(log_dir=log_dir) config = net.config config_str = json.dumps(config, indent=2, sort_keys=True).replace( '\n', '\n\n').replace(' ', '\t') writer.add_text(tag='config', text_string=config_str) # write config to config.txt config_path = os.path.join(log_dir, 'config.txt') config_file = open(config_path, 'w') json.dump(config, config_file, sort_keys=True) config_file.close() best_iou = 0.6 loaders = [train_loader, val_loader] loader_names = ['train', 'val'] for epoch in range(args.n_epoch): for loader, loader_name in zip(loaders, loader_names): if loader is None: continue if loader_name == 'val': if epoch % 5 != 0: continue print(loader_name + '.' * 50) n_step = len(loader) losses = [] for i, (images, labels) in enumerate(loader): x = images.data.numpy() trues = labels.data.numpy() y = to_categorical(trues, net.class_number) if loader_name == 'train': outputs = net.model.train_on_batch(x, y) else: outputs = net.model.test_on_batch(x, y) predict_outputs = net.model.predict_on_batch(x) predicts = np.argmax(predict_outputs, axis=-1) losses.append(outputs[0]) if epoch % 5 == 0: print('keras metrics as follow:', '*' * 30) print("%s Epoch [%d/%d] Step [%d/%d]" % (loader_name, epoch + 1, args.n_epoch, i, n_step)) for name, value in zip(net.model.metrics_names, outputs): print(name, value) print('running metrics as follow:', '*' * 30) running_metrics.update(trues, predicts) score, class_iou = running_metrics.get_scores() for k, v in score.items(): print(k, v) if epoch % 5 == 0: writer.add_scalar('%s/loss' % loader_name, np.mean(losses), epoch) writer.add_scalar('%s/acc' % loader_name, score['Overall Acc: \t'], epoch) writer.add_scalar('%s/iou' % loader_name, score['Mean IoU : \t'], epoch) running_metrics.reset() if loader_name == 'val': if score['Mean IoU : \t'] >= best_iou: best_iou = score['Mean IoU : \t'] net.model.save(checkpoint_path) # if epoch % (1+args.n_epoch//10) == 0: # print('write image to tensorboard'+'.'*50) # idx=np.random.choice(predicts.shape[0]) # writer.add_image('val/images',x[idx,:,:,:],epoch) # writer.add_image('val/predicts', torch.from_numpy(predicts[idx,:,:]), epoch) # writer.add_image('val/trues', torch.from_numpy(trues[idx,:,:]), epoch) # diff_img=(predicts[idx,:,:]==trues[idx,:,:]).astype(np.uint8) # writer.add_image('val/difference', torch.from_numpy(diff_img), epoch) print('best iou is', best_iou) writer.close()
def train(self, args=None, train_loader=None, val_loader=None): if args is None: args = self.config.training self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) if train_loader is None and val_loader is None: train_dataset = dataset_generalize(config.dataset, split='train', bchw=False) train_loader = TD.DataLoader( dataset=train_dataset, batch_size=self.config.dataset.batch_size, shuffle=True, drop_last=False) val_dataset = dataset_generalize(config.dataset, split='val', bchw=False) val_loader = TD.DataLoader( dataset=val_dataset, batch_size=self.config.dataset.batch_size, shuffle=True, drop_last=False) running_metrics = runningScore(self.class_number) time_str = time.strftime("%Y-%m-%d___%H-%M-%S", time.localtime()) log_dir = os.path.join(args.log_dir, self.name, self.dataset_name, args.note, time_str) checkpoint_path = os.path.join( log_dir, "{}_{}_best_model.pkl".format(self.name, self.dataset_name)) os.makedirs(log_dir, exist_ok=True) writer = SummaryWriter(log_dir=log_dir) best_iou = 0.6 loaders = [train_loader, val_loader] loader_names = ['train', 'val'] for epoch in range(args.n_epoch): for loader, loader_name in zip(loaders, loader_names): if loader is None: continue if loader_name == 'val': if epoch % 10 != 0: continue print(loader_name + '.' * 50) n_step = len(loader) losses = [] for i, (images, labels) in enumerate(loader): x = images.data.numpy() trues = labels.data.numpy() y = to_categorical(trues, self.class_number) if loader_name == 'train': outputs = self.model.train_on_batch(x, y) else: outputs = self.model.test_on_batch(x, y) predict_outputs = self.model.predict_on_batch(x) predicts = np.argmax(predict_outputs, axis=-1) losses.append(outputs[0]) if i % 5 == 0: print( "%s Epoch [%d/%d] Step [%d/%d]" % (loader_name, epoch + 1, args.n_epoch, i, n_step)) for name, value in zip(self.model.metrics_names, outputs): print(name, value) running_metrics.update(trues, predicts) score, class_iou = running_metrics.get_scores() for k, v in score.items(): print(k, v) writer.add_scalar('%s/loss' % loader_name, np.mean(losses), epoch) writer.add_scalar('%s/acc' % loader_name, score['Overall Acc: \t'], epoch) writer.add_scalar('%s/iou' % loader_name, score['Mean IoU : \t'], epoch) running_metrics.reset() if loader_name == 'val': if score['Mean IoU : \t'] >= best_iou: best_iou = score['Mean IoU : \t'] self.model.save(checkpoint_path) if epoch % (1 + args.n_epoch // 10) == 0: print('write image to tensorboard' + '.' * 50) idx = np.random.choice(predicts.shape[0]) writer.add_image('val/images', x[idx, :, :, :], epoch) writer.add_image('val/predicts', torch.from_numpy(predicts[idx, :, :]), epoch) writer.add_image('val/trues', torch.from_numpy(trues[idx, :, :]), epoch) diff_img = ( predicts[idx, :, :] == trues[idx, :, :]).astype( np.uint8) writer.add_image('val/difference', torch.from_numpy(diff_img), epoch) writer.close()
def train(data_path, models_path, backend, snapshot, crop_x, crop_y, batch_size, alpha, epochs, start_lr, milestones, gpu): os.environ["CUDA_VISIBLE_DEVICES"] = gpu net, starting_epoch = build_network(snapshot, backend) data_path = os.path.abspath(os.path.expanduser(data_path)) models_path = os.path.abspath(os.path.expanduser(models_path)) os.makedirs(models_path, exist_ok=True) ''' To follow this training routine you need a DataLoader that yields the tuples of the following format: (Bx3xHxW FloatTensor x, BxHxW LongTensor y, BxN LongTensor y_cls) where x - batch of input images, y - batch of groung truth seg maps, y_cls - batch of 1D tensors of dimensionality N: N total number of classes, y_cls[i, T] = 1 if class T is present in image i, 0 otherwise ''' voc_data = pascalVOCLoader(root=data_path, is_transform=True, augmentations=None) # train_loader, class_weights, n_images = None, None, None train_loader = DataLoader(voc_data, batch_size=batch_size, shuffle=True, num_workers=0) max_steps = len(voc_data) class_weights = None optimizer = optim.Adam(net.parameters(), lr=start_lr) scheduler = MultiStepLR(optimizer, milestones=[int(x) for x in milestones.split(',')], gamma=0.1) running_score = runningScore(21) for epoch in range(starting_epoch, starting_epoch + epochs): seg_criterion = nn.NLLLoss(weight=class_weights) cls_criterion = nn.BCEWithLogitsLoss(weight=class_weights) epoch_losses = [] # train_iterator = tqdm(train_loader, total=max_steps // batch_size + 1) net.train() print('------------epoch[{}]----------'.format(epoch + 1)) for i, (x, y, y_cls) in enumerate(train_loader): optimizer.zero_grad() x, y, y_cls = Variable(x).cuda(), Variable(y).cuda(), Variable( y_cls).float().cuda() out, out_cls = net(x) pred = out.data.max(1)[1].cpu().numpy() seg_loss, cls_loss = seg_criterion(out, y), cls_criterion( out_cls, y_cls) loss = seg_loss + alpha * cls_loss epoch_losses.append(loss.item()) running_score.update(y.data.cpu().numpy(), pred) if (i + 1) % 138 == 0: score, class_iou = running_score.get_scores() for k, v in score.items(): print(k, v) logger.info('{}:{}'.format(k, v)) running_score.reset() print_format_str = "Epoch[{}] batch[{}] loss = {:.4f} LR = {}" print_str = print_format_str.format(epoch + 1, i + 1, loss.item(), scheduler.get_lr()[0]) print(print_str) logger.info(print_str) ''' status = '[{}] loss = {:.4f} avg = {:.4f}, LR = {}'.format( epoch + 1, loss.item(), np.mean(epoch_losses), scheduler.get_lr()[0]) train_iterator.set_description(status) ''' loss.backward() optimizer.step() scheduler.step() if epoch + 1 % 20 == 0: train_loss = ('%.4f' % np.mean(epoch_losses)) torch.save( net.state_dict(), os.path.join( models_path, '_'.join(["PSPNet", str(epoch + 1), train_loss]) + '.pth'))
output_path = 'output/pspnet/voc2012/voc_val' label_path = '/home/yzbx/.cv/datasets/VOC/VOCdevkit/VOC2012/SegmentationClass' val_output_files = glob.glob(os.path.join(output_path, '*.png')) # all label files for train and validation label_files = glob.glob(os.path.join(label_path, '*.png')) val_label_files = [] for f in val_output_files: basename = os.path.basename(f) val_label_file = os.path.join(label_path, basename) assert val_label_file in label_files, '%s %s %s' % ( basename, val_label_file, label_files[0]) val_label_files.append(val_label_file) run_score = runningScore(21) for output_file, label_file in tqdm(zip(val_output_files, val_label_files)): label_img_pil = Image.open(label_file) label_img = np.array(label_img_pil, dtype=np.uint8) output_img_pil = Image.open(output_file) output_img = np.array(output_img_pil, dtype=np.uint8) assert label_img.shape == output_img.shape run_score.update(label_trues=label_img, label_preds=output_img) # run_score.update(label_trues=output_img, label_preds=label_img) score, class_iou = run_score.get_scores() for k, v in score.items(): print(k, v) labels = [
'params': [p for p in model.parameters() if p.requires_grad] }] if config.optimizer == 'adam': optimizer = torch.optim.Adam(optimizer_params, lr=config['init_lr'], amsgrad=False) else: assert config.init_lr > 1e-3 optimizer = torch.optim.SGD(optimizer_params, lr=config['init_lr'], momentum=0.9, weight_decay=1e-4) metric_mask_loss = Metric_Mean() metric_total_loss = Metric_Mean() running_metrics = runningScore(config.class_number) tqdm_epoch = trange(config['epoch'], desc='{} epochs'.format(config.note), leave=True) for epoch in tqdm_epoch: for split in ['train', 'val']: if split == 'train': model.train() else: model.eval() metric_mask_loss.reset() metric_total_loss.reset() running_metrics.reset() tqdm_step = tqdm(dataset_loaders[split], desc='steps', leave=False)