ax[2].imshow(lbl) ax[0].set_xticks([]) ax[0].set_yticks([]) ax[1].set_xticks([]) ax[1].set_yticks([]) ax[2].set_xticks([]) ax[2].set_yticks([]) (log_dir / 'eval_vis').mkdir(exist_ok=True, parents=True) plt.savefig( str(log_dir / 'eval_vis' / f'{i_epoch:04d}_{valid_iou:.4f}_{valid_fp}{"_best" if best_metrics == valid_iou else ""}_{i:03d}.png' )) plt.close() else: valid_loss = None valid_iou = None loss_history.append([train_loss, valid_loss]) iou_history.append([train_iou, valid_iou]) history_ploter(loss_history, log_dir.joinpath('loss.png')) history_ploter(iou_history, log_dir.joinpath('iou.png')) history_dict = { 'loss': loss_history, 'iou': iou_history, 'best_metrics': best_metrics } with open(log_dir.joinpath('history.pkl'), 'wb') as f: pickle.dump(history_dict, f)
def main(): config_path = Path(args.config_path) config = yaml.load(open(config_path)) net_config = config['Net'] data_config = config['Data'] train_config = config['Train'] # Config for data: train_dir = data_config["train_dir"] train_name = data_config["train_name"] train_type = data_config["train_type"] val_dir = data_config["val_dir"] val_name = data_config["val_name"] val_type = data_config["val_type"] target_size = data_config["target_size"] num_workers = data_config["num_worker"] # Config for train: num_epoch = train_config["num_epoch"] batch_size = train_config["batch_size"] val_every = train_config["val_every"] resume = train_config["resume"] pretrained_path = train_config["pretrained_path"] saved_dir = train_config["saved_dir"] epoch_start = 0 loss_type = train_config["loss_type"] optimizer_config = train_config["optimizer"] del data_config del train_config model = load_model(**net_config) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"[INFO] Device: {device}") # To device model = model.to(device) # if torch.cuda.is_available(): # model.cuda() modelname = config_path.stem output_dir = Path(saved_dir) / "models" / modelname output_dir.mkdir(parents=True, exist_ok=True) log_dir = Path(saved_dir) / "logs" / modelname log_dir.mkdir(parents=True, exist_ok=True) # logger = debug_logger(log_dir) # logger.debug(config) # logger.info(f'Device: {device}') # logger.info(f'Max Epoch: {max_epoch}') loss_fn = Criterion(loss_type=loss_type).to(device) params = model.parameters() optimizer, scheduler = create_optimizer(params, **optimizer_config) # Dataset affine_augmenter = albu.Compose([ albu.GaussNoise(var_limit=(0, 25), p=.2), albu.GaussianBlur(3, p=0.2), albu.JpegCompression(50, 100, p=0.2) ]) image_augmenter = albu.Compose([ albu.OneOf([ albu.RandomBrightnessContrast(0.25, 0.25), albu.CLAHE(clip_limit=2), albu.RandomGamma(), ], p=0.5), albu.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=0.2), albu.RGBShift(p=0.2), albu.RandomSizedCrop(min_max_height=[45, 64], height=64, width=64, p=0.5), ]) train_dataset = load_dataset(data_type=train_type, base_dir=train_dir, filename=train_name, n_class=net_config['n_class'], target_size=target_size, affine_augmenter=affine_augmenter, image_augmenter=image_augmenter, debug=False) val_dataset = load_dataset(data_type=val_type, base_dir=val_dir, filename=val_name, n_class=net_config['n_class'], target_size=target_size, debug=False) train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=True, drop_last=True) valid_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True) if torch.cuda.is_available(): model = nn.DataParallel(model) if resume: checkpoint = torch.load(pretrained_path) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch_start = checkpoint['epoch'] + 1 loss_history = checkpoint['loss_history'] else: loss_history = [] model.train() for i_epoch in range(epoch_start, num_epoch): print(f"Epoch: {i_epoch}") print(f'Learning rate: {optimizer.param_groups[0]["lr"]}') train_losses = [] train_diffs = [] model.train() with tqdm(train_loader) as _tqdm: for batched in _tqdm: optimizer.zero_grad() if loss_type == "RANK": img1, img2, lbl1, lbl2, labels = batched img1, img2, lbl1, lbl2, labels = img1.to(device), img2.to( device), lbl1.to(device), lbl2.to(device), labels.to( device) preds1 = model(img1) preds2 = model(img2) preds1 = preds1.to(device) preds2 = preds2.to(device) loss = loss_fn([preds1, preds2], [lbl1, lbl2, labels]) diff = calculate_diff(preds1, lbl1) diff += calculate_diff(preds2, lbl2) diff /= 2 _tqdm.set_postfix( OrderedDict(loss=f'{loss.item():.3f}', mae=f'{diff:.1f}')) train_losses.append(loss.item()) history_ploter(train_losses, log_dir.joinpath('loss.png')) train_diffs.append(diff) loss.backward() optimizer.step() elif loss_type == "MSE" or loss_type == "wrapped": img1, lbl1, _, _, _ = batched img1, lbl1 = img1.to(device), lbl1.to(device) if net_config["net_type"] == "Perceiver": img1 = img1.permute(0, 2, 3, 1) preds1 = model(img1) loss = loss_fn([preds1, []], [lbl1, []]) diff = calculate_diff(preds1, lbl1) _tqdm.set_postfix( OrderedDict(loss=f'{loss.item():.3f}', mae=f'{diff:.1f}')) train_losses.append(loss.item()) history_ploter(train_losses, log_dir.joinpath('loss.png')) train_diffs.append(diff) loss.backward() optimizer.step() train_loss = np.mean(train_losses) train_diff = np.nanmean(train_diffs) print(f'[INFO] train loss: {train_loss}') print(f'[INFO] train diff: {train_diff}') scheduler.step() if (i_epoch + 1) % val_every == 0: valid_losses = [] valid_diffs = [] model.eval() with torch.no_grad(): with tqdm(valid_loader) as _tqdm: for batched in _tqdm: images, labels, _, _, _ = batched if net_config["net_type"] == "Perceiver": images = images.permute(0, 2, 3, 1) images, labels = images.to(device), labels.to(device) preds = model(images) # loss = loss_fn([preds], [labels]) diff = calculate_diff(preds, labels) _tqdm.set_postfix(OrderedDict(mae=f'{diff:.2f}')) # _tqdm.set_postfix(OrderedDict(loss=f'{loss.item():.3f}', d_y=f'{np.mean(diff[:,0]):.1f}', d_p=f'{np.mean(diff[:,1]):.1f}', d_r=f'{np.mean(diff[:,2]):.1f}')) valid_diffs.append(diff) valid_diff = np.mean(valid_diffs) loss_history.append([train_diff, valid_diff]) history_ploter(loss_history, log_dir.joinpath('diff.png')) print(f'[INFO] valid diff: {valid_diff}') torch.save( model.state_dict(), output_dir.joinpath(f'model_epoch_{i_epoch}_{valid_diff}.pth')) torch.save( { 'epoch': i_epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss_history': loss_history, }, output_dir.joinpath( f'checkpoint_epoch_{i_epoch}_{valid_diff}.pth')) else: valid_diff = None
def main(): config_path = Path(args.config_path) config = yaml.load(open(config_path)) net_config = config['Net'] data_config = config['Data'] train_config = config['Train'] loss_config = config['Loss'] opt_config = config['Optimizer'] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') n_class = net_config['n_class'] max_epoch = train_config['max_epoch'] batch_size = train_config['batch_size'] num_workers = train_config['num_workers'] test_every = train_config['test_every'] resume = train_config['resume'] pretrained_path = train_config['pretrained_path'] use_rank = train_config['use_rank'] use_bined = train_config['use_bined'] del train_config['use_rank'] del train_config['use_bined'] train_dir = data_config['train_dir'] val_dir = data_config['val_dir'] train_name = data_config['train_name'] val_name = data_config['val_name'] train_type = data_config['train_type'] val_type = data_config['val_type'] del data_config['train_dir'] del data_config['val_dir'] del data_config['train_name'] del data_config['val_name'] del data_config['train_type'] del data_config['val_type'] model = load_model(**net_config) # To device model = model.to(device) modelname = config_path.stem output_dir = Path('../model') / modelname output_dir.mkdir(exist_ok=True) log_dir = Path('../logs') / modelname log_dir.mkdir(exist_ok=True) logger = debug_logger(log_dir) logger.debug(config) logger.info(f'Device: {device}') logger.info(f'Max Epoch: {max_epoch}') loss_fn = Criterion(**loss_config).to(device) params = model.parameters() optimizer, scheduler = create_optimizer(params, **opt_config) # history if resume: with open(log_dir.joinpath('history.pkl'), 'rb') as f: history_dict = pickle.load(f) best_metrics = history_dict['best_metrics'] loss_history = history_dict['loss'] diff_history = history_dict['diff'] # start_epoch = len(diff_history) start_epoch = 47 for _ in range(start_epoch): scheduler.step() else: start_epoch = 0 best_metrics = float('inf') loss_history = [] diff_history = [] # Dataset affine_augmenter = albu.Compose([albu.GaussNoise(var_limit=(0,25),p=.2), albu.GaussianBlur(3, p=0.2), albu.JpegCompression(50, 100, p=0.2)]) image_augmenter = albu.Compose([ albu.OneOf([ albu.RandomBrightnessContrast(0.25,0.25), albu.CLAHE(clip_limit=2), albu.RandomGamma(), ], p=0.5), albu.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20,p=0.2), albu.RGBShift(p=0.2), ]) # image_augmenter = None train_dataset = laod_dataset(data_type=train_type, affine_augmenter=affine_augmenter, image_augmenter=image_augmenter, base_dir=train_dir, filename=train_name, use_bined=use_bined, n_class=n_class, **data_config) valid_dataset = laod_dataset(data_type=val_type, split='valid', base_dir=val_dir, filename=val_name, use_bined=use_bined, n_class=n_class, **data_config) # top_10 = len(train_dataset) // 10 # top_30 = len(train_dataset) // 3.33 # train_weights = [ 3 if idx<top_10 else 2 if idx<top_30 else 1 for idx in train_dataset.labels_sort_idx] # train_sample = WeightedRandomSampler(train_weights, num_samples=len(train_dataset), replacement=True) # train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sample, num_workers=num_workers, # pin_memory=True, drop_last=True) train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=True, drop_last=True) valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=num_workers, pin_memory=True) if torch.cuda.is_available(): model = nn.DataParallel(model) # Pretrained model if pretrained_path: logger.info(f'Load pretrained from {pretrained_path}') param = torch.load(pretrained_path, map_location='cpu') if "state_dict" in param: model.load_state_dict(param['state_dict'], strict=False) else: model.load_state_dict(param) del param # Restore model if resume: print("[INFO] resume training.") model_path = output_dir.joinpath(f'model_epoch_{start_epoch-1}.pth') logger.info(f'Resume from {model_path}') param = torch.load(model_path, map_location='cpu') model.load_state_dict(param) del param opt_path = output_dir.joinpath(f'opt_epoch_{start_epoch-1}.pth') param = torch.load(opt_path) optimizer.load_state_dict(param) del param file_train_log = open("file_train_log.txt", "a") file_val_log = open("file_val_log.txt", "a") # Train for i_epoch in range(start_epoch, max_epoch): logger.info(f'Epoch: {i_epoch}') logger.info(f'Learning rate: {optimizer.param_groups[0]["lr"]}') train_losses = [] train_diffs = [] model.train() with tqdm(train_loader) as _tqdm: for batched in _tqdm: optimizer.zero_grad() if use_rank: if use_bined: img1, img2, lbl1, lbl2, labels, yaw_lbl1, pitch_lbl1, roll_lbl1, yaw_lbl2, pitch_lbl2, roll_lbl2 = batched img1, img2, lbl1, lbl2, labels = img1.to(device),img2.to(device),lbl1.to(device),lbl2.to(device),labels.to(device) yaw_lbl1, pitch_lbl1, roll_lbl1 = yaw_lbl1.to(device), pitch_lbl1.to(device), roll_lbl1.to(device) yaw_lbl2, pitch_lbl2, roll_lbl2 = yaw_lbl2.to(device), pitch_lbl2.to(device), roll_lbl2.to(device) preds1, y_pres1, p_pres1, r_pres1 = model(img1, True) preds2, y_pres2, p_pres2, r_pres2 = model(img2, True) pre_list = [preds1,preds2,y_pres1,p_pres1,r_pres1,y_pres2,p_pres2,r_pres2] lbl_list = [lbl1,lbl2,yaw_lbl1,pitch_lbl1,roll_lbl1,yaw_lbl2,pitch_lbl2,roll_lbl2,labels] loss = loss_fn(pre_list, lbl_list, use_bined=True) else: img1, img2, lbl1, lbl2, labels = batched img1, img2, lbl1, lbl2, labels = img1.to(device),img2.to(device),lbl1.to(device),lbl2.to(device),labels.to(device) preds1 = model(img1, False) preds2 = model(img2, False) loss = loss_fn([preds1,preds2], [lbl1,lbl2,labels], use_bined=False) # print(f"Preds1: {preds1}") # print(f"Preds2: {preds2}") # print(f"lib1: {lbl1}") # print(f"lib2: {lbl2}") diff = calculate_diff(preds1, lbl1) diff += calculate_diff(preds2, lbl2) diff /= 2 # print(f"Diff: {diff}") elif use_bined: images, labels, yaw_labels, pitch_labels, roll_labels = batched images, labels = images.to(device), labels.to(device) yaw_labels, pitch_labels, roll_labels = yaw_labels.to(device), pitch_labels.to(device), roll_labels.to(device) preds, y_pres, p_pres, r_pres = model(images, use_bined) loss = loss_fn([preds, y_pres, p_pres, r_pres], [labels, yaw_labels, pitch_labels, roll_labels], use_bined) diff = calculate_diff(preds, labels) else: images, labels = batched images, labels = images.to(device), labels.to(device) preds = model(images, use_bined) loss = loss_fn([preds], [labels]) diff = calculate_diff(preds, labels, mean=True) _tqdm.set_postfix(OrderedDict(loss=f'{loss.item():.3f}', mae=f'{diff:.1f}')) train_losses.append(loss.item()) train_diffs.append(diff) loss.backward() optimizer.step() scheduler.step() train_loss = np.mean(train_losses) train_diff = np.nanmean(train_diffs) logger.info(f'train loss: {train_loss}') logger.info(f'train diff: {train_diff}') file_train_log.write(f"{train_loss},{train_diff}") # torch.save(model.module.state_dict(), output_dir.joinpath(f'model_tmp_epoch_{i_epoch}.pth')) # torch.save(optimizer.state_dict(), output_dir.joinpath(f'opt_tmp_{i_epoch}.pth')) if (i_epoch + 1) % test_every == 0: valid_losses = [] valid_diffs = [] model.eval() with torch.no_grad(): with tqdm(valid_loader) as _tqdm: for batched in _tqdm: if use_bined: images, labels, yaw_labels, pitch_labels, roll_labels = batched images, labels = images.to(device), labels.to(device) # yaw_labels, pitch_labels, roll_labels = yaw_labels.to(device), pitch_labels.to(device), roll_labels.to(device) preds, y_pres, p_pres, r_pres = model(images, use_bined) # loss = loss_fn([preds, y_pres, p_pres, r_pres], [labels, yaw_labels, pitch_labels, roll_labels]) diff = calculate_diff(preds, labels) else: images, labels = batched images, labels = images.to(device), labels.to(device) preds = model(images, use_bined) # loss = loss_fn([preds], [labels]) diff = calculate_diff(preds, labels) _tqdm.set_postfix(OrderedDict(mae=f'{diff:.2f}')) # _tqdm.set_postfix(OrderedDict(loss=f'{loss.item():.3f}', d_y=f'{np.mean(diff[:,0]):.1f}', d_p=f'{np.mean(diff[:,1]):.1f}', d_r=f'{np.mean(diff[:,2]):.1f}')) valid_losses.append(0) valid_diffs.append(diff) valid_loss = np.mean(valid_losses) valid_diff = np.mean(valid_diffs) logger.info(f'valid seg loss: {valid_loss}') logger.info(f'valid diff: {valid_diff}') file_val_log.write(f"{valid_loss},{valid_diff}") if best_metrics >= valid_diff: best_metrics = valid_diff logger.info('Best Model!\n') torch.save(model.state_dict(), output_dir.joinpath(f'model_epoch_{i_epoch}_{valid_diff}.pth')) torch.save(optimizer.state_dict(), output_dir.joinpath(f'opt_epoch_{i_epoch}_{valid_diff}.pth')) torch.save(model.state_dict(), output_dir.joinpath(f'model_epoch_{i_epoch}_{valid_diff}.pth')) torch.save(optimizer.state_dict(), output_dir.joinpath(f'opt_epoch_{i_epoch}_{valid_diff}.pth')) else: valid_loss = None valid_diff = None loss_history.append([train_loss, valid_loss]) diff_history.append([train_diff, valid_diff]) history_ploter(loss_history, log_dir.joinpath('loss.png')) history_ploter(diff_history, log_dir.joinpath('diff.png')) history_dict = {'loss': loss_history, 'diff': diff_history, 'best_metrics': best_metrics} with open(log_dir.joinpath('history.pkl'), 'wb') as f: pickle.dump(history_dict, f) file_train_log.close() file_val_log.close()
else: valid_loss = None valid_iou = None valid_iou1 = None valid_iou2 = None valid_iou3 = None loss_history.append([train_loss_all, valid_loss]) iou_history.append([train_iou_all, valid_iou]) iou_history1.append([train_iou1, valid_iou1]) iou_history2.append([train_iou2, valid_iou2]) iou_history3.append([train_iou3, valid_iou3]) history_ploter(loss_history, log_dir.joinpath('loss.png')) history_ploter(iou_history, log_dir.joinpath('iou.png')) history_ploter(iou_history1, log_dir.joinpath('iou1.png')) history_ploter(iou_history2, log_dir.joinpath('iou2.png')) history_ploter(iou_history3, log_dir.joinpath('iou3.png')) history_dict = { 'loss': loss_history, 'iou': iou_history, 'best_metrics': best_metrics } with open(log_dir.joinpath('history.pkl'), 'wb') as f: pickle.dump(history_dict, f) #cuda memory usage #print(torch.cuda.max_memory_allocated(device=device))
def train(): best_metrics = 0 loss_history = [] iou_history = [] if resume: model_path = output_dir.joinpath(f'model.pth') logger.info(f'Resume from {model_path}') param = torch.load(model_path) model.load_state_dict(param) del param for _ in range(start_epoch): scheduler.step() if log_dir.joinpath('history.pkl').exists(): with open(log_dir.joinpath('history.pkl'), 'rb') as f: history_dict = pickle.load(f) best_metrics = history_dict['best_metrics'] loss_history = history_dict['seg_loss'] iou_history = history_dict['iou'] for i_epoch in range(start_epoch, max_epoch): logger.info(f'Epoch: {i_epoch}') logger.info(f'Learning rate: {optimizer.param_groups[0]["lr"]}') train_losses = [] train_ious = [] with tqdm(train_loader) as _tqdm: for batched in _tqdm: images, labels = batched images, labels = images.to(device), labels.to(device) optimizer.zero_grad() preds = model(images) preds = F.interpolate(preds, size=labels.shape[2:], mode='bilinear', align_corners=True) loss = loss_fn(preds, labels) preds_np = preds.detach().cpu().numpy() labels_np = labels.detach().cpu().numpy().squeeze() iou = compute_iou_batch(preds_np, labels_np) _tqdm.set_postfix( OrderedDict(seg_loss=f'{loss.item():.5f}', iou=f'{iou:.3f}')) train_losses.append(loss.item()) train_ious.append(iou) loss.backward() optimizer.step() scheduler.step() train_loss = np.mean(train_losses) train_iou = np.mean(train_ious) logger.info(f'train loss: {train_loss}') logger.info(f'train iou: {train_iou}') valid_losses = [] valid_ious = [] model.eval() with torch.no_grad(): with tqdm(valid_loader) as _tqdm: for batched in _tqdm: images, labels = batched images, labels = images.to(device), labels.to(device) preds = model(images) preds = F.interpolate(preds, size=labels.shape[2:], mode='bilinear', align_corners=True) loss = loss_fn(preds, labels) preds_np = preds.detach().cpu().numpy() labels_np = labels.detach().cpu().numpy() iou = compute_iou_batch(preds_np, labels_np) _tqdm.set_postfix( OrderedDict(seg_loss=f'{loss.item():.5f}', iou=f'{iou:.3f}')) valid_losses.append(loss.item()) valid_ious.append(iou) model.train() valid_loss = np.mean(valid_losses) valid_iou = np.mean(valid_ious) logger.info(f'valid seg loss: {valid_loss}') logger.info(f'valid iou: {valid_iou}') loss_history.append([train_loss, valid_loss]) iou_history.append([train_iou, valid_iou]) history_ploter(loss_history, log_dir.joinpath('loss.png')) history_ploter(iou_history, log_dir.joinpath('iou.png')) torch.save(model.state_dict(), output_dir.joinpath('model_tmp.pth')) if best_metrics < valid_iou: best_metrics = valid_iou logger.info('Best Model!') torch.save(model.state_dict(), output_dir.joinpath('model.pth')) history_dict = { 'loss': loss_history, 'iou': iou_history, 'best_metrics': best_metrics } with open(log_dir.joinpath('history.pkl'), 'wb') as f: pickle.dump(history_dict, f)
def process(config_path): gc.collect() torch.cuda.empty_cache() config = yaml.load(open(config_path)) net_config = config['Net'] data_config = config['Data'] train_config = config['Train'] loss_config = config['Loss'] opt_config = config['Optimizer'] device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') t_max = opt_config['t_max'] # Collect training parameters max_epoch = train_config['max_epoch'] batch_size = train_config['batch_size'] fp16 = train_config['fp16'] resume = train_config['resume'] pretrained_path = train_config['pretrained_path'] freeze_enabled = train_config['freeze'] seed_enabled = train_config['seed'] ######################################### # Deterministic training if seed_enabled: seed = 100 torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed=seed) import random random.seed(a=100) ######################################### # Network if 'unet' in net_config['dec_type']: net_type = 'unet' model = EncoderDecoderNet(**net_config) else: net_type = 'deeplab' net_config['output_channels'] = 19 model = SPPNet(**net_config) dataset = data_config['dataset'] if dataset == 'deepglobe-dynamic': from dataset.deepglobe_dynamic import DeepGlobeDatasetDynamic as Dataset net_config['output_channels'] = 7 classes = np.arange(0, 7) else: raise NotImplementedError del data_config['dataset'] modelname = config_path.stem timestamp = datetime.timestamp(datetime.now()) print("timestamp =", datetime.fromtimestamp(timestamp)) output_dir = Path(os.path.join(ROOT_DIR, f'model/{modelname}_{datetime.fromtimestamp(timestamp)}') ) output_dir.mkdir(exist_ok=True) log_dir = Path(os.path.join(ROOT_DIR, f'logs/{modelname}_{datetime.fromtimestamp(timestamp)}') ) log_dir.mkdir(exist_ok=True) dataset_dir= '/home/sfoucher/DEV/pytorch-segmentation/data/deepglobe_as_pascalvoc/VOCdevkit/VOC2012' logger = debug_logger(log_dir) logger.debug(config) logger.info(f'Device: {device}') logger.info(f'Max Epoch: {max_epoch}') # Loss loss_fn = MultiClassCriterion(**loss_config).to(device) params = model.parameters() optimizer, scheduler = create_optimizer(params, **opt_config) # history if resume: with open(log_dir.joinpath('history.pkl'), 'rb') as f: history_dict = pickle.load(f) best_metrics = history_dict['best_metrics'] loss_history = history_dict['loss'] iou_history = history_dict['iou'] start_epoch = len(iou_history) for _ in range(start_epoch): scheduler.step() else: start_epoch = 0 best_metrics = 0 loss_history = [] iou_history = [] affine_augmenter = albu.Compose([albu.HorizontalFlip(p=.5),albu.VerticalFlip(p=.5) # Rotate(5, p=.5) ]) # image_augmenter = albu.Compose([albu.GaussNoise(p=.5), # albu.RandomBrightnessContrast(p=.5)]) image_augmenter = None # This has been put in the loop for the dynamic training """ # Dataset train_dataset = Dataset(affine_augmenter=affine_augmenter, image_augmenter=image_augmenter, net_type=net_type, **data_config) valid_dataset = Dataset(split='valid', net_type=net_type, **data_config) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True) valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True) """ # Pretrained model if pretrained_path: logger.info(f'Resume from {pretrained_path}') param = torch.load(pretrained_path) model.load_state_dict(param) model.logits = torch.nn.Conv2d(256, net_config['output_channels'], 1) del param # To device model = model.to(device) ######################################### if freeze_enabled: # Code de Rémi # Freeze layers for param_index in range(int((len(optimizer.param_groups[0]['params']))*0.5)): optimizer.param_groups[0]['params'][param_index].requires_grad = False ######################################### params_to_update = model.parameters() print("Params to learn:") if freeze_enabled: params_to_update = [] for name,param in model.named_parameters(): if param.requires_grad == True: params_to_update.append(param) print("\t",name) optimizer, scheduler = create_optimizer(params_to_update, **opt_config) # fp16 if fp16: # I only took the necessary files because I don't need the C backend of apex, # which is broken and can't be installed # from apex import fp16_utils from utils.apex.apex.fp16_utils.fp16util import BN_convert_float from utils.apex.apex.fp16_utils.fp16_optimizer import FP16_Optimizer # model = fp16_utils.BN_convert_float(model.half()) model = BN_convert_float(model.half()) # optimizer = fp16_utils.FP16_Optimizer(optimizer, verbose=False, dynamic_loss_scale=True) optimizer = FP16_Optimizer(optimizer, verbose=False, dynamic_loss_scale=True) logger.info('Apply fp16') # Restore model if resume: model_path = output_dir.joinpath(f'model_tmp.pth') logger.info(f'Resume from {model_path}') param = torch.load(model_path) model.load_state_dict(param) del param opt_path = output_dir.joinpath(f'opt_tmp.pth') param = torch.load(opt_path) optimizer.load_state_dict(param) del param i_iter = 0 ma_loss= 0 ma_iou= 0 # Train for i_epoch in range(start_epoch, max_epoch): logger.info(f'Epoch: {i_epoch}') logger.info(f'Learning rate: {optimizer.param_groups[0]["lr"]}') train_losses = [] train_ious = [] model.train() # Initialize randomized but balanced datasets train_dataset = Dataset(base_dir = dataset_dir, affine_augmenter=affine_augmenter, image_augmenter=image_augmenter, net_type=net_type, **data_config) valid_dataset = Dataset(base_dir = dataset_dir, split='valid', net_type=net_type, **data_config) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True) valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True) with tqdm(train_loader) as _tqdm: for i, batched in enumerate(_tqdm): images, labels = batched if fp16: images = images.half() images, labels = images.to(device), labels.to(device) optimizer.zero_grad() preds = model(images) if net_type == 'deeplab': preds = F.interpolate(preds, size=labels.shape[1:], mode='bilinear', align_corners=True) if fp16: loss = loss_fn(preds.float(), labels) else: loss = loss_fn(preds, labels) preds_np = preds.detach().cpu().numpy() labels_np = labels.detach().cpu().numpy() iou = compute_iou_batch(np.argmax(preds_np, axis=1), labels_np, classes) _tqdm.set_postfix(OrderedDict(seg_loss=f'{loss.item():.5f}', iou=f'{iou:.3f}')) train_losses.append(loss.item()) train_ious.append(iou) ma_loss= 0.01*loss.item() + 0.99 * ma_loss ma_iou= 0.01*iou + 0.99 * ma_iou plotter.plot('loss', 'train', 'iteration Loss', i_iter, loss.item()) plotter.plot('iou', 'train', 'iteration iou', i_iter, iou) plotter.plot('loss', 'ma_loss', 'iteration Loss', i_iter, ma_loss) plotter.plot('iou', 'ma_iou', 'iteration iou', i_iter, ma_iou) if fp16: optimizer.backward(loss) else: loss.backward() optimizer.step() i_iter += 1 scheduler.step() train_loss = np.mean(train_losses) train_iou = np.nanmean(train_ious) logger.info(f'train loss: {train_loss}') logger.info(f'train iou: {train_iou}') plotter.plot('loss-epoch', 'train', 'iteration Loss', i_epoch, train_loss) plotter.plot('iou-epoch', 'train', 'iteration iou', i_epoch, train_iou) torch.save(model.state_dict(), output_dir.joinpath('model_tmp.pth')) torch.save(optimizer.state_dict(), output_dir.joinpath('opt_tmp.pth')) valid_losses = [] valid_ious = [] model.eval() with torch.no_grad(): with tqdm(valid_loader) as _tqdm: for batched in _tqdm: images, labels = batched if fp16: images = images.half() images, labels = images.to(device), labels.to(device) preds = model.tta(images, net_type=net_type) if fp16: loss = loss_fn(preds.float(), labels) else: loss = loss_fn(preds, labels) preds_np = preds.detach().cpu().numpy() labels_np = labels.detach().cpu().numpy() # I changed a parameter in the compute_iou method to prevent it from yielding nans iou = compute_iou_batch(np.argmax(preds_np, axis=1), labels_np, classes) _tqdm.set_postfix(OrderedDict(seg_loss=f'{loss.item():.5f}', iou=f'{iou:.3f}')) valid_losses.append(loss.item()) valid_ious.append(iou) valid_loss = np.mean(valid_losses) valid_iou = np.mean(valid_ious) logger.info(f'valid seg loss: {valid_loss}') logger.info(f'valid iou: {valid_iou}') plotter.plot('loss-epoch', 'valid', 'iteration Loss', i_epoch, valid_loss) plotter.plot('iou-epoch', 'valid', 'iteration iou', i_epoch, valid_iou) if best_metrics < valid_iou: best_metrics = valid_iou logger.info('Best Model!') torch.save(model.state_dict(), output_dir.joinpath('model.pth')) torch.save(optimizer.state_dict(), output_dir.joinpath('opt.pth')) loss_history.append([train_loss, valid_loss]) iou_history.append([train_iou, valid_iou]) history_ploter(loss_history, log_dir.joinpath('loss.png')) history_ploter(iou_history, log_dir.joinpath('iou.png')) history_dict = {'loss': loss_history, 'iou': iou_history, 'best_metrics': best_metrics} with open(log_dir.joinpath('history.pkl'), 'wb') as f: pickle.dump(history_dict, f)