def get_test_dataset(dataset: str, prepare, test_or_train='test'): if test_or_train == 'test': ids = D.all_test_ids() images = D.read_test_images(ids) else: ids = D.all_train_ids() images = D.read_train_images(ids) depths = D.read_depths(ids) use_cumsum = (dataset == 'image_depth_cumsum' or dataset == 'image_cumsum') use_depth = (dataset == 'image_depth' or dataset == 'image_depth_cumsum') dataset = D.ImageAndMaskDataset(ids, images, None, depths, prepare_fn=prepare) return dataset
def test_folds_coverage(): train_ids = D.all_train_ids() depths = D.read_depths(train_ids) images = D.read_train_images(train_ids) masks = D.read_train_masks(train_ids) n_folds = 10 coverage = np.array([cv2.countNonZero(x) for x in masks], dtype=np.int) folds_d = D.get_folds_vector('coverage', images, masks, depths, n_folds=n_folds) f, ax = plt.subplots(1, 2) for fold in range(n_folds): train = coverage[folds_d != fold] val = coverage[folds_d == fold] ax[0].hist(train, label=f'Fold {fold}') ax[1].hist(val, label=f'Fold {fold}') f.show()
def main(): parser = U.get_argparser() args = parser.parse_args() U.set_manual_seed(args.seed) train_session_args = vars(args) train_session = U.get_random_name() current_time = datetime.now().strftime('%b%d_%H_%M') prefix = f'{current_time}_{args.model}_{args.prepare}_{args.augmentation}_{train_session}' if args.fold is not None: prefix += f'_fold_{args.stratify}_{args.fold}' log_dir = os.path.join('runs', prefix) exp_dir = os.path.join('experiments', args.model, args.prepare, args.augmentation, prefix) os.makedirs(exp_dir, exist_ok=True) train_ids = D.all_train_ids() depths = D.read_depths(train_ids) images = D.read_train_images(train_ids) masks = D.read_train_masks(train_ids) if args.fix_masks: masks, changed_ids = D.fix_masks(masks, train_ids) with open(os.path.join(exp_dir, 'fixed_masks.txt'), 'w') as f: for sample_id in changed_ids: f.write(sample_id) f.write('\n') print(f'Fixed {len(changed_ids)} masks') if args.fold is not None: train_indexes, test_indexes = D.get_train_test_split_for_fold( args.stratify, args.fold, train_ids) else: train_indexes, test_indexes = train_test_split( np.arange(len(train_ids)), shuffle=False, random_state=args.split_seed, test_size=0.2) ids_train, ids_test = train_ids[train_indexes], train_ids[test_indexes] img_train, img_test = images[train_indexes], images[test_indexes] mask_train, mask_test = masks[train_indexes], masks[test_indexes] depth_train, depth_test = depths[train_indexes], depths[test_indexes] # Here we can exclude some images from training, but keep in validation train_mask = D.drop_some(img_train, mask_train, drop_black=True, drop_vstrips=args.drop_vstrips, drop_few=args.drop_few) ids_train = ids_train[train_mask] img_train = img_train[train_mask] mask_train = mask_train[train_mask] depth_train = depth_train[train_mask] if not is_sorted(ids_train): raise RuntimeError("ids_train is not sorted") if not is_sorted(ids_test): raise RuntimeError("ids_test_sorted is not sorted") prepare_fn = D.get_prepare_fn(args.prepare, **train_session_args) # This line valid if we apply prepare_fn first and then do augmentation target_size = prepare_fn.target_size if prepare_fn is not None else D.ORIGINAL_SIZE # target_size = D.ORIGINAL_SIZE build_augmentation_fn = D.AUGMENTATION_MODES[args.augmentation] aug = build_augmentation_fn(target_size, border_mode=args.border_mode) train_transform_list = [] valid_transform_list = [] if prepare_fn is not None: train_transform_list.append(prepare_fn.t_forward) valid_transform_list.append(prepare_fn.t_forward) train_transform_list.append(aug) trainset = D.ImageAndMaskDataset(ids_train, img_train, mask_train, depth_train, augment=A.Compose(train_transform_list)) validset = D.ImageAndMaskDataset(ids_test, img_test, mask_test, depth_test, augment=A.Compose(valid_transform_list)) trainloader = DataLoader(trainset, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True, drop_last=True, shuffle=True) validloader = DataLoader(validset, batch_size=args.batch_size, pin_memory=True, drop_last=False, shuffle=False) # Save train/val split for future use train_session_args.update({ 'train_set': list(ids_train), 'valid_set': list(ids_test) }) # Declare variables we will use during training start_epoch = 0 train_history = pd.DataFrame() target_metric = args.target_metric target_metric_mode = 'max' best_metric_val = 0 best_lb_checkpoint = os.path.join(exp_dir, f'{prefix}_{target_metric}.pth') model = U.get_model(args.model, num_classes=args.num_classes, num_channels=trainset.channels(), abn=args.abn, use_dropout=not args.no_dropout, pretrained=not args.no_pretrain).cuda() print('Train set size :', len(ids_train), 'batch size', trainloader.batch_size) print('Valid set size :', len(ids_test), 'batch size', validloader.batch_size) print('Tile transform :', prepare_fn if prepare_fn is not None else "None") print('Model :', args.model, count_parameters(model)) print('Augmentations :', args.augmentation, args.border_mode) print('Input channels :', trainset.channels()) print('Output classes :', args.num_classes) print('Optimizer :', args.optimizer, 'wd', args.weight_decay) print('Use of dropout :', not args.no_dropout) print('Train session :', train_session) print('Freeze encoder :', args.freeze_encoder) print('Seed :', args.seed, args.split_seed) print('Restart every :', args.restart_every) print('Fold :', args.fold, args.stratify) print('Fine-tune :', args.fine_tune) print('ABN Mode :', args.abn) print('Fix masks :', args.fix_masks) if args.resume: fname = U.auto_file(args.resume) start_epoch, train_history, best_score = U.restore_checkpoint( fname, model) print(train_history) print('Resuming training from epoch', start_epoch, ' and score', best_score, args.resume) if args.fine_tune and args.freeze_encoder > 0: raise ValueError( 'Incompatible options --fune-tune and --freeze-encoder') writer = SummaryWriter(log_dir) writer.add_text('train/params', '```' + json.dumps(train_session_args, indent=2) + '```', 0) config_fname = os.path.join(exp_dir, f'{train_session}.json') with open(config_fname, 'w') as f: f.write(json.dumps(train_session_args, indent=2)) weights = { 'mask': 1.0, 'class': 0.05, 'dsv': 0.1, } bce = U.get_loss('bce') bce_lovasz = U.get_loss('bce_lovasz') bce_jaccard = U.get_loss('bce_jaccard') losses = { 'warmup': { 'mask': bce, 'class': bce, 'dsv': bce, }, 'main': { 'mask': bce_jaccard, 'class': bce, 'dsv': bce, }, 'annealing': { 'mask': bce_lovasz, 'class': bce, 'dsv': bce, } } epochs = {'warmup': 50, 'main': 250, 'annealing': 50} if args.fast: for key in epochs.keys(): epochs[key] = 1 learning_rates = { 'warmup': args.learning_rate, 'main': 1e-3, 'annealing': 1e-2 } # Warmup phase if epochs['warmup']: print(torch.cuda.max_memory_allocated(), torch.cuda.max_memory_cached()) trainable_parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = U.get_optimizer(args.optimizer, trainable_parameters, learning_rates['warmup'], weight_decay=args.weight_decay) scheduler = None # StepLR(optimizer, gamma=0.5, step_size=50) train_history, best_metric_val, start_epoch = train( model, losses['warmup'], weights, optimizer, scheduler, trainloader, validloader, writer, start_epoch, epochs=epochs['warmup'], early_stopping=args.early_stopping, train_history=train_history, experiment_dir=exp_dir, target_metric=target_metric, best_metric_val=best_metric_val, target_metric_mode=target_metric_mode, checkpoint_filename=best_lb_checkpoint) U.save_checkpoint(os.path.join(exp_dir, f'{prefix}_warmup.pth'), model, start_epoch, train_history, metric_name=target_metric, metric_score=best_metric_val) del trainable_parameters, optimizer, scheduler torch.cuda.empty_cache() torch.cuda.synchronize() print('Finished warmup phase. Main train loop.') # Main training phase print(torch.cuda.max_memory_allocated(), torch.cuda.max_memory_cached()) trainable_parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = U.get_optimizer(args.optimizer, trainable_parameters, learning_rates['main'], weight_decay=args.weight_decay) scheduler = ReduceLROnPlateau(optimizer, mode='max', patience=50, factor=0.5, min_lr=1e-5) train_history, best_metric_val, start_epoch = train( model, losses['main'], weights, optimizer, scheduler, trainloader, validloader, writer, start_epoch, epochs=epochs['main'], early_stopping=args.early_stopping, train_history=train_history, experiment_dir=exp_dir, target_metric=target_metric, best_metric_val=best_metric_val, target_metric_mode=target_metric_mode, checkpoint_filename=best_lb_checkpoint) del trainable_parameters, optimizer, scheduler torch.cuda.empty_cache() torch.cuda.synchronize() snapshots = [best_lb_checkpoint] U.save_checkpoint(os.path.join(exp_dir, f'{prefix}_main.pth'), model, start_epoch, train_history, metric_name=target_metric, metric_score=best_metric_val) print('Finished train phase.') # Cosine annealing if epochs['annealing']: for snapshot in range(5): print(f'Starting annealing phase {snapshot}') print(torch.cuda.max_memory_allocated(), torch.cuda.max_memory_cached()) # model.set_fine_tune(True) trainable_parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = U.get_optimizer('sgd', trainable_parameters, learning_rates['annealing'], weight_decay=args.weight_decay) scheduler = CosineAnnealingLR(optimizer, epochs['annealing'], eta_min=1e-7) snapshot_name = os.path.join( exp_dir, f'{prefix}_{target_metric}_snapshot_{snapshot}.pth') snapshots.append(snapshot_name) train_history, best_metric_val, start_epoch = train( model, losses['annealing'], weights, optimizer, scheduler, trainloader, validloader, writer, start_epoch, epochs=epochs['annealing'], early_stopping=args.early_stopping, train_history=train_history, experiment_dir=exp_dir, target_metric=target_metric, best_metric_val=0, target_metric_mode=target_metric_mode, checkpoint_filename=snapshot_name) del trainable_parameters, optimizer, scheduler torch.cuda.empty_cache() torch.cuda.synchronize() print('Training finished') train_history.to_csv(os.path.join(exp_dir, 'train_history.csv'), index=False) for snapshot_file in snapshots: generate_model_submission(snapshot_file, config_fname, mine_on_val=True)
def main(): parser = U.get_argparser() args = parser.parse_args() U.set_manual_seed(args.seed) train_session_args = vars(args) train_session = U.get_random_name() current_time = datetime.now().strftime('%b%d_%H_%M') prefix = f'{current_time}_{args.model}_{args.prepare}_{args.augmentation}_{train_session}' if args.fold is not None: prefix += f'_fold_{args.stratify}_{args.fold}' log_dir = os.path.join('runs', prefix) exp_dir = os.path.join('experiments', args.model, args.prepare, args.augmentation, prefix) os.makedirs(exp_dir, exist_ok=True) train_ids = D.get_train_ids(drop_black=True, drop_vstrips=args.drop_vstrips, drop_empty=args.drop_empty, drop_few=args.drop_few, fast=args.fast) depths = D.read_depths(train_ids) images = D.read_train_images(train_ids) masks = D.read_train_masks(train_ids) if args.fix_masks: masks, changed_ids = D.fix_masks(masks, train_ids) with open(os.path.join(exp_dir, 'fixed_masks.txt'), 'w') as f: for sample_id in changed_ids: f.write(sample_id) f.write('\n') print(f'Fixed {len(changed_ids)} masks') if args.fold is not None: train_indexes, test_indexes = D.get_train_test_split_for_fold( args.stratify, args.fold, train_ids) else: train_indexes, test_indexes = train_test_split( np.arange(len(train_ids)), shuffle=False, random_state=args.split_seed, test_size=0.2) ids_train, ids_test = train_ids[train_indexes], train_ids[test_indexes] if not is_sorted(ids_train): raise RuntimeError("ids_train is not sorted") if not is_sorted(ids_test): raise RuntimeError("ids_test_sorted is not sorted") img_train, img_test = images[train_indexes], images[test_indexes] mask_train, mask_test = masks[train_indexes], masks[test_indexes] depth_train, depth_test = depths[train_indexes], depths[test_indexes] prepare_fn = D.get_prepare_fn(args.prepare, **train_session_args) # This line valid if we apply prepare_fn first and then do augmentation target_size = prepare_fn.target_size if prepare_fn is not None else D.ORIGINAL_SIZE # target_size = D.ORIGINAL_SIZE build_augmentation_fn = D.AUGMENTATION_MODES[args.augmentation] aug = build_augmentation_fn(target_size, border_mode=args.border_mode) train_transform_list = [] valid_transform_list = [] if prepare_fn is not None: train_transform_list.append(prepare_fn.t_forward) valid_transform_list.append(prepare_fn.t_forward) train_transform_list.append(aug) trainset = D.ImageAndMaskDataset(ids_train, img_train, mask_train, depth_train, augment=A.Compose(train_transform_list)) validset = D.ImageAndMaskDataset(ids_test, img_test, mask_test, depth_test, augment=A.Compose(valid_transform_list)) trainloader = DataLoader(trainset, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True, drop_last=True, shuffle=True) validloader = DataLoader(validset, batch_size=args.batch_size, pin_memory=True, drop_last=False, shuffle=False) # Save train/val split for future use train_session_args.update({ 'train_set': list(ids_train), 'valid_set': list(ids_test) }) # Declare variables we will use during training start_epoch = 0 train_history = pd.DataFrame() scheduler = None optimizer = None target_metric = args.target_metric target_metric_mode = 'max' best_metric_val = 0 best_lb_checkpoint = os.path.join(exp_dir, f'{prefix}_{target_metric}.pth') model = U.get_model(args.model, num_classes=args.num_classes, num_channels=trainset.channels(), abn=args.abn, use_dropout=not args.no_dropout, pretrained=not args.no_pretrain).cuda() print('Train set size :', len(trainloader), 'batch size', trainloader.batch_size) print('Valid set size :', len(validloader), 'batch size', validloader.batch_size) print('Tile transform :', prepare_fn if prepare_fn is not None else "None") print('Model :', args.model, count_parameters(model)) print('Augmentations :', args.augmentation, args.border_mode) print('Input channels :', trainset.channels()) print('Output classes :', args.num_classes) print('Criterion :', args.loss), print('Optimizer :', args.optimizer, args.learning_rate, args.weight_decay) print('Use of dropout :', not args.no_dropout) print('Train session :', train_session) print('Freeze encoder :', args.freeze_encoder) print('Seed :', args.seed, args.split_seed) print('Restart every :', args.restart_every) print('Fold :', args.fold, args.stratify) print('Fine-tune :', args.fine_tune) print('ABN Mode :', args.abn) print('Fix masks :', args.fix_masks) if args.resume: fname = U.auto_file(args.resume) start_epoch, train_history, best_score = U.restore_checkpoint( fname, model) print(train_history) print('Resuming training from epoch', start_epoch, ' and score', best_score, args.resume) segmentation_loss = U.get_loss(args.loss) if args.fine_tune and args.freeze_encoder > 0: raise ValueError( 'Incompatible options --fune-tune and --freeze-encoder') writer = SummaryWriter(log_dir) writer.add_text('train/params', '```' + json.dumps(train_session_args, indent=2) + '```', 0) config_fname = os.path.join(exp_dir, f'{train_session}.json') with open(config_fname, 'w') as f: f.write(json.dumps(train_session_args, indent=2)) # Start training loop no_improvement_epochs = 0 for epoch in range(start_epoch, start_epoch + args.epochs): # On Epoch begin if U.should_quit(exp_dir) or ( args.early_stopping is not None and no_improvement_epochs > args.early_stopping): break epochs_trained = epoch - start_epoch should_restart_optimizer = ( args.restart_every > 0 and epochs_trained % args.restart_every == 0) or (epochs_trained == args.freeze_encoder) or optimizer is None if should_restart_optimizer: del optimizer if args.fine_tune: model.set_fine_tune(args.fine_tune) else: model.set_encoder_training_enabled( epochs_trained >= args.freeze_encoder) trainable_parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = U.get_optimizer(args.optimizer, trainable_parameters, args.learning_rate, weight_decay=args.weight_decay) print('Restarting optimizer state', epoch, count_parameters(model)) if args.lr_scheduler: scheduler = U.get_lr_scheduler(args.lr_scheduler, optimizer, args.epochs) if scheduler is not None and not isinstance(scheduler, ReduceLROnPlateau): scheduler.step(epochs_trained) U.log_learning_rate(writer, optimizer, epoch) # Epoch train_metrics = process_epoch(model, segmentation_loss, optimizer, trainloader, epoch, True, writer, mask_postprocess=prepare_fn.backward) valid_metrics = process_epoch(model, segmentation_loss, None, validloader, epoch, False, writer, mask_postprocess=prepare_fn.backward) all_metrics = {} all_metrics.update(train_metrics) all_metrics.update(valid_metrics) # On Epoch End summary = { 'epoch': [int(epoch)], 'lr': [float(optimizer.param_groups[0]['lr'])] } for k, v in all_metrics.items(): summary[k] = [v] train_history = train_history.append(pd.DataFrame.from_dict(summary), ignore_index=True) print(epoch, summary) if isinstance(scheduler, ReduceLROnPlateau): scheduler.step(all_metrics[target_metric], epochs_trained) if U.is_better(all_metrics[target_metric], best_metric_val, target_metric_mode): best_metric_val = all_metrics[target_metric] U.save_checkpoint(best_lb_checkpoint, model, epoch, train_history, metric_name=target_metric, metric_score=best_metric_val) print('Checkpoint saved', epoch, best_metric_val, best_lb_checkpoint) no_improvement_epochs = 0 else: no_improvement_epochs += 1 print('Training finished') generate_model_submission(best_lb_checkpoint, config_fname, mine_on_val=True)
from lib import dataset as D import pandas as pd if __name__ == '__main__': train_ids = D.get_train_ids(drop_black=False, drop_vstrips=False, drop_empty=False, drop_few=False) images = D.read_train_images(train_ids) masks = D.read_train_masks(train_ids) depths = D.read_depths(train_ids) folds_by_salt = D.get_folds_vector('coverage', images, masks, depths, n_folds=5) folds_by_depth = D.get_folds_vector('depth', images, masks, depths, n_folds=5) folds_by_rnd = D.get_folds_vector(None, images, masks, depths, n_folds=5, random_state=42) pd.DataFrame.from_dict({ 'id': train_ids,